D Solutions ch. 7 - Nearest neighbours
Solutions to exercises of chapter 5.
D.1 Exercise 1
Load libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
library(RColorBrewer)
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(corrplot)
## corrplot 0.92 loaded
Prepare for parallel processing
registerDoMC(detectCores())
Load data
load("data/wheat_seeds/wheat_seeds.Rda")
Partition data
set.seed(42)
<- createDataPartition(y=variety, times=1, p=0.7, list=F)
trainIndex <- variety[trainIndex]
varietyTrain <- morphometrics[trainIndex,]
morphTrain <- variety[-trainIndex]
varietyTest <- morphometrics[-trainIndex,]
morphTest
summary(varietyTrain)
## Canadian Kama Rosa
## 49 49 49
summary(varietyTest)
## Canadian Kama Rosa
## 21 21 21
Data check: zero and near-zero predictors
<- nearZeroVar(morphTrain, saveMetrics=T)
nzv nzv
## freqRatio percentUnique zeroVar nzv
## area 1.500000 95.91837 FALSE FALSE
## perimeter 1.333333 86.39456 FALSE FALSE
## compactness 1.000000 91.83673 FALSE FALSE
## kernLength 1.000000 91.15646 FALSE FALSE
## kernWidth 1.000000 91.83673 FALSE FALSE
## asymCoef 2.000000 99.31973 FALSE FALSE
## grooveLength 1.333333 76.19048 FALSE FALSE
Data check: are all predictors on same scale?
summary(morphTrain)
## area perimeter compactness kernLength
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.34 1st Qu.:13.46 1st Qu.:0.8577 1st Qu.:5.264
## Median :14.46 Median :14.40 Median :0.8734 Median :5.541
## Mean :14.87 Mean :14.56 Mean :0.8724 Mean :5.624
## 3rd Qu.:17.10 3rd Qu.:15.65 3rd Qu.:0.8881 3rd Qu.:5.979
## Max. :20.97 Max. :17.25 Max. :0.9153 Max. :6.675
## kernWidth asymCoef grooveLength
## Min. :2.630 Min. :0.903 Min. :4.519
## 1st Qu.:2.958 1st Qu.:2.372 1st Qu.:5.046
## Median :3.259 Median :3.597 Median :5.222
## Mean :3.267 Mean :3.659 Mean :5.406
## 3rd Qu.:3.557 3rd Qu.:4.799 3rd Qu.:5.862
## Max. :4.032 Max. :8.456 Max. :6.550
featurePlot(x = morphTrain,
y = varietyTrain,
plot = "box",
## Pass in options to bwplot()
scales = list(y = list(relation="free"),
x = list(rot = 90)),
layout = c(3,3))
Data check: pairwise correlations between predictors
<- cor(morphTrain)
corMat corrplot(corMat, order="hclust", tl.cex=1)
<- findCorrelation(corMat, cutoff=0.75)
highCorr length(highCorr)
## [1] 4
names(morphTrain)[highCorr]
## [1] "area" "perimeter" "kernWidth" "kernLength"
Data check: skewness
featurePlot(x = morphTrain,
y = varietyTrain,
plot = "density",
## Pass in options to xyplot() to
## make it prettier
scales = list(x = list(relation="free"),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(3, 3),
auto.key = list(columns = 3))
Create a ‘grid’ of values of k for evaluation:
<- data.frame(k=seq(1,50,2)) tuneParam
Generate a list of seeds for reproducibility (optional) based on grid size
set.seed(42)
<- vector(mode = "list", length = 101)
seeds for(i in 1:100) seeds[[i]] <- sample.int(1000, length(tuneParam$k))
101]] <- sample.int(1000,1) seeds[[
Set training parameters. In the example in chapter 5 pre-processing was performed outside the cross-validation process to save time for the purposes of the demonstration. Here we have a relatively small data set, so we can do pre-processing within each iteration of the cross-validation process. We specify the option preProcOptions=list(cutoff=0.75)
to set a value for the pairwise correlation coefficient cutoff.
<- trainControl(method="repeatedcv",
train_ctrl number = 10,
repeats = 10,
preProcOptions=list(cutoff=0.75),
seeds = seeds)
Run training
<- train(morphTrain, varietyTrain,
knnFit method="knn",
preProcess = c("center", "scale", "corr"),
tuneGrid=tuneParam,
trControl=train_ctrl)
knnFit
## k-Nearest Neighbors
##
## 147 samples
## 7 predictor
## 3 classes: 'Canadian', 'Kama', 'Rosa'
##
## Pre-processing: centered (3), scaled (3), remove (4)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 133, 132, 133, 132, 132, 133, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8640238 0.7955794
## 3 0.8411667 0.7614211
## 5 0.8544524 0.7813197
## 7 0.8646429 0.7966349
## 9 0.8743810 0.8111795
## 11 0.8771429 0.8154320
## 13 0.8777619 0.8162978
## 15 0.8804762 0.8204404
## 17 0.8852857 0.8276139
## 19 0.8839048 0.8255536
## 21 0.8846190 0.8266306
## 23 0.8846190 0.8266223
## 25 0.8839048 0.8255454
## 27 0.8853333 0.8277076
## 29 0.8908095 0.8359544
## 31 0.8907619 0.8358447
## 33 0.8921429 0.8379874
## 35 0.8874762 0.8309461
## 37 0.8894762 0.8339626
## 39 0.8888095 0.8329793
## 41 0.8880952 0.8319026
## 43 0.8880476 0.8317839
## 45 0.8907619 0.8358610
## 47 0.8874286 0.8308694
## 49 0.8866667 0.8297155
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 33.
Plot cross validation accuracy as a function of k
plot(knnFit)
Predict the class (wheat variety) of the observations in the test set.
<- predict(knnFit, morphTest)
test_pred confusionMatrix(test_pred, varietyTest)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canadian Kama Rosa
## Canadian 20 3 0
## Kama 1 17 0
## Rosa 0 1 21
##
## Overall Statistics
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.881
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Canadian Class: Kama Class: Rosa
## Sensitivity 0.9524 0.8095 1.0000
## Specificity 0.9286 0.9762 0.9762
## Pos Pred Value 0.8696 0.9444 0.9545
## Neg Pred Value 0.9750 0.9111 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3175 0.2698 0.3333
## Detection Prevalence 0.3651 0.2857 0.3492
## Balanced Accuracy 0.9405 0.8929 0.9881