Lab 3A: Regression (Easy)

1. Explore the Data

  1. Load Iris data
data(iris)
  1. Create scatterplot matrix
plot(iris[1:4])

  1. Load corrgram package
library(corrgram)
  1. Create correlogram
corrgram(iris[1:4])

  1. Inspect correlation coefficients
cor(iris[1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
  1. Get correlation coefficient
cor(
  x = iris$Petal.Length, 
  y = iris$Petal.Width)
## [1] 0.9628654
  1. Create a scatterplot of petal length vs width
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width,
  xlim = c(0.25, 7),
  ylim = c(0.25, 2.5))

2. Create Training and Test Sets

  1. Set the random seed to make randomness reproducable
set.seed(42)
  1. Randomly sample 100 of 150 row indexes
indexes <- sample(
  x = 1:150, 
  size = 100)
  1. Create training set from indexes
train <- iris[indexes, ]
  1. Create test set from remaining indexes
test <- iris[-indexes, ]

3. Predict with Simple Linear Regression

  1. Create a simple linear regression model
simpleModel <- lm(
  formula = Petal.Width ~ Petal.Length,
  data = train)
  1. Draw simple linear regression line on plot
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width,
  xlim = c(0.25, 7),
  ylim = c(0.25, 2.5))
  
lines(
  x = train$Petal.Length,
  y = simpleModel$fitted, 
  col = "red",
  lwd = 3)

  1. Summarize the model
summary(simpleModel)
## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56627 -0.12399 -0.01245  0.13209  0.64001 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.34411    0.04996  -6.888 5.47e-10 ***
## Petal.Length  0.41257    0.01192  34.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2119 on 98 degrees of freedom
## Multiple R-squared:  0.9244, Adjusted R-squared:  0.9236 
## F-statistic:  1198 on 1 and 98 DF,  p-value: < 2.2e-16
  1. Create predictions with test set
simplePredictions <- predict(
  object = simpleModel,
  newdata = test)
  1. Plot predictions on scatterplot
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width,
  xlim = c(0.25, 7),
  ylim = c(0.25, 2.5))
  
points(
  x = test$Petal.Length,
  y = simplePredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

points(
  x = test$Petal.Length,
  y = test$Petal.Width,
  col = "red",
  pch = 16)

  1. Compute Root Mean Squared Error for simple linear predictions
simpleRMSE <- sqrt(mean((test$Petal.Width - simplePredictions)^2))
  1. Inspect RMSE
print(simpleRMSE)
## [1] 0.1960294

4. Predict with Multiple Regression

  1. Create multiple linear regression model
multipleModel <- lm(
  formula = Petal.Width ~ .,
  data = train)
  1. Inspect the model
summary(multipleModel)
## 
## Call:
## lm(formula = Petal.Width ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.61056 -0.08035 -0.01463  0.09287  0.45260 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.28170    0.22889  -1.231 0.221503    
## Sepal.Length      -0.10639    0.05639  -1.887 0.062271 .  
## Sepal.Width        0.19705    0.05894   3.343 0.001189 ** 
## Petal.Length       0.26896    0.06718   4.004 0.000125 ***
## Speciesversicolor  0.55074    0.17149   3.211 0.001808 ** 
## Speciesvirginica   0.92273    0.23227   3.973 0.000139 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1788 on 94 degrees of freedom
## Multiple R-squared:  0.9484, Adjusted R-squared:  0.9456 
## F-statistic: 345.4 on 5 and 94 DF,  p-value: < 2.2e-16
  1. Create predictions with test set
multiplePredictions <- predict(
  object = multipleModel,
  newdata = test)
  1. Plot predictions on scatterplot
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width,
  xlim = c(0.25, 7),
  ylim = c(0.25, 2.5))
  
points(
  x = test$Petal.Length,
  y = multiplePredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

points(
  x = test$Petal.Length,
  y = test$Petal.Width,
  col = "red",
  pch = 16)

  1. Compute RMSE for model
multipleRMSE <- sqrt(mean((test$Petal.Width - multiplePredictions)^2))
  1. Display RMSE for model
print(multipleRMSE)
## [1] 0.144159

5. Predict with Neural Network Regression

  1. Create normalization function
normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x)) - 0.5
}
  1. Create denormalize function
denormalize <- function(x, y) {
  ((x + 0.5) * (max(y) - min(y))) + min(y)
}
  1. Scale iris data set
scaledIris <- data.frame(
  Sepal.Length = normalize(iris$Sepal.Length),
  Sepal.Width = normalize(iris$Sepal.Width),
  Petal.Length = normalize(iris$Petal.Length),
  Petal.Width = normalize(iris$Petal.Width),
  Species = iris$Species)
  1. Create training set from indexes
scaledTrain <- scaledIris[indexes, ]
  1. Create test set from remaining indexes
scaledTest <- scaledIris[-indexes, ]
  1. Load the neural network package
library(nnet)
## Warning: package 'nnet' was built under R version 3.4.1
  1. Create a neural network regressor
neuralRegressor <- nnet(
  formula = Petal.Width ~ .,
  data = scaledTrain,
  linout = TRUE,
  skip = TRUE,
  size = 4,
  decay = 0.0001,
  maxit = 500)
## # weights:  34
## initial  value 97.925222 
## iter  10 value 0.557610
## iter  20 value 0.519856
## iter  30 value 0.511567
## iter  40 value 0.492292
## iter  50 value 0.484937
## iter  60 value 0.477067
## iter  70 value 0.473046
## iter  80 value 0.471776
## iter  90 value 0.468606
## iter 100 value 0.459243
## iter 110 value 0.448696
## iter 120 value 0.443407
## iter 130 value 0.439249
## iter 140 value 0.436952
## iter 150 value 0.435550
## iter 160 value 0.433243
## iter 170 value 0.428749
## iter 180 value 0.426459
## iter 190 value 0.425556
## iter 200 value 0.424250
## iter 210 value 0.423499
## iter 220 value 0.423308
## iter 230 value 0.423046
## iter 240 value 0.422602
## iter 250 value 0.421156
## iter 260 value 0.420692
## iter 270 value 0.420173
## iter 280 value 0.419807
## iter 290 value 0.419495
## iter 300 value 0.419240
## iter 310 value 0.418807
## iter 320 value 0.418300
## iter 330 value 0.417126
## iter 340 value 0.416725
## iter 350 value 0.416234
## iter 360 value 0.415865
## iter 370 value 0.415529
## iter 380 value 0.414839
## iter 390 value 0.413638
## iter 400 value 0.411991
## iter 410 value 0.407332
## iter 420 value 0.404744
## iter 430 value 0.402823
## iter 440 value 0.401268
## iter 450 value 0.400604
## iter 460 value 0.399889
## iter 470 value 0.399352
## iter 480 value 0.398884
## iter 490 value 0.398843
## iter 500 value 0.398745
## final  value 0.398745 
## stopped after 500 iterations
  1. Load neural network tools
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 3.4.2
  1. Plot the neural network
plotnet(neuralRegressor)

  1. Predict new values
scaledPredictions <- predict(
  object = neuralRegressor, 
  newdata = scaledTest)
  1. Denormalize predictions
neuralPredictions <- denormalize(
  x = scaledPredictions, 
  y = iris$Petal.Width)
  1. Plot predictions on scatterplot
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width,
  xlim = c(0.25, 7),
  ylim = c(0.25, 2.5))
  
points(
  x = test$Petal.Length,
  y = neuralPredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

points(
  x = test$Petal.Length,
  y = test$Petal.Width,
  col = "red",
  pch = 16)

  1. Compute RMSE for neural regression
neuralRMSE <- sqrt(mean((test$Petal.Width - neuralPredictions)^2))
  1. Display RMSE for neural regression
print(neuralRMSE)
## [1] 0.1364079

6. Evaluate Regression Models

  1. Compare the RMSE for all three models
print(simpleRMSE)
## [1] 0.1960294
print(multipleRMSE)
## [1] 0.144159
print(neuralRMSE)
## [1] 0.1364079