Lab 3A: Regression (Easy)
1. Explore the Data
- Load Iris data
data(iris)
- Create scatterplot matrix
plot(iris[1:4])
- Load corrgram package
library(corrgram)
- Create correlogram
corrgram(iris[1:4])
- Inspect correlation coefficients
cor(iris[1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
- Get correlation coefficient
cor(
x = iris$Petal.Length,
y = iris$Petal.Width)
## [1] 0.9628654
- Create a scatterplot of petal length vs width
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
xlim = c(0.25, 7),
ylim = c(0.25, 2.5))
2. Create Training and Test Sets
- Set the random seed to make randomness reproducable
set.seed(42)
- Randomly sample 100 of 150 row indexes
indexes <- sample(
x = 1:150,
size = 100)
- Create training set from indexes
train <- iris[indexes, ]
- Create test set from remaining indexes
test <- iris[-indexes, ]
3. Predict with Simple Linear Regression
- Create a simple linear regression model
simpleModel <- lm(
formula = Petal.Width ~ Petal.Length,
data = train)
- Draw simple linear regression line on plot
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
xlim = c(0.25, 7),
ylim = c(0.25, 2.5))
lines(
x = train$Petal.Length,
y = simpleModel$fitted,
col = "red",
lwd = 3)
- Summarize the model
summary(simpleModel)
##
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56627 -0.12399 -0.01245 0.13209 0.64001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.34411 0.04996 -6.888 5.47e-10 ***
## Petal.Length 0.41257 0.01192 34.617 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2119 on 98 degrees of freedom
## Multiple R-squared: 0.9244, Adjusted R-squared: 0.9236
## F-statistic: 1198 on 1 and 98 DF, p-value: < 2.2e-16
- Create predictions with test set
simplePredictions <- predict(
object = simpleModel,
newdata = test)
- Plot predictions on scatterplot
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
xlim = c(0.25, 7),
ylim = c(0.25, 2.5))
points(
x = test$Petal.Length,
y = simplePredictions,
col = "blue",
pch = 4,
lwd = 2)
points(
x = test$Petal.Length,
y = test$Petal.Width,
col = "red",
pch = 16)
- Compute Root Mean Squared Error for simple linear predictions
simpleRMSE <- sqrt(mean((test$Petal.Width - simplePredictions)^2))
- Inspect RMSE
print(simpleRMSE)
## [1] 0.1960294
4. Predict with Multiple Regression
- Create multiple linear regression model
multipleModel <- lm(
formula = Petal.Width ~ .,
data = train)
- Inspect the model
summary(multipleModel)
##
## Call:
## lm(formula = Petal.Width ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61056 -0.08035 -0.01463 0.09287 0.45260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.28170 0.22889 -1.231 0.221503
## Sepal.Length -0.10639 0.05639 -1.887 0.062271 .
## Sepal.Width 0.19705 0.05894 3.343 0.001189 **
## Petal.Length 0.26896 0.06718 4.004 0.000125 ***
## Speciesversicolor 0.55074 0.17149 3.211 0.001808 **
## Speciesvirginica 0.92273 0.23227 3.973 0.000139 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1788 on 94 degrees of freedom
## Multiple R-squared: 0.9484, Adjusted R-squared: 0.9456
## F-statistic: 345.4 on 5 and 94 DF, p-value: < 2.2e-16
- Create predictions with test set
multiplePredictions <- predict(
object = multipleModel,
newdata = test)
- Plot predictions on scatterplot
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
xlim = c(0.25, 7),
ylim = c(0.25, 2.5))
points(
x = test$Petal.Length,
y = multiplePredictions,
col = "blue",
pch = 4,
lwd = 2)
points(
x = test$Petal.Length,
y = test$Petal.Width,
col = "red",
pch = 16)
- Compute RMSE for model
multipleRMSE <- sqrt(mean((test$Petal.Width - multiplePredictions)^2))
- Display RMSE for model
print(multipleRMSE)
## [1] 0.144159
5. Predict with Neural Network Regression
- Create normalization function
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x)) - 0.5
}
- Create denormalize function
denormalize <- function(x, y) {
((x + 0.5) * (max(y) - min(y))) + min(y)
}
- Scale iris data set
scaledIris <- data.frame(
Sepal.Length = normalize(iris$Sepal.Length),
Sepal.Width = normalize(iris$Sepal.Width),
Petal.Length = normalize(iris$Petal.Length),
Petal.Width = normalize(iris$Petal.Width),
Species = iris$Species)
- Create training set from indexes
scaledTrain <- scaledIris[indexes, ]
- Create test set from remaining indexes
scaledTest <- scaledIris[-indexes, ]
- Load the neural network package
library(nnet)
## Warning: package 'nnet' was built under R version 3.4.1
- Create a neural network regressor
neuralRegressor <- nnet(
formula = Petal.Width ~ .,
data = scaledTrain,
linout = TRUE,
skip = TRUE,
size = 4,
decay = 0.0001,
maxit = 500)
## # weights: 34
## initial value 97.925222
## iter 10 value 0.557610
## iter 20 value 0.519856
## iter 30 value 0.511567
## iter 40 value 0.492292
## iter 50 value 0.484937
## iter 60 value 0.477067
## iter 70 value 0.473046
## iter 80 value 0.471776
## iter 90 value 0.468606
## iter 100 value 0.459243
## iter 110 value 0.448696
## iter 120 value 0.443407
## iter 130 value 0.439249
## iter 140 value 0.436952
## iter 150 value 0.435550
## iter 160 value 0.433243
## iter 170 value 0.428749
## iter 180 value 0.426459
## iter 190 value 0.425556
## iter 200 value 0.424250
## iter 210 value 0.423499
## iter 220 value 0.423308
## iter 230 value 0.423046
## iter 240 value 0.422602
## iter 250 value 0.421156
## iter 260 value 0.420692
## iter 270 value 0.420173
## iter 280 value 0.419807
## iter 290 value 0.419495
## iter 300 value 0.419240
## iter 310 value 0.418807
## iter 320 value 0.418300
## iter 330 value 0.417126
## iter 340 value 0.416725
## iter 350 value 0.416234
## iter 360 value 0.415865
## iter 370 value 0.415529
## iter 380 value 0.414839
## iter 390 value 0.413638
## iter 400 value 0.411991
## iter 410 value 0.407332
## iter 420 value 0.404744
## iter 430 value 0.402823
## iter 440 value 0.401268
## iter 450 value 0.400604
## iter 460 value 0.399889
## iter 470 value 0.399352
## iter 480 value 0.398884
## iter 490 value 0.398843
## iter 500 value 0.398745
## final value 0.398745
## stopped after 500 iterations
- Load neural network tools
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 3.4.2
- Plot the neural network
plotnet(neuralRegressor)
- Predict new values
scaledPredictions <- predict(
object = neuralRegressor,
newdata = scaledTest)
- Denormalize predictions
neuralPredictions <- denormalize(
x = scaledPredictions,
y = iris$Petal.Width)
- Plot predictions on scatterplot
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
xlim = c(0.25, 7),
ylim = c(0.25, 2.5))
points(
x = test$Petal.Length,
y = neuralPredictions,
col = "blue",
pch = 4,
lwd = 2)
points(
x = test$Petal.Length,
y = test$Petal.Width,
col = "red",
pch = 16)
- Compute RMSE for neural regression
neuralRMSE <- sqrt(mean((test$Petal.Width - neuralPredictions)^2))
- Display RMSE for neural regression
print(neuralRMSE)
## [1] 0.1364079
6. Evaluate Regression Models
- Compare the RMSE for all three models
print(simpleRMSE)
## [1] 0.1960294
print(multipleRMSE)
## [1] 0.144159
print(neuralRMSE)
## [1] 0.1364079