Lab 5A: Statistical Modeling (Easy)

  1. Load the Iris data set.
data(iris)
  1. Peek at the data.
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
  1. Look at unique species.
unique(iris$Species)
## [1] setosa     versicolor virginica 
## Levels: setosa versicolor virginica

Create a Gaussian Distribution Model

  1. Create a plot of sepal width
plot(density(iris$Sepal.Width))

  1. Get the mean
irisMean <- mean(iris$Sepal.Width)
  1. Print the mean
print(irisMean)
## [1] 3.057333
  1. Get the standard deviation
irisStdDev <- sd(iris$Sepal.Width)
  1. Print the standard deviation
print(irisStdDev)
## [1] 0.4358663
  1. Create points along x-axis of the distribution
distributionX <- seq(
  from = min(iris$Sepal.Width), 
  to = max(iris$Sepal.Width), 
  length = 100)
  1. Compute the y-axis height of each point
distributionY <- dnorm(
  x = distributionX, 
  mean = irisMean, 
  sd = irisStdDev)
  1. Add the distribution to the plot
plot(density(iris$Sepal.Width))
lines(
  x = distributionX,
  y = distributionY,
  col = "red")

  1. Generate/predict new values from model
values <- rnorm(
    n = 10000,
    mean = mean(iris$Sepal.Width),
    sd = sd(iris$Sepal.Width))
  1. Add plot of distribution of generated values
plot(density(iris$Sepal.Width))
plot(density(iris$Sepal.Width))
lines(
  x = distributionX,
  y = distributionY,
  col = "red")
lines(
  x = density(values),
  col = "blue")

  1. Get mean of generated values
mean(values)
## [1] 3.059143
  1. Get standard deviation of generated values
sd(values)
## [1] 0.4397619

Create a Simple Linear Regression Model

  1. Create a scatterplot matrix.
plot(iris[1:4])

  1. Create a scatterplot of petal length vs width.
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width)

  1. Create a linear regression model.
model <- lm(
  formula = Petal.Width ~ Petal.Length,
  data = iris)
  1. Draw linear regression model on the scatterplot.
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width)

lines(
  x = iris$Petal.Length,
  y = model$fitted, 
  col = "red",
  lwd = 3)

  1. Get the correlation coefficient.
cor(
  x = iris$Petal.Length, 
  y = iris$Petal.Width)
## [1] 0.9628654
  1. Summarize the model.
summary(model)
## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56515 -0.12358 -0.01898  0.13288  0.64272 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.363076   0.039762  -9.131  4.7e-16 ***
## Petal.Length  0.415755   0.009582  43.387  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2065 on 148 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9266 
## F-statistic:  1882 on 1 and 148 DF,  p-value: < 2.2e-16
  1. Create new petal lengths to predict.
unknownLengths <- data.frame(
  Petal.Length = c(2, 5, 7))
  1. Predict new unknown values from the model.
predict(
  object = model, 
  newdata = unknownLengths)
##         1         2         3 
## 0.4684353 1.7157016 2.5472124