Lab 6A: Handling Big Data (Easy)
- Set working directory
setwd("C:/Workshop/Data")
- Load the ff package
library(ff)
- Read a CSV file as ff data frame
irisff <- read.table.ffdf(
file = "Iris.csv",
FUN = "read.csv")
- Inspect the class
class(irisff)
## [1] "ffdf"
- Inspect the column names
names(irisff)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
- Inspect the first few rows
irisff[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
- Load the biglm package
library(biglm)
- Create a linear regression model
model <- biglm(
formula = Petal.Width ~ Petal.Length,
data = irisff)
- Summarize the model
summary(model)
## Large data regression model: biglm(formula = Petal.Width ~ Petal.Length, data = irisff)
## Sample size = 150
## Coef (95% CI) SE p
## (Intercept) -0.3631 -0.4426 -0.2836 0.0398 0
## Petal.Length 0.4158 0.3966 0.4349 0.0096 0
- Create a scatterplot
plot(
x = irisff$Petal.Length[],
y = irisff$Petal.Width[],
main = "Iris Petal Length vs. Width",
xlab = "Petal Length (cm)",
ylab = "Petal Width (cm)")
- Get y-intercept from model
b <- summary(model)$mat[1,1]
- Get slope from model
m <- summary(model)$mat[2,1]
- Draw a regression line on plot
plot(
x = irisff$Petal.Length[],
y = irisff$Petal.Width[],
main = "Iris Petal Length vs. Width",
xlab = "Petal Length (cm)",
ylab = "Petal Width (cm)")
lines(
x = irisff$Petal.Length[],
y = m * irisff$Petal.Length[] + b,
col = "red",
lwd = 3)
- Predict new values with the model NOTE: BigLM requires zeros for Petal.Width
predict(
object = model,
newdata = data.frame(
Petal.Length = c(2, 5, 7),
Petal.Width = c(0, 0, 0)))
## [,1]
## 1 0.4684353
## 2 1.7157016
## 3 2.5472124