1. Import the OS library.
import os
2. Set the working directory.
os.chdir("C:\\Workshop\\Data")
3. Import the pandas library as "pd".
import pandas as pd
4. Read the Iris CSV file into a data frame called iris.
iris = pd.read_csv("Iris.csv")
1. Inspect the iris data set.
iris.head()
2. Import the matplotlib pyplot library as "plt".
import matplotlib.pyplot as plt
3. Create a scatterplot matrix of the iris data set.
Note: The semicolon at the end prevents text output from being displayed with the plot.
pd.plotting.scatter_matrix(
frame = iris,
alpha = 1,
s = 100,
diagonal = 'none');
6. Create a correlation matrix of the iris data set.
correlations = iris.corr()
print(correlations)
5. Import the seaborn library as "sns".
import seaborn as sns
6. Create a correlogram using the correlation matrix.
sns.heatmap(
data = correlations,
cmap = sns.diverging_palette(
h_neg = 10,
h_pos = 220,
as_cmap=True));
7. Question: Which variable is most strongly correlated with Petal Width?
8. Get the correlation between petal length and width.
iris.Petal_Length \
.corr(iris.Petal_Width)
9. Create a scatterplot of petal width (y) vs. petal width (x)
plt.scatter(
x = iris.Petal_Length,
y = iris.Petal_Width)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
1. Inspect the iris data set.
iris.head()
2. Create a data frame named X containing all variables except for petal width.
X = iris.iloc[:, iris.columns != "Petal_Width"]
3. Inspect the features X.
X.head()
4. Convert the categorical variable Species into a set of one-hot-encoding variables.
dummies = pd.get_dummies(X.Species)
5. Inspect the one-hot encoded variables.
dummies.head()
6. Append the one-hot-encoded species variables to the features data set X.
X = pd.concat([X, dummies], axis = 1)
7. Drop the Species column from the features data frame X.
X = X.drop("Species", 1)
8. Inspect the features data frame X.
X.head()
9. Create a series named y containing just the labels (i.e. Petal_Width).
y = iris.Petal_Width
10. Inspect the series of labels y.
y.head()
1. Import the numpy library as "np".
import numpy as np
2. Set the random number seed to 234.
np.random.seed(234)
3. Import the test_train_split function from sklearn.
from sklearn.model_selection import train_test_split
4. Randomly sample 80% of the rows for the training set and 20% of the rows for the test set.
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
train_size = 0.80,
test_size = 0.20)
5. Inspect the shape of the training and test sets using the shape
property.
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)
1. Import the linear regression class from sklearn.
from sklearn.linear_model import LinearRegression
2. Create a simple linear regression model.
simple_model = LinearRegression()
3. Create a data frame named x1_train containing only the petal length feature from the training set.
x1_train = X_train.loc[:, ["Petal_Length"]]
4. Create a data frame named x1_test containing onl the petal length feature from the test set.
x1_test = X_test.loc[:, ["Petal_Length"]]
5. Train the model using the training data.
Note: You should be using x1_train as your training data.
simple_model.fit(
X = x1_train,
y = y_train)
6. Draw the regression line on top of a scatterplot of petal width (y) vs petal width (x).
plt.scatter(
x = iris.Petal_Length,
y = iris.Petal_Width,
color = "black")
plt.plot(
x1_test,
simple_model.predict(
x1_test),
color = "blue",
linewidth = 3)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
7. Inspect the slope (m) and y-intercept (b) parameter estimates.
print("y-intercept (b): ", simple_model.intercept_)
print("Slope (m): ", simple_model.coef_[0])
8. Question: How do you interpret these two values?
9. Predict the labels of the test set using the model.
simple_predictions = simple_model.predict(x1_test)
10. Visualize the prediction error.
# Plot the training set (black dots)
plt.scatter(
x = x1_train.Petal_Length,
y = y_train,
color = "black",
facecolor = "none")
# Plot the predictions (blue dots)
plt.scatter(
x = x1_test.Petal_Length,
y = simple_predictions,
color = "blue",
marker = 'x')
# Plot the correct answer (green dots)
plt.scatter(
x = x1_test.Petal_Length,
y = y_test,
color = "green")
# Plot the error (red lines)
plt.plot(
[x1_test.Petal_Length, x1_test.Petal_Length],
[simple_predictions, y_test],
color = "red",
zorder = 0)
# Finish the plot
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
11. Question: How do you interpret this graph?
12. Compute the root mean squared error (RMSE) the these predictions.
simple_rmse = np.sqrt(np.mean((y_test - simple_predictions) ** 2))
print(simple_rmse)
1. Create a linear regression model.
multiple_model = LinearRegression()
2. Train the model using all features of the training data.
multiple_model.fit(
X = X_train,
y = y_train)
3. Inspect the parameter estimates.
print("{:<12}: {: .3f}"
.format("y-intercept", multiple_model.intercept_))
for i, column_name in enumerate(X_train.columns):
print("{:<12}: {: .3f}".format(
column_name,
multiple_model.coef_[i]))
4. Question: How do you interpret these values?
5. Predict output values for the input values in the test set.
multiple_predictions = multiple_model.predict(X_test)
6. Visualize the prediction error.
plt.scatter(
x = X_train.Petal_Length,
y = y_train,
color = "black",
facecolor = "none")
plt.scatter(
x = X_test.Petal_Length,
y = multiple_predictions,
color = "blue",
marker = 'x')
plt.scatter(
x = X_test.Petal_Length,
y = y_test,
color = "green")
plt.plot(
[X_test.Petal_Length, X_test.Petal_Length],
[multiple_predictions, y_test],
color = "red",
zorder = 0)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
7. Question: How do you interpret this graph?
8. Compute the root mean squared error (RMSE) of these predictions.
multiple_rmse = np.sqrt(np.mean((y_test - multiple_predictions) ** 2))
print(multiple_rmse)
9. How do you interpret this value?
1. Import the standard scaler from sklearn.
from sklearn.preprocessing import StandardScaler
2. Create standard scalers for training and test data.
X_scaler = StandardScaler()
y_scaler = StandardScaler()
3. Fit the scaler to all training data.
X_scaler.fit(X)
y_scaler.fit(y.values.reshape(-1, 1))
4. Scale the training and test data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))
5. Import the neural network regressor class from sklearn.
from sklearn.neural_network import MLPRegressor
6. Create a neural network regressor with 4 hidden nodes, a tanh activation function, an LBFGS solver, and 1000 maximum iterations.
neural_model = MLPRegressor(
hidden_layer_sizes = (4),
activation = "tanh",
solver = "lbfgs",
max_iter = 1000)
7. Train the model with the training set.
neural_model.fit(
X = X_train_scaled,
y = y_train_scaled.reshape(-1, ))
8. Predict output values for the test set.
scaled_predictions = neural_model.predict(X_test_scaled)
9. Unscale the predictions.
neural_predictions = y_scaler.inverse_transform(scaled_predictions)
10. Visualize the prediction error.
plt.scatter(
x = X_train.Petal_Length,
y = y_train,
color = "black",
facecolor = "none")
plt.scatter(
x = X_test.Petal_Length,
y = neural_predictions,
color = "blue",
marker = 'x')
plt.scatter(
x = X_test.Petal_Length,
y = y_test,
color = "green")
plt.plot(
[X_test.Petal_Length, X_test.Petal_Length],
[neural_predictions, y_test],
color = "red",
zorder = 0)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
11. Compute the root mean squared error (RMSE) of these predictions.
neural_rmse = np.sqrt(np.mean((y_test - neural_predictions) ** 2))
12. Inspect the RMSE of these predictions.
print(neural_rmse)
1. Compare all three results.
print("Simple RMSE: ", simple_rmse)
print("Multiple RMSE: ", multiple_rmse)
print("Neural RMSE: ", neural_rmse)
2. Question: Which of these models would you choose? Why?