Applied-Machine-Learning-with-Python: Programming Assignment: Assignment 2 Submission
In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
# You can use this function to help you visualize the dataset by
# plotting a scatterplot of the data points
# in the training and test sets.
#def part1_scatter():
#import matplotlib.pyplot as plt
#%matplotlib notebook
#plt.figure()
#plt.scatter(X_train, y_train, label='training data')
#plt.scatter(X_test, y_test, label='test data')
#plt.legend(loc=4);
# NOTE: Uncomment the function below to visualize the data, but be sure
# to **re-comment it before submitting this assignment to the autograder**.
#part1_scatter()
In [26]:
def answer_one():
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
# Your code here
DEGREES = [1, 3, 6, 9]
N_POINTS = 100
result = np.zeros([len(DEGREES), N_POINTS])
predict = np.linspace(0, 10, N_POINTS).reshape(-1, 1)
X_tr = X_train.reshape(-1, 1)
for i, deg in enumerate(DEGREES):
poly = PolynomialFeatures(degree=deg)
X_ = poly.fit_transform(X_tr)
predict_ = poly.fit_transform(predict)
reg = LinearRegression()
reg.fit(X_, y_train)
result[i, :] = reg.predict(predict_)
return result
In [25]:
# feel free to use the function plot_one() to replicate the figure
# from the prompt once you have completed question one
#def plot_one(degree_predictions):
#import matplotlib.pyplot as plt
#%matplotlib notebook
#plt.figure(figsize=(10,5))
#plt.plot(X_train, y_train, 'o', label='training data', markersize=10)
#plt.plot(X_test, y_test, 'o', label='test data', markersize=10)
#for i,degree in enumerate([1,3,6,9]):
#plt.plot(np.linspace(0,10,100), degree_predictions[i], alpha=0.8, lw=2, label='degree={}'.format(degree))
#plt.ylim(-1,2.5)
#plt.legend(loc=4)
#plot_one(answer_one())
In [27]:
def answer_two():
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics.regression import r2_score
# Your code here
DEGREES = np.arange(0, 10)
r2_train = np.zeros([len(DEGREES)])
r2_test = np.zeros([len(DEGREES)])
X_tr = X_train.reshape(-1, 1)
X_tst = X_test.reshape(-1, 1)
for i, deg in enumerate(DEGREES):
poly = PolynomialFeatures(degree=deg)
X_tr_ = poly.fit_transform(X_tr)
X_tst_ = poly.transform(X_tst)
reg = LinearRegression()
reg.fit(X_tr_, y_train)
r2_train[i] = r2_score(y_train, reg.predict(X_tr_))
r2_test[i] = r2_score(y_test, reg.predict(X_tst_))
return r2_train, r2_test # Your answer here
answer_two()
Out[27]:
In [28]:
def answer_three():
# Your code here
r2_train, r2_test = answer_two()
# Sort the scores
r2_train_sorted = np.sort(r2_train)
r2_test_sorted = np.sort(r2_test)
# Initialize the values
Underfitting = 0
Overfitting = 0
Good_Generalization = 0
min_r2_train = np.min(r2_train)
max_r2_train = np.max(r2_train)
min_r2_test = np.max(r2_test)
max_r2_test = np.max(r2_test)
for deg, data in enumerate(zip(r2_train, r2_test)):
if data[0] < r2_train_sorted[5] and data[1] < r2_test_sorted[5]:
Underfitting = deg
if data[0] >= r2_train_sorted[5] and data[1] < r2_test_sorted[5]:
Overfitting = deg
if data[0] >= r2_train_sorted[5] and data[1] >= r2_test_sorted[7]:
Good_Generalization = deg
return Underfitting, Overfitting, Good_Generalization
#return # Return your answer
answer_three()
Out[28]:
In [29]:
def answer_four():
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics.regression import r2_score
# Your code here
ALPHA=0.01
MAX_ITER=10000
DEGREE = 12
# Set up data
X_tr = X_train.reshape(-1, 1)
X_tst = X_test.reshape(-1, 1)
poly = PolynomialFeatures(degree=DEGREE)
X_tr_ = poly.fit_transform(X_tr)
X_tst_ = poly.transform(X_tst)
# Fit OLS
ols = LinearRegression()
ols.fit(X_tr_, y_train)
LinearRegression_R2_test_score = r2_score(y_test, ols.predict(X_tst_))
# Fit Lasso
linlasso = Lasso(alpha=ALPHA, max_iter = MAX_ITER).fit(X_tr_, y_train)
Lasso_R2_test_score = r2_score(y_test, linlasso.predict(X_tst_))
return LinearRegression_R2_test_score, Lasso_R2_test_score
#return # Your answer here
answer_four()
Out[29]:
In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#mush_df = pd.read_csv('readonly/mushrooms.csv')
mush_df = pd.read_csv('mushrooms.csv')
mush_df2 = pd.get_dummies(mush_df)
X_mush = mush_df2.iloc[:,2:]
y_mush = mush_df2.iloc[:,1]
# use the variables X_train2, y_train2 for Question 5
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush, random_state=0)
# For performance reasons in Questions 6 and 7, we will create a smaller version of the
# entire mushroom dataset for use in those questions. For simplicity we'll just re-use
# the 25% test split created above as the representative subset.
#
# Use the variables X_subset, y_subset for Questions 6 and 7.
X_subset = X_test2
y_subset = y_test2
In [9]:
def answer_five():
from sklearn.tree import DecisionTreeClassifier
# Your code here
clf = DecisionTreeClassifier(random_state=0).fit(X_train2, y_train2)
#top_five = clf.feature_importances_.argsort()[-5:]
top_five = clf.feature_importances_.argsort()[::-1][:5]
result = list(X_train2.columns[top_five])
return result # Your answer here
In [10]:
def answer_six():
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
# Your code here
param_range = np.logspace(-4,1,6)
clf = SVC(kernel = 'rbf', C = 1, random_state=0)
train_scores, test_scores = validation_curve(clf, X_subset, y_subset,
param_name='gamma',
param_range=param_range, cv=3, scoring="accuracy")
training_scores = np.mean(train_scores, axis=1)
testing_scores = np.mean(test_scores, axis=1)
return training_scores, testing_scores
#return # Your answer here
In [11]:
def answer_seven():
# Your code here
param_range = np.logspace(-4, 1, 6)
# Read in the results of answer_six
training_scores, test_scores = answer_six()
# Sort the scores
train_scores_sorted = np.sort(training_scores)
test_scores_sorted = np.sort(test_scores)
# Initialize the values
Underfitting = 0
Overfitting = 0
Good_Generalization = 0
min_train_scores = np.min(training_scores)
max_train_scores = np.max(training_scores)
min_test_scores = np.max(test_scores)
max_test_scores = np.max(test_scores)
for gam, data in zip(param_range, zip(training_scores, test_scores)):
if data[0] <= train_scores_sorted[1] and data[1] <= test_scores_sorted[1]:
Underfitting = gam
if data[0] > train_scores_sorted[1] and data[1] <= test_scores_sorted[1]:
Overfitting = gam
if data[0] == max_train_scores and data[1] == max_test_scores:
Good_Generalization = gam
return Underfitting, Overfitting, Good_Generalization
answer_seven()
#return # Return your answer
Out[11]:
In [ ]:
Comments
Post a Comment