Lecture 18: Cross Validation

  • We overfit the data

  • We try to fit minimize training error
  • Test error to see generalization error

  • 5-fold cross validation
  • like bootstrap

Cross Val

# shuffle
shuffled_data = data.sample(frac=1.) # all data
shuffled_data

split_point = int(shuffled_data.shape[0]*0.95)
tr = shuffled_data.iloc[:split_point]
te = shuffled_data.iloc[split_point:]

len(tr) + len(te) == len(data)

from sklearn.model_selection import train_test_split
tr, te = train_test_split(data, test_size=0.1, random_state=83)

Don't evalutate model on test error?

SKLearn Pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

model = Pipeline([
    ("SelectColumns", ColumnTransformer([("keep", "passthrough", ["cylinders", "displacement"])])),
    ("LinearModel", LinearRegression())
]) 

model['SelectColumns']

model.fit(tr, tr['mpg'])
# model is a pipeline

Y_hat = model.predict(tr)
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

models = {"c+d": model}
quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration"]
model = Pipeline([
    ("SelectColumns", ColumnTransformer([("keep", "passthrough", quantitative_features)])),
    ("LinearModel", LinearRegression())
])

from sklearn.impute import SimpleImputer
model = Pipeline([
    ("SelectColumns", ColumnTransformer([("keep", "passthrough", quantitative_features)])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])

Cross Validation

from sklearn.model_selection import KFold
from sklearn.base import clone

def cross_validate_rmse(model):
    model = clone(model)
    five_fold = KFold(n_splits=5)
    rmse_values = []
    for tr_ind, va_ind in five_fold.split(tr):
        model.fit(tr.iloc[tr_ind,:], tr['mpg'].iloc[tr_ind])
        rmse_values.append(rmse(tr['mpg'].iloc[va_ind], model.predict(tr.iloc[va_ind,:])))
    return np.mean(rmse_values)

cross_validate_rmse(model)

def compare_models(models):
    # Compute the training error for each model
    training_rmse = [rmse(tr['mpg'], model.predict(tr)) for model in models.values()]
    # Compute the cross validation error for each model
    validation_rmse = [cross_validate_rmse(model) for model in models.values()]
    # Compute the test error for each model (don't do this!)
    test_rmse = [rmse(te['mpg'], model.predict(te)) for model in models.values()]
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = training_rmse, name="Training RMSE"),
        go.Bar(x = names, y = validation_rmse, name="CV RMSE"),
        go.Bar(x = names, y = test_rmse, name="Test RMSE", opacity=.3)])
    return fig

  • Train and CV RMSE

  • An example of overfiting