Lecture 19: Regularization

Link

models = {}

quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]


for i in range(len(quantitative_features)):
    # The features to include in the ith model
    features = quantitative_features[:(i+1)]
    # The name we are giving to the ith model
    name = ",".join([name[0] for name in features])
    # The pipeline for the ith model
    model = Pipeline([
        ("SelectColumns", ColumnTransformer([
            ("keep", "passthrough", features),
        ])),
        ("Imputation", SimpleImputer()),
        ("LinearModel", LinearRegression())
    ])
    # Fit the pipeline
    model.fit(tr, tr['mpg']);
    # Saving the ith model
    models[name] = model

K-fold Cross Validation

from sklearn.model_selection import cross_val_score

Overfitting

  • The blue is really small

Regularization

  • penalize overfit models
  • use less complexity

  • complexity

  • Best solution

  • L1 norm
  • LASSO

  • L2 norm
  • doesn't really stick to the corners

  • Different regularization

  • Lambda is the same as complexity due to lagrangian

  • standardize all your terms?
ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", Ridge(alpha=10))
])
ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", Ridge(alpha=10))
])

alphas = np.linspace(0.5, 20, 30)
cv_values = []
train_values = []
test_values = []
for alpha in alphas:
    ridge_model.set_params(LinearModel__alpha=alpha)
    cv_values.append(np.mean(cross_val_score(ridge_model, tr, tr['mpg'], scoring=rmse_score, cv=5)))
    ridge_model.fit(tr, tr['mpg'])
    train_values.append(rmse_score(ridge_model, tr, tr['mpg']))
    test_values.append(rmse_score(ridge_model, te, te['mpg']))

Cross Validate Tune Regularization Param

fig = go.Figure()
fig.add_trace(go.Scatter(x = alphas, y = train_values, mode="lines+markers", name="Train"))
fig.add_trace(go.Scatter(x = alphas, y = cv_values, mode="lines+markers", name="CV"))
fig.add_trace(go.Scatter(x = alphas, y = test_values, mode="lines+markers", name="Test"))
fig.update_layout(xaxis_title=r"$\alpha$", yaxis_title="CV RMSE")

Ridge with CV

from sklearn.linear_model import RidgeCV

alphas = np.linspace(0.5, 3, 30)

ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", RidgeCV(alphas=alphas))
])

  • The red CV is shrinking

Lasso CV

from sklearn.linear_model import Lasso, LassoCV

lasso_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LassoCV(cv=3))
])

lasso_model.fit(tr, tr['mpg'])
models["LassoCV"] = lasso_model
compare_models(models)