# Lecture 19: Regularization

models = {}

quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]

for i in range(len(quantitative_features)):
# The features to include in the ith model
features = quantitative_features[:(i+1)]
# The name we are giving to the ith model
name = ",".join([name[0] for name in features])
# The pipeline for the ith model
model = Pipeline([
("SelectColumns", ColumnTransformer([
("keep", "passthrough", features),
])),
("Imputation", SimpleImputer()),
("LinearModel", LinearRegression())
])
# Fit the pipeline
model.fit(tr, tr['mpg']);
# Saving the ith model
models[name] = model


## K-fold Cross Validation

from sklearn.model_selection import cross_val_score



## Overfitting

• The blue is really small

## Regularization

• penalize overfit models
• use less complexity

• complexity

• Best solution

• L1 norm
• LASSO

• L2 norm
• doesn't really stick to the corners

• Different regularization

• Lambda is the same as complexity due to lagrangian

ridge_model = Pipeline([
("SelectColumns", ColumnTransformer([
("keep", StandardScaler(), quantitative_features),
("origin_encoder", OneHotEncoder(), ["origin"]),
("text", CountVectorizer(), "name")
])),
("Imputation", SimpleImputer()),
("LinearModel", Ridge(alpha=10))
])

ridge_model = Pipeline([
("SelectColumns", ColumnTransformer([
("keep", StandardScaler(), quantitative_features),
("origin_encoder", OneHotEncoder(), ["origin"]),
("text", CountVectorizer(), "name")
])),
("Imputation", SimpleImputer()),
("LinearModel", Ridge(alpha=10))
])

alphas = np.linspace(0.5, 20, 30)
cv_values = []
train_values = []
test_values = []
for alpha in alphas:
ridge_model.set_params(LinearModel__alpha=alpha)
cv_values.append(np.mean(cross_val_score(ridge_model, tr, tr['mpg'], scoring=rmse_score, cv=5)))
ridge_model.fit(tr, tr['mpg'])
train_values.append(rmse_score(ridge_model, tr, tr['mpg']))
test_values.append(rmse_score(ridge_model, te, te['mpg']))


## Cross Validate Tune Regularization Param

fig = go.Figure()
fig.add_trace(go.Scatter(x = alphas, y = train_values, mode="lines+markers", name="Train"))
fig.add_trace(go.Scatter(x = alphas, y = cv_values, mode="lines+markers", name="CV"))
fig.add_trace(go.Scatter(x = alphas, y = test_values, mode="lines+markers", name="Test"))
fig.update_layout(xaxis_title=r"$\alpha$", yaxis_title="CV RMSE")


## Ridge with CV

from sklearn.linear_model import RidgeCV

alphas = np.linspace(0.5, 3, 30)

ridge_model = Pipeline([
("SelectColumns", ColumnTransformer([
("keep", StandardScaler(), quantitative_features),
("origin_encoder", OneHotEncoder(), ["origin"]),
("text", CountVectorizer(), "name")
])),
("Imputation", SimpleImputer()),
("LinearModel", RidgeCV(alphas=alphas))
])


• The red CV is shrinking

## Lasso CV

from sklearn.linear_model import Lasso, LassoCV

lasso_model = Pipeline([
("SelectColumns", ColumnTransformer([
("keep", StandardScaler(), quantitative_features),
("origin_encoder", OneHotEncoder(), ["origin"]),
("text", CountVectorizer(), "name")
])),
("Imputation", SimpleImputer()),
("LinearModel", LassoCV(cv=3))
])

lasso_model.fit(tr, tr['mpg'])
models["LassoCV"] = lasso_model
compare_models(models)