 
plt.figure()
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1), features,
		   rotation=30, ha="right")
plt.ylabel("MEDV")
 
 
 
 
MaxAbsScaler)from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = \
	train_test_split(X, y, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
print("{:.2f}".format(ridge.score(X_test_scaled, y_test)))
0.63
 
 
Shortcuts:
est.fit_transform(X) == est.fit(X).transform(X) (mostly)est.fit_predict(X) == est.fit(X).predict(X) (mostly)scores = cross_val_score(RidgeCV(), X_train, y_train, cv=10)
print2(np.mean(scores), np.std(scores))
(0.717, 0.125)
scores = cross_val_score(RidgeCV(), X_train_scaled, y_train, cv=10)
print2(np.mean(scores), np.std(scores))
(0.718, 0.127)
scores = cross_val_score(KNeighborsRegressor(), X_train, y_train, cv=10)
print2(np.mean(scores), np.std(scores))
(0.499, 0.146)
scores = cross_val_score(KNeighborsRegressor(), X_train_scaled, y_train, cv=10)
print2(np.mean(scores), np.std(scores))
(0.750, 0.106)
print(X.shape)
(100, 10000)
# select most informative 5% of features
from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func=f_regression, percentile=5)
select.fit(X, y)
X_selected = select.transform(X)
print(X_selected.shape)
(100, 500)
np.mean(cross_val_score(Ridge(), X_selected, y))
0.90
ridge = Ridge().fit(X_selected, y)
X_test_selected = select.transform(X_test)
ridge.score(X_test_selected, y_test)
-0.18
# BAD!
select.fit(X, y)  # includes the cv test parts!
X_sel = select.transform(X)
scores = []
for train, test in cv.split(X, y):
    ridge = Ridge().fit(X_sel[train], y[train])
    score = ridge.score(X_sel[test], y[test])
    scores.append(score)
# GOOD!
scores = []
for train, test in cv.split(X, y):
    select.fit(X[train], y[train])
    X_sel_train = select.transform(X[train])
    ridge = Ridge().fit(X_sel_train, y[train])
    X_sel_test = select.transform(X[test])
    score = ridge.score(X_sel_test, y[test])
    scores.append(score)
Need to include preprocessing in cross-validation!
Information leak:
 
No information leak:
 
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = \
	train_test_split(X, y, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
print2(ridge.score(X_test_scaled, y_test))
(0.635)
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), Ridge())
pipe.fit(X_train, y_train)
print2(pipe.score(X_test, y_test))
(0.635)
 
# BAD!
select.fit(X, y)  # includes the cv test parts!
X_sel = select.transform(X)
scores = []
for train, test in cv.split(X, y):
    ridge = Ridge().fit(X_sel[train], y[train])
    score = ridge.score(X_sel[test], y[test])
    scores.append(score)
Same as:
select.fit(X, y)
X_selected = select.transform(X, y)
np.mean(cross_val_score(Ridge(), X_selected, y))
0.90
# GOOD!
scores = []
for train, test in cv.split(X, y):
    select.fit(X[train], y[train])
    X_sel_train = select.transform(X[train])
    ridge = Ridge().fit(X_sel_train, y[train])
    X_sel_test = select.transform(X[test])
    score = ridge.score(X_sel_test, y[test])
    scores.append(score)
Same as:
pipe = make_pipeline(select, Ridge())
np.mean(cross_val_score(pipe, X, y))
-0.079
from sklearn.pipeline import make_pipeline
knn_pipe = make_pipeline(StandardScaler(),
                         KNeighborsRegressor())
print(knn_pipe)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsregressor', KNeighborsRegressor())])
from sklearn.pipeline import Pipeline
pipe = Pipeline((("scaler", StandardScaler()),
                 ("regressor", KNeighborsRegressor)))
knn_pipe = make_pipeline(StandardScaler(),
                         KNeighborsRegressor())
param_grid = \
    {'kneighborsregressor__n_neighbors': range(1, 10)}
grid = GridSearchCV(knn_pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
print2(grid.score(X_test, y_test))
{'kneighborsregressor__n_neighbors': 7}
(0.600)
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, random_state=0)
from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    Ridge())
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
              'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid,
                    n_jobs=-1, return_train_score=True)
grid.fit(X_train, y_train)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('regressor', Ridge())])
param_grid = {'scaler': [StandardScaler(), MinMaxScaler(),
                         'passthrough'],
              'regressor': [Ridge(), Lasso()],
              'regressor__alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
from sklearn.tree import DecisionTreeRegressor
pipe = Pipeline([('scaler', StandardScaler()),
                 ('regressor', Ridge())])
# check out searchgrid for more convenience
param_grid = [{'regressor': [DecisionTreeRegressor()],
               'regressor__max_depth': [2, 3, 4],
               'scaler': ['passthrough']},
              {'regressor': [Ridge()],
               'regressor__alpha': [0.1, 1],
               'scaler': [StandardScaler(), MinMaxScaler(),
                          'passthrough']}
             ]
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)
 
 
Only applicable for positive \(x\)!
 
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='box-cox')
# for any data: Yeo-Johnson
pt.fit(X)
 
 
Before
After
 
 
Before
After
df = pd.DataFrame({
    'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
    'salary': [103, 89, 142, 54, 63, 219],
    'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
| boro | salary | vegan | |
|---|---|---|---|
| 0 | Manhattan | 103 | No | 
| 1 | Queens | 89 | No | 
| 2 | Manhattan | 142 | No | 
| 3 | Brooklyn | 54 | Yes | 
| 4 | Brooklyn | 63 | Yes | 
| 5 | Bronx | 219 | No | 
df['boro_ordinal'] = df.boro.astype("category").cat.codes
| boro | salary | vegan | boro_ordinal | |
|---|---|---|---|---|
| 0 | Manhattan | 103 | No | 2 | 
| 1 | Queens | 89 | No | 3 | 
| 2 | Manhattan | 142 | No | 2 | 
| 3 | Brooklyn | 54 | Yes | 1 | 
| 4 | Brooklyn | 63 | Yes | 1 | 
| 5 | Bronx | 219 | No | 0 | 
 
 
| boro | salary | vegan | |
|---|---|---|---|
| 0 | Manhattan | 103 | No | 
| 1 | Queens | 89 | No | 
| 2 | Manhattan | 142 | No | 
| 3 | Brooklyn | 54 | Yes | 
| 4 | Brooklyn | 63 | Yes | 
| 5 | Bronx | 219 | No | 
pd.get_dummies(df)
| salary | boro_Bronx | boro_Brooklyn | boro_Manhattan | boro_Queens | vegan_No | vegan_Yes | |
|---|---|---|---|---|---|---|---|
| 0 | 103 | 0 | 0 | 1 | 0 | 1 | 0 | 
| 1 | 89 | 0 | 0 | 0 | 1 | 1 | 0 | 
| 2 | 142 | 0 | 0 | 1 | 0 | 1 | 0 | 
| 3 | 54 | 0 | 1 | 0 | 0 | 0 | 1 | 
| 4 | 63 | 0 | 1 | 0 | 0 | 0 | 1 | 
| 5 | 219 | 1 | 0 | 0 | 0 | 1 | 0 | 
| boro | salary | vegan | |
|---|---|---|---|
| 0 | Manhattan | 103 | No | 
| 1 | Queens | 89 | No | 
| 2 | Manhattan | 142 | No | 
| 3 | Brooklyn | 54 | Yes | 
| 4 | Brooklyn | 63 | Yes | 
| 5 | Bronx | 219 | No | 
pd.get_dummies(df, columns=['boro'])
| salary | vegan | boro_Bronx | boro_Brooklyn | boro_Manhattan | boro_Queens | |
|---|---|---|---|---|---|---|
| 0 | 103 | No | 0 | 0 | 1 | 0 | 
| 1 | 89 | No | 0 | 0 | 0 | 1 | 
| 2 | 142 | No | 0 | 0 | 1 | 0 | 
| 3 | 54 | Yes | 0 | 1 | 0 | 0 | 
| 4 | 63 | Yes | 0 | 1 | 0 | 0 | 
| 5 | 219 | No | 1 | 0 | 0 | 0 | 
| boro | salary | vegan | |
|---|---|---|---|
| 0 | Manhattan | 103 | No | 
| 1 | Queens | 89 | No | 
| 2 | Manhattan | 142 | No | 
| 3 | Brooklyn | 54 | Yes | 
| 4 | Brooklyn | 63 | Yes | 
| 5 | Bronx | 219 | No | 
pd.get_dummies(df_ordinal, columns=['boro'])
| salary | vegan | boro_0 | boro_1 | boro_2 | boro_3 | |
|---|---|---|---|---|---|---|
| 0 | 103 | No | 0 | 0 | 1 | 0 | 
| 1 | 89 | No | 0 | 0 | 0 | 1 | 
| 2 | 142 | No | 0 | 0 | 1 | 0 | 
| 3 | 54 | Yes | 0 | 1 | 0 | 0 | 
| 4 | 63 | Yes | 0 | 1 | 0 | 0 | 
| 5 | 219 | No | 1 | 0 | 0 | 0 | 
df = pd.DataFrame({
    'boro': ['Manhattan', 'Queens', 'Manhattan',
             'Brooklyn', 'Brooklyn', 'Bronx'],
    'salary': [103, 89, 142, 54, 63, 219],
    'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
df_dummies = pd.get_dummies(df, columns=['boro'])
| salary | vegan | boro_Bronx | boro_Brooklyn | boro_Manhattan | boro_Queens | |
|---|---|---|---|---|---|---|
| 0 | 103 | No | 0 | 0 | 1 | 0 | 
| 1 | 89 | No | 0 | 0 | 0 | 1 | 
| 2 | 142 | No | 0 | 0 | 1 | 0 | 
| 3 | 54 | Yes | 0 | 1 | 0 | 0 | 
| 4 | 63 | Yes | 0 | 1 | 0 | 0 | 
| 5 | 219 | No | 1 | 0 | 0 | 0 | 
df = pd.DataFrame({
    'boro': ['Brooklyn', 'Manhattan', 'Brooklyn',
             'Queens', 'Brooklyn', 'Staten Island'],
    'salary': [61, 146, 142, 212, 98, 47],
    'vegan': ['Yes', 'No','Yes','No', 'Yes', 'No']})
df_dummies = pd.get_dummies(df, columns=['boro'])
| salary | vegan | boro_Brooklyn | boro_Manhattan | boro_Queens | boro_Staten Island | |
|---|---|---|---|---|---|---|
| 0 | 61 | Yes | 1 | 0 | 0 | 0 | 
| 1 | 146 | No | 0 | 1 | 0 | 0 | 
| 2 | 142 | Yes | 1 | 0 | 0 | 0 | 
| 3 | 212 | No | 0 | 0 | 1 | 0 | 
| 4 | 98 | Yes | 1 | 0 | 0 | 0 | 
| 5 | 47 | No | 0 | 0 | 0 | 1 | 
df = pd.DataFrame({
    'boro': ['Manhattan', 'Queens', 'Manhattan',
             'Brooklyn', 'Brooklyn', 'Bronx'],
    'salary': [103, 89, 142, 54, 63, 219],
    'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
df['boro'] = pd.Categorical(df.boro,
                            categories=['Manhattan', 'Queens', 'Brooklyn',
                                        'Bronx', 'Staten Island'])
pd.get_dummies(df, columns=['boro'])
| salary | vegan | boro_Manhattan | boro_Queens | boro_Brooklyn | boro_Bronx | boro_Staten Island | |
|---|---|---|---|---|---|---|---|
| 0 | 103 | No | 1 | 0 | 0 | 0 | 0 | 
| 1 | 89 | No | 0 | 1 | 0 | 0 | 0 | 
| 2 | 142 | No | 1 | 0 | 0 | 0 | 0 | 
| 3 | 54 | Yes | 0 | 0 | 1 | 0 | 0 | 
| 4 | 63 | Yes | 0 | 0 | 1 | 0 | 0 | 
| 5 | 219 | No | 0 | 0 | 0 | 1 | 0 | 
OneHotEncoderfrom sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhattan', 'Queens', 'Manhattan',
                            'Brooklyn', 'Brooklyn', 'Bronx']})
ce = OneHotEncoder().fit(df)
print(ce.transform(df).toarray())
[[0. 0. 0. 1. 0. 0. 0. 0. 1. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 1. 0. 0. 0. 1. 0.] [1. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]]
OneHotEncoder + ColumnTransformercategorical = df.dtypes == object
preprocess = make_column_transformer(
    (StandardScaler(), ~categorical),
    (OneHotEncoder(), categorical))
model = make_pipeline(preprocess, LogisticRegression())
model
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  salary     True
boro      False
dtype: bool),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  salary    False
boro       True
dtype: bool)])),
                ('logisticregression', LogisticRegression())])
 
 
from sklearn.datasets import fetch_openml
data = fetch_openml("house_sales", as_frame=True)
X = data.frame.drop(['date', 'price'], axis=1)
y = data.frame['price']
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.columns
Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')
X_train.head()
| bedrooms | bathrooms | sqft_living | sqft_lot | floors | ... | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5945 | 4.0 | 2.25 | 1810.0 | 9240.0 | 2.0 | ... | 98055.0 | 47.4362 | -122.187 | 1660.0 | 9240.0 | 
| 8423 | 3.0 | 2.50 | 1600.0 | 2788.0 | 2.0 | ... | 98031.0 | 47.4034 | -122.187 | 1720.0 | 3605.0 | 
| 13488 | 4.0 | 2.50 | 1720.0 | 8638.0 | 2.0 | ... | 98003.0 | 47.2704 | -122.313 | 1870.0 | 7455.0 | 
| 20731 | 2.0 | 2.25 | 1240.0 | 705.0 | 2.0 | ... | 98027.0 | 47.5321 | -122.073 | 1240.0 | 750.0 | 
| 2358 | 3.0 | 2.00 | 1280.0 | 13356.0 | 1.0 | ... | 98042.0 | 47.3715 | -122.074 | 1590.0 | 8071.0 | 
import category_encoders as ce
te = ce.TargetEncoder(cols='zipcode').fit(X_train,  y_train)
te.transform(X_train).head()
| bedrooms | bathrooms | sqft_living | sqft_lot | floors | ... | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5945 | 4.0 | 2.25 | 1810.0 | 9240.0 | 2.0 | ... | 305061.113861 | 47.4362 | -122.187 | 1660.0 | 9240.0 | 
| 8423 | 3.0 | 2.50 | 1600.0 | 2788.0 | 2.0 | ... | 303052.073892 | 47.4034 | -122.187 | 1720.0 | 3605.0 | 
| 13488 | 4.0 | 2.50 | 1720.0 | 8638.0 | 2.0 | ... | 290589.201970 | 47.2704 | -122.313 | 1870.0 | 7455.0 | 
| 20731 | 2.0 | 2.25 | 1240.0 | 705.0 | 2.0 | ... | 618687.511785 | 47.5321 | -122.073 | 1240.0 | 750.0 | 
| 2358 | 3.0 | 2.00 | 1280.0 | 13356.0 | 1.0 | ... | 314250.081967 | 47.3715 | -122.074 | 1590.0 | 8071.0 | 
y_train.groupby(X_train.zipcode).mean()[X_train.head().zipcode]
zipcode 98055.0 305061.113861 98031.0 303052.073892 98003.0 290589.201970 98027.0 618687.511785 98042.0 314250.081967 Name: price, dtype: float64
X = data.frame.drop(['date', 'price', 'zipcode'], axis=1)
scores = cross_val_score(Ridge(), X, y)
print(f"{np.mean(scores):.2f}")
0.69
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
X = data.frame.drop(['date', 'price'], axis=1)
ct = make_column_transformer((OneHotEncoder(), ['zipcode']), remainder='passthrough')
pipe_ohe = make_pipeline(ct, Ridge())
scores = cross_val_score(pipe_ohe, X, y)
print(f"{np.mean(scores):.2f}")
0.53
pipe_target = make_pipeline(ce.TargetEncoder(cols='zipcode'), Ridge())
scores = cross_val_score(pipe_target, X, y)
print(f"{np.mean(scores):.2f}")
0.79
 
 
from sklearn.linear_model import LogisticRegressionCV
X_train, X_test, y_train, y_test = \
    train_test_split(X_, y, stratify=y)
nan_columns = np.any(np.isnan(X_train), axis=0)
X_drop_columns = X_train[:, ~nan_columns]
scores = cross_val_score(LogisticRegressionCV(v=5), 
                         X_drop_columns, y_train, cv=10)
np.mean(scores)
0.772
 
 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScalar
nan_columns = np.any(np.isnan(X_train), axis = 0)
X_drop_columns = X_train[:,~nan_columns]
logreg = make_pipeline(StandardScalar(),
                       LogisticRegression())
scores = cross_val_score(logreg, X_drop_columns,
                         y_train, cv = 10)
print(np.mean(scores))
mean_pipe = make_pipeline(SimpleImputer(), StandardScalar(),
                          LogisticRegression())
scores = cross_val_score(mean_pipe, X_train, y_train, cv=10)
print(np.mean(scores))
0.794
0.729
sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)
distances = np.zeros((X_train.shape[0], X_train.shape[0]))
for i, x1 in enumerate(X_train):
    for j, x2 in enumerate(X_train):
        dist = (x1 - x2) ** 2
        nan_mask = np.isnan(dist)
        distances[i, j] = dist[~nan_mask].mean() * X_train.shape[1]
neighbors = np.argsort(distances, axis=1)[:, 1:]
n_neighbors = 3
X_train_knn = X_train.copy()
for feature in range(X_train.shape[1]):
    has_missing_value = np.isnan(X_train[:, feature])
    for row in np.where(has_missing_value)[0]:
        neighbor_features = X_train[neighbors[row], feature]
        non_nan_neighbors = \
            neighbor_features[~np.isnan(neighbor_features)]
        X_train_knn[row, feature] = \
            non_nan_neighbors[:n_neighbors].mean()
scores = cross_val_score(logreg, X_train_knn, y_train, cv=10)
np.mean(scores)
0.849
 
rf = RandomForestRegressor(n_estimators=100)
X_imputed = X_train.copy()
for i in range(10):
    last = X_imputed.copy()
    for feature in range(X_train.shape[1]):
        inds_not_f = np.arange(X_train.shape[1])
        inds_not_f = inds_not_f[inds_not_f != feature]
        f_missing = np.isnan(X_train[:, feature])
        rf.fit(X_imputed[~f_missing][:, inds_not_f],
               X_train[~f_missing, feature])
        X_imputed[f_missing, feature] = rf.predict(
            X_imputed[f_missing][:, inds_not_f])
    if (np.linalg.norm(last - X_imputed)) < .5:
        break
scores = cross_val_score(logreg, X_imputed, y_train, cv=10)
np.mean(scores)
0.855
 
!pip install fancyimputesklearn's IterativeImputer can work well..fancyimpute provides fancier features 
fancyimputefrom fancyimpute import IterativeImputer
imputer = IterativeImputer(n_iter=5)
X_complete = imputer.fit_transform(X_train)
scores = cross_val_score(logreg, X_train_fancy_mice, 
                         y_train, cv=10)
np.mean(scores)
0.866