Explain model globally
Explain model locally
Naive:
coef_
for linear modelsfeature_importances_
for tree-based models\[ I^{drop}_i = Acc(f,X,y) - Acc(f',X_{-i},y) \]
def drop_feature_importance(est, X, y):
base_score = np.mean(cross_val_score(est, X, y))
scores = []
for feature in range(X.shape[1]):
mask = np.ones(X.shape[1], 'bool')
mask[feature] = False
X_new = X[:, mask]
this_score = np.mean(cross_val_score(est, X_new, y))
scores.append(base_score - this_score)
return np.array(scores)
Idea: measure marginal influence of one feature \[ I^{perm}_i = Acc(f,X,y) - E_{x_i}[Acc(f(x_i,X_{-i}),y)] \]
def permutation_importance(est, X, y, n_repeat=100):
baseline_score = estimator.score(X, y)
for f_idx in range(X.shape[1]):
for repeat in range(n_repeat):
X_new = X.copy()
X_new[:, f_idx] = np.random.shuffle(X[:, f_idx])
feature_score = estimator.score(X_new, y)
scores[f_idx, repeat] = baseline_score - feature_score
100,000 points, 8 features
lasso = LassoCV().fit(X_train, y_train)
lasso.score(X_test, y_test)
0.545
ridge = RidgeCV().fit(X_train, y_train)
ridge.score(X_test, y_test)
0.545
lr = LinearRegression().fit(X_train, y_train)
lr.score(X_test, y_test)
0.545
param_grid = {'max_leaf_nodes': range(5, 40, 5)}
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=10, n_jobs=3)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
0.545
rf = RandomForestRegressor(min_samples_leaf=5).fit(X_train, y_train)
rf.score(X_test, y_test)
0.542
from sklearn.ensemble.partial_dependence import plot_partial_dependence
boston = load_boston()
X_train, X_test, y_train, y_test = \
train_test_split(boston.data, boston.target,
random_state=0)
gbrt = GradientBoostingRegressor().fit(X_train, y_train)
fig, axs = \
plot_partial_dependence(gbrt, X_train,
np.argsort(gbrt.feature_importances_)[-6:],
feature_names=boston.feature_names, n_jobs=3,
grid_resolution=50)
plot_partial_dependence(
gbrt, X_train, [np.argsort(gbrt.feature_importances_)[-2:]],
feature_names=boston.feature_names,
n_jobs=3, grid_resolution=50)
from sklearn.ensemble.partial_dependence import plot_partial_dependence
for i in range(3):
fig, axs = \
plot_partial_dependence(gbrt, X_train, range(4), n_cols=4,
feature_names=iris.feature_names,
grid_resolution=50, label=i)
from sklearn.preprocessing import scale
boston = load_boston()
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = \
train_test_split(X, y, random_state=0)
X_train_scaled = scale(X_train)
cov = np.cov(X_train_scaled, rowvar=False)
from scipy.cluster import hierarchy
order = np.array(hierarchy.dendrogram(
hierarchy.ward(cov),no_plot=True)['ivl'], dtype="int")
from sklearn.feature_selection import f_regression
f_values, p_values = f_regression(X, y)
from sklearn.feature_selection import \
SelectKBest, SelectPercentile, SelectFpr
from sklearn.linear_model import RidgeCV
select = SelectKBest(k=2, score_func=f_regression)
select.fit(X_train, y_train)
print(X_train.shape)
print(select.transform(X_train).shape)
(379, 13)
(379, 2)
all_features = make_pipeline(StandardScaler(), RidgeCV())
np.mean(cross_val_score(all_features, X_train, y_train, cv=10))
0.718
select_2 = make_pipeline(StandardScaler(),
SelectKBest(k=2,
score_func=f_regression),
RidgeCV())
np.mean(cross_val_score(select_2, X_train, y_train, cv=10))
0.624
from sklearn.feature_selection import mutual_info_regression
scores = mutual_info_regression(X_train, y_train,
discrete_features=[3])
from sklearn.linear_model import LassoCV
X_train_scaled = scale(X_train)
lasso = LassoCV().fit(X_train_scaled, y_train)
print(lasso.coef_)
[-0.881 0.951 -0.082 0.59 -1.69 2.639 -0.146 -2.796 1.695 -1.614 -2.133 0.729 -3.615]
from sklearn.linear_model import Lasso
X_train_scaled = scale(X_train)
lasso = Lasso().fit(X_train_scaled, y_train)
print(lasso.coef_)
[-0. 0. -0. 0. -0. 2.529 -0. -0. -0. -0.228 -1.701 0.132 -3.606]
SelectFromModel
from sklearn.feature_selection import SelectFromModel
select_lassocv = SelectFromModel(LassoCV(), threshold=1e-5)
select_lassocv.fit(X_train, y_train)
print(select_lassocv.transform(X_train).shape)
(379,11)
pipe_lassocv = make_pipeline(StandardScaler(),
select_lassocv, RidgeCV())
np.mean(cross_val_score(pipe_lassocv, X_train, y_train, cv=10))
np.mean(cross_val_score(all_features, X_train, y_train, cv=10))
0.717
0.718
# could grid-search alpha in lasso
select_lasso = SelectFromModel(Lasso())
pipe_lasso = make_pipeline(StandardScaler(), select_lasso, RidgeCV())
np.mean(cross_val_score(pipe_lasso, X_train, y_train, cv=10))
0.671
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
# create ranking among all features by selecting only one
rfe = RFE(LinearRegression(), n_features_to_select=1)
rfe.fit(X_train_scaled, y_train)
rfe.ranking_
array([ 9, 8, 13, 11, 5, 2, 12, 4, 7, 6, 3, 10, 1])
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
rfe = RFECV(LinearRegression(), cv=10)
rfe.fit(X_train_scaled, y_train)
print(rfe.support_)
print(boston.feature_names[rfe.support_])
[ True True False True True True False True True True True True True]
['CRIM' 'ZN' 'CHAS' 'NOX' 'RM' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT']
pipe_rfe_ridgecv = make_pipeline(StandardScaler(),
RFECV(LinearRegression(),
cv=10),
RidgeCV())
np.mean(cross_val_score(pipe_rfe_ridgecv, X_train, y_train, cv=10))
0.710
pipe_rfe_ridgecv = make_pipeline(StandardScaler(),
RFECV(LinearRegression(),
cv=10),
RidgeCV())
np.mean(cross_val_score(pipe_rfe_ridgecv, X_train, y_train, cv=10))
0.710
from sklearn.preprocessing import PolynomialFeatures
pipe_rfe_ridgecv = make_pipeline(StandardScaler(),
PolynomialFeatures(),
RFECV(LinearRegression(),
cv=10),
RidgeCV())
np.mean(cross_val_score(pipe_rfe_ridgecv, X_train, y_train, cv=10))
0.820
mlxtend
SequentialFeatureSelector
from mlxtend.feature_selection import \
SequentialFeatureSelector
sfs = SequentialFeatureSelector(LinearRegression(),
forward=False, k_features=7)
sfs.fit(X_train_scaled, y_train)
Features: 7/7
print(sfs.k_feature_idx_)
print(boston.feature_names[np.array(sfs.k_feature_idx_)])
(1, 4, 5, 7, 9, 10, 12)
['ZN' 'NOX' 'RM' 'DIS' 'TAX' 'PTRATIO' 'LSTAT']
sfs.k_score_
0.725