calibration_curve
with sklearnUsing subsample of covertype dataset
from sklearn.linear_model import LogisticRegressionCV
print(X_train.shape)
print(np.bincount(y_train))
lr = LogisticRegressionCV().fit(X_train, y_train)
# (52292, 54)
# [19036 33256]
lr.C_
array([ 2.783])
print(lr.predict_proba(X_test)[:10])
print(y_test[:10])
# [[ 0.681 0.319]
# [ 0.049 0.951]
# [ 0.706 0.294]
# [ 0.537 0.463]
# [ 0.819 0.181]
# [ 0. 1. ]
# [ 0.794 0.206]
# [ 0.676 0.324]
# [ 0.727 0.273]
# [ 0.597 0.403]]
# [0 1 0 1 1 1 0 0 0 1]
from sklearn.calibration import calibration_curve
probs = lr.predict_proba(X_test)[:, 1]
prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=5)
print(prob_true)
print(prob_pred)
# [ 0.2 0.303 0.458 0.709 0.934]
# [ 0.138 0.306 0.498 0.701 0.926]
\[ BS = \frac{\sum_{i=1}^{n} (\widehat{p} (y_i)-y_i)^{2}}{n}\]
\[ f_{callib}(s(x)) \approx p(y)\]
\[f_{platt} = \frac{1}{1 + exp(-ws(x))}\]
from sklearn.calibration import CalibratedClassifierCV
X_train_sub, X_val, y_train_sub, y_val = \
train_test_split(X_train, y_train,
stratify=y_train, random_state=0)
rf = RandomForestClassifier(n_estimators=100).fit(X_train_sub, y_train_sub)
scores = rf.predict_proba(X_test)[:, 1]
plot_calibration_curve(y_test, scores, n_bins=20)
cal_rf = CalibratedClassifierCV(rf, cv="prefit",
method='sigmoid')
cal_rf.fit(X_val, y_val)
scores_sigm = cal_rf.predict_proba(X_test)[:, 1]
cal_rf_iso = CalibratedClassifierCV(rf, cv="prefit",
method='isotonic')
cal_rf_iso.fit(X_val, y_val)
scores_iso = cal_rf_iso.predict_proba(X_test)[:, 1]
cal_rf_iso_cv = CalibratedClassifierCV(rf, method='isotonic')
cal_rf_iso_cv.fit(X_train, y_train)
scores_iso_cv = cal_rf_iso_cv.predict_proba(X_test)[:, 1]
data = load_breast_cancer()
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
X_train, X_test, y_train, y_test = \
train_test_split(data.data, data.target,
stratify=data.target, random_state=0)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.91 0.92 0.92 53 1 0.96 0.94 0.95 90 accuracy 0.94 143 macro avg 0.93 0.93 0.93 143 weighted avg 0.94 0.94 0.94 143
y_pred = lr.predict_proba(X_test)[:, 1] > .85
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.84 1.00 0.91 53 1 1.00 0.89 0.94 90 accuracy 0.93 143 macro avg 0.92 0.94 0.93 143 weighted avg 0.94 0.93 0.93 143
from sklearn.datasets import fetch_openml
data = fetch_openml('mammography', as_frame=True)
X, y = data.data, data.target
print(X.shape)
(11183, 6)
print(y.value_counts())
-1 10923 1 260 Name: class, dtype: int64
lr = LogisticRegression(solver='lbfgs')
scores = cross_validate(lr, X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
0.918, 0.631
rf = RandomForestClassifier(n_estimators=100)
scores = cross_validate(rf, X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
0.943, 0.738
Change the training procedure
!pip install imbalanced-learn
# or conda install ...
Extends sklearn
API
data_resampled, targets_resampled = obj.resample(data, targets)
data_resampled, targets_resampled = \
obj.fit_resample(data, targets)
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(replacement=False)
X_train_subsample, y_train_subsample = \
rus.fit_resample(X_train, y_train)
print(X_train.shape)
print(X_train_subsample.shape)
print(np.bincount(y_train_subsample))
(8387, 6) (392, 6) [196 196]
from imblearn.pipeline import make_pipeline as make_imb_pipeline
undersample_pipe = make_imb_pipeline(
RandomUnderSampler(random_state=57), lr)
scores = cross_validate(undersample_pipe,
X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
# was 0.918, 0.631 without
0.921, 0.580
undersample_pipe_rf = \
make_imb_pipeline(RandomUnderSampler(random_state=0),
RandomForestClassifier(n_estimators=100))
scores = cross_validate(undersample_pipe_rf,
X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
# was 0.943, 0.738 without
0.945, 0.594
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train_oversample, y_train_oversample = \
ros.fit_resample(X_train, y_train)
print(X_train.shape)
print(X_train_oversample.shape)
print(np.bincount(y_train_oversample))
(8387, 6) (16382, 6) [8191 8191]
oversample_pipe = make_imb_pipeline(
RandomOverSampler(random_state=0), lr)
scores = cross_validate(oversample_pipe,
X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
# was 0.918, 0.631 without
0.925, 0.570
oversample_pipe_rf = \
make_imb_pipeline(RandomOverSampler(random_state=0),
RandomForestClassifier(n_estimators=100))
scores = cross_validate(oversample_pipe_rf,
X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
# was 0.943, 0.738 without
0.923, 0.708
FPR or Precision? \[ \large\text{FPR} = \frac{\text{FP}}{\text{FP}+\text{TN}}\] \[ \large\text{Precision} = \frac{\text{TP}}{\text{TP}+\text{FP}}\]
\[\min_{w \in ℝ^{p}}-C \sum_{i=1}^n\log(\exp(-y_iw^T \textbf{x}_i) + 1) + ||w||_2^2\] \[ \min_{w \in \mathbb{R}^p} -C \sum_{i=1}^n c_{y_i} \log(\exp(-y_i w^T \mathbf{x}_i) + 1) + ||w||^2_2 \] Similar for linear and non-linear SVM
Gini Index: \[H_\text{gini}(X_m) = \sum_{k\in\mathcal{Y}} p_{mk} (1 - p_{mk})\] \[H_\text{gini}(X_m) = \sum_{k\in\mathcal{Y}} c_k p_{mk} (1 - p_{mk})\] Prediction: Weighted vote
lr = LogisticRegression(solver='lbfgs',
class_weight='balanced')
scores = cross_validate(lr, X_train, y_train, cv=10,
scoring=('roc_auc',
'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
0.923, 0.564
rf = RandomForestClassifier(n_estimators=100,
class_weight='balanced')
scores = cross_validate(rf, X_train, y_train, cv=10,
scoring=('roc_auc',
'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
>>> 0.915, 0.707
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_features='sqrt')
from imblearn.ensemble import BalancedBaggingClassifier
resampled_rf = \
BalancedBaggingClassifier(base_estimator=tree,
n_estimators=100, random_state=0)
scores = cross_validate(resampled_rf, X_train, y_train,
cv=10,
scoring=('roc_auc', 'average_precision'))
roc = scores['test_roc_auc'].mean()
avep = scores['test_average_precision'].mean()
print(f"{roc:.3f}, {avep:.3f}")
0.949, 0.668
(based on nearest neighbour heuristics from the 70's)
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(n_neighbors=5)
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)
enn_mode = EditedNearestNeighbours(kind_sel="mode",
n_neighbors=5)
X_train_enn_mode, y_train_enn_mode = \
enn_mode.fit_resample(X_train, y_train)
enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5),
LogisticRegression(solver='lbfgs'))
scores = cross_val_score(enn_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.3f}")
0.921
enn_pipe_rf = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5),
RandomForestClassifier(n_estimators=100))
scores = cross_val_score(enn_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.3f}")
0.941
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour()
X_train_cnn, y_train_cnn = cnn.fit_resample(X_train, y_train)
print(X_train_cnn.shape)
print(np.bincount(y_train_cnn))
(553, 6) [357 196]
cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(),
LogisticRegression(solver='lbfgs'))
scores = cross_val_score(cnn_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.3f}")
0.916
rf = RandomForestClassifier(n_estimators=100, random_state=0)
cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(), rf)
scores = cross_val_score(cnn_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.2f}")
0.93
from imblearn.over_sampling import SMOTE
smote_pipe = make_imb_pipeline(SMOTE(),
LogisticRegression(solver='lbfgs'))
scores = cross_val_score(smote_pipe, X_train, y_train,
cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.3f}")
0.922
smote_pipe_rf = \
make_imb_pipeline(SMOTE(),
RandomForestClassifier(n_estimators=100))
scores = cross_val_score(smote_pipe_rf,X_train,y_train,
cv=10, scoring='roc_auc')
print(f"{np.mean(scores):.3f}")
0.943
param_grid = {'smote__k_neighbors': [3, 5, 7, 9, 11, 15, 31]}
search = GridSearchCV(smote_pipe_rf, param_grid,
cv=10, scoring="roc_auc")
search.fit(X_train, y_train)
print(f"{search.score(X_test, y_test):.3f}")
0.959