def shuffle_split_pimped(clf, X_train, y_train, n):
...
return r
f, (ax1, ax2) = plt.subplots(ncols=2,figsize=(10,5))
ax1.hist(feature_nan[y_train==1], bins = 30, normed=True, facecolor='red', alpha=0.55)
ax1.set_title("Banrkupt NAN")
ax2.hist(feature_nan[y_train==0], bins = 30, normed=True, facecolor='blue', alpha=0.55)
ax2.set_title("Healthy NAN")
plt.show()
def objective(params):
params = {
'max_depth': int(params['max_depth']),
'gamma': "{:.4f}".format(params['gamma']),
'colsample_bytree': '{:.4f}'.format(params['colsample_bytree']),
'scale_pos_weight': int(params['scale_pos_weight']),
'n_estimators': int(params['n_estimators']),
'learning_rate': '{:.4f}'.format(params['learning_rate']),
'subsample': '{:.4f}'.format(params['subsample']),
}
clf = XGBClassifier(n_jobs=4, eval_metric="auc", **params)
score = shuffle_split_pimped(clf, X_train, y_train, 8).mean()
print("Score {:.4f} params {}".format(score, params))
return -score
space = {
'max_depth': hp.quniform('max_depth', 3, 12, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
'gamma': hp.uniform('gamma', 0.0, 0.6),
'scale_pos_weight': hp.quniform('scale_pose_weight', 10, 100, 1),
'n_estimators': hp.quniform('n_estimators', 200, 400, 20),
'learning_rate': hp.uniform('learning_rate', 0.04, 0.15),
'subsample': hp.uniform('subsample', 0.7, 1.0),
}
best = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=10)
Score 0.9196 params {'max_depth': 4, 'gamma': '0.0824', 'colsample_bytree': '0.8314', 'scale_pos_weight': 94, 'n_estimators': 380, 'learning_rate': '0.0641', 'eta': '0.0315', 'subsample': '0.7748', 'num_boost_round': 60}
Repeat N_rounds time:
SMOTE (Synthetic Minority Over-Sampling Technique)
For each point p in the minority class S:
Compute its k nearest neighbors in S
Randomly choose r ≤ k of the neighbors (with re-placement)
Choose a random point along the lines joining p and each of the r selected neighbors
Add these synthetic points to the dataset with class S
ENN (Edited Nearest Neighbor)