Adding callback to a sklearn GridSearch
GridSearch in scikit-learn
is just awesome - no doubt about that. But what is unfortunate is the fact that it only shows one metric in the results and you couldn't store any intermediate information or do some actions during the search (such as save every model, or compute additional metrics than just the one).
One (and only?) variant how to achieve that is to create your own custom scoring function, which would do something more than just scoring...
Here is an example and how far I have got.
import pickle
import time
import json
def log_metrics_and_params(results, model_savepath):
# log results and save path
to_write = {}
to_write['results'] = results
to_write['model_savepath'] = model_savepath
log.info('%s', json.dumps(to_write))
def save_model(clf):
# save model with timestamp
timestring = "".join(str(time.time()).split("."))
model_savepath = 'model_' + timestring + '.pk'
with open(model_savepath, 'wb') as ofile:
pickle.dump(clf, ofile)
return model_savepath
def get_train_metrics():
# currently impossible
# X_train and y_train are in higher scopes
pass
def get_val_metrics(y_pred, y_true):
return get_metrics(y_pred, y_true)
def get_metrics(y_pred, y_true):
# compute more than just one metrics
chosen_metrics = {
'conf_mat': metrics.confusion_matrix,
'accuracy': metrics.accuracy_score,
'auc' : metrics.roc_auc_score,
}
results = {}
for metric_name, metric_func in chosen_metrics.items():
try:
inter_res = metric_func(y_pred, y_true)
except Exception as ex:
inter_res = None
log.parent.error("Couldn't evaluate %s because of %s", metric_name, ex)
results[metric_name] = inter_res
results['conf_mat'] = results['conf_mat'].tolist()
return results
def _my_scorer(clf, X_val, y_true_val):
# do all the work and return some of the metrics
y_pred_val = clf.predict(X_val)
results = get_val_metrics(y_pred_val, y_true_val)
model_savepath = save_model(clf)
log_metrics_and_params(results, model_savepath)
return results['accuracy']
and then just call gridsearch:
gs = GridSearchCV(clf, tuned_params, scoring=_my_scorer)
gs.fit(X, y)
Unfortunately, I wasn't able to get X_train
and y_train
in the scorer scope, so I can't get metrics for that :-( .