Source code for olpy

"""This is the main module of the OLPy package.
It compares the performance of the various algorithms and returns the
result to the use in the desired format.
"""

__all__ = ['olpy_parse_args', 'run_experiments', 'classifiers', 'datasets', 'exceptions', 'preprocessing', 'utils']
__version__ = '1.0.0'
__author__ = 'Boladji Vinny'


from . import classifiers, datasets, preprocessing, utils, exceptions

import argparse
import time
import os
import joblib
import pathlib
import sklearn

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV as __GridSearchCV



[docs]def olpy_parse_args(): """Parses the command-line arguments passed to the main program. Args: None Returns: :obj:`Namespace`, the set of arguments parsed. Raises: KeyError: if one argument key is not existent. """ parser = argparse.ArgumentParser( prog="OLPy", description='After receiving input from the user, this program trains\ a series of Online Machine Learning models for binary\ classification.' ) parser.add_argument( 'train_set', metavar='TRAINING SET', help='CSV file containing the training dataset.', type=argparse.FileType('r') ) parser.add_argument( 'test_set', metavar='TESTING SET', help='CSV file containing the test dataset.', type=argparse.FileType('r') ) parser.add_argument( '-l', '--label', type=str, default='Label', help='index of the target variable. (default: %(default)s)' ) parser.add_argument( '--models', type=str, nargs='+', default='all', help='the list of models to try from or use use %(default)s', choices=[ 'all', 'alma', 'arow', 'cw', 'scw', 'scw2', 'iellip', 'narow', 'nherd', 'ogd', 'pa', 'pa1', 'pa2', 'perceptron', 'sop', 'romma', 'aromma' ] ) parser.add_argument( '-n', type=int, default=1, help='the number of iterations to run. (default: %(default)s)' ) parser.add_argument( '-s', type=int, default=None, help='the random seed to use in training the models. \ (default: %(default)s)' ) parser.add_argument( '-o', type=str, default='experiment-results.csv', help='file to which the reports would be saved \ (default: %(default)s)' ) parser.add_argument( '-b', '--bias', help="whether or not a bias should be used for the training.", action="store_true" ) parser.add_argument( '-w', '--use-weights', help="whether or not weights should be used while training the\ models.", action="store_true" ) parser.add_argument( '--weights', help="custom weights to use with the training", type=float, nargs='+' ) parser.add_argument( '--cv', help="whether or not hyper-parameter through cross validation should\ be done.", action="store_true" ) parser.add_argument( '-d', '--dump-dir', type=pathlib.Path, help="output directory for dumping the models. (default: %(default)s)" ) parser.add_argument( '-v', help='represents the verbosity level of the application. \ (default: %(default)d)', action="count", default=0 ) parser.add_argument( '--version', action='version', version='%(prog)s {}'.format(__version__) ) return parser.parse_args()
[docs]def run_experiments( train_file, test_file, models, n_iterations=1, label = 'Label', bias = False, use_weights = False, weights = None, cv = False, model_dir = None, verbose = False, output_file = None, seed = None, ): """Run an experiment using the data passed by the user. Given the parameters passed by the user, this function executes an experiment and reports the results to the user at the specified destination(files and/or console). Args: train_file (:obj:`str`): path to the training dataset file. test_file (:obj:`str`): path to the testing dataset file. models (:obj:`list`): a list of models to try out. n_iterations (:obj:`int`, optional): number of iterations to run each model for. Defaults to 1. label (:obj:`str`, optional): the column index of the output variable. Defaults to 'Label'. bias (:obj:`bool`, optional): whether a bias should be used or not. Defaults to False. use_weights (:obj:`bool`, optional): whether weights should be used while training the models. Defaults to False. weights (:obj:`numpy.ndarray`, optional): an array representing the weights to use during the training process. This only works when `use_weights` is set to True. cv (:obj:`bool`, optional): whether cross validation will be ran or not. Defaults to True. model_dir (:obj:`str`, optional): the directory to which the dumps of the models will be saved. Defaults to None. verbose (:obj:`bool`, optional): whether the program should produce output or not. Defaults to False. output_file (:obj:`str`): path to the output file if any. seed (:obj:`int`, optional): random-generator seed. Defaults to None. Returns: None Raises: FileNotFoundError: if one of the files passed does not exist IndexError: if the specified label does not exist in the data. """ # Load the datasets scaler = sklearn.preprocessing.MinMaxScaler() train_data = pd.read_csv(train_file) test_data = pd.read_csv(test_file) if bias: train_data.insert(0, 'Bias', np.ones(train_data.shape[0])) test_data.insert(0, 'Bias', np.ones(test_data.shape[0])) # Scaling the dataset to avoid numerical issues Y_train = train_data.loc[:, label].to_numpy() X_train = scaler.fit_transform(train_data.drop(columns=[label])) Y_test = test_data.loc[:, label].to_numpy() X_test = scaler.fit_transform(test_data.drop(columns=[label])) # Check the oversampling now class_weight = None if use_weights: if weights is not None and len(weights) >= 2: class_weight = np.array(weights) else: class_weight = sklearn.utils.class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(Y_train), y=Y_train ) # First we replace all by the list of available models if models == 'all' or 'all' in models: models = [ 'alma', 'arow', 'cw', 'scw', 'scw2', 'iellip', 'narow', 'nherd', 'ogd', 'pa', 'pa1', 'pa2', 'perceptron', 'sop', 'romma', 'aromma' ] # Create a variable to store the model objects models_ = [] params_ = [] for model in models: model = model.lower() if model == 'alma': models_.append(classifiers.ALMA(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)], 'p': range(2, 12, 2), 'alpha': list(np.arange(0.50, 1, 0.05)) }) if model == 'arow': models_.append(classifiers.AROW(random_state=seed)) params_.append({ 'r': [2 ** i for i in range(-4, 5)] }) if model == 'cw': models_.append(classifiers.CW(random_state=seed)) params_.append({ 'a': list(np.arange(0.1, 1, 0.1)), 'eta': list(np.arange(0.50, 1, 0.05)) }) if model == 'scw': models_.append(classifiers.SCW(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)], 'eta': list(np.arange(0.50, 1, 0.05)) }) if model == 'scw2': models_.append(classifiers.SCW2(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)], 'eta': list(np.arange(0.50, 1, 0.05)) }) if model == 'iellip': models_.append(classifiers.IELLIP(random_state=seed)) params_.append({ 'a': list(np.arange(0.1, 1.1, 0.1)), 'b': list(np.arange(0.1, 1.1, 0.1)), 'c': list(np.arange(0.1, 1.0, 0.1)) }) if model == 'narow': models_.append(classifiers.NAROW(random_state=seed)) params_.append({ 'a': list(np.arange(0.1, 1.1, 0.1)), }) if model == 'nherd': models_.append(classifiers.NHerd(random_state=seed)) params_.append({ 'a': list(np.arange(0.1, 1.1, 0.1)), 'C': [2 ** i for i in range(-4, 5)] }) if model == 'ogd': models_.append(classifiers.OGD(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)] }) if model == 'pa': models_.append(classifiers.PA(random_state=seed)) params_.append({}) if model == 'pa1': models_.append(classifiers.PA_I(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)] }) if model == 'pa2': models_.append(classifiers.PA_II(random_state=seed)) params_.append({ 'C': [2 ** i for i in range(-4, 5)] }) if model == 'perceptron': models_.append(classifiers.Perceptron(random_state=seed)) params_.append({}) if model == 'sop': models_.append(classifiers.SecondOrderPerceptron(random_state=seed)) params_.append({ 'a': list(np.arange(0.1, 1.1, 0.1)) }) if model == 'romma': models_.append(classifiers.ROMMA(random_state=seed)) params_.append({}) if model == 'aromma': models_.append(classifiers.aROMMA(random_state=seed)) params_.append({}) summary = pd.DataFrame( np.zeros((len(models_), 10)), columns=[ 'Training-Time', 'Prediction-Time', 'Accuracy', 'F1-Score', 'Recall', 'ROC_AUC-Score', 'FP', 'FN', 'TP', 'TN' ]) summary.insert(0, 'Model', [model for model in models]) if verbose > 0: print( "%9s\t%8s\t%8s\t%8s\t%8s\t%8s\t%8s\t%8s\t%8s\t%8s\n" % ( 'algorithm', 'train time (s)', 'test time (s)', 'accuracy', 'f1-score', 'roc-auc','true positive', 'true negative', 'false positive', 'false negative' )) i = 0 best_params_record = "Best params: \n" for model in models_: if use_weights: model.set_params(class_weight=class_weight) # Use the verbose level from the command line if cv: model_ = __GridSearchCV(model, params_[i], n_jobs=-1) model_.fit(X_train, Y_train, verbose=verbose-1) # After collecting, let's save, report and proceed model.set_params(**model_.best_params_) best_params_record += (models[i] + "\n" + str(model_.best_params_) + "\n\n") # Set the number of iterations now model.set_params(num_iterations=n_iterations) training_start = time.time() model.fit(X_train, Y_train, verbose=False) duration = time.time() - training_start scores = model.decision_function(X_test) test_start = time.time() preds = model.predict(X_test) preds_duration = time.time() - test_start acc = sklearn.metrics.accuracy_score(Y_test, preds) f1 = sklearn.metrics.f1_score(Y_test, preds) tn, fp, fn, tp = sklearn.metrics.confusion_matrix(Y_test, preds, normalize='true').ravel() # ROC would not compute if for instance we have only one class in the # test data. # This is the case for the svmguide3 dataset bundled with the package. try: roc = sklearn.metrics.roc_auc_score(Y_test, scores) except ValueError: roc = np.nan if verbose: print( "%-12s\t%-3f\t%-3f\t%-5f\t%-5f\t%-5f\t%-5f\t%-5f\t%-5f\t%-5f" % (list( set(models))[i], duration, preds_duration, acc, f1, roc, tp, tn, fp, fn )) summary.loc[i, 'Training-Time'] = duration summary.loc[i, 'Prediction-Time'] = preds_duration summary.loc[i, 'Accuracy'] = acc summary.loc[i, 'F1-Score'] = f1 summary.loc[i, 'ROC_AUC-Score'] = roc summary.loc[i, 'TP'] = tp summary.loc[i, 'TN'] = tn summary.loc[i, 'FP'] = fp summary.loc[i, 'FN'] = fn if model_dir is not None: # Save the model dump_name = list(set(models))[i] + '.dump' joblib.dump(model, model_dir / dump_name) i = i + 1 if cv: print() print() print(best_params_record) if output_file is not None: summary.to_csv(output_file, index=False)