Source code for pairpro.evaluate_model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import sklearn

# testing function

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

# create dictionary of models
names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

# list of classifiers (hyperparameters optimized)
classifiers = [
    # Regression
    LogisticRegression(),
    # KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    # Decision Tree
    DecisionTreeClassifier(max_features=None),
    # Gaussian
    GaussianNB(),
    # RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    # RF Classifier with bagging (with optuna)
    BaggingClassifier(RandomForestClassifier
                      (n_estimators=200, max_depth=None,
                       min_weight_fraction_leaf=0.000215), max_samples=0.5,
                      max_features=0.5),
    # AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    # Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                               max_depth=1),
    # C-support vector classification (9)
    #     SVC(),
]


[docs]def evaluate_model(model, target: list, dataframe): ''' Takes a trained model and test data and tests the model. Runs a single or multi-class Classifier depending on input. Args: output path: File path: str model: sklearn.neighbors.KNeighborsClassifier target: target for classifier (list) dataframe: pandas dataframe Returns: Vector of predictions (numpy arrray) precision score (numpy array) results (csv) ''' from sklearn.metrics import accuracy_score, \ confusion_matrix, \ roc_auc_score,\ average_precision_score, precision_score,\ auc,\ roc_curve, f1_score, recall_score, matthews_corrcoef if 'structure_match' not in target: # initialize empty eval results file F = open('evaluationResults.txt', 'w') F.write('Evaluation Scale:' + '\n') F.write('0.0% <=Accuracy<= 100.0%' + '\n') F.write('0.0 <=AUC<= 1.0' + '\n') # area under curve F.write('0.0 <=auPR<= 1.0' + '\n') # average_Precision F.write('0.0 <=F1_Score<= 1.0' + '\n') F.write('-1.0 <=MCC<= 1.0' + '\n') F.write('_______________________________________' + '\n') # need to figure out what seq 1 and seq2 are called results_df = dataframe[['query', 'subject']] dataframe = dataframe.drop(columns=['query', 'subject']) features = [columns for columns in dataframe.drop(columns=target)] # split into input and output feature(s) test_X = dataframe[features].values test_y = dataframe[target].values.reshape(-1, 1) # scale data scaler = sklearn.preprocessing.StandardScaler() test_X = scaler.fit_transform(test_X) # test input arguments assert "sklearn" in str(type(model)) assert "numpy.ndarray" in str(type(test_X)) assert "numpy.ndarray" in str(type(test_y)) # vector of predictions preds = model.predict(test_X) # Calculate ROC Curve and Area the Curve proba_y = model.predict_proba(test_X)[:, 1] FPR, TPR, _ = roc_curve(test_y, proba_y, pos_label=True) roc_auc = auc(FPR, TPR) # calculate scoring metrics # include option to return these scores accuracy = 100 * (accuracy_score(y_pred=preds, y_true=test_y)) avg_precision = average_precision_score( y_true=test_y, y_score=proba_y, pos_label=1) F1_Score = f1_score(y_true=test_y, y_pred=preds, pos_label=True) MCC = matthews_corrcoef(y_true=test_y, y_pred=preds) Recall = recall_score(y_true=test_y, y_pred=preds, pos_label=True) AUC = roc_auc # confusion_matrix = sklearn.metrics.confusion_matrix( # y_pred=preds, y_true=test_y) # sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot() F.write('Accuracy: {0:.4f}%\n'.format(accuracy)) F.write('AUC: {0:.4f}\n'.format(AUC)) F.write( 'auPR: {0:.4f}\n'.format(avg_precision) ) # average_Precision F.write('F1_Score: {0:.4f}\n'.format(F1_Score)) F.write('MCC: {0:.4f}\n'.format(MCC)) # TN, FP, FN, TP = CM.ravel() F.write('Recall: {0:.4f}\n'.format(Recall)) F.write('_______________________________________' + '\n') # merge dataframes together to report results results_df['prediction'] = preds # save to csv results_df.to_csv('./data/user_predictions.csv') else: # initialize empty eval results file F = open('evaluationResults.txt', 'w') F.write('Evaluation Scale:' + '\n') F.write('0.0% <=Accuracy<= 100.0%' + '\n') F.write('0.0 <=AUC<= 1.0' + '\n') # area under curve F.write('0.0 <=auPR<= 1.0' + '\n') # average_Precision F.write('0.0 <=F1_Score<= 1.0' + '\n') F.write('-1.0 <=MCC<= 1.0' + '\n') F.write('_______________________________________' + '\n') results_df = dataframe[['query', 'subject']] dataframe = dataframe.drop(columns=['query', 'subject']) features = [columns for columns in dataframe.drop(columns=target)] # split into input and output feature(s) test_X = dataframe[features].values test_y = dataframe[target].values # scale data scaler = sklearn.preprocessing.StandardScaler() test_X = scaler.fit_transform(test_X) # test input arguments assert "sklearn" in str(type(model)) assert "numpy.ndarray" in str(type(test_X)) assert "numpy.ndarray" in str(type(test_y)) # vector of predictions preds = model.predict(test_X) hmmer_preds = preds[:, 0] structure_preds = preds[:, 1] assert len(hmmer_preds) == len(structure_preds) # calculate scoring metrics accuracy = 100 * accuracy_score(y_pred=preds, y_true=test_y) Precision = precision_score( y_pred=preds, y_true=test_y, average=None, pos_label=True ) F1_Score = f1_score( y_true=test_y, y_pred=preds, pos_label=True, average=None) Recall = recall_score( y_true=test_y, y_pred=preds, pos_label=True, average=None) print(accuracy) print(Precision) print(F1_Score) print(Recall) # confusion_matrix = sklearn.metrics.confusion_matrix( # y_pred=preds, y_true=test_y) # sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot() F.write('Accuracy: {0:.4f}%\n'.format(accuracy)) F.write('Mean Precision: {0:.4f}\n'.format(np.mean(Precision))) F.write('Mean F1_Score: {0:.4f}\n'.format(np.mean(F1_Score))) # TN, FP, FN, TP = CM.ravel() F.write('Mean Recall: {0:.4f}\n'.format(np.mean(Recall))) F.write('_______________________________________' + '\n') # merge dataframes together to report results results_df['hmmer_prediction'] = hmmer_preds results_df['structure_prediction'] = structure_preds results_df['hmmer_structure_match'] = results_df['hmmer_prediction'] == results_df['structure_prediction'] # save to csv results_df.to_csv('./data/user_predictions.csv') return preds, results_df