Source code for pairpro.evaluate_input_cleaning

"""
This module cleans dataframe from user input for upstream classifier.
Keeps both protein sequences for reporting results.
"""

from sklearn.utils import resample
import pandas as pd


# keep columns that can be used as features
columns_to_keep = [
    'bit_score',
    'local_gap_compressed_percent_id',
    'scaled_local_query_percent_id',
    'scaled_local_symmetric_percent_id',
    'query_align_len',
    'query_align_cov',
    'subject_align_len',
    'subject_align_cov',
    'query_len',
    'subject_len',
    'hmmer_match',
    'norm_bit_score_query',
    'norm_bit_score_subject',
    'query',
    'subject'
]


[docs]def normalize_bit_scores(dataframe): ''' Creates two new columns of bit score normalized by the protein length. Args: pandas dataframe Returns: pandas dataframe ''' dataframe['norm_bit_score_query'] = dataframe['bit_score'] / \ dataframe['query_len'] dataframe['norm_bit_score_subject'] = dataframe['bit_score'] / \ dataframe['subject_len'] return dataframe
[docs]def check_input_type(dataframe): ''' Takes in input dataframe and asserts that it is the correct data type. Args: pandas dataframe Returns: pandas dataframe ''' assert "pandas.core.frame.DataFrame" in str( type(dataframe)), 'Not a pandas dataframe!' return dataframe
[docs]def clean_input_columns(dataframe): ''' Cleans out columns that are not in a predefined list of features. Args: pandas dataframe Returns: pandas dataframe ''' for title in dataframe: if title not in columns_to_keep: dataframe = dataframe.drop(columns=title) else: pass return dataframe
[docs]def verify_input_columns(dataframe): ''' Asserts that columns we want to keep remain in the dataframe. Args: pandas dataframe Returns: pandas dataframe ''' for title in columns_to_keep: if title not in dataframe: raise KeyError else: pass return dataframe
[docs]def check_input_nans(dataframe): ''' Checks for NaN values in input dataframe. Removes rows with NaN values present. Args: pandas dataframe Returns: pandas dataframe ''' has_nan = dataframe.isna().any().any() nan_rows = dataframe[dataframe.isna().any(axis=1)] if has_nan: print('Dataframe has {} rows with NaN values!'.format(len(nan_rows))) else: print("DataFrame does not have any NaN values.") # Drop rows with NaN's dataframe = dataframe.dropna() return dataframe
[docs]def verify_protein_pairs(dataframe): ''' Checks that input data has two protein sequences with simple assert statements. Args: pandas dataframe Returns: pandas dataframe ''' assert 'query_len' in dataframe, 'Dataframe missing query sequence!' assert 'subject_len' in dataframe, 'Dataframe missing subject sequence!' return dataframe
[docs]def input_cleaning_wrapper(dataframe, structure): ''' Takes in a pandas dataframe and runs it through each of the cleaning and verification steps. Args: pandas dataframe Returns: pandas dataframe ''' if structure: columns_to_keep.append('structure_match') else: pass # normalize bit scores normed = normalize_bit_scores(dataframe) # check type of dataframe check = check_input_type(normed) # clean out unnecessary columns clean = clean_input_columns(check) # verify necessary columns are present verify_input = verify_input_columns(clean) # check for NaN's check_nans = check_input_nans(verify_input) # verify every protein has a pair verify_pairs = verify_protein_pairs(check_nans) print('The new shape of the dataframe is:{}'.format(verify_pairs.shape)) return verify_pairs