Source code for pairpro.train_val_input_cleaning

"""
This module takes a dataframe from the data scraping component
and cleans it so that it can be passed through a machine
learning algorithm.
"""

from sklearn.utils import resample
import pandas as pd

# keep columns that can be used as features
columns_to_keep = [
    'pair_id',
    'bit_score',
    'local_gap_compressed_percent_id',
    'scaled_local_query_percent_id',
    'scaled_local_symmetric_percent_id',
    'query_align_len',
    'query_align_cov',
    'subject_align_len',
    'subject_align_cov',
    'm_protein_len',
    't_protein_len',
    'm_protein_seq',
    't_protein_seq',
    'hmmer_match',
    'norm_bit_score_m',
    'norm_bit_score_t'
]


[docs]def normalize_bit_scores(dataframe):
    '''
    Creates two new columns of bit score
    normalized by the protein length.

    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    dataframe['norm_bit_score_m'] = dataframe['bit_score'] / \
        dataframe['m_protein_len']
    dataframe['norm_bit_score_t'] = dataframe['bit_score'] / \
        dataframe['t_protein_len']

    return dataframe


[docs]def check_input_type(dataframe):
    '''
    Takes in input dataframe and asserts that it is the correct data type.

    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    assert "pandas.core.frame.DataFrame" in str(
        type(dataframe)), 'Not a pandas dataframe!'

    return dataframe


[docs]def clean_input_columns(dataframe):
    '''
    Cleans out columns that are not in
    a predefined list of features.

    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    for title in dataframe:
        if title not in columns_to_keep:
            dataframe = dataframe.drop(columns=title)
        else:
            pass

    return dataframe


[docs]def verify_input_columns(dataframe):
    '''
    Asserts that columns we want to keep
    remain in the dataframe.

    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    for title in columns_to_keep:

        if title not in dataframe:
            raise KeyError
        else:
            pass

    return dataframe


[docs]def check_input_nans(dataframe):
    '''
    Checks for NaN values in input dataframe.
    Removes rows with NaN values present.
    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    has_nan = dataframe.isna().any().any()
    nan_rows = dataframe[dataframe.isna().any(axis=1)]

    if has_nan:
        print('Dataframe has {} rows with NaN values!'.format(len(nan_rows)))
    else:
        print("DataFrame does not have any NaN values.")

    # Drop rows with NaN's
    dataframe = dataframe.dropna()

    return dataframe


[docs]def verify_protein_pairs(dataframe):
    '''
    Checks that input data has two protein sequences
    with simple assert statements.
    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    assert 'm_protein_len' in dataframe, 'Dataframe missing mesophillic sequence!'
    assert 't_protein_len' in dataframe, 'Dataframe missing thermophillic sequence!'

    return dataframe


[docs]def input_cleaning_wrapper(dataframe, structure):
    '''
    Takes in a pandas dataframe and runs it through each of the cleaning
    and verification steps.
    Args:
        pandas dataframe

    Returns:
        pandas dataframe
    '''
    if structure:
        columns_to_keep.append('structure_match')
    else:
        pass

    # normalize bit scores
    normed = normalize_bit_scores(dataframe)

    # check type of dataframe
    check = check_input_type(normed)

    # clean out unnecessary columns
    clean = clean_input_columns(check)

    # verify necessary columns are present
    verify_input = verify_input_columns(clean)

    # check for NaN's
    check_nans = check_input_nans(verify_input)

    # verify every protein has a pair
    verify_pairs = verify_protein_pairs(check_nans)

    print('The new shape of the dataframe is:{}'.format(verify_pairs.shape))

    return verify_pairs