Source code for pairpro.train_val_featuregen

"""
This module utilizes iFeatureOmega, a feature generation
package for proteins and nucleic acids.
"""

import numpy as np
import iFeatureOmegaCLI
import pandas as pd


feature_list = ['AAC', 'GAAC', 'DistancePair',
                'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A',
                'PseKRAAC type 7', 'PseKRAAC type 9', 'Geary', 'APAAC', 'QSOrder']


[docs]def get_fasta_from_dataframe( dataframe, output_file_a: str, output_file_b: str): ''' Generates fasta file type from pandas dataframe. Args: Dataframe (pandas dataframe) Names of output fasta files (str) Returns: Two fasta files with protein sequences and pair_id ''' # meso sequence to fasta with open(f'./tmp/{output_file_a}', 'w') as f: for _, row in dataframe.iterrows(): f.write( '>{}\n{}\n'.format( (row['pair_id']), row['m_protein_seq'])) # thermo sequence to fasta with open(f'./tmp/{output_file_b}', 'w') as f: for _, row in dataframe.iterrows(): f.write( '>{}\n{}\n'.format( (row['pair_id']), (row['t_protein_seq']))) # return output files return [output_file_a, output_file_b]
[docs]def get_protein_descriptors(fasta_file: str, descriptors=[]): ''' Generates features from a protein sequence. Args: Fasta file with amino acid sequences (fasta file) Returns: Vector of descriptors (numpy array) ''' # create iProtein object protein = iFeatureOmegaCLI.iProtein(fasta_file) # not sure why we need this yet. Right now it is stored in local directory. params = protein.import_parameters('protein_parameters.json') protein_descriptors = {} for descriptor in descriptors: protein.get_descriptor(descriptor) protein_descriptors.update({f'{descriptor}': protein.encodings}) # make sure output is a dictionary of correct length assert "dict" in str(type(protein_descriptors)) assert len(protein_descriptors) == len(descriptors) return protein_descriptors
[docs]def clean_new_dataframe(dataframe): ''' Asserts that artifact columns generated from iFeatureOmega such as "index" are removed. Args: Pandas dataframe Returns: Pandas dataframe ''' # drop indexing columns created by feature gen dataframe = dataframe.drop( columns=dataframe.columns[dataframe.columns.str.contains('index|Unnamed')]) assert (dataframe.filter(like='index|Unnamed').shape)[1] == 0 # turn inf into NaN dataframe = dataframe.replace([np.inf, -np.inf], np.nan) dataframe = dataframe.dropna(axis=1, how='any') # assert NaN's are removed nan_counts = dataframe.isna().sum() assert len(nan_counts.unique()) == 1 return dataframe
[docs]def create_new_dataframe(dataframe, output_files: list, descriptors=[]): ''' Creates new dataframe with descriptors added. Args: Pandas dataframe, list of descriptors as strings, output file name. Returns: Dataframe including vector(s) of descriptors (pandas dataframe) ''' fasta_files = get_fasta_from_dataframe( dataframe, output_files[0], output_files[1]) def compute_descriptor_ratio(fasta_files, descriptors=[]): ''' ` Generates dictionary of descriptors for each of the two input sequences. Computes the difference between each instance of a descriptor. Args: List of two fasta files (str) and list of descriptors (str). Returns: Dictionary with difference between descriptors for each of the input sequences. ''' desc_a = get_protein_descriptors(fasta_files[0], descriptors) desc_b = get_protein_descriptors(fasta_files[1], descriptors) feature_dict = {} for key in desc_a: if 'AAC' in key: feature_dict[key] = desc_a[key] - desc_b[key] elif 'GAAC' in key: feature_dict[key] = desc_a[key] - desc_b[key] else: feature_dict[key] = desc_a[key] / desc_b[key] return feature_dict feature_dict = compute_descriptor_ratio(fasta_files, descriptors) df = dataframe.reset_index() for desc in descriptors: feature_dict[desc].index = feature_dict[desc].index.astype(int) features = feature_dict[desc].reset_index() df = pd.merge( df, features, how='outer', left_index=True, right_index=True) df = clean_new_dataframe(df) return df