"""
This module utilizes iFeatureOmega, a feature generation
package for proteins and nucleic acids.
"""
import numpy as np
import iFeatureOmegaCLI
import pandas as pd
feature_list = ['AAC', 'GAAC', 'DistancePair',
'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A',
'PseKRAAC type 7', 'PseKRAAC type 9', 'Geary', 'APAAC', 'QSOrder']
[docs]def get_fasta_from_dataframe(
dataframe, output_file_a: str, output_file_b: str):
'''
Generates fasta file type from pandas dataframe.
Args:
Dataframe (pandas dataframe)
Names of output fasta files (str)
Returns:
Two fasta files with protein sequences and pair_id
'''
# meso sequence to fasta
with open(f'./tmp/{output_file_a}', 'w') as f:
for _, row in dataframe.iterrows():
f.write(
'>{}\n{}\n'.format(
(row['pair_id']),
row['m_protein_seq']))
# thermo sequence to fasta
with open(f'./tmp/{output_file_b}', 'w') as f:
for _, row in dataframe.iterrows():
f.write(
'>{}\n{}\n'.format(
(row['pair_id']),
(row['t_protein_seq'])))
# return output files
return [output_file_a, output_file_b]
[docs]def get_protein_descriptors(fasta_file: str, descriptors=[]):
'''
Generates features from a protein sequence.
Args:
Fasta file with amino acid sequences (fasta file)
Returns:
Vector of descriptors (numpy array)
'''
# create iProtein object
protein = iFeatureOmegaCLI.iProtein(fasta_file)
# not sure why we need this yet. Right now it is stored in local directory.
params = protein.import_parameters('protein_parameters.json')
protein_descriptors = {}
for descriptor in descriptors:
protein.get_descriptor(descriptor)
protein_descriptors.update({f'{descriptor}': protein.encodings})
# make sure output is a dictionary of correct length
assert "dict" in str(type(protein_descriptors))
assert len(protein_descriptors) == len(descriptors)
return protein_descriptors
[docs]def clean_new_dataframe(dataframe):
'''
Asserts that artifact columns generated
from iFeatureOmega such as "index" are removed.
Args:
Pandas dataframe
Returns:
Pandas dataframe
'''
# drop indexing columns created by feature gen
dataframe = dataframe.drop(
columns=dataframe.columns[dataframe.columns.str.contains('index|Unnamed')])
assert (dataframe.filter(like='index|Unnamed').shape)[1] == 0
# turn inf into NaN
dataframe = dataframe.replace([np.inf, -np.inf], np.nan)
dataframe = dataframe.dropna(axis=1, how='any')
# assert NaN's are removed
nan_counts = dataframe.isna().sum()
assert len(nan_counts.unique()) == 1
return dataframe
[docs]def create_new_dataframe(dataframe, output_files: list, descriptors=[]):
'''
Creates new dataframe with descriptors added.
Args:
Pandas dataframe, list of descriptors as strings, output file name.
Returns:
Dataframe including vector(s) of descriptors (pandas dataframe)
'''
fasta_files = get_fasta_from_dataframe(
dataframe, output_files[0], output_files[1])
def compute_descriptor_ratio(fasta_files, descriptors=[]):
'''
` Generates dictionary of descriptors for each of the two input sequences.
Computes the difference between each instance of a descriptor.
Args:
List of two fasta files (str) and list of descriptors (str).
Returns:
Dictionary with difference between descriptors for each of the
input sequences.
'''
desc_a = get_protein_descriptors(fasta_files[0], descriptors)
desc_b = get_protein_descriptors(fasta_files[1], descriptors)
feature_dict = {}
for key in desc_a:
if 'AAC' in key:
feature_dict[key] = desc_a[key] - desc_b[key]
elif 'GAAC' in key:
feature_dict[key] = desc_a[key] - desc_b[key]
else:
feature_dict[key] = desc_a[key] / desc_b[key]
return feature_dict
feature_dict = compute_descriptor_ratio(fasta_files, descriptors)
df = dataframe.reset_index()
for desc in descriptors:
feature_dict[desc].index = feature_dict[desc].index.astype(int)
features = feature_dict[desc].reset_index()
df = pd.merge(
df,
features,
how='outer',
left_index=True,
right_index=True)
df = clean_new_dataframe(df)
return df