Source code for pairpro.utils

"""
The following is importable random utilites.
You will find:
- logger function
- pairwise sequence builder
"""
import pandas as pd
import itertools

# import distributed
import logging
logger = logging.getLogger(__name__)


[docs]def start_logger_if_necessary(logger_name: str, log_file: str, log_level, filemode: str = 'a', worker: bool = False): """Quickly configure and return a logger that respects parallel processes. Parameters ---------- logger_name : str name of logger to start or retrieve log_file : str path to file to log to log_level log level to respect worker: str name of worker using this logger filemode : str mode to apply to log file eg "a" for append """ logger = logging.getLogger(logger_name) logger.setLevel(log_level) fh = logging.FileHandler(log_file, mode=filemode) if worker: # worker_name = distributed.get_worker().name # fh.setFormatter(logging.Formatter('%(filename)s %(worker)s - %(asctime)s %(levelname)-8s %(message)s')) # if len(logger.handlers) == 0: # logger.addHandler(fh) # else: # logger.handlers[-1] = fh # logger = logging.LoggerAdapter(logger, extra={'worker': worker_name}) pass else: fh.setFormatter(logging.Formatter('%(filename)s - %(asctime)s %(levelname)-8s %(message)s')) if len(logger.handlers) == 0: logger.addHandler(fh) else: logger.handlers[-1] = fh return logger
[docs]def make_pairs(seq1_list, seq2_list, seq1_name='seq1', seq2_name='seq2', csv_path='./paired_seqs.csv', save=True): ''' Function for building a combinatorial set of sequences from two lists. Args: seq1_list (list): List of protein sequence strings seq2_list (list): List of protein sequence strings seq1_name (str): Column name for first sequence column seq2_name (str): Column name for second sequence column csv_path (str): Path for saved .csv file save (bool): Saves paired sequences as .csv when True Returns: combined_df (pd.DataFrame): A dataframe with rows as all possible sequence pairs. ''' combined = list(itertools.product(seq1_list, seq2_list)) combined_df = pd.DataFrame(combined, columns=[seq1_name, seq2_name]) if save is True: combined_df.to_csv(csv_path, index=False) return combined_df