Source code for pairpro.dev_tools
from pairpro.preprocessing import connect_db
import pandas as pd
[docs]def build_sample_l2t(db_in, db_out, size):
'''
Generates a sample l2t relational database of given size. Note that the final
size will about 30% of 'size' due to pair filtering.
Args:
db_in (str): Path to full size l2t database
db_out (int): Path to sample l2t database to be created
size (str): Number of pairs to sample for test database.
Final size will be about 30% of this.
Returns:
None. Database file is saved at db_out.
Raises:
None.
'''
con, _ = connect_db(db_in)
# Make sample protein pairs table
cmd1 = f"""CREATE TEMP TABLE samp_protein_pairs AS
SELECT * FROM protein_pairs
USING SAMPLE {size}"""
con.execute(cmd1)
# Make sample proteins table
cmd2 = """CREATE TEMP TABLE samp_proteins AS
SELECT * FROM proteins
WHERE proteins.pid IN
(SELECT DISTINCT meso_pid FROM samp_protein_pairs)
OR proteins.pid IN
(SELECT DISTINCT thermo_pid FROM samp_protein_pairs)"""
con.execute(cmd2)
# Make sample taxa table
cmd3 = """CREATE TEMP TABLE samp_taxa AS
SELECT * FROM taxa
WHERE taxa.taxid IN
(SELECT DISTINCT meso_taxid FROM samp_protein_pairs)
OR taxa.taxid IN
(SELECT DISTINCT thermo_taxid FROM samp_protein_pairs)"""
con.execute(cmd3)
# Make sample taxa_pairs table
cmd4 = """CREATE TEMP TABLE samp_taxa_pairs AS
SELECT * FROM taxa_pairs
WHERE taxa_pairs.query_id IN
(SELECT DISTINCT taxid FROM samp_taxa)
AND taxa_pairs.subject_id IN
(SELECT DISTINCT taxid FROM samp_taxa)"""
con.execute(cmd4)
# Make sample taxa_lab table
cmd5 = """CREATE TEMP TABLE samp_taxa_pairs_lab AS
SELECT * FROM taxa_pairs_lab
WHERE taxa_pairs_lab.__index_level_0__ IN
(SELECT __index_level_0__ FROM samp_taxa_pairs)
"""
con.execute(cmd5)
# Grab new tables as df and close connection to large database
samp_protein_pairs = con.execute("""SELECT * FROM samp_protein_pairs""").df()
samp_proteins = con.execute("""SELECT * FROM samp_proteins""").df()
samp_taxa = con.execute("""SELECT * FROM samp_taxa""").df()
samp_taxa_pairs = con.execute("""SELECT * FROM samp_taxa_pairs""").df()
samp_taxa_pairs_lab = con.execute("""SELECT * FROM samp_taxa_pairs_lab""").df()
con.close()
con2 = duckdb.connect(db_out)
con2.execute("""CREATE OR REPLACE TABLE protein_pairs AS SELECT * FROM samp_protein_pairs""")
con2.execute("""CREATE OR REPLACE TABLE proteins AS SELECT * FROM samp_proteins""")
con2.execute("""CREATE OR REPLACE TABLE taxa AS SELECT * FROM samp_taxa""")
con2.execute("""CREATE OR REPLACE TABLE taxa_pairs AS SELECT * FROM samp_taxa_pairs""")
con2.execute("""CREATE OR REPLACE TABLE taxa_pairs_lab AS SELECT * FROM samp_taxa_pairs_lab""")
con2.close()