Source code for polygraph.input
import os
import pandas as pd
script_dir = os.path.dirname(os.path.abspath(__file__))
resources_dir = os.path.join(script_dir, "resources")
[docs]
def read_seqs(file, sep="\t", incl_ids=False):
"""
Read sequences and group labels into a dataframe. This creates the input
dataframe for all subsequent analyses.
Args:
file (str): path to a text file containing no header. If incl_ids=True,
the first column should contain IDs and the next two columns should contain
sequence and group label. If incl_ids=False, the first two columns should
contain sequence and group label.
sep (str): Column separator
incl_ids (bool): Whether the first column corresponds to sequence IDs.
Returns:
df (pd.DataFrame): Pandas dataframe with columns Sequence, Group
and a unique index.
"""
if incl_ids:
df = pd.read_csv(
file,
sep=sep,
header=None,
usecols=(0, 1, 2),
names=["SeqID", "Sequence", "Group"],
dtype="str",
).set_index("SeqID")
assert len(set(df.index)) == len(df), "SeqIDs are not unique."
else:
from polygraph.utils import make_ids
df = pd.read_csv(
file,
sep=sep,
header=None,
usecols=(0, 1),
names=["Sequence", "Group"],
dtype="str",
)
# Add unique IDs
df = make_ids(df)
return df
[docs]
def read_meme_file(file):
"""
Read a motif database in MEME format
Args:
file (str): path to MEME file
Returns:
motifs (list): List of pymemesuite.common.Motif objects
bg (pymemesuite.common.Background): Background distribution
"""
from pymemesuite.common import MotifFile
# Open file
motiffile = MotifFile(file)
# Read motifs until file end
motifs = []
while True:
motif = motiffile.read()
if motif is None:
break
motifs.append(motif)
print(f"Read {len(motifs)} motifs from file.")
return motifs, motiffile.background
[docs]
def download_jaspar(
family="vertebrates", download_dir=os.path.join(resources_dir, "jaspar")
):
"""
Download and read the JASPAR database of TF motifs
Args:
family (str): JASPAR family. one of "fungi", "insects", "nematodes",
"plants", "urochordates", "vertebrates"
download_dir (str): Path to directory in which to download motifs
Returns:
(str): Path to downloaded local file
"""
# Create download directory
if not os.path.exists(download_dir):
os.makedirs(download_dir)
# Download
jaspar_core_prefix = (
"https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_"
)
url = f"{jaspar_core_prefix}{family}_non-redundant_pfms_meme.txt"
local_path = os.path.join(
download_dir, f"JASPAR2022_CORE_{family}_non-redundant_pfms_meme.txt"
)
if os.path.exists(local_path):
print(f"File already exists at {local_path}")
else:
os.system(f"wget --no-check-certificate -P {download_dir} {url}")
return str(local_path)
[docs]
def download_gtex_tpm(download_dir=os.path.join(resources_dir, "gtex")):
"""
Download per-tissue TPM values from GTEX.
Args:
download_dir (str): Path to directory in which to download file
Returns:
(str): Path to downloaded local file
"""
# Create download directory
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url = (
"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
+ "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
)
local_path = os.path.join(
download_dir, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
)
if os.path.exists(local_path):
print(f"File already exists at {local_path}")
else:
os.system(f"wget --no-check-certificate -P {download_dir} {url}")
return str(local_path)
[docs]
def load_gtex_tpm(download_dir=os.path.join(resources_dir, "gtex")):
"""
Load per-tissue TPM values from GTEX.
Args:
download_dir (str): Path to directory in which to download file
Returns:
(pd.DataFrame): TPM matrix.
"""
local_path = download_gtex_tpm(download_dir)
return pd.read_table(local_path, skiprows=2)