Source code for polygraph.input

import os

import pandas as pd

script_dir = os.path.dirname(os.path.abspath(__file__))
resources_dir = os.path.join(script_dir, "resources")



[docs]
def read_seqs(file, sep="\t", incl_ids=False):
    """
    Read sequences and group labels into a dataframe. This creates the input
    dataframe for all subsequent analyses.

    Args:
        file (str): path to a text file containing no header. If incl_ids=True,
        the first column should contain IDs and the next two columns should contain
        sequence and group label. If incl_ids=False, the first two columns should
        contain sequence and group label.
        sep (str): Column separator
        incl_ids (bool): Whether the first column corresponds to sequence IDs.

    Returns:
        df (pd.DataFrame): Pandas dataframe with columns Sequence, Group
            and a unique index.
    """
    if incl_ids:
        df = pd.read_csv(
            file,
            sep=sep,
            header=None,
            usecols=(0, 1, 2),
            names=["SeqID", "Sequence", "Group"],
            dtype="str",
        ).set_index("SeqID")
        assert len(set(df.index)) == len(df), "SeqIDs are not unique."

    else:
        from polygraph.utils import make_ids

        df = pd.read_csv(
            file,
            sep=sep,
            header=None,
            usecols=(0, 1),
            names=["Sequence", "Group"],
            dtype="str",
        )

        # Add unique IDs
        df = make_ids(df)

    return df




[docs]
def read_meme_file(file):
    """
    Read a motif database in MEME format

    Args:
        file (str): path to MEME file

    Returns:
        motifs (list): List of pymemesuite.common.Motif objects
        bg (pymemesuite.common.Background): Background distribution
    """
    from pymemesuite.common import MotifFile

    # Open file
    motiffile = MotifFile(file)

    # Read motifs until file end
    motifs = []
    while True:
        motif = motiffile.read()
        if motif is None:
            break
        motifs.append(motif)

    print(f"Read {len(motifs)} motifs from file.")
    return motifs, motiffile.background




[docs]
def download_jaspar(
    family="vertebrates", download_dir=os.path.join(resources_dir, "jaspar")
):
    """
    Download and read the JASPAR database of TF motifs

    Args:
        family (str): JASPAR family. one of "fungi", "insects", "nematodes",
            "plants", "urochordates", "vertebrates"
        download_dir (str): Path to directory in which to download motifs

    Returns:
        (str): Path to downloaded local file
    """
    # Create download directory
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    # Download
    jaspar_core_prefix = (
        "https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_"
    )

    url = f"{jaspar_core_prefix}{family}_non-redundant_pfms_meme.txt"
    local_path = os.path.join(
        download_dir, f"JASPAR2022_CORE_{family}_non-redundant_pfms_meme.txt"
    )
    if os.path.exists(local_path):
        print(f"File already exists at {local_path}")
    else:
        os.system(f"wget --no-check-certificate -P {download_dir} {url}")
    return str(local_path)




[docs]
def download_gtex_tpm(download_dir=os.path.join(resources_dir, "gtex")):
    """
    Download per-tissue TPM values from GTEX.

    Args:
        download_dir (str): Path to directory in which to download file

    Returns:
        (str): Path to downloaded local file
    """
    # Create download directory
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    url = (
        "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
        + "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
    )
    local_path = os.path.join(
        download_dir, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
    )
    if os.path.exists(local_path):
        print(f"File already exists at {local_path}")
    else:
        os.system(f"wget --no-check-certificate -P {download_dir} {url}")
    return str(local_path)




[docs]
def load_gtex_tpm(download_dir=os.path.join(resources_dir, "gtex")):
    """
    Load per-tissue TPM values from GTEX.

    Args:
        download_dir (str): Path to directory in which to download file

    Returns:
        (pd.DataFrame): TPM matrix.
    """
    local_path = download_gtex_tpm(download_dir)
    return pd.read_table(local_path, skiprows=2)
Source code for polygraph.input

polygraph

Navigation

Related Topics