Source code for reglm.utils
import numpy as np
BASE_TO_IDX = {
"A": 0,
"C": 1,
"G": 2,
"T": 3,
}
[docs]def get_percentiles(values, n_bins=None, qlist=None):
"""
Return list of tokens for sequences by binning their associated values
Args:
values (list): Values for which to calculate percentiles
n_bins (int): Number of equal bins into which to split values
qlist (list): Quantiles to split values into
Returns:
List containing percentiles at which to split the values
"""
# If given a number of bins, split the given values into equal bins.
if n_bins is not None:
assert n_bins < len(values)
binwidth = 100 / n_bins
qlist = np.arange(binwidth, 100, binwidth)
# Find values that split the values by percentiles
return np.percentile(values, qlist)
[docs]def get_label_tokens(values, percentiles):
"""
Return labels for sequences given cutoff percentiles
Args:
values (list): Values for which to calculate percentiles
percentiles (list): Percentiles at which to split values
Returns:
list containing label token corresponding to each value
"""
return [str(x) for x in np.digitize(values, percentiles)]
[docs]def tokenize(df, cols, names, n_bins=None, qlist=None, percentiles=None):
"""
Create labels for sequences by dividing them into bins
Args:
df (pd.DataFrame): Dataframe containing label values
cols (list): Names of columns to tokenize
names (list): Names to use for the returned tokens
n_bins (int): Number of equal bins into which to split values
qlist (list): Quantiles to split values into
percentiles (dict): Dictionary containing columns from cols
as keys, and lists of percentile values.
Returns:
df (pd.DataFrame): Original dataframe with additional columns containing
tokenized labels
"""
# Get percentiles
if percentiles is None:
percentiles = dict()
for col in cols:
percentiles[col] = get_percentiles(df[col], n_bins=n_bins, qlist=qlist)
print(col, percentiles[col].tolist())
# Add a column to contain the label
df["label"] = [""] * len(df)
# Fill in tokens and labels
for name, col in zip(names, cols):
df[name + "_token"] = get_label_tokens(df[col], percentiles[col])
df["label"] = df["label"] + df[name + "_token"]
return df
[docs]def seqs_to_idxs(seqs):
"""
Convert DNA sequences to indices
Args:
seqs (list): List of sequences to convert into indices
Returns:
np.array of shape (len(seqs), seq_len) containing the sequences
as indices
"""
return np.array([[BASE_TO_IDX[base] for base in seq] for seq in seqs])
[docs]def scores_to_matrix(scores, seqs):
"""
Convert per-base scores to a N x seq_len x 4 numpy array
Args:
scores (torch.Tensor): tensor of shape N x seq_len
seqs (list): List of DNA sequences of length N
Returns:
matrix (np.array): An array of shape N x seq_len x 4, in
which the entries corresponding to each base in seqs
will be filled with the values in scores, and other
entries will be 0.
"""
# Check shapes
assert len(seqs) == scores.shape[0]
# Encode sequences
idxs = seqs_to_idxs(seqs) # N, seq_len
# Create empty array
matrix = np.zeros((idxs.shape[0], idxs.shape[1], 4)) # N, seq_len, 4
# Fill in empty matrix with scores
for seq_idx in range(idxs.shape[0]):
for pos in range(idxs.shape[1]):
true_base_idx = idxs[seq_idx, pos]
true_base_score = scores[seq_idx, pos].tolist()
matrix[seq_idx, pos, true_base_idx] = true_base_score
return matrix
[docs]def matrix_to_scores(matrix, seqs):
"""
Convert a tensor of shape N x seq_len 4 to a 2-D array of shape N, seq_len
containing scores for the actual bases in each sequence
Args:
matrix (torch.Tensor): An tensor of shape N x seq_len x 4
seqs (list): List of DNA sequences of length N
Returns:
scores (np.array): array of shape N x seq_len, which will contain
the values in matrix that correspond to the real bases in seqs.
"""
# Encode sequences
idxs = seqs_to_idxs(seqs)
# Create empty array
scores = np.zeros(idxs.shape)
# Fill the empty array with scores of the true base
for seq_idx in range(idxs.shape[0]):
for pos in range(idxs.shape[1]):
true_base_idx = idxs[seq_idx, pos]
true_base_score = matrix[seq_idx, pos, true_base_idx].tolist()
scores[seq_idx, pos] = true_base_score
return scores