Source code for reglm.evolve

import numpy as np
import pandas as pd

from reglm.interpret import ISM
from reglm.regression import SeqDataset


[docs]def evolve( start_seqs, regression_model, seq_len=None, language_model=None, label=None, tol=0.0, specific=None, max_iter=10, device=0, num_workers=1, batch_size=512, ): """ Directed evolution optionally using a language model to filter sequences. Args: start_seqs (list): Starting sequences regression_model (pl.LightningModule): Regression model seq_len (int): Sequence length for regression model language_model (pl.LightningModule): Language model label (str): Label for language model tol (float): Tolerance for likelihood filter specific (list): Task indices if optimizing for task specificity max_iter (int): Maximum number of iterations for evolution device (int): GPU index num_workers (int): Number of workers for regression model batch_size (int): Batch size for regression model Returns: df (pd.DataFrame): Dataframe containing evolution results """ # Create empty dataframe for results df = pd.DataFrame() # Iterate for i in range(max_iter + 1): if i == 0: # initial dataframe includes only starting sequences curr_df = pd.DataFrame( { "Sequence": start_seqs, "iter": i, "start_seq": range(len(start_seqs)), "best_in_iter": [True] * len(start_seqs), } ) elif i > 0: print(f"Iteration: {i}") start_seq_lens = [len(seq) for seq in start_seqs] # ISM new_seqs = np.concatenate([ISM(seq, drop_ref=True) for seq in start_seqs]) curr_df = pd.DataFrame( { "Sequence": new_seqs, "start_seq": np.concatenate( [ [s_idx] * s_len * 3 for s_idx, s_len in enumerate(start_seq_lens) ] ), "iter": i, } ) if language_model is not None: # Calculate likelihood curr_df["likelihood"] = np.concatenate( [ language_model.P_seqs_given_labels( batch, [label] * len(batch), add_stop=True, log=True, per_pos=False, ) for batch in np.split( curr_df.Sequence.tolist(), list(range(1000, len(curr_df), 1000)) ) ] ) # Filter sequences based on whether the likelihood has improved # relative to their previous sequence if i > 0: # Get likelihood of the respective start sequence curr_df["prev_likelihood"] = curr_df.start_seq.apply( lambda x: start_likelihoods[x] ) # Filter curr_df = curr_df[curr_df.likelihood > (curr_df.prev_likelihood - tol)] # Predict function with regression model ds = SeqDataset(curr_df.Sequence.tolist()) preds = regression_model.predict_on_dataset( ds, batch_size=batch_size, device=device, num_workers=num_workers ) # Get mean prediction or task specificity if (specific is None) and (preds.ndim == 2): preds = preds.mean(1) else: non_specific = [x for x in range(preds.shape[1]) if x != specific] preds = preds[:, non_specific].max(1) - preds[:, specific] curr_df["pred"] = [x for x in preds] if i > 0: # Get the best sequence from each starting sequence curr_df["best_in_iter"] = [False] * len(curr_df) curr_df.loc[ curr_df.groupby("start_seq").pred.idxmax(), "best_in_iter" ] = True # Collect the sequences to start the next iteration # And compute their likelihood start_seqs = curr_df.loc[curr_df.best_in_iter, "Sequence"].tolist() if language_model is not None: start_likelihoods = curr_df.loc[curr_df.best_in_iter, "likelihood"].tolist() # Concat to results df = pd.concat([df, curr_df]) return df.reset_index(drop=True)