Source code for squid.utils

import pandas as pd
import numpy as np



[docs]
def arr2pd(x, alphabet=['A','C','G','T']):
    """Function to convert a Numpy array to Pandas dataframe with proper column headings.

    Parameters
    ----------
    x : numpy.ndarray
        One-hot encoding or attribution map (shape : (L,C)).
    alphabet : list
        The alphabet used to determine the C characters in the logo such that
        each entry is a string; e.g., ['A','C','G','T'] for DNA.

    Returns
    -------
    x : pandas.dataframe
        Dataframe corresponding to the input array.
    """
    labels = {}
    idx = 0
    for i in alphabet:
        labels[i] = x[:,idx]
        idx += 1
    x = pd.DataFrame.from_dict(labels, orient='index').T
    
    return x




[docs]
def oh2seq(one_hot, alphabet=['A','C','G','T']):
    """Function to convert one-hot encoding to a sequence.

    Parameters
    ----------
    one_hot : numpy.ndarray
        Input one-hot encoding of sequence (shape : (L,C))
    alphabet : list
        The alphabet used to determine the C characters in the logo such that
        each entry is a string; e.g., ['A','C','G','T'] for DNA.

    Returns
    -------
    seq : string
        Input sequence with length L.
    """
    seq = []
    for i in range(np.shape(one_hot)[0]):
        for j in range(len(alphabet)):
            if one_hot[i][j] == 1:
                seq.append(alphabet[j])
    seq = ''.join(seq)
    return seq




[docs]
def seq2oh(seq, alphabet=['A','C','G','T']):
    """Function to convert a sequence to one-hot encoding.

    Parameters
    ----------
    seq : string
        Input sequence with length L
    alphabet : list
        The alphabet used to determine the C characters in the logo such that
        each entry is a string; e.g., ['A','C','G','T'] for DNA.

    Returns
    -------
    one_hot : numpy.ndarray
        One-hot encoding corresponding to input sequence (shape : (L,C)).
    """
    L = len(seq)
    one_hot = np.zeros(shape=(L,len(alphabet)), dtype=np.float32)
    for idx, i in enumerate(seq):
        for jdx, j in enumerate(alphabet):
            if i == j:
                one_hot[idx,jdx] = 1
    return one_hot




[docs]
def fix_gauge(x, gauge, wt=None, r=0.1):
    """Function to fix the gauge for an attribution matrix.

    Parameters
    ----------
    x : numpy.ndarray
        Attribution scores for a sequence-of-interest (shape : (L,C)).
    gauge : gauge mode used to fix model parameters.
        See https://mavenn.readthedocs.io/en/latest/math.html for more info.
        'uniform'   :   hierarchical gauge using a uniform sequence distribution over
                        the characters at each position observed in the training set
                        (unobserved characters are assigned probability 0).
        'empirical' :   uses an empirical distribution computed from the training data.
        'consensus' :   wild-type gauge using the training data consensus sequence.
        'default'   :   default gauge (no change).
    OH_wt : numpy.ndarray
        Wild-type sequence (one-hot encoding) for 'wildtype' or 'empirical' gauge (shape : (L,C)).
    r : float
        For 'empirical gauge', the probability of mutation used during generation of
        in silico MAVE dataset (should match user-defined 'mut_rate').

    Returns
    -------
    OH : numpy.ndarray
        Gauge-fixed one-hot encoding corresponding to input sequence (shape : (L,C)).
    """
    x1 = x.copy()

    if gauge == 'empirical':
        L = wt.shape[0] #length of sequence
        wt_argmax = np.argmax(wt, axis=1) #index of each wild-type in the one-hot encoding

        p_lc = np.ones(shape=wt.shape) #empirical probability matrix
        p_lc = p_lc*(r/3.)

        for l in range(L):
            p_lc[l,wt_argmax[l]] = (1-r)

        for l in range(L):
            weighted_avg = np.average(x[l,:], weights=p_lc[l,:])
            for c in range(4):
                x1[l,c] -= weighted_avg

    elif gauge == 'wildtype':
        L = wt.shape[0]
        wt_argmax = np.argmax(wt, axis=1)
        for l in range(L):
            wt_val = x[l, wt_argmax[l]]
            x1[l,:] -= wt_val

    elif gauge == 'hierarchical':
        for l in range(x.shape[0]):
            col_mean = np.mean(x[l,:])
            x1[l,:] -= col_mean

    elif gauge == 'default':
        pass

    return x1