Source code for secomo.utils

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.cm as cm
import scipy
from weblogolib import LogoData, LogoFormat, LogoOptions, Alphabet
from weblogolib import classic, png_print_formatter, formatters
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
from Bio.motifs.jaspar import write as write_motif
from Bio.motifs.jaspar import Motif
import os

[docs]def saveMotifs(model, path, name = "mot", fformat = 'jaspar'):
    """Save weight-matrices as PFMs.

    This method converts the cRBM weight-matrices to position frequency matrices
    and stores each matrix in a single file with ending .pfm.

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object.

    path : str
        Directory in which the PFMs should be stored.

    name : str
        File prefix for the motif files. Default: 'mot'.
    fformat : str
        File format of the motifs. Either 'jaspar' or 'tab'. Default: 'jaspar'.
    """

    pfms = model.getPFMs()
    alphabet = ['A','C','G','T']
    for i in range(len(pfms)):
        cnts = {}
        for c in range(len(alphabet)):
            cnts[alphabet[c]] = pfms[i][c]
        mot = Motif(name+str(i+1), name+str(i+1), counts = cnts)
        if not os.path.exists(path):
            os.makedirs(path)
        with open(os.path.join(path, "{}{:d}.{}".format(name, i, "pfm")), "w") as f:
            f.write(write_motif([mot], fformat))
        f.close()

[docs]def createSeqLogos(model, path, fformat = 'eps'):
    """Create sequence logos for all cRBM motifs

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object.
    path : str
        Output folder.
    fformat : str
        File format for storing the sequence logos. Default: 'eps'.
    """
    pfms = model.getPFMs()
    if not os.path.exists(path):
        os.makedirs(path)

    for idx in range(len(model.getPFMs())):
        pfm = pfms[idx]
        createSeqLogo(pfm, path + "/logo{:d}.{}".format(idx+1, fformat),
                fformat)

[docs]def createSeqLogo(pfm, filename, fformat = 'eps'):
    """Create sequence logo for an individual cRBM motif.

    Parameters
    -----------
    pfm : numpy-array
        2D numpy array representing a PFM. See :meth:`CRBM.getPFMs`
    path : str
        Output folder.
    fformat : str
        File format for storing the sequence logos. Default: 'eps'.
    """
    alph = Alphabet('ACGT')
    weblogoData = LogoData.from_counts(alph, pfm.T)#, c)#, learner.c.get_value().reshape(-1))
    weblogoOptions = LogoOptions(color_scheme=classic)
    weblogoFormat = LogoFormat(weblogoData, weblogoOptions)
    content = formatters[fformat](weblogoData, weblogoFormat)
    f = open(filename, "wb")
    f.write(content)
    f.close()

[docs]def positionalDensityPlot(model, seqs, filename = None):
    """Positional enrichment of the motifs.

    This function creates a figure that illustrates a positional
    enrichment of all cRBM motifs in the given set of sequences.

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object
    seqs : numpy-array
        A set of DNA sequences in *one-hot* encoding.
        See :func:`crbm.sequences.seqToOneHot`
    filename : str
        Filename for storing the figure. If ``filename = None``,
        no figure will be stored.
    """

    # get motifs and hit observer

    pfms = model.getPFMs()

    h = model.motifHitProbs(seqs)

    #mean over sequences
    mh=h.mean(axis=(0,2))

    fig = plt.figure(figsize=(10,7))

    ax = fig.add_axes([.1,.1, .6,.75])
    colors = cm.rainbow(np.linspace(0,1,mh.shape[0]))
    for m_idx in range(mh.shape[0]):
        smh = np.convolve(mh[m_idx,:],
                scipy.stats.norm.pdf(np.linspace(-3,3,num=10)), mode='same')
        plt.plot(list(range(mh.shape[1])), smh, color=colors[m_idx],
                label='Motif {:d}'.format(m_idx +1))
    plt.xlabel("Position")
    plt.ylabel("Average motif match probability")
    plt.legend(bbox_to_anchor=(1.05,1), loc=2, borderaxespad=0.)

    if filename:
        plt.savefig(filename)
    else:
        plt.show()

[docs]def runTSNE(model, seqs):
    """Run t-SNE on the motif abundances.

    This function produces a clustering of the sequences
    using t-SNE based on the motif matches in the sequences.
    Accordingly, the sequences are projected onto a 2D hyper-plane
    in which similar sequences are located in close proximity.

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object
    seqs : numpy-array
        A set of DNA sequences in *one-hot* encoding.
        See :func:`crbm.sequences.seqToOneHot`
    """

    #hiddenprobs = model.fePerMotif(seqs)
    hiddenprobs = model.motifHitProbs(seqs)
    hiddenprobs = hiddenprobs.max(axis=(2,3))

    hreshaped = hiddenprobs.reshape((hiddenprobs.shape[0],
        np.prod(hiddenprobs.shape[1:])))
    model = TSNE()
    return model.fit_transform(hreshaped)

[docs]def tsneScatter(data, lims = None, colors = None, filename = None, legend = True):
    """Scatter plot of t-SNE clustering.

    Parameters
    -----------
    data : dict
        Dictionary containing the dataset name (keys) and data itself (values).
        The data is assumed to have been generated using :func:`runTSNE`.

    lims : tuple
        Optional parameter containing the x- and y-limits for the figure.
        If None, the limits are automatically determined.
        For example: ``lims = ([xmin, ymin], [xmax, ymax])``

    colors : matplotlib.cm
        Optional colormap to illustrate the datapoints.
        If None, a default colormap will be used.

    filename : str
        Filename for storing the figure. Default: None, means
        the figure stored but directly displayed.

    legend : bool
        Include the legend into the figure. Default: True
    """


    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_axes([.1,.1, .6,.75])

    if not colors:
        colors = cm.brg(np.linspace(0,1,len(data)))

    for name, color  in zip(data, colors):
        plt.scatter(x=data[name][:,0], y=data[name][:,1],
                c=color, label = name, alpha=.3)
    if lims:
        plt.xlim(lims[0][0], lims[1][0])
        plt.ylim(lims[0][1], lims[1][1])

    if legend:
        plt.legend(bbox_to_anchor=(1.05,1), loc=2, borderaxespad=0.)
    plt.axis('off')

    if filename:
        fig.savefig(filename, dpi = 700)
    else:
        plt.show()

[docs]def tsneScatterWithPies(model, seqs, tsne, lims = None, filename= None):
    """Scatter plot of t-SNE clustering.

    This function produces a figure in which sequences are represented
    as pie chart in a 2D t-SNE hyper-plane obtained with :func:`runTSNE`.
    Moreover, the pie pieces correspond to the individual :class:`CRBM`
    motifs and the sizes represent the enrichment/abundance of the
    motifs in the respective sequences.

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object

    seqs : numpy-array
        DNA sequences represented in *one-hot* encoding.
        See :func:`crbm.sequences.seqToOneHot`.
    tsne : numpy-array
        2D numpy array representing the sequences projected onto
        the t-SNE hyper-plane that was obtained with :func:`runTSNE`.

    lims : tuple
        Optional parameter containing the x- and y-limits for the figure.
        For example:

        ([xmin, ymin], [xmax, ymax])

    filename : str
        Filename for storing the figure.
    """

    hiddenprobs = model.motifHitProbs(seqs)
    probs = hiddenprobs

    pmax=probs.max(axis=(0,2,3))
    pmedian=np.median(probs.max(axis=(2,3)), axis=(0))
    pcurrent = probs.max(axis = (2,3))

    colors = cm.rainbow(np.linspace(0,1,probs.shape[1]))

    fig = plt.figure(figsize = (6, 6))
    ax = fig.add_axes([.1,.1, .6,.75])
    #given a probability matrix (N x K x 1 x 200)
    for idx, col in zip(list(range(probs.shape[1])), colors):

        markx = [0] + np.cos([idx*2*np.pi/probs.shape[1], 2*np.pi*(idx+1)/probs.shape[1]]).tolist()
        marky = [0] + np.sin([idx*2*np.pi/probs.shape[1], 2*np.pi*(idx+1)/probs.shape[1]]).tolist()

        markxy = list(zip(markx, marky))

        #X, Y = tsne[:,0], tsne[:,1]

        s =150*(pcurrent[:, idx]-pmedian[idx])/(pmax[idx]-pmedian[idx])
        s[s<=0]=0.
        plt.scatter(tsne[:,0], tsne[:,1], marker=(markxy, 0),
                s=s, color = col, label = "Motif " + str(idx +1), alpha = .6)

    if lims:
        plt.xlim(lims[0][0], lims[1][0])
        plt.ylim(lims[0][1], lims[1][1])
    plt.legend(bbox_to_anchor=(1.05,1), loc=2, borderaxespad=0.)
    plt.axis('off')
    if filename:
        fig.savefig(filename, dpi=700)
    else:
        plt.show()

[docs]def violinPlotMotifMatches(model, data, filename = None):
    """Violin plot of motif abundances.

    This function summarized the relative motif abundances of
    the :class:`CRBM` motifs
    in a given set of sequences (e.g. sequences with different functions).

    Parameters
    -----------
    model : :class:`CRBM` object
        A cRBM object
    data : dict
        Dictionary with keys representing dataset-names
        and values a set of DNA sequences in *one-hot* encoding.
        See :func:`crbm.sequences.seqToOneHot`.
    filename : str
        Filename to store the figure. Default: None,
        the figure will be dirctly disployed.
    """

    seqs = np.concatenate(list(data.values()), axis=0)
    # generate labels for the two sets
    labels = []
    for k in data:
        labels += [k] * data[k].shape[0]

    # compute hidden probabilities and normalize them
    hiddenprobs = model.motifHitProbs(seqs)
    probs = hiddenprobs.mean(axis=(2,3))
    probs = probs / np.amax(probs,axis=0, keepdims=True)

    # put everything in a DF and reduce it to fit with violinplot
    df = pd.DataFrame(data=probs, columns = [ "Motif "+str(i +1) \
            for i in range(probs.shape[1])])
    df["TF"] = pd.DataFrame(data = pd.Series(labels, name="TF"))
    dfm = pd.melt(df, id_vars = "TF")

    # create figure with violinplot
    fig = plt.figure(figsize = (8,5))
    g = sns.violinplot(x='variable', y='value', hue='TF', data=dfm,
            palette="Set2")
    g.set_xlabel("")
    g.set_ylabel("Normalized motif abundance")
    locs, labels = plt.xticks()
    #g.set_xticklabels(rotation=30)
    plt.setp(labels, rotation=30)
    #g.set_ylim(0,1)

    if filename:
        fig.savefig(filename, dpi=700)
    else:
        plt.show()