Source code for secomo.sequences

import Bio.SeqIO as sio
from Bio.Alphabet import IUPAC
from Bio import motifs

import numpy as np
import re
import os

L = {'A': 0,
     'a': 0,
     'C': 1,
     'c': 1,
     'G': 2,
     'g': 2,
     'T': 3,
     't': 3
     }


def _getOneHotSeq(seq):
    """Convert Biopython Seq to one-hot encoding.

    .. note:: Only internally used.
    """

    m = len(seq.alphabet.letters)
    n = len(seq)
    result = np.zeros((1,1, m, n), dtype=np.float32)
    for i in range(len(seq)):
        result[0, 0, L[seq[i]], i] = 1
    return result

[docs]def readSeqsFromFasta(filename):
    """Read sequences from multi-fasta file.

    Parameters
    -----------
    filename : str
        File containing DNA sequences in multi-fasta format.

    returns : list
        List of Biopython Seq objects.
    """

    seqs = []
    for faseq in sio.parse(open(filename), 'fasta', IUPAC.unambiguous_dna):
        match = re.search(r'N', str(faseq.seq), re.I)
        if match:
            print("skip sequence containing N")
            continue
        seqs.append(faseq)
    return seqs

[docs]def splitTrainingTest(filename, train_test_ratio, num_top_regions = None,\
        randomize =True):
    """Splits training and test set.

    The DNA sequences are read from filename.
    As output, two additional
    fasta files are generated 
    with prefixes being same as filename and suffixes
    *train.fa* and *test.fa*.
    
    Parameters
    -----------
    filename : str
        File containing DNA sequences in multi-fasta format.

    train_test_ratio : float
        Ratio between 0 and 1 for splitting the dataset into
        training and test. For instance, train_test_ratio=0.1 
        uses 10% and 90% of the dataset for test and training, respectively.
    num_top_regions : int
        Use only the first num_top_regions from 
        the fasta-file. Default: (None) all sequences are used.
    randomize : bool
        Randomize the dataset. Default: True.
    """

    seqs = readSeqsFromFasta(filename)

    # only extract the top N regions
    if num_top_regions:
        seqs = seqs[:num_top_regions]

    if randomize:
        idx_permut = list(np.random.permutation(len(seqs)))
    else:
        idx_permut = list(range(len(seqs)))

    itest = idx_permut[:int(len(seqs)*train_test_ratio)]
    itrain = idx_permut[int(len(seqs)*train_test_ratio):]
    trfilename = ".".join(filename.split(".")[:-1]) + "_train.fa"
    tefilename = ".".join(filename.split(".")[:-1]) + "_test.fa"
    trseq = [seqs[i] for i in itrain]
    teseq = [seqs[i] for i in itest]
    sio.write(trseq, trfilename, "fasta")
    sio.write(teseq, tefilename, "fasta")


[docs]def seqToOneHot(seqs):
    """Converts a set of Biopython DNA sequences to *one-hot* encoding.

    Parameters
    -----------
    seqs :list
        List of Biopython Sequence objects.

    returns : numpy-array
        4D Numpy array representing the 
        sequences in *one-hot* encoding.
    """

    onehots = []
    for seq in seqs:
        onehots.append(_getOneHotSeq(seq.seq))
    return np.concatenate(onehots, axis=0)


[docs]def load_sample():
    """
    Load sample sequences of Oct4 ChIP-seq peaks from H1hesc cell
    of ENCODE.
    The sequences are converted to one-hot encoding.

    Parameters
    -----------
    returns : numpy-array
        Sample DNA sequences in *one-hot* encoding.
    """
    
    
    fafile = os.path.join(os.path.dirname(__file__), 'data', 'oct4.fa')
    seqs = readSeqsFromFasta(fafile)
    onehot = seqToOneHot(seqs)

    return onehot