In my project, I need to make oneHotEncode for millions of DNA sequences for ~100 time(in total, billions of times of similar sequences). So an effiect way will be very improtant for me.
Bellow is my code, which takes 4.5s for 10K sequences.
import numpy as np
import os,sys,time
def dna2onehot(dnaSeq):
seqLen = len(dnaSeq)
dnaSeq = dnaSeq.upper()
# initialize the matrix to seqlen x 4
seqMatrix = np.zeros((seqLen,4))
# change the value to matrix
for i in range(0,seqLen):
if dnaSeq[i] == 'A':
seqMatrix[i,0] = 1
if dnaSeq[i] == 'C':
seqMatrix[i,1] = 1
if dnaSeq[i] == 'G':
seqMatrix[i,2] = 1
if dnaSeq[i] == 'T':
seqMatrix[i,3] = 1
ret = np.array(seqMatrix.flat)
return ret
#
sequence = "TCTGAGTCCCAATACACAAGAGGTTCCCTCACCTGTTCTGGTGTCAGACCCTCCCAGATGATCACCTCTCCTATGGCGGGGAAGGTGCCTGGATGTCTAAAGCCTGAAATGGGGATCTATCCCAGAAGCTGTGTAGCTTCTGCCTGTCCCAGAAGCTGTGTTGTTTCTGTATTCAGCTTGCTCACCCTCCGCAGTCCATTGATCTGCACAGACTGTTCTCAGATGGACTCGTGAGACAAGATGGCTCCTTCACCTGCTCTGGGGATCAGAACCCTCCCAGGTGGCCACCTCTCCTGTGGTGGGGAAGGTACCTGGAAGTCTTCAGCCCAAAACAGGGCCTGTCCCAGAAGCTGTGTCTCTTCTGCCTATCCCAGAAGCTGTATTGCTTCTGCTGTCCACTTGCTCACCCTCTGCAGTCTGCATGCTGATCTGCGCAGACTGTTCTCAGAGGGATCTGGCAGACAAGTTGGCTCCCTCACCTGCTCTGGGGCGGGGGGGGGGGGTTCAGAGCCCTCCTGGGCAGCCACCTCTCCTCTAGCAGAGAAGGTGCTGGGATGTCTTGAGCAGGAAACGGGGTATGTCCCAGAAGCTGTCTTGCTTCTGCAATCCACATGCTCAGCCTCTGCAGTCTGTGAGCTAATCTGGGCAGTCTGGTCTCAGGGGACTCTGGAGACAAGATGGCTCCCTCACCTGCTCTGGGGGTCAAAGCCCTCCTTGGCAGCCACCTTTTTCAGGCGGAGAAGGTGCCCGGATGTCTGGAGCCTGAAACAGGGGTATGTCCCAGACACTGTGTAGCTTCTGCCTGCCCCAGAAGATGTGTCACTTCCTCAGTCTGCTTGTTCACCCTCCACAGTCTGCAAGCTGATCTGCACAGACTGGTCTCAGAGGGACCTAGAAGACAAGATCAAGAAAAGTCTTATAGGTATAATGAATCAAGCAGAAAATGAAACATCAGAAGCTTAAGATAAAATACAGGATCTAGTCCAAATTAGCAAGAAGTA"
count = 10000
datalist = []
t1 = time.time()
for k in range(count):
datalist.append(dna2onehot(sequence))
#
t2 = time.time()
print("time cost:",t2-t1)
Do you have any suggestion to reduce the time with python (My whole project was based on python)?
You can use the OneHotEncoder from scikit-learn.
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# create the encoder object
encoder = OneHotEncoder()
sequence = 'TCTGAGTCCCAATACACAAGAGGTTCCCTCACCTGTTCTGGTGTCAGACCCTCCCAGATGATCACCTCTCCTATGGCG'
sequence += 'GGGAAGGTGCCTGGATGTCTAAAGCCTGAAATGGGGATCTATCCCAGAAGCTGTGTAGCTTCTGCCTGTCCCAGAAGC'
sequence += 'TGTGTTGTTTCTGTATTCAGCTTGCTCACCCTCCGCAGTCCATTGATCTGCACAGACTGTTCTCAGATGGACTCGTGA'
sequence += 'GACAAGATGGCTCCTTCACCTGCTCTGGGGATCAGAACCCTCCCAGGTGGCCACCTCTCCTGTGGTGGGGAAGGTACC'
sequence += 'TGGAAGTCTTCAGCCCAAAACAGGGCCTGTCCCAGAAGCTGTGTCTCTTCTGCCTATCCCAGAAGCTGTATTGCTTCT'
sequence += 'GCTGTCCACTTGCTCACCCTCTGCAGTCTGCATGCTGATCTGCGCAGACTGTTCTCAGAGGGATCTGGCAGACAAGTT'
sequence += 'GGCTCCCTCACCTGCTCTGGGGCGGGGGGGGGGGGTTCAGAGCCCTCCTGGGCAGCCACCTCTCCTCTAGCAGAGAAG'
sequence += 'GTGCTGGGATGTCTTGAGCAGGAAACGGGGTATGTCCCAGAAGCTGTCTTGCTTCTGCAATCCACATGCTCAGCCTCT'
sequence += 'GCAGTCTGTGAGCTAATCTGGGCAGTCTGGTCTCAGGGGACTCTGGAGACAAGATGGCTCCCTCACCTGCTCTGGGGG'
sequence += 'TCAAAGCCCTCCTTGGCAGCCACCTTTTTCAGGCGGAGAAGGTGCCCGGATGTCTGGAGCCTGAAACAGGGGTATGTC'
sequence += 'CCAGACACTGTGTAGCTTCTGCCTGCCCCAGAAGATGTGTCACTTCCTCAGTCTGCTTGTTCACCCTCCACAGTCTGC'
sequence += 'AAGCTGATCTGCACAGACTGGTCTCAGAGGGACCTAGAAGACAAGATCAAGAAAAGTCTTATAGGTATAATGAATCAA'
sequence += 'GCAGAAAATGAAACATCAGAAGCTTAAGATAAAATACAGGATCTAGTCCAAATTAGCAAGAAGTA'
# transform sequence to a Nx1 array, pass through fit/transform operation
seq_arr = np.array(list(sequence)).reshape(-1, 1)
seq_1hot = encoder.fit_transform(seq_arr).toarray()
seq_1hot
# returns:
array([[0., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 0., 1.],
...,
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[1., 0., 0., 0.]])
You can see which letters correspond to which column by looking at:
encoder.categories_
# returns:
[array(['A', 'C', 'G', 'T'], dtype='<U1')]
So in this case they are in alphabetic order.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With