forked from pnpnpn/dna2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
501 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,15 @@ | ||
# dna2vec | ||
|
||
<https://arxiv.org/abs/1701.06279> | ||
|
||
This implementation has only been tested on Python 3.5.3, but we welcome any | ||
contributions and bug reporting to make it more accessible. | ||
|
||
Contribute | ||
--- | ||
I would love for you to fork and send me pull request for this project. | ||
Please contribute. | ||
|
||
License | ||
--- | ||
This software is licensed under the [MIT license](http://en.wikipedia.org/wiki/MIT_License) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import numpy as np | ||
import os | ||
from Bio.Seq import Seq | ||
|
||
NUCLEOTIDES = 'ACGT' | ||
|
||
class Tuple4: | ||
def __init__(self, pos1, pos2, neg1, neg2): | ||
self.pos1 = pos1 | ||
self.pos2 = pos2 | ||
self.neg1 = neg1 | ||
self.neg2 = neg2 | ||
|
||
def determine_out_filename(output_dir, fileroot, mode, extension='txt'): | ||
return os.path.join(output_dir, '{}.{}.{}'.format(fileroot, mode, extension)) | ||
|
||
def create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg): | ||
""" | ||
all inputs are list of single nucleotides, e.g. ['A', 'A', 'C'] | ||
""" | ||
return Tuple4( | ||
''.join(kmer1), | ||
''.join(kmer2), | ||
''.join(kmer1_neg), | ||
''.join(kmer2_neg)) | ||
|
||
def insert_snippet(seq, snippet, idx): | ||
""" | ||
idx: 0 <= idx <= len(seq)] | ||
""" | ||
split1 = seq[:idx] | ||
split2 = seq[idx:] | ||
return split1 + snippet + split2 | ||
|
||
def pairwise_key(v1, v2): | ||
return '{}:{}'.format(v1, v2) | ||
|
||
def rand_kmer(rng, k_low, k_high=None): | ||
""" | ||
k_low and k_high are inclusive | ||
""" | ||
if k_high is None: | ||
k_high = k_low | ||
k_len = rng.randint(k_low, k_high + 1) | ||
return ''.join([NUCLEOTIDES[x] for x in rng.randint(4, size=k_len)]) | ||
|
||
def rand_nt(rng): | ||
return NUCLEOTIDES[rng.randint(4)] | ||
|
||
def generate_revcompl_pair(k_low, k_high=None, rng=None): | ||
# TODO make params k_high and rng be required | ||
if k_high is None: | ||
k_high = k_low | ||
if rng is None: | ||
rng = np.random | ||
kmer = rand_kmer(rng, k_low, k_high) | ||
return (kmer, revcompl(kmer)) | ||
|
||
def revcompl(kmer): | ||
return str(Seq(kmer).reverse_complement()) | ||
|
||
def generate_1nt_mutation_4tuple(rng, k_len): | ||
kmer1 = list(rand_kmer(rng, k_len, k_len)) | ||
kmer2 = list(rand_kmer(rng, k_len, k_len)) | ||
|
||
idx = rng.randint(len(kmer1)) | ||
original_nt = kmer1[idx] | ||
mutate_nt = rand_nt(rng) | ||
|
||
kmer1_neg = list(kmer1) | ||
kmer1_neg[idx] = mutate_nt | ||
|
||
kmer2[idx] = mutate_nt | ||
kmer2_neg = list(kmer2) | ||
kmer2_neg[idx] = original_nt | ||
|
||
return create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import sys | ||
|
||
class Tee(object): | ||
def __init__(self, fptr): | ||
self.file = fptr | ||
|
||
def __enter__(self): | ||
self.stdout = sys.stdout | ||
sys.stdout = self | ||
|
||
def __exit__(self, exception_type, exception_value, traceback): | ||
sys.stdout = self.stdout | ||
|
||
def write(self, data): | ||
self.file.write(data) | ||
self.stdout.write(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import time | ||
import logbook | ||
|
||
class Benchmark(): | ||
def __init__(self): | ||
self.time_wall_start = time.time() | ||
self.time_cpu_start = time.clock() | ||
self.logger = logbook.Logger(self.__class__.__name__) | ||
|
||
def diff_time_wall_secs(self): | ||
return (time.time() - self.time_wall_start) | ||
|
||
def print_time(self, label=''): | ||
self.logger.info("%s wall=%.3fm cpu=%.3fm" % ( | ||
label, | ||
self.diff_time_wall_secs() / 60.0, | ||
(time.clock() - self.time_cpu_start) / 60.0, | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import random | ||
import string | ||
import resource | ||
import logbook | ||
import arrow | ||
import numpy as np | ||
import os | ||
|
||
def split_Xy(df, y_colname='label'): | ||
X = df.drop([y_colname], axis=1) | ||
y = df[y_colname] | ||
return (X, y) | ||
|
||
def shuffle_tuple(tup, rng): | ||
lst = list(tup) | ||
rng.shuffle(lst) | ||
return tuple(lst) | ||
|
||
def shuffle_dataframe(df, rng): | ||
""" | ||
this does NOT do in-place shuffling | ||
""" | ||
return df.reindex(rng.permutation(df.index)) | ||
|
||
def random_str(N): | ||
return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(N)) | ||
|
||
def memory_usage(): | ||
return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1E6 | ||
|
||
def estimate_bytes(filenames): | ||
return sum([os.stat(f).st_size for f in filenames]) | ||
|
||
def get_output_fileroot(dirpath, name, postfix): | ||
return '{}/{}-{}-{}-{}'.format( | ||
dirpath, | ||
name, | ||
arrow.utcnow().format('YYYYMMDD-HHmm'), | ||
postfix, | ||
random_str(3)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import logbook | ||
import re | ||
from Bio import SeqIO | ||
from attic_util import util | ||
from itertools import islice | ||
import numpy as np | ||
|
||
def remove_empty(str_list): | ||
return filter(bool, str_list) # fastest way to remove empty string | ||
|
||
class SeqFragmenter: | ||
""" | ||
Split a sequence into small sequences based on some criteria, e.g. 'N' characters | ||
""" | ||
def __init__(self): | ||
pass | ||
|
||
def get_acgt_seqs(self, seq): | ||
return remove_empty(re.split(r'[^ACGTacgt]+', str(seq))) | ||
|
||
class SlidingKmerFragmenter: | ||
""" | ||
Slide only a single nucleotide | ||
""" | ||
def __init__(self, k_low, k_high): | ||
self.k_low = k_low | ||
self.k_high = k_high | ||
|
||
def apply(self, rng, seq): | ||
return [seq[i: i + rng.randint(self.k_low, self.k_high + 1)] for i in range(len(seq) - self.k_high + 1)] | ||
|
||
class SeqMapper: | ||
def __init__(self, use_revcomp=True): | ||
self.use_revcomp = use_revcomp | ||
|
||
def apply(self, rng, seq): | ||
seq = seq.upper() | ||
if self.use_revcomp and rng.rand() < 0.5: | ||
return seq.reverse_complement() | ||
else: | ||
return seq | ||
|
||
class SeqGenerator: | ||
def __init__(self, filenames, nb_epochs, seqlen_ulim=5000): | ||
self.filenames = filenames | ||
self.nb_epochs = nb_epochs | ||
self.seqlen_ulim = seqlen_ulim | ||
self.logger = logbook.Logger(self.__class__.__name__) | ||
self.logger.info('Number of epochs: {}'.format(nb_epochs)) | ||
|
||
def filehandle_generator(self): | ||
for curr_epoch in range(self.nb_epochs): | ||
for filename in self.filenames: | ||
with open(filename) as file: | ||
self.logger.info('Opened file: {}'.format(filename)) | ||
self.logger.info('Memory usage: {} MB'.format(util.memory_usage())) | ||
self.logger.info('Current epoch: {} / {}'.format(curr_epoch + 1, self.nb_epochs)) | ||
yield file | ||
|
||
def generator(self, rng): | ||
for fh in self.filehandle_generator(): | ||
# SeqIO takes twice as much memory than even simple fh.readlines() | ||
for seq_record in SeqIO.parse(fh, "fasta"): | ||
whole_seq = seq_record.seq | ||
self.logger.info('Whole fasta seqlen: {}'.format(len(whole_seq))) | ||
curr_left = 0 | ||
while curr_left < len(whole_seq): | ||
seqlen = rng.randint(self.seqlen_ulim // 2, self.seqlen_ulim) | ||
segment = seq_record.seq[curr_left: seqlen + curr_left] | ||
curr_left += seqlen | ||
self.logger.debug('input seq len: {}'.format(len(segment))) | ||
yield segment | ||
|
||
class KmerSeqIterable: | ||
def __init__(self, rand_seed, seq_generator, mapper, seq_fragmenter, kmer_fragmenter, histogram): | ||
self.logger = logbook.Logger(self.__class__.__name__) | ||
self.seq_generator = seq_generator | ||
self.mapper = mapper | ||
self.kmer_fragmenter = kmer_fragmenter | ||
self.seq_fragmenter = seq_fragmenter | ||
self.histogram = histogram | ||
self.rand_seed = rand_seed | ||
self.iter_count = 0 | ||
|
||
def __iter__(self): | ||
self.iter_count += 1 | ||
rng = np.random.RandomState(self.rand_seed) | ||
for seq in self.seq_generator.generator(rng): | ||
seq = self.mapper.apply(rng, seq) | ||
acgt_seq_splits = list(self.seq_fragmenter.get_acgt_seqs(seq)) | ||
self.logger.debug('Splits of len={} to: {}'.format(len(seq), [len(f) for f in acgt_seq_splits])) | ||
|
||
for acgt_seq in acgt_seq_splits: | ||
kmer_seq = self.kmer_fragmenter.apply(rng, acgt_seq) # list of strings | ||
if len(kmer_seq) > 0: | ||
if self.iter_count == 1: | ||
# only collect stats on the first call | ||
self.histogram.add(kmer_seq) | ||
yield kmer_seq |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from collections import Counter | ||
import logbook | ||
|
||
class Histogram: | ||
def __init__(self): | ||
self.kmer_len_counter = Counter() | ||
self.nb_kmers = 0 | ||
self.logger = logbook.Logger(self.__class__.__name__) | ||
|
||
def add(self, seq): | ||
""" | ||
seq - array of k-mer string | ||
""" | ||
for kmer in seq: | ||
self.kmer_len_counter[len(kmer)] += 1 | ||
self.nb_kmers += 1 | ||
|
||
def print_stat(self, fptr): | ||
for kmer_len in sorted(self.kmer_len_counter.keys()): | ||
self.logger.info('Percent of {:2d}-mers: {:3.1f}% ({})'.format( | ||
kmer_len, | ||
100.0 * self.kmer_len_counter[kmer_len] / self.nb_kmers, | ||
self.kmer_len_counter[kmer_len], | ||
)) | ||
|
||
total_bps = sum([l * c for l, c in self.kmer_len_counter.items()]) | ||
self.logger.info('Number of base-pairs: {}'.format(total_bps)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import logbook | ||
import tempfile | ||
import numpy as np | ||
|
||
from gensim.models import word2vec | ||
from gensim import matutils | ||
|
||
class SingleKModel: | ||
def __init__(self, model): | ||
self.model = model | ||
self.vocab_lst = sorted(model.vocab.keys()) | ||
|
||
class MultiKModel: | ||
def __init__(self, filepath): | ||
self.aggregate = word2vec.Word2Vec.load_word2vec_format(filepath, binary=False) | ||
self.logger = logbook.Logger(self.__class__.__name__) | ||
|
||
vocab_lens = [len(vocab) for vocab in self.aggregate.vocab.keys()] | ||
self.k_low = min(vocab_lens) | ||
self.k_high = max(vocab_lens) | ||
self.vec_dim = self.aggregate.vector_size | ||
|
||
self.data = {} | ||
for k in range(self.k_low, self.k_high + 1): | ||
self.data[k] = self.separate_out_model(k) | ||
|
||
def model(self, k_len): | ||
""" | ||
Use vector('ACGTA') when possible | ||
""" | ||
return self.data[k_len].model | ||
|
||
def vector(self, vocab): | ||
return self.data[len(vocab)].model[vocab] | ||
|
||
def unitvec(self, vec): | ||
return matutils.unitvec(vec) | ||
|
||
def cosine_distance(self, vocab1, vocab2): | ||
return np.dot(self.unitvec(self.vector(vocab1)), self.unitvec(self.vector(vocab2))) | ||
|
||
def l2_norm(self, vocab): | ||
return np.linalg.norm(self.vector(vocab)) | ||
|
||
def separate_out_model(self, k_len): | ||
vocabs = [vocab for vocab in self.aggregate.vocab.keys() if len(vocab) == k_len] | ||
if len(vocabs) != 4 ** k_len: | ||
self.logger.warn('Missing {}-mers: {} / {}'.format(k_len, len(vocabs), 4 ** k_len)) | ||
|
||
header_str = '{} {}'.format(len(vocabs), self.vec_dim) | ||
with tempfile.NamedTemporaryFile(mode='w') as fptr: | ||
print(header_str, file=fptr) | ||
for vocab in vocabs: | ||
vec_str = ' '.join("%f" % val for val in self.aggregate[vocab]) | ||
print('{} {}'.format(vocab, vec_str), file=fptr) | ||
fptr.flush() | ||
return SingleKModel(word2vec.Word2Vec.load_word2vec_format(fptr.name, binary=False)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
arrow==0.8.0 | ||
biopython==1.68 | ||
boto==2.46.1 | ||
bz2file==0.98 | ||
gensim==0.13.2 | ||
Logbook==1.0.0 | ||
numpy==1.11.1 | ||
pep8==1.7.0 | ||
python-dateutil==2.6.0 | ||
requests==2.13.0 | ||
scipy==0.19.0 | ||
six==1.10.0 | ||
smart-open==1.5.1 |
Oops, something went wrong.