Skip to content

Commit

Permalink
initial files
Browse files Browse the repository at this point in the history
  • Loading branch information
pnpnpn committed Mar 30, 2017
1 parent 7e964ce commit eec64aa
Show file tree
Hide file tree
Showing 12 changed files with 501 additions and 0 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# dna2vec

<https://arxiv.org/abs/1701.06279>

This implementation has only been tested on Python 3.5.3, but we welcome any
contributions and bug reporting to make it more accessible.

Contribute
---
I would love for you to fork and send me pull request for this project.
Please contribute.

License
---
This software is licensed under the [MIT license](http://en.wikipedia.org/wiki/MIT_License)
Empty file added attic_util/__init__.py
Empty file.
77 changes: 77 additions & 0 deletions attic_util/bio_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import numpy as np
import os
from Bio.Seq import Seq

NUCLEOTIDES = 'ACGT'

class Tuple4:
def __init__(self, pos1, pos2, neg1, neg2):
self.pos1 = pos1
self.pos2 = pos2
self.neg1 = neg1
self.neg2 = neg2

def determine_out_filename(output_dir, fileroot, mode, extension='txt'):
return os.path.join(output_dir, '{}.{}.{}'.format(fileroot, mode, extension))

def create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg):
"""
all inputs are list of single nucleotides, e.g. ['A', 'A', 'C']
"""
return Tuple4(
''.join(kmer1),
''.join(kmer2),
''.join(kmer1_neg),
''.join(kmer2_neg))

def insert_snippet(seq, snippet, idx):
"""
idx: 0 <= idx <= len(seq)]
"""
split1 = seq[:idx]
split2 = seq[idx:]
return split1 + snippet + split2

def pairwise_key(v1, v2):
return '{}:{}'.format(v1, v2)

def rand_kmer(rng, k_low, k_high=None):
"""
k_low and k_high are inclusive
"""
if k_high is None:
k_high = k_low
k_len = rng.randint(k_low, k_high + 1)
return ''.join([NUCLEOTIDES[x] for x in rng.randint(4, size=k_len)])

def rand_nt(rng):
return NUCLEOTIDES[rng.randint(4)]

def generate_revcompl_pair(k_low, k_high=None, rng=None):
# TODO make params k_high and rng be required
if k_high is None:
k_high = k_low
if rng is None:
rng = np.random
kmer = rand_kmer(rng, k_low, k_high)
return (kmer, revcompl(kmer))

def revcompl(kmer):
return str(Seq(kmer).reverse_complement())

def generate_1nt_mutation_4tuple(rng, k_len):
kmer1 = list(rand_kmer(rng, k_len, k_len))
kmer2 = list(rand_kmer(rng, k_len, k_len))

idx = rng.randint(len(kmer1))
original_nt = kmer1[idx]
mutate_nt = rand_nt(rng)

kmer1_neg = list(kmer1)
kmer1_neg[idx] = mutate_nt

kmer2[idx] = mutate_nt
kmer2_neg = list(kmer2)
kmer2_neg[idx] = original_nt

return create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg)
16 changes: 16 additions & 0 deletions attic_util/tee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sys

class Tee(object):
def __init__(self, fptr):
self.file = fptr

def __enter__(self):
self.stdout = sys.stdout
sys.stdout = self

def __exit__(self, exception_type, exception_value, traceback):
sys.stdout = self.stdout

def write(self, data):
self.file.write(data)
self.stdout.write(data)
18 changes: 18 additions & 0 deletions attic_util/time_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import time
import logbook

class Benchmark():
def __init__(self):
self.time_wall_start = time.time()
self.time_cpu_start = time.clock()
self.logger = logbook.Logger(self.__class__.__name__)

def diff_time_wall_secs(self):
return (time.time() - self.time_wall_start)

def print_time(self, label=''):
self.logger.info("%s wall=%.3fm cpu=%.3fm" % (
label,
self.diff_time_wall_secs() / 60.0,
(time.clock() - self.time_cpu_start) / 60.0,
))
40 changes: 40 additions & 0 deletions attic_util/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import random
import string
import resource
import logbook
import arrow
import numpy as np
import os

def split_Xy(df, y_colname='label'):
X = df.drop([y_colname], axis=1)
y = df[y_colname]
return (X, y)

def shuffle_tuple(tup, rng):
lst = list(tup)
rng.shuffle(lst)
return tuple(lst)

def shuffle_dataframe(df, rng):
"""
this does NOT do in-place shuffling
"""
return df.reindex(rng.permutation(df.index))

def random_str(N):
return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(N))

def memory_usage():
return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1E6

def estimate_bytes(filenames):
return sum([os.stat(f).st_size for f in filenames])

def get_output_fileroot(dirpath, name, postfix):
return '{}/{}-{}-{}-{}'.format(
dirpath,
name,
arrow.utcnow().format('YYYYMMDD-HHmm'),
postfix,
random_str(3))
99 changes: 99 additions & 0 deletions dna2vec/generators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logbook
import re
from Bio import SeqIO
from attic_util import util
from itertools import islice
import numpy as np

def remove_empty(str_list):
return filter(bool, str_list) # fastest way to remove empty string

class SeqFragmenter:
"""
Split a sequence into small sequences based on some criteria, e.g. 'N' characters
"""
def __init__(self):
pass

def get_acgt_seqs(self, seq):
return remove_empty(re.split(r'[^ACGTacgt]+', str(seq)))

class SlidingKmerFragmenter:
"""
Slide only a single nucleotide
"""
def __init__(self, k_low, k_high):
self.k_low = k_low
self.k_high = k_high

def apply(self, rng, seq):
return [seq[i: i + rng.randint(self.k_low, self.k_high + 1)] for i in range(len(seq) - self.k_high + 1)]

class SeqMapper:
def __init__(self, use_revcomp=True):
self.use_revcomp = use_revcomp

def apply(self, rng, seq):
seq = seq.upper()
if self.use_revcomp and rng.rand() < 0.5:
return seq.reverse_complement()
else:
return seq

class SeqGenerator:
def __init__(self, filenames, nb_epochs, seqlen_ulim=5000):
self.filenames = filenames
self.nb_epochs = nb_epochs
self.seqlen_ulim = seqlen_ulim
self.logger = logbook.Logger(self.__class__.__name__)
self.logger.info('Number of epochs: {}'.format(nb_epochs))

def filehandle_generator(self):
for curr_epoch in range(self.nb_epochs):
for filename in self.filenames:
with open(filename) as file:
self.logger.info('Opened file: {}'.format(filename))
self.logger.info('Memory usage: {} MB'.format(util.memory_usage()))
self.logger.info('Current epoch: {} / {}'.format(curr_epoch + 1, self.nb_epochs))
yield file

def generator(self, rng):
for fh in self.filehandle_generator():
# SeqIO takes twice as much memory than even simple fh.readlines()
for seq_record in SeqIO.parse(fh, "fasta"):
whole_seq = seq_record.seq
self.logger.info('Whole fasta seqlen: {}'.format(len(whole_seq)))
curr_left = 0
while curr_left < len(whole_seq):
seqlen = rng.randint(self.seqlen_ulim // 2, self.seqlen_ulim)
segment = seq_record.seq[curr_left: seqlen + curr_left]
curr_left += seqlen
self.logger.debug('input seq len: {}'.format(len(segment)))
yield segment

class KmerSeqIterable:
def __init__(self, rand_seed, seq_generator, mapper, seq_fragmenter, kmer_fragmenter, histogram):
self.logger = logbook.Logger(self.__class__.__name__)
self.seq_generator = seq_generator
self.mapper = mapper
self.kmer_fragmenter = kmer_fragmenter
self.seq_fragmenter = seq_fragmenter
self.histogram = histogram
self.rand_seed = rand_seed
self.iter_count = 0

def __iter__(self):
self.iter_count += 1
rng = np.random.RandomState(self.rand_seed)
for seq in self.seq_generator.generator(rng):
seq = self.mapper.apply(rng, seq)
acgt_seq_splits = list(self.seq_fragmenter.get_acgt_seqs(seq))
self.logger.debug('Splits of len={} to: {}'.format(len(seq), [len(f) for f in acgt_seq_splits]))

for acgt_seq in acgt_seq_splits:
kmer_seq = self.kmer_fragmenter.apply(rng, acgt_seq) # list of strings
if len(kmer_seq) > 0:
if self.iter_count == 1:
# only collect stats on the first call
self.histogram.add(kmer_seq)
yield kmer_seq
27 changes: 27 additions & 0 deletions dna2vec/histogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from collections import Counter
import logbook

class Histogram:
def __init__(self):
self.kmer_len_counter = Counter()
self.nb_kmers = 0
self.logger = logbook.Logger(self.__class__.__name__)

def add(self, seq):
"""
seq - array of k-mer string
"""
for kmer in seq:
self.kmer_len_counter[len(kmer)] += 1
self.nb_kmers += 1

def print_stat(self, fptr):
for kmer_len in sorted(self.kmer_len_counter.keys()):
self.logger.info('Percent of {:2d}-mers: {:3.1f}% ({})'.format(
kmer_len,
100.0 * self.kmer_len_counter[kmer_len] / self.nb_kmers,
self.kmer_len_counter[kmer_len],
))

total_bps = sum([l * c for l, c in self.kmer_len_counter.items()])
self.logger.info('Number of base-pairs: {}'.format(total_bps))
57 changes: 57 additions & 0 deletions dna2vec/multi_k_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import logbook
import tempfile
import numpy as np

from gensim.models import word2vec
from gensim import matutils

class SingleKModel:
def __init__(self, model):
self.model = model
self.vocab_lst = sorted(model.vocab.keys())

class MultiKModel:
def __init__(self, filepath):
self.aggregate = word2vec.Word2Vec.load_word2vec_format(filepath, binary=False)
self.logger = logbook.Logger(self.__class__.__name__)

vocab_lens = [len(vocab) for vocab in self.aggregate.vocab.keys()]
self.k_low = min(vocab_lens)
self.k_high = max(vocab_lens)
self.vec_dim = self.aggregate.vector_size

self.data = {}
for k in range(self.k_low, self.k_high + 1):
self.data[k] = self.separate_out_model(k)

def model(self, k_len):
"""
Use vector('ACGTA') when possible
"""
return self.data[k_len].model

def vector(self, vocab):
return self.data[len(vocab)].model[vocab]

def unitvec(self, vec):
return matutils.unitvec(vec)

def cosine_distance(self, vocab1, vocab2):
return np.dot(self.unitvec(self.vector(vocab1)), self.unitvec(self.vector(vocab2)))

def l2_norm(self, vocab):
return np.linalg.norm(self.vector(vocab))

def separate_out_model(self, k_len):
vocabs = [vocab for vocab in self.aggregate.vocab.keys() if len(vocab) == k_len]
if len(vocabs) != 4 ** k_len:
self.logger.warn('Missing {}-mers: {} / {}'.format(k_len, len(vocabs), 4 ** k_len))

header_str = '{} {}'.format(len(vocabs), self.vec_dim)
with tempfile.NamedTemporaryFile(mode='w') as fptr:
print(header_str, file=fptr)
for vocab in vocabs:
vec_str = ' '.join("%f" % val for val in self.aggregate[vocab])
print('{} {}'.format(vocab, vec_str), file=fptr)
fptr.flush()
return SingleKModel(word2vec.Word2Vec.load_word2vec_format(fptr.name, binary=False))
13 changes: 13 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
arrow==0.8.0
biopython==1.68
boto==2.46.1
bz2file==0.98
gensim==0.13.2
Logbook==1.0.0
numpy==1.11.1
pep8==1.7.0
python-dateutil==2.6.0
requests==2.13.0
scipy==0.19.0
six==1.10.0
smart-open==1.5.1
Loading

0 comments on commit eec64aa

Please sign in to comment.