From eec64aaea8b55cbc9315eeebd937801345279e3e Mon Sep 17 00:00:00 2001 From: Patrick Date: Fri, 31 Mar 2017 01:06:15 +0900 Subject: [PATCH] initial files --- README.md | 12 +++ attic_util/__init__.py | 0 attic_util/bio_util.py | 77 +++++++++++++++++++ attic_util/tee.py | 16 ++++ attic_util/time_benchmark.py | 18 +++++ attic_util/util.py | 40 ++++++++++ dna2vec/generators.py | 99 +++++++++++++++++++++++++ dna2vec/histogram.py | 27 +++++++ dna2vec/multi_k_model.py | 57 ++++++++++++++ requirements.txt | 13 ++++ scripts/train_dna2vec.py | 139 +++++++++++++++++++++++++++++++++++ setup.cfg | 3 + 12 files changed, 501 insertions(+) create mode 100644 attic_util/__init__.py create mode 100644 attic_util/bio_util.py create mode 100644 attic_util/tee.py create mode 100644 attic_util/time_benchmark.py create mode 100644 attic_util/util.py create mode 100644 dna2vec/generators.py create mode 100644 dna2vec/histogram.py create mode 100644 dna2vec/multi_k_model.py create mode 100644 requirements.txt create mode 100755 scripts/train_dna2vec.py create mode 100644 setup.cfg diff --git a/README.md b/README.md index 2a8ecec..8503151 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ # dna2vec + +This implementation has only been tested on Python 3.5.3, but we welcome any +contributions and bug reporting to make it more accessible. + +Contribute +--- +I would love for you to fork and send me pull request for this project. +Please contribute. + +License +--- +This software is licensed under the [MIT license](http://en.wikipedia.org/wiki/MIT_License) diff --git a/attic_util/__init__.py b/attic_util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/attic_util/bio_util.py b/attic_util/bio_util.py new file mode 100644 index 0000000..b6ef6b2 --- /dev/null +++ b/attic_util/bio_util.py @@ -0,0 +1,77 @@ +import numpy as np +import os +from Bio.Seq import Seq + +NUCLEOTIDES = 'ACGT' + +class Tuple4: + def __init__(self, pos1, pos2, neg1, neg2): + self.pos1 = pos1 + self.pos2 = pos2 + self.neg1 = neg1 + self.neg2 = neg2 + +def determine_out_filename(output_dir, fileroot, mode, extension='txt'): + return os.path.join(output_dir, '{}.{}.{}'.format(fileroot, mode, extension)) + +def create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg): + """ + all inputs are list of single nucleotides, e.g. ['A', 'A', 'C'] + """ + return Tuple4( + ''.join(kmer1), + ''.join(kmer2), + ''.join(kmer1_neg), + ''.join(kmer2_neg)) + +def insert_snippet(seq, snippet, idx): + """ + idx: 0 <= idx <= len(seq)] + """ + split1 = seq[:idx] + split2 = seq[idx:] + return split1 + snippet + split2 + +def pairwise_key(v1, v2): + return '{}:{}'.format(v1, v2) + +def rand_kmer(rng, k_low, k_high=None): + """ + k_low and k_high are inclusive + """ + if k_high is None: + k_high = k_low + k_len = rng.randint(k_low, k_high + 1) + return ''.join([NUCLEOTIDES[x] for x in rng.randint(4, size=k_len)]) + +def rand_nt(rng): + return NUCLEOTIDES[rng.randint(4)] + +def generate_revcompl_pair(k_low, k_high=None, rng=None): + # TODO make params k_high and rng be required + if k_high is None: + k_high = k_low + if rng is None: + rng = np.random + kmer = rand_kmer(rng, k_low, k_high) + return (kmer, revcompl(kmer)) + +def revcompl(kmer): + return str(Seq(kmer).reverse_complement()) + +def generate_1nt_mutation_4tuple(rng, k_len): + kmer1 = list(rand_kmer(rng, k_len, k_len)) + kmer2 = list(rand_kmer(rng, k_len, k_len)) + + idx = rng.randint(len(kmer1)) + original_nt = kmer1[idx] + mutate_nt = rand_nt(rng) + + kmer1_neg = list(kmer1) + kmer1_neg[idx] = mutate_nt + + kmer2[idx] = mutate_nt + kmer2_neg = list(kmer2) + kmer2_neg[idx] = original_nt + + return create_tuple4(kmer1, kmer2, kmer1_neg, kmer2_neg) diff --git a/attic_util/tee.py b/attic_util/tee.py new file mode 100644 index 0000000..b9c9305 --- /dev/null +++ b/attic_util/tee.py @@ -0,0 +1,16 @@ +import sys + +class Tee(object): + def __init__(self, fptr): + self.file = fptr + + def __enter__(self): + self.stdout = sys.stdout + sys.stdout = self + + def __exit__(self, exception_type, exception_value, traceback): + sys.stdout = self.stdout + + def write(self, data): + self.file.write(data) + self.stdout.write(data) diff --git a/attic_util/time_benchmark.py b/attic_util/time_benchmark.py new file mode 100644 index 0000000..8122bd8 --- /dev/null +++ b/attic_util/time_benchmark.py @@ -0,0 +1,18 @@ +import time +import logbook + +class Benchmark(): + def __init__(self): + self.time_wall_start = time.time() + self.time_cpu_start = time.clock() + self.logger = logbook.Logger(self.__class__.__name__) + + def diff_time_wall_secs(self): + return (time.time() - self.time_wall_start) + + def print_time(self, label=''): + self.logger.info("%s wall=%.3fm cpu=%.3fm" % ( + label, + self.diff_time_wall_secs() / 60.0, + (time.clock() - self.time_cpu_start) / 60.0, + )) diff --git a/attic_util/util.py b/attic_util/util.py new file mode 100644 index 0000000..4deb572 --- /dev/null +++ b/attic_util/util.py @@ -0,0 +1,40 @@ +import random +import string +import resource +import logbook +import arrow +import numpy as np +import os + +def split_Xy(df, y_colname='label'): + X = df.drop([y_colname], axis=1) + y = df[y_colname] + return (X, y) + +def shuffle_tuple(tup, rng): + lst = list(tup) + rng.shuffle(lst) + return tuple(lst) + +def shuffle_dataframe(df, rng): + """ + this does NOT do in-place shuffling + """ + return df.reindex(rng.permutation(df.index)) + +def random_str(N): + return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(N)) + +def memory_usage(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1E6 + +def estimate_bytes(filenames): + return sum([os.stat(f).st_size for f in filenames]) + +def get_output_fileroot(dirpath, name, postfix): + return '{}/{}-{}-{}-{}'.format( + dirpath, + name, + arrow.utcnow().format('YYYYMMDD-HHmm'), + postfix, + random_str(3)) diff --git a/dna2vec/generators.py b/dna2vec/generators.py new file mode 100644 index 0000000..c990f74 --- /dev/null +++ b/dna2vec/generators.py @@ -0,0 +1,99 @@ +import logbook +import re +from Bio import SeqIO +from attic_util import util +from itertools import islice +import numpy as np + +def remove_empty(str_list): + return filter(bool, str_list) # fastest way to remove empty string + +class SeqFragmenter: + """ + Split a sequence into small sequences based on some criteria, e.g. 'N' characters + """ + def __init__(self): + pass + + def get_acgt_seqs(self, seq): + return remove_empty(re.split(r'[^ACGTacgt]+', str(seq))) + +class SlidingKmerFragmenter: + """ + Slide only a single nucleotide + """ + def __init__(self, k_low, k_high): + self.k_low = k_low + self.k_high = k_high + + def apply(self, rng, seq): + return [seq[i: i + rng.randint(self.k_low, self.k_high + 1)] for i in range(len(seq) - self.k_high + 1)] + +class SeqMapper: + def __init__(self, use_revcomp=True): + self.use_revcomp = use_revcomp + + def apply(self, rng, seq): + seq = seq.upper() + if self.use_revcomp and rng.rand() < 0.5: + return seq.reverse_complement() + else: + return seq + +class SeqGenerator: + def __init__(self, filenames, nb_epochs, seqlen_ulim=5000): + self.filenames = filenames + self.nb_epochs = nb_epochs + self.seqlen_ulim = seqlen_ulim + self.logger = logbook.Logger(self.__class__.__name__) + self.logger.info('Number of epochs: {}'.format(nb_epochs)) + + def filehandle_generator(self): + for curr_epoch in range(self.nb_epochs): + for filename in self.filenames: + with open(filename) as file: + self.logger.info('Opened file: {}'.format(filename)) + self.logger.info('Memory usage: {} MB'.format(util.memory_usage())) + self.logger.info('Current epoch: {} / {}'.format(curr_epoch + 1, self.nb_epochs)) + yield file + + def generator(self, rng): + for fh in self.filehandle_generator(): + # SeqIO takes twice as much memory than even simple fh.readlines() + for seq_record in SeqIO.parse(fh, "fasta"): + whole_seq = seq_record.seq + self.logger.info('Whole fasta seqlen: {}'.format(len(whole_seq))) + curr_left = 0 + while curr_left < len(whole_seq): + seqlen = rng.randint(self.seqlen_ulim // 2, self.seqlen_ulim) + segment = seq_record.seq[curr_left: seqlen + curr_left] + curr_left += seqlen + self.logger.debug('input seq len: {}'.format(len(segment))) + yield segment + +class KmerSeqIterable: + def __init__(self, rand_seed, seq_generator, mapper, seq_fragmenter, kmer_fragmenter, histogram): + self.logger = logbook.Logger(self.__class__.__name__) + self.seq_generator = seq_generator + self.mapper = mapper + self.kmer_fragmenter = kmer_fragmenter + self.seq_fragmenter = seq_fragmenter + self.histogram = histogram + self.rand_seed = rand_seed + self.iter_count = 0 + + def __iter__(self): + self.iter_count += 1 + rng = np.random.RandomState(self.rand_seed) + for seq in self.seq_generator.generator(rng): + seq = self.mapper.apply(rng, seq) + acgt_seq_splits = list(self.seq_fragmenter.get_acgt_seqs(seq)) + self.logger.debug('Splits of len={} to: {}'.format(len(seq), [len(f) for f in acgt_seq_splits])) + + for acgt_seq in acgt_seq_splits: + kmer_seq = self.kmer_fragmenter.apply(rng, acgt_seq) # list of strings + if len(kmer_seq) > 0: + if self.iter_count == 1: + # only collect stats on the first call + self.histogram.add(kmer_seq) + yield kmer_seq diff --git a/dna2vec/histogram.py b/dna2vec/histogram.py new file mode 100644 index 0000000..05e3afe --- /dev/null +++ b/dna2vec/histogram.py @@ -0,0 +1,27 @@ +from collections import Counter +import logbook + +class Histogram: + def __init__(self): + self.kmer_len_counter = Counter() + self.nb_kmers = 0 + self.logger = logbook.Logger(self.__class__.__name__) + + def add(self, seq): + """ + seq - array of k-mer string + """ + for kmer in seq: + self.kmer_len_counter[len(kmer)] += 1 + self.nb_kmers += 1 + + def print_stat(self, fptr): + for kmer_len in sorted(self.kmer_len_counter.keys()): + self.logger.info('Percent of {:2d}-mers: {:3.1f}% ({})'.format( + kmer_len, + 100.0 * self.kmer_len_counter[kmer_len] / self.nb_kmers, + self.kmer_len_counter[kmer_len], + )) + + total_bps = sum([l * c for l, c in self.kmer_len_counter.items()]) + self.logger.info('Number of base-pairs: {}'.format(total_bps)) diff --git a/dna2vec/multi_k_model.py b/dna2vec/multi_k_model.py new file mode 100644 index 0000000..1af9e66 --- /dev/null +++ b/dna2vec/multi_k_model.py @@ -0,0 +1,57 @@ +import logbook +import tempfile +import numpy as np + +from gensim.models import word2vec +from gensim import matutils + +class SingleKModel: + def __init__(self, model): + self.model = model + self.vocab_lst = sorted(model.vocab.keys()) + +class MultiKModel: + def __init__(self, filepath): + self.aggregate = word2vec.Word2Vec.load_word2vec_format(filepath, binary=False) + self.logger = logbook.Logger(self.__class__.__name__) + + vocab_lens = [len(vocab) for vocab in self.aggregate.vocab.keys()] + self.k_low = min(vocab_lens) + self.k_high = max(vocab_lens) + self.vec_dim = self.aggregate.vector_size + + self.data = {} + for k in range(self.k_low, self.k_high + 1): + self.data[k] = self.separate_out_model(k) + + def model(self, k_len): + """ + Use vector('ACGTA') when possible + """ + return self.data[k_len].model + + def vector(self, vocab): + return self.data[len(vocab)].model[vocab] + + def unitvec(self, vec): + return matutils.unitvec(vec) + + def cosine_distance(self, vocab1, vocab2): + return np.dot(self.unitvec(self.vector(vocab1)), self.unitvec(self.vector(vocab2))) + + def l2_norm(self, vocab): + return np.linalg.norm(self.vector(vocab)) + + def separate_out_model(self, k_len): + vocabs = [vocab for vocab in self.aggregate.vocab.keys() if len(vocab) == k_len] + if len(vocabs) != 4 ** k_len: + self.logger.warn('Missing {}-mers: {} / {}'.format(k_len, len(vocabs), 4 ** k_len)) + + header_str = '{} {}'.format(len(vocabs), self.vec_dim) + with tempfile.NamedTemporaryFile(mode='w') as fptr: + print(header_str, file=fptr) + for vocab in vocabs: + vec_str = ' '.join("%f" % val for val in self.aggregate[vocab]) + print('{} {}'.format(vocab, vec_str), file=fptr) + fptr.flush() + return SingleKModel(word2vec.Word2Vec.load_word2vec_format(fptr.name, binary=False)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3b94926 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +arrow==0.8.0 +biopython==1.68 +boto==2.46.1 +bz2file==0.98 +gensim==0.13.2 +Logbook==1.0.0 +numpy==1.11.1 +pep8==1.7.0 +python-dateutil==2.6.0 +requests==2.13.0 +scipy==0.19.0 +six==1.10.0 +smart-open==1.5.1 diff --git a/scripts/train_dna2vec.py b/scripts/train_dna2vec.py new file mode 100755 index 0000000..12a71df --- /dev/null +++ b/scripts/train_dna2vec.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +import sys +sys.path.append('../') + +import logbook +from logbook.compat import redirect_logging +import argparse +import numpy as np +from Bio import SeqIO +from attic_util.time_benchmark import Benchmark +from attic_util import util +from attic_util.tee import Tee +from dna2vec.histogram import Histogram +from dna2vec.generators import SeqGenerator, KmerSeqIterable, SeqMapper, SeqFragmenter +from dna2vec.generators import SlidingKmerFragmenter + +from gensim.models import word2vec + +""" +./scripts/train_dna2vec.py --inputs ../dataset/genomes2016/hg38/*.fa +""" + +class InvalidArgException(Exception): + pass + +class Learner: + def __init__(self, out_fileroot, context_halfsize, gensim_iters, vec_dim): + self.logger = logbook.Logger(self.__class__.__name__) + assert(word2vec.FAST_VERSION >= 0) + self.logger.info('word2vec.FAST_VERSION (should be >= 0): {}'.format(word2vec.FAST_VERSION)) + self.model = None + self.out_fileroot = out_fileroot + self.context_halfsize = context_halfsize + self.gensim_iters = gensim_iters + self.use_skipgram = 1 + self.vec_dim = vec_dim + + self.logger.info('Context window half size: {}'.format(self.context_halfsize)) + self.logger.info('Use skipgram: {}'.format(self.use_skipgram)) + self.logger.info('gensim_iters: {}'.format(self.gensim_iters)) + self.logger.info('vec_dim: {}'.format(self.vec_dim)) + + def train(self, kmer_seq_generator): + self.model = word2vec.Word2Vec( + sentences=kmer_seq_generator, + size=self.vec_dim, + window=self.context_halfsize, + min_count=5, + workers=4, + sg=self.use_skipgram, + iter=self.gensim_iters) + + # self.logger.info(model.vocab) + + def write_vec(self): + out_filename = '{}.w2v'.format(self.out_fileroot) + self.model.save_word2vec_format(out_filename, binary=False) + +def run_main(args, out_fileroot): + logbook.info(' '.join(sys.argv)) + if not args.debug: + import logging + logging.getLogger('gensim.models.word2vec').setLevel(logging.INFO) + + np.random.seed(args.rseed) + + benchmark = Benchmark() + + if args.kmer_fragmenter == 'disjoint': + kmer_fragmenter = DisjointKmerFragmenter(args.k_low, args.k_high) + elif args.kmer_fragmenter == 'sliding': + kmer_fragmenter = SlidingKmerFragmenter(args.k_low, args.k_high) + else: + raise InvalidArgException('Invalid kmer fragmenter: {}'.format(args.kmer_fragmenter)) + + logbook.info('kmer fragmenter: {}'.format(args.kmer_fragmenter)) + + histogram = Histogram() + kmer_seq_iterable = KmerSeqIterable( + args.rseed_trainset, + SeqGenerator(args.inputs, args.epochs), + SeqMapper(), + SeqFragmenter(), + kmer_fragmenter, + histogram, + ) + + learner = Learner(out_fileroot, args.context, args.gensim_iters, args.vec_dim) + learner.train(kmer_seq_iterable) + learner.write_vec() + + histogram.print_stat(sys.stdout) + + benchmark.print_time() + +def main(): + argp = argparse.ArgumentParser() + argp.add_argument('--kmer-fragmenter', help='disjoint or sliding', choices=['disjoint', 'sliding'], default='sliding') + argp.add_argument('--vec-dim', help='vector dimension', type=int, default=12) + argp.add_argument('--rseed', help='general np.random seed', type=int, default=7) + argp.add_argument('--rseed-trainset', help='random seed for generating training data', type=int, default=123) + argp.add_argument('--inputs', help='FASTA files', nargs='+', required=True) + argp.add_argument('--k-low', help='k-mer start range (inclusive)', type=int, default=5) + argp.add_argument('--k-high', help='k-mer end range (inclusive)', type=int, default=5) + argp.add_argument('--context', help='half size of context window (the total size is 2*c+1)', type=int, default=4) + argp.add_argument('--epochs', help='', type=int, default=1) + argp.add_argument('--gensim-iters', help="gensim's internal iterations", type=int, default=1) + argp.add_argument('--debug', help='', action='store_true') + args = argp.parse_args() + + if args.debug: + out_dir = '/tmp' + log_level = 'DEBUG' + else: + out_dir = '../dataset/dna2vec/results' + log_level = 'INFO' + + mbytes = util.estimate_bytes(args.inputs) // (10 ** 6) + out_fileroot = util.get_output_fileroot( + out_dir, + 'dna2vec', + 'k{}to{}-{}d-{}c-{}Mbp-{}'.format( + args.k_low, + args.k_high, + args.vec_dim, + args.context, + mbytes * args.epochs, # total Mb including epochs + args.kmer_fragmenter)) + + out_txt_filename = '{}.txt'.format(out_fileroot) + with open(out_txt_filename, 'w') as summary_fptr: + with Tee(summary_fptr): + logbook.StreamHandler(sys.stdout, level=log_level).push_application() + redirect_logging() + run_main(args, out_fileroot) + +if __name__ == '__main__': + main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..652b22b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[pep8] +max-line-length = 140 +ignore = E302,E303,E226,E402