Skip to content

Commit

Permalink
more minor bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jeicher committed Mar 8, 2017
1 parent b9ebe68 commit 3e5ac2e
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 14 deletions.
53 changes: 41 additions & 12 deletions seq2simulate/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def _simulate_deletions(sequences, freq=0.4, strip_deletions=True, max_length=12
seq_diffs.append([])
return sequences, seq_diffs

def _simulate_insertions(sequences, freq=0.2, max_length=100, min_length=15, no_frameshifts=True):
def _simulate_insertions(sequences, freq=0.2, max_length=200, min_length=30, no_frameshifts=True):
"""
Args:
sequences: list of DNA strings
Expand Down Expand Up @@ -238,17 +238,40 @@ def _simulate_insertions(sequences, freq=0.2, max_length=100, min_length=15, no_
max_length = len(seq)//2
if random.uniform(0, 1) <= freq:
if no_frameshifts:
ins_start = random.randrange(0, (len(seq)-min_length)//3)*3
ins_length = random.randint(0, min(max_length, len(seq)-ins_start)//3)*3
ins_start = (random.randrange(0, len(seq)-min_length)//3)*3
ins_length = (random.randint(0, min(max_length, len(seq)-ins_start))//3)*3
else:
ins_start = random.randrange(0, len(seq)-min_length)
ins_length = random.randint(0, min(max_length, len(seq)-ins_start))
sequences[i] = seq[0:ins_start] + ''.join([random.choice('ATGC') for insertion in range(ins_length)]) + seq[ins_start:]
seq_diffs.append([ins_start, ins_start+ins_length])
new_stops_introduced = True
while new_stops_introduced:
inserted_sequence = ''.join([random.choice('ATGC') for insertion in range(ins_length)])
new_sequence = seq[0:ins_start] + inserted_sequence + seq[ins_start:]
if _get_n_stop_codons(new_sequence) <= _get_n_stop_codons(seq):
new_stops_introduced = False
sequences[i] = new_sequence
seq_diffs.append([ins_start, ins_start+ins_length-1, inserted_sequence])
else:
seq_diffs.append([])
return sequences, seq_diffs


def _get_n_stop_codons(seq):
stop_codons = ['TAG', 'TAA', 'TGA']
return len([cdn for cdn in split_seq(seq) if cdn in stop_codons])


def split_seq(seq):
"""
Split a string sequence into a list of codons.
"""
seq_split = []
for cdn in range(len(seq)//3):
seq_split.append(seq[cdn*3:cdn*3+3])
if (len(seq)//3)*3 < len(seq):
seq_split.append(seq[cdn*3+3:])
return seq_split

def _simulate_frameshifts(sequences, freq=0.3, strip_deletions=True):
"""
Args:
Expand Down Expand Up @@ -324,13 +347,13 @@ def _closest_match(s1, s2):
def _sim_score(s1, s2):
return sum([i==j for i,j in zip(s1, s2)])

def _simulate_inversions(sequences, freq=0.3, max_length=100, min_length=5):
def _simulate_inversions(sequences, freq=0.3, max_length=200, min_length=15):
"""
Args:
sequences: list of DNA strings
freq: probability of insertion
max_length: of random insertion
min_length: of random insertion
freq: probability of inversion
max_length: of random inversion
min_length: of random inversion
Returns:
list of sequences
Expand All @@ -347,9 +370,15 @@ def _simulate_inversions(sequences, freq=0.3, max_length=100, min_length=5):
seq_diffs = []
for i, seq in enumerate(sequences):
if random.uniform(0, 1) <= freq:
ins_start = random.randrange(0, len(seq)-min_length)
ins_length = random.randint(0, min(max_length, len(seq)-ins_start))
sequences[i] = seq[0:ins_start] + seq[ins_start:ins_start + ins_length][::-1] + seq[ins_start + ins_length:]
new_stops_introduced = True
while new_stops_introduced:
ins_start = random.randrange(0, len(seq)-min_length)
ins_length = random.randint(min_length, min(max_length, len(seq)-ins_start))
new_sequence = seq[0:ins_start] + seq[ins_start:ins_start + ins_length][::-1] + seq[ins_start + ins_length:]
if _get_n_stop_codons(new_sequence) <= _get_n_stop_codons(seq):
new_stops_introduced = False
print('aweh', seq[ins_start:ins_start + ins_length][::-1])
sequences[i] = new_sequence
seq_diffs.append([ins_start, ins_start + ins_length])
else:
seq_diffs.append([])
Expand Down
4 changes: 2 additions & 2 deletions seq2simulate/run_make_proviral_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ def _make_mutation_data_files(sequences, working_dir, hypermutation_rate=3):
'longdel': [diversity._simulate_deletions, {'freq': 0, 'no_frameshifts': True}],
'insertion': [diversity._simulate_insertions, {'freq': 0, 'no_frameshifts': True}],
'frameshift': [diversity._simulate_frameshifts, {'freq': 0}],
'stopcodon': [diversity._simulate_stop_codons, {'freq': 1}],
'inversion': [diversity._simulate_inversions, {'freq': 0}],
'stopcodon': [diversity._simulate_stop_codons, {'freq': 0}],
'inversion': [diversity._simulate_inversions, {'freq': 1}],
}

for i_mutation, mutation_type in enumerate(mutation_types):
Expand Down

0 comments on commit 3e5ac2e

Please sign in to comment.