Skip to content

Commit

Permalink
Merge pull request #17 from kaitlinchaung/dev-z
Browse files Browse the repository at this point in the history
updates to fast and slow modes
  • Loading branch information
kaitlinchaung authored Apr 13, 2022
2 parents 1806cc8 + 9fdef32 commit 1e35c81
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 203 deletions.
34 changes: 30 additions & 4 deletions bin/Anchors.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,28 +337,54 @@ def compute_target_distance(self, anchor, target, distance_type):
distance = min([utils.get_distance(target, t, distance_type) for t in topTargets])
return distance

def get_blacklist_anchors(self):
def get_blacklist_anchors(self, use_std):
"""
Get sum of all scores and sort by sum descending
"""
df = pd.DataFrame(self).T
df['z'] = df.sum(axis=1)

if use_std:
df['z'] = (
df
.std(axis=1)
.abs()
)
else:
df['z'] = (
df
.sum(axis=1)
.abs()
)

anchors = (
df[df['z'] < df['z'].quantile(0.2)]
.index
.to_list()
)
return anchors

def get_final_anchors(self, num_keep_anchors):
def get_final_anchors(self, num_keep_anchors, use_std):
"""
Get final anchors for parse_anchors
"""
print(self)
if len(self) < num_keep_anchors:
anchors = list(self.keys())
else:
df = pd.DataFrame(self).T
df['z'] = df.sum(axis=1)
if use_std:
df['z'] = (
df
.std(axis=1)
.abs()
)
else:
df['z'] = (
df
.sum(axis=1)
.abs()
)

anchors = (
df['z']
.sort_values(ascending=False)
Expand Down
222 changes: 46 additions & 176 deletions bin/get_anchors.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ def get_args():
def main():
args = get_args()

# read in samples from samplesheet
# [fastq_file, optional group_id]
# read in samples from samplesheet, formatted as [fastq_file, optional group_id]
sample_list = pd.read_csv(
args.samplesheet,
header=None
Expand All @@ -124,6 +123,7 @@ def main():
use_std = True
elif args.use_std == "false":
use_std = False
# if sampelsheet only has 1 column, force use_std
if sample_list.shape[1] == 1:
use_std = True

Expand All @@ -136,54 +136,10 @@ def main():
datefmt='%Y-%m-%d %H:%M:%S'
)

"""logging"""
logging.info("--------------------------------------Parameters--------------------------------------")
logging.info(f'samplesheet = {args.samplesheet}')
logging.info(f'Number of targets per anchor required to calculate phase_1 score = {args.target_counts_threshold}')
logging.info(f'Number of total anchors required to calculate phase_1 score = {args.anchor_counts_threshold}')
logging.info(f'Keeplist condition: top {args.num_keep_anchors} anchor significance scores if more than {args.anchor_score_threshold} anchors with scores')
logging.info(f'Ignorelist condition: phase 1 scores with mean(S_i) < 3')
logging.info(f'Ignorelist condition: if more than {args.anchor_score_threshold} anchors with scores, scores in 40-60% percentile')
logging.info(f'Ignorelist condition: computing min_anchor_counts with c_type = {args.c_type}')
logging.info(f'Anchor freeze threshold = {args.anchor_freeze_threshold} anchors')
logging.info(f'Read freeze threshold = {args.read_freeze_threshold} anchors')
logging.info('')
# logging: print input parametrs
utils.log_params(args, use_std)

logging.info(f'n_iterations = {args.n_iterations}')
logging.info(f'chunk_size = {args.max_reads}')
logging.info(f'kmer_size = {args.kmer_size}')
logging.info(f'target_counts_threshold = {args.target_counts_threshold}')
logging.info(f'anchor_counts_threshold = {args.anchor_counts_threshold}')
logging.info(f'anchor_freeze_threshold = {args.anchor_freeze_threshold}')
logging.info(f'read_freeze_threshold = {args.read_freeze_threshold}')
logging.info(f'anchor_score_threshold = {args.anchor_score_threshold}')
logging.info(f'anchor_mode = {args.anchor_mode}')
if args.anchor_mode == 'tile':
logging.info(f'window_slide = {args.window_slide}')
logging.info(f'c_type = {args.c_type}')
logging.info(f'lookahead = {args.lookahead}')
logging.info(f'num_keep_anchors = {args.num_keep_anchors}')
logging.info(f'use_std = {use_std}')
logging.info(f'compute_target_distance = {args.compute_target_distance}')
logging.info(f'distance_type = {args.distance_type}')

logging.info("--------------------------------------Parameters--------------------------------------")
logging.info('')

# initialise dataframe of logging values
stats = pd.DataFrame(
columns = [
'Run Time', 'Total Reads', 'Anchors with Scores',
'Phase 1 Total', 'Phase 1 Kept', 'Phase 1 Ignored',
'Phase 2 Total', 'Phase 2 topTargets', 'Phase 2 Other Targets',
'anchor_counts', 'anchor_targets_samples', 'anchor_topTargets_scores', 'anchor_topTargets_distances',
'ignorelist total', 'ignorelist abundance'
]
)
"""logging"""

# get list of samples from fastq_files
# if fastq_file = "file1.fastq.gz", sample = "file1"
# get list of samples from fastq_files (if fastq_file = "file1.fastq.gz", sample = "file1")
samples = (
sample_list
.iloc[:,0]
Expand All @@ -194,27 +150,19 @@ def main():
.tolist()
)

# if not using standard deviation score, make dict of {sample : group_id}
group_ids_dict = {}
if not use_std or sample_list.shape[1] != 1:
group_ids = sample_list.iloc[:,1].tolist()
for i in range(0, len(samples)):
group_ids_dict[samples[i]] = group_ids[i]
else:
for i in range(0, len(samples)):
group_ids_dict[samples[i]] = 1

# create sample index dict
sample_index_dict = {}
for i in range(len(samples)):
sample_index_dict[samples[i]] = i
# get sample-related dicts
group_ids_dict, sample_index_dict = utils.get_sample_dict(
sample_list,
samples,
use_std
)

# initialise objects
anchor_counts = Anchors.AnchorCounts(len(samples)) # {anchor : counts}
anchor_targets_samples = Anchors.AnchorTargetsSamples(sample_index_dict) # {anchor : {target : [sample_1_count, ...]}}
anchor_targets = Anchors.AnchorTargets() # {anchor : targets}
anchor_topTargets_scores = Anchors.AnchorTopTargetsScores() # {anchor : scores]}
anchor_topTargets_distances = Anchors.AnchorTopTargetsDistances() # {anchor : {targets : distance}}
anchor_topTargets_distances = Anchors.AnchorTopTargetsDistances() # {anchor : {targets : distance}}
anchor_status = Anchors.AnchorStatus() # [anchor1, anchor2]
status_checker = Anchors.StatusChecker(
anchor_counts,
Expand All @@ -229,22 +177,6 @@ def main():
# begin iterations
for iteration in range(1, args.n_iterations+1):

# intialise this for logging
ignorelist_anchors_score = []
# only continue accumulations if we have less than num_keep_anchors anchors with candidate scores
if len(anchor_topTargets_scores) >= args.anchor_score_threshold:

# ignorelist condition : ignorelist anchors that are in the bottom 20% of abs(sum(scores))
ignorelist_anchors_score = anchor_topTargets_scores.get_blacklist_anchors()
# update ignorelist
for anchor in ignorelist_anchors_score:
status_checker.update_ignorelist(anchor, read_counter_freeze)


# initialise loggging counters
ignore_summary_scores = 0
ignore_abundance = 0

# if we are at more than read_freeze_threshold reads, set the read_counter freeze so that we don't add any more anchors
# if we are at more than read_freeze_threshold reads, clear the ignorelist
num_reads = iteration * args.max_reads
Expand All @@ -257,9 +189,8 @@ def main():
# get reads from this iteration's fastq files
read_chunk = utils.get_read_chunk(iteration, samples, args.n_iterations)

# accumulate for this iteration
start_time = time.time() # get step start time
phase_0, phase_1, phase_2, phase_1_compute_score, phase_1_ignore_score, phase_2_fetch_distance, phase_2_compute_distance, valid_anchor, invalid_anchor, ignorelisted_anchor, new_anchor = utils.get_iteration_summary_scores(
# accumulate anchors for this iteration
utils.get_iteration_summary_scores(
args.score_type,
iteration,
read_chunk,
Expand All @@ -283,105 +214,46 @@ def main():
group_ids_dict,
args.distance_type
)
run_time = time.time() - start_time # get step total run time

# ignorelist condition: ignorelist anchors that don't meet an abundance requirement
k = math.floor(num_reads / 100000)
if args.c_type == "num_samples":
c = len(samples)
elif args.c_type == "constant":
c = 1
# define min number of counts required per anchor to prevent ignorelisting
anchor_min_count = k * c
# get the anchors that do not meet the abundance requirement of anchor_min_co
ignorelist_anchors_abundance = anchor_counts.get_ignorelist_anchors(anchor_min_count)
# add those anchors to the ignorelist
for anchor in ignorelist_anchors_abundance:
status_checker.update_ignorelist(anchor, read_counter_freeze)
ignore_abundance += 1

# write out ignorelist
with open(f'ignorelist_iteration_{iteration}.tsv', 'w') as outfile:
outfile.write("\n".join(status_checker.ignorelist))


"""logging"""
try:
valid_anchor_percent = round((valid_anchor / (valid_anchor + invalid_anchor)) * 100, 2)
invalid_anchor_percent = round((invalid_anchor / (valid_anchor + invalid_anchor)) * 100, 2)
phase_0_perecent = round((phase_0 / (phase_0+phase_1+phase_2)) * 100, 2)
phase_1_percent = round((phase_1 / (phase_0+phase_1+phase_2)) * 100, 2)
phase_2_percent = round((phase_2 / (phase_0+phase_1+phase_2)) * 100, 2)

except:
valid_anchor_percent = 0
invalid_anchor_percent = 0
phase_0_perecent = 0
phase_1_percent = 0
phase_2_percent = 0

logging.info(f'i = {iteration}')
logging.info(f'\tReads procesed per file = {num_reads}')
logging.info(f'\tReads processed total = {num_reads * len(samples)}')
logging.info(f'\tRead_counter_freeze = {read_counter_freeze}')
logging.info("")
logging.info(f'\tIteration run time = {round(run_time, 2 )} seconds')
logging.info("")
logging.info(f'\tinvalid anchors = {invalid_anchor} ({invalid_anchor_percent}%)')
# logging.info(f'\t\tanchor was on the ignorelist = {ignorelisted_anchor}')
# logging.info(f'\t\tanchor is new and anchor_counter is full = {new_anchor}')
logging.info("")
logging.info(f'\tvalid anchors = {valid_anchor} ({valid_anchor_percent}%)')
logging.info(f'\t\tphase_0 anchors = {phase_0} ({phase_0_perecent}%)')
logging.info(f'\t\tphase_1 anchors = {phase_1} ({phase_1_percent}%)')
logging.info(f'\t\t\tscore passed diversity condition = {phase_1_compute_score}')
logging.info(f'\t\t\tscore failed diversity condition = {phase_1_ignore_score}')
logging.info(f'\t\tphase_2 anchors = {phase_2} ({phase_2_percent}%)')
logging.info(f'\t\t\ttarget distance score is fetched = {phase_2_fetch_distance}')
logging.info(f'\t\t\ttarget distance score is computed = {phase_2_compute_distance}')
logging.info("")
logging.info(f'\tnumber of anchors with candidate scores = {len(anchor_topTargets_scores)}')
logging.info(f'\t\tsize of anchor_counts dict = {len(anchor_counts.total_counts)}')
logging.info(f'\t\tsize of all_target_counts dict = {len(anchor_counts.all_target_counts)}')
logging.info(f'\t\tsize of anchor_targets_samples dict = {len(anchor_targets_samples.counter)}')
logging.info(f'\t\tsize of anchor_topTargets_scores dict = {len(anchor_topTargets_scores)}')
logging.info(f'\t\tsize of anchor_topTargets_distances dict = {len(anchor_topTargets_distances)}')
logging.info("")
logging.info(f'\tignorelist size = {len(status_checker.ignorelist)}')
logging.info(f'\t\tanchors that have failed the abundance requirement = {len(ignorelist_anchors_abundance)}')
logging.info(f'\t\t\tabundance requirement = {anchor_min_count} minimum total anchors')
logging.info(f'\t\tanchors that have failed the diversity requirement = {phase_1_ignore_score}')
logging.info(f'\t\tanchors that have failed the score requirement = {len(ignorelist_anchors_score)}')
logging.info("")
logging.info("-----------------------------------------------------------------------------------------------------------------")
# scores threshold : only continue accumulations if we have less than num_keep_anchors anchors with candidate scores
# only do this iterative version if we are using the slow score
if args.score_type == "slow" and len(anchor_topTargets_scores) >= args.num_keep_anchors:
utils.ignorelist_scores(
anchor_topTargets_scores,
status_checker,
use_std,
read_counter_freeze
)

# add row to stats df
stats = stats.append(
{
'Run Time' : run_time,
'Total Reads' : num_reads * len(samples),
'Anchors with Scores': len(anchor_topTargets_scores),
'Phase 1 Total': phase_1,
'Phase 1 Kept': phase_1_compute_score,
'Phase 1 Ignored': phase_1_ignore_score,
'Phase 2 Total': phase_2,
'Phase 2 topTargets': phase_2_fetch_distance,
'Phase 2 Other Targets': phase_2_compute_distance,
'anchor_counts': len(anchor_counts.total_counts),
'anchor_targets_samples': len(anchor_targets_samples.counter),
'anchor_topTargets_scores': len(anchor_topTargets_scores),
'anchor_topTargets_distances': len(anchor_topTargets_distances),
'ignorelist total': len(status_checker.ignorelist),
'ignorelist abundance': ignore_abundance
},
ignore_index=True
# abundance threshold: ignorelist anchors that don't meet an abundance requirement
utils.ignorelist_abundance(
anchor_counts,
status_checker,
num_reads,
args.c_type,
samples,
read_counter_freeze
)
"""logging"""

## done with all iterations ##

if args.score_type == "fast":
for anchor in anchor_topTargets_scores:
# get scores for each anchor
scores, _, _ = utils.compute_phase_1_scores(
anchor_targets_samples.get_anchor_counts_df(anchor),
group_ids_dict,
args.kmer_size,
args.distance_type,
"force"
)

# updates scores for each anchor
anchor_topTargets_scores.initialise(anchor, scores)

# after all iterations are done accumulating, calculate the summary score
keep_anchors = anchor_topTargets_scores.get_final_anchors(args.num_keep_anchors)
keep_anchors = anchor_topTargets_scores.get_final_anchors(args.num_keep_anchors, use_std)

# filter for these anchors and drop any duplicates
final_anchors = (
Expand All @@ -392,8 +264,6 @@ def main():
## output this final anchors list
final_anchors.to_csv(args.outfile, index=False, sep='\t')

# output run stats
stats.to_csv('run_stats.tsv', index=False, sep='\t')


main()
Loading

0 comments on commit 1e35c81

Please sign in to comment.