Merge pull request #17 from kaitlinchaung/dev-z

updates to fast and slow modes
salzman-lab · Apr 13, 2022 · 1e35c81 · 1e35c81
2 parents 1806cc8 + 9fdef32
commit 1e35c81
Show file tree

Hide file tree

Showing 6 changed files with 258 additions and 203 deletions.
diff --git a/bin/Anchors.py b/bin/Anchors.py
@@ -337,28 +337,54 @@ def compute_target_distance(self, anchor, target, distance_type):
         distance = min([utils.get_distance(target, t, distance_type) for t in topTargets])
         return distance
 
-    def get_blacklist_anchors(self):
+    def get_blacklist_anchors(self, use_std):
         """
         Get sum of all scores and sort by sum descending
         """
         df = pd.DataFrame(self).T
-        df['z'] = df.sum(axis=1)
+
+        if use_std:
+            df['z'] = (
+                df
+                .std(axis=1)
+                .abs()
+            )
+        else:
+            df['z'] = (
+                df
+                .sum(axis=1)
+                .abs()
+            )
+
         anchors = (
             df[df['z'] < df['z'].quantile(0.2)]
             .index
             .to_list()
         )
         return anchors
 
-    def get_final_anchors(self, num_keep_anchors):
+    def get_final_anchors(self, num_keep_anchors, use_std):
         """
         Get final anchors for parse_anchors
         """
+        print(self)
         if len(self) < num_keep_anchors:
             anchors = list(self.keys())
         else:
             df = pd.DataFrame(self).T
-            df['z'] = df.sum(axis=1)
+            if use_std:
+                df['z'] = (
+                    df
+                    .std(axis=1)
+                    .abs()
+                )
+            else:
+                df['z'] = (
+                    df
+                    .sum(axis=1)
+                    .abs()
+                )
+
             anchors = (
                 df['z']
                 .sort_values(ascending=False)

diff --git a/bin/get_anchors.py b/bin/get_anchors.py
@@ -112,8 +112,7 @@ def get_args():
 def main():
     args = get_args()
 
-    # read in samples from samplesheet
-    # [fastq_file, optional group_id]
+    # read in samples from samplesheet, formatted as [fastq_file, optional group_id]
     sample_list = pd.read_csv(
         args.samplesheet,
         header=None
@@ -124,6 +123,7 @@ def main():
         use_std = True
     elif args.use_std == "false":
         use_std = False
+    # if sampelsheet only has 1 column, force use_std
     if sample_list.shape[1] == 1:
         use_std = True
 
@@ -136,54 +136,10 @@ def main():
         datefmt='%Y-%m-%d %H:%M:%S'
     )
 
-    """logging"""
-    logging.info("--------------------------------------Parameters--------------------------------------")
-    logging.info(f'samplesheet = {args.samplesheet}')
-    logging.info(f'Number of targets per anchor required to calculate phase_1 score = {args.target_counts_threshold}')
-    logging.info(f'Number of total anchors required to calculate phase_1 score = {args.anchor_counts_threshold}')
-    logging.info(f'Keeplist condition: top {args.num_keep_anchors} anchor significance scores if more than {args.anchor_score_threshold} anchors with scores')
-    logging.info(f'Ignorelist condition: phase 1 scores with mean(S_i) < 3')
-    logging.info(f'Ignorelist condition: if more than {args.anchor_score_threshold} anchors with scores, scores in 40-60% percentile')
-    logging.info(f'Ignorelist condition: computing min_anchor_counts with c_type = {args.c_type}')
-    logging.info(f'Anchor freeze threshold = {args.anchor_freeze_threshold} anchors')
-    logging.info(f'Read freeze threshold = {args.read_freeze_threshold} anchors')
-    logging.info('')
+    # logging: print input parametrs
+    utils.log_params(args, use_std)
 
-    logging.info(f'n_iterations             = {args.n_iterations}')
-    logging.info(f'chunk_size               = {args.max_reads}')
-    logging.info(f'kmer_size                = {args.kmer_size}')
-    logging.info(f'target_counts_threshold  = {args.target_counts_threshold}')
-    logging.info(f'anchor_counts_threshold  = {args.anchor_counts_threshold}')
-    logging.info(f'anchor_freeze_threshold  = {args.anchor_freeze_threshold}')
-    logging.info(f'read_freeze_threshold    = {args.read_freeze_threshold}')
-    logging.info(f'anchor_score_threshold   = {args.anchor_score_threshold}')
-    logging.info(f'anchor_mode              = {args.anchor_mode}')
-    if args.anchor_mode == 'tile':
-        logging.info(f'window_slide             = {args.window_slide}')
-    logging.info(f'c_type                   = {args.c_type}')
-    logging.info(f'lookahead                = {args.lookahead}')
-    logging.info(f'num_keep_anchors         = {args.num_keep_anchors}')
-    logging.info(f'use_std                  = {use_std}')
-    logging.info(f'compute_target_distance  = {args.compute_target_distance}')
-    logging.info(f'distance_type            = {args.distance_type}')
-
-    logging.info("--------------------------------------Parameters--------------------------------------")
-    logging.info('')
-
-    # initialise dataframe of logging values
-    stats = pd.DataFrame(
-        columns = [
-            'Run Time', 'Total Reads', 'Anchors with Scores',
-            'Phase 1 Total', 'Phase 1 Kept', 'Phase 1 Ignored',
-            'Phase 2 Total', 'Phase 2 topTargets', 'Phase 2 Other Targets',
-            'anchor_counts', 'anchor_targets_samples', 'anchor_topTargets_scores', 'anchor_topTargets_distances',
-            'ignorelist total', 'ignorelist abundance'
-        ]
-    )
-    """logging"""
-
-    # get list of samples from fastq_files
-    # if fastq_file = "file1.fastq.gz", sample = "file1"
+    # get list of samples from fastq_files (if fastq_file = "file1.fastq.gz", sample = "file1")
     samples = (
         sample_list
         .iloc[:,0]
@@ -194,27 +150,19 @@ def main():
         .tolist()
     )
 
-    # if not using standard deviation score, make dict of {sample : group_id}
-    group_ids_dict = {}
-    if not use_std or sample_list.shape[1] != 1:
-        group_ids = sample_list.iloc[:,1].tolist()
-        for i in range(0, len(samples)):
-            group_ids_dict[samples[i]] = group_ids[i]
-    else:
-        for i in range(0, len(samples)):
-            group_ids_dict[samples[i]] = 1
-
-    # create sample index dict
-    sample_index_dict = {}
-    for i in range(len(samples)):
-        sample_index_dict[samples[i]] = i
+    # get sample-related dicts
+    group_ids_dict, sample_index_dict = utils.get_sample_dict(
+        sample_list,
+        samples,
+        use_std
+    )
 
     # initialise objects
     anchor_counts = Anchors.AnchorCounts(len(samples))                          # {anchor : counts}
     anchor_targets_samples = Anchors.AnchorTargetsSamples(sample_index_dict)    # {anchor : {target : [sample_1_count, ...]}}
     anchor_targets = Anchors.AnchorTargets()                                    # {anchor : targets}
     anchor_topTargets_scores = Anchors.AnchorTopTargetsScores()                 # {anchor : scores]}
-    anchor_topTargets_distances = Anchors.AnchorTopTargetsDistances()                   # {anchor : {targets : distance}}
+    anchor_topTargets_distances = Anchors.AnchorTopTargetsDistances()           # {anchor : {targets : distance}}
     anchor_status = Anchors.AnchorStatus()                                      # [anchor1, anchor2]
     status_checker = Anchors.StatusChecker(
         anchor_counts,
@@ -229,22 +177,6 @@ def main():
     # begin iterations
     for iteration in range(1, args.n_iterations+1):
 
-        # intialise this for logging
-        ignorelist_anchors_score = []
-        # only continue accumulations if we have less than num_keep_anchors anchors with candidate scores
-        if len(anchor_topTargets_scores) >= args.anchor_score_threshold:
-
-            # ignorelist condition : ignorelist anchors that are in the bottom 20% of abs(sum(scores))
-            ignorelist_anchors_score = anchor_topTargets_scores.get_blacklist_anchors()
-            # update ignorelist
-            for anchor in ignorelist_anchors_score:
-                status_checker.update_ignorelist(anchor, read_counter_freeze)
-
-
-        # initialise loggging counters
-        ignore_summary_scores = 0
-        ignore_abundance = 0
-
         # if we are at more than read_freeze_threshold reads, set the read_counter freeze so that we don't add any more anchors
         # if we are at more than read_freeze_threshold reads, clear the ignorelist
         num_reads = iteration * args.max_reads
@@ -257,9 +189,8 @@ def main():
         # get reads from this iteration's fastq files
         read_chunk = utils.get_read_chunk(iteration, samples, args.n_iterations)
 
-        # accumulate for this iteration
-        start_time = time.time() # get step start time
-        phase_0, phase_1, phase_2, phase_1_compute_score, phase_1_ignore_score, phase_2_fetch_distance, phase_2_compute_distance, valid_anchor, invalid_anchor, ignorelisted_anchor, new_anchor = utils.get_iteration_summary_scores(
+        # accumulate anchors for this iteration
+        utils.get_iteration_summary_scores(
             args.score_type,
             iteration,
             read_chunk,
@@ -283,105 +214,46 @@ def main():
             group_ids_dict,
             args.distance_type
         )
-        run_time = time.time() - start_time # get step total run time
-
-        # ignorelist condition: ignorelist anchors that don't meet an abundance requirement
-        k = math.floor(num_reads / 100000)
-        if args.c_type == "num_samples":
-            c = len(samples)
-        elif args.c_type == "constant":
-            c = 1
-        # define min number of counts required per anchor to prevent ignorelisting
-        anchor_min_count = k * c
-        # get the anchors that do not meet the abundance requirement of anchor_min_co
-        ignorelist_anchors_abundance = anchor_counts.get_ignorelist_anchors(anchor_min_count)
-        # add those anchors to the ignorelist
-        for anchor in ignorelist_anchors_abundance:
-            status_checker.update_ignorelist(anchor, read_counter_freeze)
-            ignore_abundance += 1
-
-        # write out ignorelist
-        with open(f'ignorelist_iteration_{iteration}.tsv', 'w') as outfile:
-            outfile.write("\n".join(status_checker.ignorelist))
-
-
-        """logging"""
-        try:
-            valid_anchor_percent = round((valid_anchor / (valid_anchor + invalid_anchor)) * 100, 2)
-            invalid_anchor_percent = round((invalid_anchor / (valid_anchor + invalid_anchor)) * 100, 2)
-            phase_0_perecent = round((phase_0 / (phase_0+phase_1+phase_2)) * 100, 2)
-            phase_1_percent = round((phase_1 / (phase_0+phase_1+phase_2)) * 100, 2)
-            phase_2_percent = round((phase_2 / (phase_0+phase_1+phase_2)) * 100, 2)
 
-        except:
-            valid_anchor_percent = 0
-            invalid_anchor_percent = 0
-            phase_0_perecent = 0
-            phase_1_percent = 0
-            phase_2_percent = 0
 
-        logging.info(f'i = {iteration}')
-        logging.info(f'\tReads procesed per file = {num_reads}')
-        logging.info(f'\tReads processed total = {num_reads * len(samples)}')
-        logging.info(f'\tRead_counter_freeze = {read_counter_freeze}')
-        logging.info("")
-        logging.info(f'\tIteration run time = {round(run_time, 2 )} seconds')
-        logging.info("")
-        logging.info(f'\tinvalid anchors = {invalid_anchor} ({invalid_anchor_percent}%)')
-        # logging.info(f'\t\tanchor was on the ignorelist = {ignorelisted_anchor}')
-        # logging.info(f'\t\tanchor is new and anchor_counter is full = {new_anchor}')
-        logging.info("")
-        logging.info(f'\tvalid anchors = {valid_anchor} ({valid_anchor_percent}%)')
-        logging.info(f'\t\tphase_0 anchors = {phase_0} ({phase_0_perecent}%)')
-        logging.info(f'\t\tphase_1 anchors = {phase_1} ({phase_1_percent}%)')
-        logging.info(f'\t\t\tscore passed diversity condition = {phase_1_compute_score}')
-        logging.info(f'\t\t\tscore failed diversity condition = {phase_1_ignore_score}')
-        logging.info(f'\t\tphase_2 anchors = {phase_2} ({phase_2_percent}%)')
-        logging.info(f'\t\t\ttarget distance score is fetched = {phase_2_fetch_distance}')
-        logging.info(f'\t\t\ttarget distance score is computed = {phase_2_compute_distance}')
-        logging.info("")
-        logging.info(f'\tnumber of anchors with candidate scores = {len(anchor_topTargets_scores)}')
-        logging.info(f'\t\tsize of anchor_counts dict = {len(anchor_counts.total_counts)}')
-        logging.info(f'\t\tsize of all_target_counts dict = {len(anchor_counts.all_target_counts)}')
-        logging.info(f'\t\tsize of anchor_targets_samples dict = {len(anchor_targets_samples.counter)}')
-        logging.info(f'\t\tsize of anchor_topTargets_scores dict = {len(anchor_topTargets_scores)}')
-        logging.info(f'\t\tsize of anchor_topTargets_distances dict = {len(anchor_topTargets_distances)}')
-        logging.info("")
-        logging.info(f'\tignorelist size = {len(status_checker.ignorelist)}')
-        logging.info(f'\t\tanchors that have failed the abundance requirement = {len(ignorelist_anchors_abundance)}')
-        logging.info(f'\t\t\tabundance requirement = {anchor_min_count} minimum total anchors')
-        logging.info(f'\t\tanchors that have failed the diversity requirement = {phase_1_ignore_score}')
-        logging.info(f'\t\tanchors that have failed the score requirement = {len(ignorelist_anchors_score)}')
-        logging.info("")
-        logging.info("-----------------------------------------------------------------------------------------------------------------")
+        # scores threshold : only continue accumulations if we have less than num_keep_anchors anchors with candidate scores
+        # only do this iterative version if we are using the slow score
+        if args.score_type == "slow" and len(anchor_topTargets_scores) >= args.num_keep_anchors:
+            utils.ignorelist_scores(
+                anchor_topTargets_scores,
+                status_checker,
+                use_std,
+                read_counter_freeze
+            )
 
-        # add row to stats df
-        stats = stats.append(
-            {
-                'Run Time' : run_time,
-                'Total Reads' : num_reads * len(samples),
-                'Anchors with Scores': len(anchor_topTargets_scores),
-                'Phase 1 Total': phase_1,
-                'Phase 1 Kept': phase_1_compute_score,
-                'Phase 1 Ignored': phase_1_ignore_score,
-                'Phase 2 Total': phase_2,
-                'Phase 2 topTargets': phase_2_fetch_distance,
-                'Phase 2 Other Targets': phase_2_compute_distance,
-                'anchor_counts': len(anchor_counts.total_counts),
-                'anchor_targets_samples': len(anchor_targets_samples.counter),
-                'anchor_topTargets_scores': len(anchor_topTargets_scores),
-                'anchor_topTargets_distances': len(anchor_topTargets_distances),
-                'ignorelist total': len(status_checker.ignorelist),
-                'ignorelist abundance': ignore_abundance
-            },
-            ignore_index=True
+        # abundance threshold: ignorelist anchors that don't meet an abundance requirement
+        utils.ignorelist_abundance(
+            anchor_counts,
+            status_checker,
+            num_reads,
+            args.c_type,
+            samples,
+            read_counter_freeze
         )
-        """logging"""
 
     ## done with all iterations ##
 
+    if args.score_type == "fast":
+        for anchor in anchor_topTargets_scores:
+            # get scores for each anchor
+            scores, _, _ = utils.compute_phase_1_scores(
+                anchor_targets_samples.get_anchor_counts_df(anchor),
+                group_ids_dict,
+                args.kmer_size,
+                args.distance_type,
+                "force"
+            )
+
+            # updates scores for each anchor
+            anchor_topTargets_scores.initialise(anchor, scores)
+
     # after all iterations are done accumulating, calculate the summary score
-    keep_anchors = anchor_topTargets_scores.get_final_anchors(args.num_keep_anchors)
+    keep_anchors = anchor_topTargets_scores.get_final_anchors(args.num_keep_anchors, use_std)
 
     # filter for these anchors and drop any duplicates
     final_anchors = (
@@ -392,8 +264,6 @@ def main():
     ## output this final anchors list
     final_anchors.to_csv(args.outfile, index=False, sep='\t')
 
-    # output run stats
-    stats.to_csv('run_stats.tsv', index=False, sep='\t')
 
 
 main()