Merge branch 'alt_tag' into dev

aryeelab · Apr 25, 2023 · 76ab9a6 · 76ab9a6
2 parents d7ec580 + 60d860b
commit 76ab9a6
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 53 deletions.
diff --git a/guideseq/alignReads.py b/guideseq/alignReads.py
@@ -10,7 +10,7 @@
 logger.propagate = False
 
 
-def alignReads(BWA_path, HG19_path, read1, read2, outfile):
+def alignReads(cores, BWA_path, genome_path, read1, read2, outfile):
 
     sample_name = os.path.basename(outfile).split('.')[0]
     output_folder = os.path.dirname(outfile)
@@ -24,14 +24,14 @@ def alignReads(BWA_path, HG19_path, read1, read2, outfile):
 
     genome_indexed = True
     for extension in index_files_extensions:
-        if not os.path.isfile(HG19_path + extension):
+        if not os.path.isfile(genome_path + extension):
             genome_indexed = False
             break
 
     # If the genome is not already indexed, index it
     if not genome_indexed:
         logger.info('Genome index files not detected. Running BWA to generate indices.')
-        bwa_index_command = '{0} index {1}'.format(BWA_path, HG19_path)
+        bwa_index_command = '{0} index {1}'.format(BWA_path, genome_path)
         logger.info('Running bwa command: %s', bwa_index_command)
         subprocess.call(bwa_index_command.split())
         logger.info('BWA genome index generated')
@@ -40,8 +40,9 @@ def alignReads(BWA_path, HG19_path, read1, read2, outfile):
 
     # Run paired end alignment against the genome
     logger.info('Running paired end mapping for {0}'.format(sample_name))
-    bwa_alignment_command = '{0} mem {1} {2} {3}'.format(BWA_path,
-                                                         HG19_path,
+    bwa_alignment_command = '{0} mem -t {1} {2} {3} {4}'.format(BWA_path,
+                                                         cores,
+                                                         genome_path,
                                                          read1,
                                                          read2)
 

diff --git a/guideseq/guideseq.py b/guideseq/guideseq.py
@@ -42,18 +42,30 @@ def parseManifest(self, manifest_path):
         logger.info('Loading manifest...')
 
         with open(manifest_path, 'r') as f:
-            manifest_data = yaml.load(f)
+            manifest_data = yaml.safe_load(f)
+
+        if not "cores" in manifest_data:
+            manifest_data['cores'] = 4
+
+        # Set default tag/primer sequences if not specified
+        if not "primer1" in manifest_data:
+            manifest_data['primer1'] = 'TTGAGTTGTCATATGTTAAT'
+        if not "primer2" in manifest_data:
+            manifest_data['primer2'] = 'ACATATGACAACTCAATTAA'
 
         try:
             # Validate manifest data
             validation.validateManifest(manifest_data)
 
+            self.cores = manifest_data['cores']
             self.BWA_path = manifest_data['bwa']
             self.bedtools = manifest_data['bedtools']
             self.reference_genome = manifest_data['reference_genome']
             self.output_folder = manifest_data['output_folder']
             self.undemultiplexed = manifest_data['undemultiplexed']
             self.samples = manifest_data['samples']
+            self.primer1 = manifest_data['primer1']
+            self.primer2 = manifest_data['primer2']
 
         except Exception as e:
             logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
@@ -199,7 +211,8 @@ def alignReads(self):
             self.aligned = {}
             for sample in self.samples:
                 sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
-                alignReads(self.BWA_path,
+                alignReads(self.cores,
+                           self.BWA_path,
                            self.reference_genome,
                            self.consolidated[sample]['read1'],
                            self.consolidated[sample]['read2'],
@@ -227,7 +240,7 @@ def identifyOfftargetSites(self):
                 annotations['Description'] = sample_data['description']
                 annotations['Targetsite'] = sample
 
-                if sample is 'control':
+                if sample == 'control':
                     annotations['Sequence'] = ''
                 else:
                     annotations['Sequence'] = sample_data['target']
@@ -237,7 +250,7 @@ def identifyOfftargetSites(self):
                 self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')
 
                 identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
-                                               self.search_radius, self.max_score)
+                                               self.search_radius, self.max_score, self.primer1, self.primer2)
 
             logger.info('Finished identifying offtarget sites.')
 

diff --git a/guideseq/identifyOfftargetSites.py b/guideseq/identifyOfftargetSites.py
@@ -291,33 +291,32 @@ def alignSequences(targetsite_sequence, window_sequence, max_score=7):
 """
 annotation is in the format:
 """
-def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, max_score):
-
+def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, max_score, primer1, primer2):
     output_folder = os.path.dirname(outfile)
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
-
-    logger.info("Processing SAM file %s", sam_filename)
-    file = open(sam_filename, 'rU')
-    __, filename_tail = os.path.split(sam_filename)
-    chromosome_position = chromosomePosition(reference_genome)
-    for line in file:
-        fields = line.split('\t')
-        if len(fields) >= 10:
-            # These are strings--need to be cast as ints for comparisons.
-            full_read_name, sam_flag, chromosome, position, mapq, cigar, name_of_mate, position_of_mate, template_length, read_sequence, read_quality = fields[:11]
-            if int(mapq) >= 50 and int(sam_flag) & 128 and not int(sam_flag) & 2048:
-                # Second read in pair
-                barcode, count = parseReadName(full_read_name)
-                primer = assignPrimerstoReads(read_sequence, sam_flag)
-                if int(template_length) < 0:  # Reverse read
-                    read_position = int(position_of_mate) + abs(int(template_length)) - 1
-                    strand = "-"
-                    chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
-                elif int(template_length) > 0:  # Forward read
-                    read_position = int(position)
-                    strand = "+"
-                    chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
+	
+	logger.info("Processing SAM file %s", sam_filename)
+	file = open(sam_filename, 'rU')
+	__, filename_tail = os.path.split(sam_filename)
+	chromosome_position = chromosomePosition(reference_genome)
+	for line in file:
+		fields = line.split('\t')
+		if len(fields) >= 10:
+			# These are strings--need to be cast as ints for comparisons.
+			full_read_name, sam_flag, chromosome, position, mapq, cigar, name_of_mate, position_of_mate, template_length, read_sequence, read_quality = fields[:11]
+			if int(mapq) >= 50 and int(sam_flag) & 128 and not int(sam_flag) & 2048:
+				# Second read in pair
+				barcode, count = parseReadName(full_read_name)
+				primer = assignPrimerstoReads(read_sequence, sam_flag, primer1, primer2)
+				if int(template_length) < 0:  # Reverse read
+					read_position = int(position_of_mate) + abs(int(template_length)) - 1
+					strand = "-"
+					chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
+				elif int(template_length) > 0:  # Forward read
+					read_position = int(position)
+					strand = "+"
+					chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
 
     # Generate barcode position summary
     stacked_summary = chromosome_position.SummarizeBarcodePositions()
@@ -394,18 +393,18 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius,
         for key in sorted(output_dict.keys()):
             print(*output_dict[key], sep='\t', file=f)
 
-def assignPrimerstoReads(read_sequence, sam_flag):
-    # Get 20-nucleotide sequence from beginning or end of sequence depending on orientation
-    if int(sam_flag) & 16:
-        readstart = reverseComplement(read_sequence[-20:])
-    else:
-        readstart = read_sequence[:20]
-    if readstart == "TTGAGTTGTCATATGTTAAT":
-        return "primer1"
-    elif readstart == "ACATATGACAACTCAATTAA":
-        return "primer2"
-    else:
-        return "nomatch"
+def assignPrimerstoReads(read_sequence, sam_flag, primer1, primer2):
+	len1 = len(primer1)
+	len2 = len(primer2)
+	# Get nucleotide sequence from beginning or end of sequence depending on orientation
+	if int(sam_flag) & 16:
+		read_sequence = reverseComplement(read_sequence)
+	if read_sequence[:len1] == primer1:
+		return "primer1"
+	elif read_sequence[:len2] == primer2:
+		return "primer2"
+	else:
+		return "nomatch"
 
 
 def loadFileIntoArray(filename):
@@ -441,18 +440,21 @@ def reverseComplement(sequence):
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.')
-    parser.add_argument('--ref', help='Reference Genome Fasta', required=True)
-    parser.add_argument('--samfile', help='SAM file', nargs='*')
-    parser.add_argument('--outfile', help='File to output identified sites to.', required=True)
+	parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.')
+	parser.add_argument('--ref', help='Reference Genome Fasta', required=True)
+	parser.add_argument('--samfile', help='SAM file', nargs='*')
+	parser.add_argument('--outfile', help='File to output identified sites to.', required=True)
     parser.add_argument('--search_radius', help='Window around breakpoint to search for off-target', type=int, default=25)
-    parser.add_argument('--max_score', help='Score threshold', type=int, default=7)
-    parser.add_argument('--target', default='')
+	parser.add_argument('--max_score', help='Score threshold', type=int, default=7)
+	parser.add_argument('--primer1', help='Primer 1 sequence', default="TTGAGTTGTCATATGTTAAT")
+	parser.add_argument('--primer2', help='Primer 2 sequence', default="ACATATGACAACTCAATTAA")
+	# parser.add_argument('--demo')
+	parser.add_argument('--target', default='')
 
     args = parser.parse_args()
 
     annotations = {'Description': 'test description', 'Targetsite': 'dummy targetsite', 'Sequence': args.target}
-    analyze(args.samfile[0], args.ref, args.outfile, annotations, args.search_radius, args.max_score)
+    analyze(args.samfile[0], args.ref, args.outfile, annotations, args.search_radius, args.max_score, primer1, primer2)
 
 
 if __name__ == "__main__":

diff --git a/guideseq/umi b/guideseq/umi