Skip to content

Commit

Permalink
Merge branch 'alt_tag' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
martinaryee committed Apr 25, 2023
2 parents d7ec580 + 60d860b commit 76ab9a6
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 53 deletions.
11 changes: 6 additions & 5 deletions guideseq/alignReads.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger.propagate = False


def alignReads(BWA_path, HG19_path, read1, read2, outfile):
def alignReads(cores, BWA_path, genome_path, read1, read2, outfile):

sample_name = os.path.basename(outfile).split('.')[0]
output_folder = os.path.dirname(outfile)
Expand All @@ -24,14 +24,14 @@ def alignReads(BWA_path, HG19_path, read1, read2, outfile):

genome_indexed = True
for extension in index_files_extensions:
if not os.path.isfile(HG19_path + extension):
if not os.path.isfile(genome_path + extension):
genome_indexed = False
break

# If the genome is not already indexed, index it
if not genome_indexed:
logger.info('Genome index files not detected. Running BWA to generate indices.')
bwa_index_command = '{0} index {1}'.format(BWA_path, HG19_path)
bwa_index_command = '{0} index {1}'.format(BWA_path, genome_path)
logger.info('Running bwa command: %s', bwa_index_command)
subprocess.call(bwa_index_command.split())
logger.info('BWA genome index generated')
Expand All @@ -40,8 +40,9 @@ def alignReads(BWA_path, HG19_path, read1, read2, outfile):

# Run paired end alignment against the genome
logger.info('Running paired end mapping for {0}'.format(sample_name))
bwa_alignment_command = '{0} mem {1} {2} {3}'.format(BWA_path,
HG19_path,
bwa_alignment_command = '{0} mem -t {1} {2} {3} {4}'.format(BWA_path,
cores,
genome_path,
read1,
read2)

Expand Down
21 changes: 17 additions & 4 deletions guideseq/guideseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,30 @@ def parseManifest(self, manifest_path):
logger.info('Loading manifest...')

with open(manifest_path, 'r') as f:
manifest_data = yaml.load(f)
manifest_data = yaml.safe_load(f)

if not "cores" in manifest_data:
manifest_data['cores'] = 4

# Set default tag/primer sequences if not specified
if not "primer1" in manifest_data:
manifest_data['primer1'] = 'TTGAGTTGTCATATGTTAAT'
if not "primer2" in manifest_data:
manifest_data['primer2'] = 'ACATATGACAACTCAATTAA'

try:
# Validate manifest data
validation.validateManifest(manifest_data)

self.cores = manifest_data['cores']
self.BWA_path = manifest_data['bwa']
self.bedtools = manifest_data['bedtools']
self.reference_genome = manifest_data['reference_genome']
self.output_folder = manifest_data['output_folder']
self.undemultiplexed = manifest_data['undemultiplexed']
self.samples = manifest_data['samples']
self.primer1 = manifest_data['primer1']
self.primer2 = manifest_data['primer2']

except Exception as e:
logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
Expand Down Expand Up @@ -199,7 +211,8 @@ def alignReads(self):
self.aligned = {}
for sample in self.samples:
sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
alignReads(self.BWA_path,
alignReads(self.cores,
self.BWA_path,
self.reference_genome,
self.consolidated[sample]['read1'],
self.consolidated[sample]['read2'],
Expand Down Expand Up @@ -227,7 +240,7 @@ def identifyOfftargetSites(self):
annotations['Description'] = sample_data['description']
annotations['Targetsite'] = sample

if sample is 'control':
if sample == 'control':
annotations['Sequence'] = ''
else:
annotations['Sequence'] = sample_data['target']
Expand All @@ -237,7 +250,7 @@ def identifyOfftargetSites(self):
self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')

identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
self.search_radius, self.max_score)
self.search_radius, self.max_score, self.primer1, self.primer2)

logger.info('Finished identifying offtarget sites.')

Expand Down
88 changes: 45 additions & 43 deletions guideseq/identifyOfftargetSites.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,33 +291,32 @@ def alignSequences(targetsite_sequence, window_sequence, max_score=7):
"""
annotation is in the format:
"""
def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, max_score):

def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, max_score, primer1, primer2):
output_folder = os.path.dirname(outfile)
if not os.path.exists(output_folder):
os.makedirs(output_folder)

logger.info("Processing SAM file %s", sam_filename)
file = open(sam_filename, 'rU')
__, filename_tail = os.path.split(sam_filename)
chromosome_position = chromosomePosition(reference_genome)
for line in file:
fields = line.split('\t')
if len(fields) >= 10:
# These are strings--need to be cast as ints for comparisons.
full_read_name, sam_flag, chromosome, position, mapq, cigar, name_of_mate, position_of_mate, template_length, read_sequence, read_quality = fields[:11]
if int(mapq) >= 50 and int(sam_flag) & 128 and not int(sam_flag) & 2048:
# Second read in pair
barcode, count = parseReadName(full_read_name)
primer = assignPrimerstoReads(read_sequence, sam_flag)
if int(template_length) < 0: # Reverse read
read_position = int(position_of_mate) + abs(int(template_length)) - 1
strand = "-"
chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
elif int(template_length) > 0: # Forward read
read_position = int(position)
strand = "+"
chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
logger.info("Processing SAM file %s", sam_filename)
file = open(sam_filename, 'rU')
__, filename_tail = os.path.split(sam_filename)
chromosome_position = chromosomePosition(reference_genome)
for line in file:
fields = line.split('\t')
if len(fields) >= 10:
# These are strings--need to be cast as ints for comparisons.
full_read_name, sam_flag, chromosome, position, mapq, cigar, name_of_mate, position_of_mate, template_length, read_sequence, read_quality = fields[:11]
if int(mapq) >= 50 and int(sam_flag) & 128 and not int(sam_flag) & 2048:
# Second read in pair
barcode, count = parseReadName(full_read_name)
primer = assignPrimerstoReads(read_sequence, sam_flag, primer1, primer2)
if int(template_length) < 0: # Reverse read
read_position = int(position_of_mate) + abs(int(template_length)) - 1
strand = "-"
chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)
elif int(template_length) > 0: # Forward read
read_position = int(position)
strand = "+"
chromosome_position.addPositionBarcode(chromosome, read_position, strand, barcode, primer, count)

# Generate barcode position summary
stacked_summary = chromosome_position.SummarizeBarcodePositions()
Expand Down Expand Up @@ -394,18 +393,18 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius,
for key in sorted(output_dict.keys()):
print(*output_dict[key], sep='\t', file=f)

def assignPrimerstoReads(read_sequence, sam_flag):
# Get 20-nucleotide sequence from beginning or end of sequence depending on orientation
if int(sam_flag) & 16:
readstart = reverseComplement(read_sequence[-20:])
else:
readstart = read_sequence[:20]
if readstart == "TTGAGTTGTCATATGTTAAT":
return "primer1"
elif readstart == "ACATATGACAACTCAATTAA":
return "primer2"
else:
return "nomatch"
def assignPrimerstoReads(read_sequence, sam_flag, primer1, primer2):
len1 = len(primer1)
len2 = len(primer2)
# Get nucleotide sequence from beginning or end of sequence depending on orientation
if int(sam_flag) & 16:
read_sequence = reverseComplement(read_sequence)
if read_sequence[:len1] == primer1:
return "primer1"
elif read_sequence[:len2] == primer2:
return "primer2"
else:
return "nomatch"


def loadFileIntoArray(filename):
Expand Down Expand Up @@ -441,18 +440,21 @@ def reverseComplement(sequence):


def main():
parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.')
parser.add_argument('--ref', help='Reference Genome Fasta', required=True)
parser.add_argument('--samfile', help='SAM file', nargs='*')
parser.add_argument('--outfile', help='File to output identified sites to.', required=True)
parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.')
parser.add_argument('--ref', help='Reference Genome Fasta', required=True)
parser.add_argument('--samfile', help='SAM file', nargs='*')
parser.add_argument('--outfile', help='File to output identified sites to.', required=True)
parser.add_argument('--search_radius', help='Window around breakpoint to search for off-target', type=int, default=25)
parser.add_argument('--max_score', help='Score threshold', type=int, default=7)
parser.add_argument('--target', default='')
parser.add_argument('--max_score', help='Score threshold', type=int, default=7)
parser.add_argument('--primer1', help='Primer 1 sequence', default="TTGAGTTGTCATATGTTAAT")
parser.add_argument('--primer2', help='Primer 2 sequence', default="ACATATGACAACTCAATTAA")
# parser.add_argument('--demo')
parser.add_argument('--target', default='')

args = parser.parse_args()

annotations = {'Description': 'test description', 'Targetsite': 'dummy targetsite', 'Sequence': args.target}
analyze(args.samfile[0], args.ref, args.outfile, annotations, args.search_radius, args.max_score)
analyze(args.samfile[0], args.ref, args.outfile, annotations, args.search_radius, args.max_score, primer1, primer2)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion guideseq/umi
Submodule umi updated 3 files
+3 −3 consolidate.py
+7 −7 demultiplex.py
+1 −1 umitag.py

0 comments on commit 76ab9a6

Please sign in to comment.