Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
martinaryee committed Apr 25, 2023
2 parents e24cfb2 + c08a4a8 commit d7ec580
Show file tree
Hide file tree
Showing 12 changed files with 630 additions and 661 deletions.
Binary file added .DS_Store
Binary file not shown.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@

The guideseq package implements our data preprocessing and analysis pipeline for GUIDE-Seq data. It takes raw sequencing reads (FASTQ) and a parameter manifest file (.yaml) as input and produces a table of annotated off-target sites as output.

### References

##### The original paper describing the GUIDE-Seq method:

Tsai SQ, Zheng Z, Nguyen NT, Liebers M, Topkar VV, Thapar V, Wyvekens N, Khayter C, Iafrate AJ, Le LP, Aryee MJ, Joung JK. [GUIDE-seq enables genome-wide profiling of off-target cleavage by CRISPR-Cas nucleases](https://www.ncbi.nlm.nih.gov/pubmed/25513782). Nat Biotechnol. 2015 Feb;33(2):187-197

##### A description of this analysis package:
Tsai SQ, Topkar VV, Joung JK, Aryee MJ. [Open-source guideseq software for analysis of GUIDE-seq data](https://www.ncbi.nlm.nih.gov/pubmed/27153277). Nat Biotechnol. 2016 May 6;34(5):483

## Table of Contents
- [Features](#features)
- [Dependencies](#dependencies)
Expand Down Expand Up @@ -115,6 +124,11 @@ cd guideseq/test
guideseq.py all -m test_manifest.yaml
```xml
<appSettings>
... [LEAVE EXISTING LINES UNCHANGED] ...
<add key="CreateFastqForIndexReads" value="1"/>
</appSettings>
```

## Running the Full Analysis Pipeline<a name="full_pipeline"></a>
Expand Down Expand Up @@ -148,6 +162,8 @@ When running the end-to-end analysis functionality of the guideseq package, a nu
- `bwa`: The absolute path to the `bwa` executable
- `bedtools`: The absolute path to the `bedtools` executable
- `PAM`: PAM sequence (optional), default is NGG.
- `search_radius`: Search radius for search. Set to 10 for Cas9 and 75 for Cpf1.
- `max_mismatches`: The maximum number of mismatches allowed to report a sequence-matched off-target
- `undemultiplexed`: The absolute paths to the undemultiplexed paired end sequencing files. The required parameters are:
- `forward`: The absolute path to the FASTQ file containing the forward reads.
- `reverse`: The absolute path to the FASTQ file containing the reverse reads.
Expand Down Expand Up @@ -199,6 +215,8 @@ bwa: bwa
bedtools: bedtools
PAM: NGG
demultiplex_min_reads: 1000
window_size: 75
max_mismatches: 7
undemultiplexed:
forward: test/data/undemultiplexed/undemux.r1.fastq.gz
Expand Down
17 changes: 13 additions & 4 deletions guideseq/filterBackgroundSites.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
import subprocess
import os


def filterBackgroundSites(bedtools_path, sample_path, control_path, outfile):
output_folder = os.path.dirname(outfile)
if not os.path.exists(output_folder):
os.makedirs(output_folder)

bedtools_filter_command = '{0} intersect -a {1} -b {2}'.format(bedtools_path, sample_path, control_path)
sample_noHeader = os.path.join(os.path.dirname(sample_path), 'sample_noHeader.txt')
control_noHeader = os.path.join(os.path.dirname(control_path), 'control_noHeader.txt')

sample_noHeader_command = "sed '1d' {0} > {1}".format(sample_path, sample_noHeader)
control_noHeader_command = "sed '1d' {0} > {1}".format(control_path, control_noHeader)
clean_command = "rm {0} {1}".format(control_noHeader, sample_noHeader)
bedtools_filter_command = '{0} intersect -a {1} -b {2}'.format(bedtools_path, sample_noHeader, control_noHeader)

subprocess.check_call(sample_noHeader_command, shell=True, env=os.environ.copy())
subprocess.check_call(control_noHeader_command, shell=True, env=os.environ.copy())

with open(outfile, 'w') as outfile:
subprocess.call(bedtools_filter_command.split(), stdout=outfile)
with open(outfile, 'w') as output_file:
subprocess.check_call(bedtools_filter_command, shell=True, env=os.environ.copy(), stdout=output_file)
subprocess.check_call(clean_command, shell=True, env=os.environ.copy())
18 changes: 9 additions & 9 deletions guideseq/guideseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def parseManifest(self, manifest_path):
else:
self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
# Allow the user to specify window size for off-target search
if 'window_size' in manifest_data:
self.window_size = manifest_data['window_size']
if 'search_radius' in manifest_data:
self.search_radius = manifest_data['search_radius']
else:
self.window_size = DEFAULT_WINDOW_SIZE
self.search_radius = DEFAULT_WINDOW_SIZE
# Allow the user to specify window size for off-target search
if 'max_score' in manifest_data:
self.max_score = manifest_data['max_score']
Expand Down Expand Up @@ -237,7 +237,7 @@ def identifyOfftargetSites(self):
self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')

identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
self.window_size, self.max_score)
self.search_radius, self.max_score)

logger.info('Finished identifying offtarget sites.')

Expand Down Expand Up @@ -339,7 +339,7 @@ def parse_args():
identify_parser.add_argument('--target_sequence', required=True)
identify_parser.add_argument('--description', required=False)
identify_parser.add_argument('--max_score', required=False, type=int, default=7)
identify_parser.add_argument('--window_size', required=False, type=int, default=25)
identify_parser.add_argument('--search_radius', required=False, type=int, default=25)

filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites')
filter_parser.add_argument('--bedtools', required=True)
Expand Down Expand Up @@ -510,10 +510,10 @@ def main():
else:
max_score = 7

if 'window_size' in args:
window_size = args.window_size
if 'search_radius' in args:
search_radius = args.search_radius
else:
window_size = 25
search_radius = 25

g = GuideSeq()
g.output_folder = args.outfolder
Expand All @@ -522,7 +522,7 @@ def main():
g.samples = {sample: {'description': description, 'target': args.target_sequence}}
g.aligned = {sample: args.aligned}
g.max_score = max_score
g.window_size = window_size
g.search_radius = search_radius
g.identifyOfftargetSites()

elif args.command == 'filter':
Expand Down
Loading

0 comments on commit d7ec580

Please sign in to comment.