Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ZijieJin authored Mar 18, 2022
1 parent 2f8cfc1 commit fc65fa5
Showing 1 changed file with 257 additions and 0 deletions.
257 changes: 257 additions & 0 deletions scFusion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import os
import sys
import argparse
import subprocess

lastd = sys.argv[0].rfind('/')
if lastd == -1:
programdir = './'
else:
programdir = sys.argv[0][:lastd] + '/'
parser = argparse.ArgumentParser(description='Single-cell Gene Fusion Detection (c) Zjie Jin', add_help=False)
subparsers = parser.add_subparsers(help='Select the function', required=True)
parser1 = subparsers.add_parser('BuildSTARIndex', help='Build the STAR Index', add_help=False)
group11 = parser1.add_argument_group('Required parameters')
group12 = parser1.add_argument_group('Optional parameters')
group11.add_argument("-s", "--STARIndex", help='The STAR index output folder.')
group11.add_argument("-g", "--Genome", help='The reference file (*.fasta or *.fa).')
group11.add_argument("-a", "--Annotation", help='The gtf annotation file (*.gtf).')
group12.add_argument("-t", "--Thread", default='8', help='Number of threads can be used, default is 8.')
parser2 = subparsers.add_parser('Rename', help='Rename files using consecutive numbers', add_help=False)
group2 = parser2.add_argument_group('Required parameters')
group2.add_argument("-f", "--FileDir", help='The folder of input data.')
parser3 = subparsers.add_parser('RestoreName', help='Restore file names.', add_help=False)
group3 = parser3.add_argument_group('Required parameters')
group3.add_argument("-f", "--FileDir", help='The folder of input data.')
parser4 = subparsers.add_parser('Index', help='Produce some necessary files.', add_help=False)
group4 = parser4.add_argument_group('Required parameters')
group4.add_argument("-d", "--GenomeDir", help='The genome index folder.')
group4.add_argument("-g", "--Genome", help='The reference file (*.fasta or *.fa).')
group4.add_argument("-a", "--Annotation", help='The gtf annotation file (*.gtf).')
parser5 = subparsers.add_parser('ReadMapping', help='Mapping pair-end reads using STAR.', add_help=False)
group51 = parser5.add_argument_group('Required parameters')
group52 = parser5.add_argument_group('Optional parameters')
group51.add_argument("-f", "--FileDir", help='The folder of input data.')
group51.add_argument("-b", "--Begin", help='The first index of input files.')
group51.add_argument("-e", "--End", help='The last index of input files.')
group51.add_argument("-s", "--STARIndex", help='The STAR index folder.')
group52.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files, default is FileDir.')
group52.add_argument("-t", "--Thread", default='8', help='Number of threads can be used, default is 8')
parser6 = subparsers.add_parser('ReadProcessing', help='Process the chimeric reads.', add_help=False)
group61 = parser6.add_argument_group('Required parameters')
group62 = parser6.add_argument_group('Optional parameters')
group61.add_argument("-d", "--GenomeDir", help='The genome index folder.')
group61.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
mappabilitygroup = group62.add_mutually_exclusive_group()
mappabilitygroup.add_argument("-m", "--Mappability", default=programdir + '/data/hg19mappability75.txt',
help='The mappability file, default is hg19mappability75.txt in the data folder.')
mappabilitygroup.add_argument("-M", "--NoMappabilityFilter", help='Keep all reads and do not apply mappability filter.',
action='store_true')
group61.add_argument("-b", "--Begin", help='The first index of input files.')
group61.add_argument("-e", "--End", help='The last index of input files.')
parser7 = subparsers.add_parser('FusionCandidate', help='Identify the fusion candidates', add_help=False)
group71 = parser7.add_argument_group('Required parameters')
group72 = parser7.add_argument_group('Optional parameters')
group71.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
group71.add_argument("-b", "--Begin", help='The first index of input files.')
group71.add_argument("-e", "--End", help='The last index of input files.')
group71.add_argument("-d", "--GenomeDir", help='The genome index folder.')
group72.add_argument("-p", "--Prefix", default='.',
help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
parser8 = subparsers.add_parser('Retrain', help='Retrain the network using current data.', add_help=False)
group81 = parser8.add_argument_group('Required parameters')
group82 = parser8.add_argument_group('Optional parameters')
group81.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
group82.add_argument("-p", "--Prefix", default='.',
help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
group82.add_argument("-w", "--Weight", default=programdir + '/data/weight-V9-2.hdf5',
help='The initial weight file of the deep-learning network, default is "weight-V9-2.hdf5" in the data folder.')
group82.add_argument("-c", "--Epoch", default='10',
help='The number of epochs in the retraining step, default is 10')
parser9 = subparsers.add_parser('ArtifactScoring', help='Find the artifacts', add_help=False)
group91 = parser9.add_argument_group('Required parameters')
group92 = parser9.add_argument_group('Optional parameters')
group91.add_argument("-d", "--GenomeDir", help='The genome index folder.')
group91.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
group91.add_argument("-b", "--Begin", help='The first index of input files.')
group91.add_argument("-e", "--End", help='The last index of input files.')
group92.add_argument("-p", "--Prefix", default='.',
help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
group92.add_argument("-w", "--Weight", default=programdir + '/data/weight-V9-2.hdf5',
help='The weight file used in the deep-learning network, default is "weight-V9-2.hdf5" in the data folder.')
parser10 = subparsers.add_parser('FusionReport', help='Report gene fusions.', add_help=False)
group101 = parser10.add_argument_group('Required parameters')
group102 = parser10.add_argument_group('Optional parameters')
group101.add_argument("-f", "--FileDir", help='The folder of input data.')
group101.add_argument("-d", "--GenomeDir", help='The genome index folder.')
group102.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files, default is FileDir.')
group101.add_argument("-b", "--Begin", help='The first index of input files.')
group101.add_argument("-e", "--End", help='The last index of input files.')
group102.add_argument("-p", "--Prefix", default='.',
help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
group102.add_argument("-v", "--PvalueCutoff", default='0.05', help='Pvalue(FDR) cutoff of the statistical model, default is 0.05.')
group102.add_argument("-n", "--ArtifactScoreCutoff", default='0.75', help='Artifact score cutoff, default is 0.75')
group102.add_argument("--LncRNAFilterOff", help='turn off the LncRNA filter', action='store_true')
group102.add_argument("--NoApprovedSymbolFilterOff", help='turn off the no-approved-symbol filter', action='store_true')

if len(sys.argv) == 1:
parser.print_help()
exit()

args = parser.parse_args()

if sys.argv[1] == 'Rename':
if not args.FileDir:
parser2.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system('python ' + programdir + '/bin/RenameFastqFiles.py ' + args.FileDir)
if aa == 0:
print('Successfully rename files to consecutive numbers!')
print('Please keep RenameList.txt carefully, or the file names can not be restored.')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'RestoreName':
if not args.FileDir:
parser3.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system(
'python ' + programdir + '/bin/RenameFastqFiles.py ' + args.FileDir + ' ' + args.FileDir + '/RenameList.txt')
if aa == 0:
print('Successfully restore file names!')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'BuildSTARIndex':
if not args.STARIndex or not args.Genome or not args.Annotation:
parser1.print_help()
print('Please specify all required parameters!')
exit()
os.system('STAR --runMode genomeGenerate --genomeDir ' + args.STARIndex + ' --runThreadN ' +
args.Thread + ' --genomeFastaFiles ' + args.Genome + ' --sjdbGTFfile ' +
args.Annotation)

if sys.argv[1] == 'Index':
if not args.GenomeDir or not args.Genome or not args.Annotation:
parser4.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system('mkdir -p ' + args.GenomeDir + ' && cp ' + args.Genome + ' ' + args.GenomeDir + '/ref.fa && cp ' +
args.Annotation + ' ' + args.GenomeDir + '/ref_annot.gtf && python ' + programdir +
'/bin/Addchr2gtf.py ' + args.GenomeDir + '/ref_annot.gtf > ' + args.GenomeDir +
'/ref_annot.gtf.added && python ' + programdir + '/bin/GetExonPos.py ' + args.GenomeDir +
'/ref_annot.gtf.added > ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed && python ' + programdir +
'/bin/GetGenePos.py ' + args.GenomeDir + '/ref_annot.gtf.added > ' + args.GenomeDir +
'/GenePos.txt && pyensembl install --reference-name GRCH37 --annotation-name my_genome_features --gtf '
+ args.GenomeDir + '/ref_annot.gtf.added')
if aa == 0:
print('Finish Indexing!')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'ReadMapping':
if not args.OutDir and args.FileDir:
args.OutDir = args.FileDir
if not args.FileDir or not args.OutDir or not args.Begin or not args.End or not args.STARIndex:
parser5.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system(
'sh ' + programdir + '/bin/StarMapping_Chimeric.sh ' + args.FileDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.OutDir +
'STARMapping/ ' + args.STARIndex + ' ' + args.Thread)
if aa == 0:
print('Finish mapping! Index: ' + args.Begin + ' ~ ' + args.End)
else:
print('ERROR!!!!!')

if sys.argv[1] == 'ReadProcessing':
if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir or not args.NoMappabilityFilter and not args.Mappability:
parser6.print_help()
print('Please specify all required parameters!')
exit()
if args.NoMappabilityFilter:
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_before_FS_NoMappa.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' +
args.GenomeDir + '/ref_annot.gtf.added ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed ' + programdir + '/bin/')
else:
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_before_FS.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' +
args.GenomeDir + '/ref_annot.gtf.added ' + args.Mappability + ' ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed ' + programdir + '/bin/')
if aa == 0:
print('Finish Read Processing! Index: ' + args.Begin + ' ~ ' + args.End)
else:
print('ERROR!!!!!')

if sys.argv[1] == 'FusionCandidate':
if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
parser7.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_startwith_FS.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.Prefix + ' ' + args.GenomeDir + '/ref.fa ' + args.GenomeDir + '/ref_annot.gtf.added ' + programdir + '/bin/')
if aa == 0:
print('Finish identifying fusion candidates')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'Retrain':
if not args.OutDir:
parser8.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_Retrain.sh ' + args.OutDir + ' ' + args.Prefix + ' ' + args.Weight + ' ' + args.Epoch + ' ' + programdir + '/bin/')
if aa == 0:
print('Finish Retraining!')
print('New weight file was saved at ' + args.OutDir + '/weights/RetrainWeight.hdf5')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'ArtifactScoring':
if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
parser9.print_help()
print('Please specify all required parameters!')
exit()
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_Predict.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.Prefix + ' ' +
args.Weight + ' ' + args.GenomeDir + '/ref.fa ' + args.GenomeDir + '/ref_annot.gtf.added ' + programdir + '/bin/')
if aa == 0:
print('Finish ArtifactScoring Step!')
else:
print('ERROR!!!!!')

if sys.argv[1] == 'FusionReport':
if not args.OutDir and args.FileDir:
args.OutDir = args.FileDir
if not args.FileDir or not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
parser10.print_help()
print('Please specify all required parameters!')
exit()
numcell = 0
for i in range(int(args.Begin), int(args.End) + 1):
if os.path.exists(args.FileDir + str(i) + '_2.fastq') or os.path.exists(args.FileDir + str(i) + '_2.fastq.gz'):
numcell += 1
if args.LncRNAFilterOff:
lncfilter = '1'
else:
lncfilter = '0'
if args.NoApprovedSymbolFilterOff:
nasfilter = '1'
else:
nasfilter = '0'
aa = os.system(
'sh ' + programdir + '/bin/CombinePipeline_startwith_ChiDist.sh ' + args.OutDir + ' ' + args.Prefix + ' ' + str(numcell) +
' ' + args.PvalueCutoff + ' ' + args.ArtifactScoreCutoff + ' ' + args.GenomeDir + '/ref_annot.gtf.added ' + args.GenomeDir + '/GenePos.txt ' + programdir + '/bin/ ' + lncfilter + ' ' + nasfilter)
if aa == 0:
if args.Prefix == '.':
args.Prefix = ''
print('Final Results are in ' + args.OutDir + '/FinalResult/' + args.Prefix + 'FinalOutput.abridged.txt')
os.system(
'cat ' + args.OutDir + '/FinalResult/' + args.Prefix + 'FinalOutput.abridged.txt > ' + args.OutDir + '/Result.abridged.txt')
os.system(
'cat ' + args.OutDir + '/FinalResult/' + args.Prefix + 'FinalOutput.full.txt > ' + args.OutDir + '/Result.full.txt')
os.system('mv ' + args.OutDir + '/FinalResult/ ' + args.OutDir + '/Resulttemp/')
else:
print('ERROR!!!!!')

0 comments on commit fc65fa5

Please sign in to comment.