Skip to content

Commit 80d6f6b

Browse files
authored
Add weight model (#154)
* update guppy and software version
1 parent 99abdcd commit 80d6f6b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2178
-242
lines changed

.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,9 @@ testdemo.py
5656
/src/nanome/nanocompare/utils/na12878.filelist.txt
5757
/guppy_basecaller-core-dump-db/
5858
/locations/
59+
/test_data/NA19240_RRBS_ENCFF000LZT_chr22.txt.gz
60+
/test_data/two_split2/ecoli_ci_basecalled.tar.gz
61+
/test_data/two_split2/ecoli_ci_test_fast5.tar.gz
62+
/test_data/multi_fast5_demo.tar.gz
63+
/test_data/na12878_chr22_p3_100.tar.gz
64+
/test_data/NA19240_RRBS_ENCFF000LZS_chr22.txt.gz

Dockerfile

+4-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ MAINTAINER Yang Liu <[email protected]>
1313
LABEL description="Nanome project in Li Lab at The Jackson Laboratory" \
1414
1515

16-
# Guppy version
17-
ARG GUPPY_VERSION=6.1.5
18-
ARG REMORA_VERSION=1.1.0
16+
# Guppy version 6.4.x is not support, due to no fast5_out option
17+
# ont-remora 2.x is not support, due to pod5 needs python 3.7+
18+
ARG GUPPY_VERSION=6.3.9
19+
ARG REMORA_VERSION=1.1.1
1920
ARG MEGALODON_VERSION=2.5.0
2021
ARG BUILD_PACKAGES="wget apt-transport-https procps git curl libnvidia-compute-460-server"
2122
ARG DEBIAN_FRONTEND="noninteractive"

docs/nanome-paper-cover.jpg

122 KB
Loading

docs/work-benchmark.tree.txt.gz

-1.29 MB
Binary file not shown.

environment.yml

+5-8
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,11 @@ dependencies:
4040
- pybedtools>=0.8.2 # nanome needs upper version of pybedtools, deal with NAs
4141
- tqdm>=4.60 # need by Megalodon, ref: https://github.com/nanoporetech/megalodon/issues/105
4242
- ont-tombo>=1.5.1
43-
- nanopolish>=0.13.2
43+
- nanopolish>=0.14.0
4444
- pip:
45-
- xgboost
46-
- ont-pyguppy-client-lib>=6.1.3
47-
- deepsignal==0.1.10
45+
- xgboost<=1.5.2 # nanome model load need <=1.5.x
46+
- ont-pyguppy-client-lib==6.3.9
47+
- deepsignal>=0.2.0
4848
- fast5mod==1.0.5
49-
- nanome-jax>=2.0.8
49+
- nanome-jax>=2.0.10
5050
- megalodon
51-
#The conflict is caused by:
52-
# fast5mod 1.0.5 depends on ont-fast5-api==3.0.0, but it can be use higher
53-
# megalodon 2.3.4+ depends on ont-fast5-api>=3.2, it must be higher, so later install it

main.nf

+114-79
Original file line numberDiff line numberDiff line change
@@ -35,48 +35,34 @@ if (! params.input) exit 1, "Missing --input option for input data, check comma
3535
//if ( !file(params.input.toString()).exists() ) exit 1, "input does not exist, check params: --input ${params.input}"
3636

3737
// Parse genome params
38-
genome_map = params.genome_map
38+
gbl_genome_map = params.genome_map
3939

40-
if (genome_map[params.genome]) { genome_path = genome_map[params.genome] }
41-
else { genome_path = params.genome }
40+
gbl_genome_path = gbl_genome_map[params.genome] ? gbl_genome_map[params.genome] : params.genome
4241

4342
// infer dataType, chrSet based on reference genome name, hg - human, ecoli - ecoli, otherwise is other reference genome
4443
humanChrSet = 'chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY'
45-
if (params.genome.contains('hg') || (params.dataType && params.dataType == 'human')) {
46-
dataType = "human"
47-
if (!params.chrSet) {
48-
// default for human, if false or 'false' (string), using ' '
49-
chrSet = humanChrSet
50-
} else {
51-
chrSet = params.chrSet
52-
}
53-
} else if (params.dataType && params.dataType == 'mouse') {
54-
dataType = "mouse"
55-
if (!params.chrSet) {
56-
// default for human, if false or 'false' (string), using ' '
57-
chrSet = humanChrSet
58-
} else {
59-
chrSet = params.chrSet
60-
}
61-
} else if (params.genome.contains('ecoli') || (params.dataType && params.dataType == 'ecoli')) {
62-
dataType = "ecoli"
63-
if (!params.chrSet) {
64-
// default for ecoli
65-
chrSet = 'NC_000913.3'
66-
} else {
67-
chrSet = params.chrSet
68-
}
44+
45+
genome_basefn = (new File(params.genome)).name
46+
if (genome_basefn.startsWith('hg') || (params.dataType && params.dataType == 'human')) {
47+
dataType = params.dataType ? params.dataType : "human"
48+
// default for human chr
49+
chrSet = params.chrSet ? params.chrSet : humanChrSet
50+
} else if (genome_basefn.startsWith('mm') || (params.dataType && params.dataType == 'mouse') ){
51+
dataType = params.dataType ? params.dataType : "mouse"
52+
// default for mouse chr
53+
chrSet = params.chrSet ? params.chrSet : humanChrSet
54+
} else if (genome_basefn.startsWith('ecoli') || (params.dataType && params.dataType == 'ecoli')) {
55+
dataType = params.dataType ? params.dataType : "ecoli"
56+
chrSet = params.chrSet ? params.chrSet : 'NC_000913.3'
6957
} else {
70-
// default will not found name, use other
71-
if (!params.dataType) { dataType = 'other' } else { dataType = params.dataType }
72-
if (!params.chrSet) {
73-
// No default value for other reference genome
74-
exit 1, "Missing --chrSet option for other reference genome, please specify chromosomes used in reference genome [${params.genome}]"
75-
}
76-
chrSet = params.chrSet
58+
// if not infer data type, use other
59+
dataType = params.dataType ? params.dataType : "other"
60+
61+
if (params.chrSet) chrSet = params.chrSet
62+
else exit 1, "Missing --chrSet option for other reference genome, please specify chromosomes used in reference genome [${params.genome}]"
7763
}
7864

79-
// chrSet1 and dataType1 is the infered params, defined from chrSet and dataType (not in scope of params)
65+
// chrSet1 and dataType1 is the infered params, defined from chrSet and dataType (not in scope of params), will be used in every modules
8066
params.chrSet1 = chrSet
8167
params.dataType1 = dataType
8268

@@ -85,7 +71,7 @@ projectDir = workflow.projectDir
8571
ch_utils = Channel.fromPath("${projectDir}/utils", type: 'dir', followLinks: false)
8672
ch_src = Channel.fromPath("${projectDir}/src", type: 'dir', followLinks: false)
8773

88-
// Reference genome, chom size file
74+
// Reference genome, chom size file name, will be used in every modules
8975
params.referenceGenome = "${params.GENOME_DIR}/${params.GENOME_FN}"
9076
params.chromSizesFile = "${params.GENOME_DIR}/${params.CHROM_SIZE_FN}"
9177

@@ -106,23 +92,23 @@ if (params.input.endsWith(".filelist.txt")) {
10692
return file(it[0])
10793
}
10894
}
109-
.set{ inputCh }
95+
.set{ ch_inputs }
11096
} else if (params.input.contains('*') || params.input.contains('?')) {
11197
// match all files in the folder, note: input must use quote string '', prevent expand in advance
11298
// such as --input '/fastscratch/liuya/nanome/NA12878/NA12878_CHR22/input_chr22/*'
11399
Channel.fromPath(params.input, type: 'any', checkIfExists: true)
114-
.set{ inputCh }
100+
.set{ ch_inputs }
115101
} else {
116102
// For single file/wildcard matched files
117-
Channel.fromPath( params.input, checkIfExists: true ).set{ inputCh }
103+
Channel.fromPath( params.input, checkIfExists: true ).set{ ch_inputs }
118104
}
119105

120106
// Header log info
121107
def summary = [:]
122108
summary['dsname'] = params.dsname
123109
summary['input'] = params.input
124110

125-
if (genome_map[params.genome] != null) { summary['genome'] = "${params.genome} - [${genome_path}]" }
111+
if (gbl_genome_map[params.genome]) { summary['genome'] = "${params.genome} - [${gbl_genome_path}]" }
126112
else { summary['genome'] = params.genome }
127113

128114
summary['\nRunning settings'] = "--------"
@@ -196,6 +182,11 @@ if (params.runMethcall && params.runDeepMod) {
196182
summary['DEEPMOD_RNN_MODEL'] = "${params.DEEPMOD_RNN_MODEL}"
197183
}
198184
}
185+
if (params.runNANOME) {
186+
summary['NANOME_MODEL'] = "${params.NANOME_MODEL}"
187+
summary['CS_MODEL_FILE'] = "${params.CS_MODEL_FILE}"
188+
summary['CS_MODEL_SPEC'] = "${params.CS_MODEL_SPEC}"
189+
}
199190

200191
summary['\nPipeline settings'] = "--------"
201192
summary['Working dir'] = workflow.workDir
@@ -284,46 +275,45 @@ include { DEEPSIGNAL; DPSIGCOMB } from './modules/DEEPSIGNAL'
284275

285276
include { DEEPSIGNAL2; DEEPSIGNAL2COMB } from './modules/DEEPSIGNAL2'
286277

287-
include { REPORT } from './modules/REPORT'
288-
289278
include { Guppy; GuppyComb; Tombo; TomboComb; DeepMod; DpmodComb; METEORE } from './modules/OLDTOOLS'
290279

291280
include { NewTool; NewToolComb } from './modules/NEWTOOLS'
292281

293282
include { CLAIR3; PHASING } from './modules/PHASING'
294283

284+
include { CONSENSUS } from './modules/CONSENSUS'
285+
286+
include { EVAL } from './modules/EVAL'
287+
288+
include { REPORT } from './modules/REPORT'
289+
290+
// place holder channel, used for empty file of a channel
291+
null1 = Channel.fromPath("${projectDir}/utils/null1")
292+
null2 = Channel.fromPath("${projectDir}/utils/null2")
293+
null3 = Channel.fromPath("${projectDir}/utils/null3")
295294

296295
workflow {
297-
if ( !file(genome_path.toString()).exists() )
296+
if ( !file(gbl_genome_path.toString()).exists() )
298297
exit 1, "genome reference path does not exist, check params: --genome ${params.genome}"
299298

300-
genome_ch = Channel.fromPath(genome_path, type: 'any', checkIfExists: true)
299+
ch_genome = Channel.fromPath(gbl_genome_path, type: 'any', checkIfExists: true)
301300

302-
if (!params.rerioDir) { // default if null, will online downloading
303-
// This is only a place holder for input
304-
rerioDir = Channel.fromPath("${projectDir}/utils/null1", type: 'any', checkIfExists: false)
305-
} else {
306-
// User provide the dir
307-
if ( !file(params.rerioDir.toString()).exists() )
308-
exit 1, "rerioDir does not exist, check params: --rerioDir ${params.rerioDir}"
309-
rerioDir = Channel.fromPath(params.rerioDir, type: 'any', checkIfExists: true)
310-
}
301+
// rerio model dir will be download in ENVCHECK if needed
302+
ch_rerio_dir = (params.rerio && params.rerioDir) ? Channel.fromPath(params.rerioDir, type: 'any', checkIfExists: true) :
303+
null1
311304

312-
if (! params.runDeepSignal) {
313-
// use null placeholder
314-
deepsignalDir = Channel.fromPath("${projectDir}/utils/null2", type: 'any', checkIfExists: true)
315-
} else if (!params.deepsignalDir) {
316-
// default if null, will online staging
317-
deepsignalDir = Channel.fromPath(params.DEEPSIGNAL_MODEL_ONLINE, type: 'any', checkIfExists: true)
305+
// deepsignal model dir will be downloaded in ENVCHECK if needed
306+
if (params.runDeepSignal) {
307+
ch_deepsignal_dir = params.deepsignalDir ?
308+
Channel.fromPath(params.deepsignalDir, type: 'any', checkIfExists: true) :
309+
Channel.fromPath(params.DEEPSIGNAL_MODEL_ONLINE, type: 'any', checkIfExists: true)
318310
} else {
319-
// User provide the dir
320-
if ( !file(params.deepsignalDir.toString()).exists() )
321-
exit 1, "deepsignalDir does not exist, check params: --deepsignalDir ${params.deepsignalDir}"
322-
deepsignalDir = Channel.fromPath(params.deepsignalDir, type: 'any', checkIfExists: true)
311+
// use null placeholder
312+
ch_deepsignal_dir = null2
323313
}
324314

325-
ENVCHECK(genome_ch, ch_utils, rerioDir, deepsignalDir)
326-
UNTAR(inputCh)
315+
ENVCHECK(ch_genome, ch_utils, ch_rerio_dir, ch_deepsignal_dir)
316+
UNTAR(ch_inputs)
327317

328318
if (params.runBasecall) {
329319
BASECALL(UNTAR.out.untar)
@@ -334,9 +324,12 @@ workflow {
334324
}
335325

336326
// Resquiggle running if use Tombo or DeepSignal
337-
if (((params.runDeepSignal || params.runTombo || params.runDeepSignal2) && params.runMethcall) || params.runResquiggle) {
338-
// BASECALL.out.basecall.subscribe({ println("BASECALL.out.basecall: $it") })
339-
RESQUIGGLE(BASECALL.out.basecall, ENVCHECK.out.reference_genome)
327+
if (((params.runDeepSignal || params.runTombo || params.runDeepSignal2) && params.runMethcall)
328+
|| params.runResquiggle) {
329+
resquiggle = RESQUIGGLE(BASECALL.out.basecall, ENVCHECK.out.reference_genome)
330+
f1 = params.feature_extract ? resquiggle.feature_extract : Channel.empty()
331+
} else {
332+
f1 = Channel.empty()
340333
}
341334

342335
if (params.runNanopolish && params.runMethcall) {
@@ -373,12 +366,19 @@ workflow {
373366
}
374367

375368
if (params.runDeepSignal2 && params.runMethcall) {
376-
DEEPSIGNAL2(RESQUIGGLE.out.resquiggle.collect(),
369+
deepsignal2 = DEEPSIGNAL2(RESQUIGGLE.out.resquiggle.collect(),
377370
ENVCHECK.out.reference_genome,
378371
ch_src, ch_utils)
379-
DEEPSIGNAL2COMB(DEEPSIGNAL2.out.deepsignal2_combine_out,
372+
comb_deepsignal2 = DEEPSIGNAL2COMB(DEEPSIGNAL2.out.deepsignal2_combine_out,
380373
ch_src, ch_utils
381374
)
375+
f2 = deepsignal2.deepsignal2_feature_out
376+
s3_1 = comb_deepsignal2.site_unify
377+
r3_1 = comb_deepsignal2.read_unify
378+
} else {
379+
f2 = Channel.empty()
380+
s3_1 = Channel.empty()
381+
r3_1 = Channel.empty()
382382
}
383383

384384
if (params.runGuppy && params.runMethcall) {
@@ -453,23 +453,58 @@ workflow {
453453
r_new = Channel.empty()
454454
}
455455

456-
// Site level combine a list
457-
Channel.fromPath("${projectDir}/utils/null1").concat(
458-
s1, s2, s3, s4, s5, s6, s7, s_new
459-
).toList().set { tools_site_unify }
456+
null2.concat(
457+
r1, r2, r3, f1, f2
458+
).toList().set { top3_tools_read_unify }
460459

461-
Channel.fromPath("${projectDir}/utils/null2").concat(
462-
r1, r2, r3
460+
if (params.runNANOME) {
461+
consensus = CONSENSUS(top3_tools_read_unify, ch_src, ch_utils)
462+
s8 = consensus.site_unify
463+
r8 = consensus.read_unify
464+
} else {
465+
s8 = Channel.empty()
466+
r8 = Channel.empty()
467+
}
468+
469+
null2.concat(
470+
r1, r2, r3, r8, f1, f2
463471
).toList().set { tools_read_unify }
464472

465-
REPORT(tools_site_unify, tools_read_unify,
473+
// perform evaluation of tools' methylation results
474+
if (params.runEval) {
475+
bg1 = params.bg1 ? Channel.fromPath(params.bg1) : Channel.empty()
476+
bg2 = params.bg2 ? Channel.fromPath(params.bg2) : Channel.empty()
477+
478+
null1.concat(
479+
bg1, bg2
480+
).toList().set { bg_list }
481+
482+
if (params.genome_annotation_dir) {
483+
genome_annotation_ch = Channel.fromPath(params.genome_annotation_dir)
484+
} else {
485+
genome_annotation_ch = null3
486+
}
487+
488+
EVAL(tools_read_unify, bg_list, ch_src, ch_utils, genome_annotation_ch)
489+
}
490+
491+
// Site level combine a list
492+
null1.concat(
493+
s1, s2, s3, s4, s5, s6, s7, s_new, s8
494+
).toList().set { tools_site_unify }
495+
496+
REPORT(tools_site_unify, top3_tools_read_unify,
466497
ENVCHECK.out.tools_version_tsv, QCEXPORT.out.qc_report,
467498
ENVCHECK.out.reference_genome, ch_src, ch_utils)
468499

469500
if (params.phasing) {
470501
CLAIR3(QCEXPORT.out.bam_data, ENVCHECK.out.reference_genome)
471-
Channel.fromPath("${projectDir}/utils/null1").concat(
472-
MGLDNCOMB.out.megalodon_combine, REPORT.out.nanome_combine_out
502+
null1.concat(
503+
MGLDNCOMB.out.megalodon_combine,
504+
MGLDNCOMB.out.read_unify,
505+
CONSENSUS.out.nanome_combine_out,
506+
CONSENSUS.out.read_unify,
507+
NPLSHCOMB.out.nanopolish_combine_out_ch
473508
).toList().set { mega_and_nanome_ch }
474509
PHASING(mega_and_nanome_ch, CLAIR3.out.clair3_out_ch,
475510
ch_src, QCEXPORT.out.bam_data, ENVCHECK.out.reference_genome)

0 commit comments

Comments
 (0)