-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
executable file
·153 lines (121 loc) · 4.61 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import os
import re
import sys
from glob import glob
from inspect import cleandoc
from snakemake.utils import validate, min_version
min_version("7.9.0")
# the global config file
configfile: 'config.yaml'
# reproducible container image for docker/singularity
containerized: 'oras://ghcr.io/asgeissler/rnabrowser:0.1'
################################################################################
################################################################################
# Load and check samples configuration
sample_files = 'samples.csv'
# Check input file settings
if not os.path.isfile(sample_files):
raise Exception("Please provide a samples.csv file.")
sample_files = pd.read_csv(sample_files)
# found read files
found_files = glob('data/*.fastq.gz')
# the expected columns, 'pair' and 'reverse' are optional
expect = ['file', 'sample', 'condition', 'dataset']
if not all( i in sample_files.columns \
for i in expect ):
raise Exception(cleandoc("""
The sample.csv must contain the columns file, sample, condition,
and dataset.
The columns 'pair' and 'reverse' are optional.
"""))
# are all files described?
xs = ( 'data/' + i for i in sample_files['file'] )
xs = set(xs).symmetric_difference(found_files)
if len(xs) > 0:
raise Exception(cleandoc("""
The samples.csv and the *.fastq.gz file found in the folder data do
not match. These entries either lack a file or do not have a sample
entry:
{}
""").format(', '.join(xs)))
# Finally, make sure the string content won't make problems
ptn = re.compile('^[-A-Za-z0-9]*$')
for i in ['sample', 'condition', 'dataset']:
if any( ptn.match(j) is None \
for j in sample_files[i]):
raise Exception(cleandoc("""
Please use only alpha numeric [A-Za-z0-9] and a '-' symbol in
the dataset, sample, and condition values.
(No underscore or other special characters!)
"""))
# pairs correctly noted
if 'pair' in sample_files.columns:
if set(sample_files['pair']) != {'R1', 'R2'}:
raise Exception(cleandoc("""
Please specify the read pairs in R1 and R2 values.
"""))
x = sample_files.groupby('sample').size()
if any(x != 2):
raise Exception(cleandoc("""
Please specify both read pairs for all samples.
Missing for: {}
""").format(', '.join(x.index[x != 2])))
if 'reverse' in sample_files.columns:
if len(set(sample_files['reverse']) - {'yes', 'no'}) > 0:
raise Exception(cleandoc("""
Please use only yes/no in column reverse.
"""))
# convert to boolean
sample_files['reverse'] = sample_files['reverse'] == 'yes'
else:
# Per default do revers.
# Most common Illumina case seems that the read (paired-end read1) is
# reverse complementary mapping
sample_files['reverse'] = True
# assert that genome sequence and annotation exist
for i in ['genome.fna.gz', 'genome.gff.gz']:
if not os.path.isfile(i):
raise Exception('Please provide both genome.fna.gz and genome.gff.gz')
# select pair end mode if needed
pair_end = 'pair' in sample_files.columns
################################################################################
################################################################################
# Helper to write output/input of rules
def sample_wise(x):
"Expand wildcards in x as a generator"
return list(sample_files.apply(
lambda row: x.format(**row),
axis = 1
))
# pretty name for samples (excl. potential read-pair indication)
sample_files['name'] = sample_wise('{condition}_{sample}_{dataset}')
################################################################################
################################################################################
tasks = []
include: 'rules/00_symlink.smk'
include: 'rules/10_mapping.smk'
include: 'rules/20_samba.smk'
include: 'rules/30_coverage.smk'
include: 'rules/40_quant.smk'
include: 'rules/50_browser.smk'
################################################################################
rule multiqc:
input:
sample_wise("logs/bowtie2_map/{name}.log"),
sample_wise("analysis/40_subread/{name}.featureCounts.summary"),
output:
"multiqc/multiqc.html",
directory("multiqc/multiqc_data")
params:
""
log:
"logs/multiqc.log"
wrapper:
"v3.3.1/bio/multiqc"
################################################################################
# Target rules
rule all:
input:
*tasks,
"multiqc/multiqc.html"