docs/rna/rna_pipeline.cfg

# pipeLine configuration Documentation

#Note:
; The current configuration file has 9 sections, the `global` section's option can used for all other sections.
; so you need to config these options, some of these parameters have a default assignment mechanism,
; so you can level these options empty such as [global.outdir | global.sample | global.thread | global.genomeDir]
; some of these parameters have [required] annotation on themselves,
; which must be configuration, such as [global.fq1 | global.fq2].
; There is one thing you need to know:
; the program will use the `global.genomeDir` parameter to determine whether to run star index step.
; More detailed description please see the annotation on themselves.
; Another you need to know is barcode section operation:
; In step of the barcode split step, there are two modes, one of them is split barcode
; with parametric mode which need provided `barcode.chemistry_config`, in the mode,you
; can configuration the valid params such as `barcode`,`linke`,`polyt` and `qual`,another
; mode is nonparametric mode, which you only can valid `polyt` and `qual`. the more detailed
; information you can view the annotation on themselves;

[global]
; output dir. must exist;if not provided,will use current dir.
outdir=
; sample name. if not provided,will extract from input fq1 file name,use `_` split and take first part.
sample=
; run thread num. if not provided,will use default 4.
thread=
; is gzip output file,if you want to output plain text file format,you can change value from `True` to `False`.
gzip=True
;[required] for 10X scRNA0seq, can take multiple comma-separated values.Doing this will treat all reads from the library,
; across flow cells, as one sample.
fq1=
;[required] RNA sequencing, format as fq1.
fq2=
;genome dir:
; if not provided
;  -> will use current dir,the star index will run and output in current dir.
;    In this case,you must provided `[mkref]` options param below.
; if provided,
;  -> if `genome.config` file exist in provided path,which mean star index already run,
;    and will not run star index.
;  -> if `genome.config` file not exist in provided path, you must provided
;    `mkref.fna`,`mkref.gtf`,`mkref.genome_name` below.
genomeDir=

[mkref]
;[take effect when need run star index] genome fasta file.
fasta=
;[take effect when need run star index] genome gtf file.
gtf=
; star index param ,
genomeSAindexNbases=14
; control the `-geneNameAsName2` option of  gtfToGenePred command
;   - true, contain this option and outfile while use gene name as first column.
;   - false, without this option and outfile while use gene id as first column.
gene_name_as_name2=true
; Mitochondria gene list file which fmt is plain text file with one gene per line.
; if not provided, will use `mt-` and `MT-` to determine mitochondria genes.
; if provided this file,please use absolute path or relative path to `global.genomeDir`
mt_gene_list=
;[take effect when need run star index] genome name.
genome_name=

[sample]
;chemistry version
chemistry=

[barcode]
; chemistry library can be specified through two method;
; 1. you can use chemistry_name param. Now a library is preset which
; name KitVersion1(named it default will also take effect),
; At the same time, the pattern and chemistry_config  parameters
; will not take effect
; 2. use pattern and chemistry_config specify chemistry library，
; You can read the following instructions to learn how to use it
chemistry_name=
;[required if chemistry_name not provided] The pattern of sequences,eg:C8L6C8L6C8U8T30.
; The number after the letter represents the number of bases,
; `C`, `L`, `U `and `T` represent `barcode`,`linker`,`UMI`
; and `poly T` respectively
pattern=
;[take effect only when run barcode split with param],
; if provided,please use absolute file path,which file content similar to below:
;-------------------------------------------------------
;[chemistry]
;barcode=Barcode1.list,Barcode2.list,Barcode3.list
;link=Link1.list,Link2.list
;-------------------------------------------------------
; the barcode and link valid file fmt all are plain text file with one sequences per line.
; And multi-file uer separate with comma. finally, please note that these files are in
; the same directory with chemistry_config file
chemistry_config=
;[take effect only when provided chemistry_config] valid link switch,default not valid link
; change from `False` to `True` only when you run with parametric mode,in this mode you must provided
; chemistry_config first!
use_link_valid_reads=True
;[take effect only when valid linker] the link diff num must less than provided value
allow_link_diff_num=2
;[take effect only when provided chemistry_config] valid barcode switch, default not valid barcode
; change from `False` to `True` only when you run with parametric mode,in this mode you must provided
; chemistry_config first!
use_barcode_valid_reads=True
;[take effect only when valid barcode] the barcode diff num must less than provided value
allow_barcode_diff_num=1
; valid poly T switch,select from [False,True],default not valid poly T
use_polyt_valid_reads=True
;[take effect only when valid poly T] number of T base in the polyT must greater than provided value times plotT length
polyt_rate=0.7
; valid QUAL value,provided 0 means not valid QUAL,if you provided a number greater than 0, that
; mean the base in barcode which extract from sequences will be check the qual whether lower than
; provided qual value. if lower than provided value, will be regarded as low quality base
low_qual=0
; [take effect only when low_qual not is 0],allow max value of low quality base number
low_num=2
; is output r1 file,select from [`False`, `True`],default is False,which mean not output r1 file
output_r1=False

[cutadapt]
;[Optional] Additional adapter fasta file.
adapter_fasta=
; Discard processed reads that are shorter than LENGTH.
minimum_length=20
; Quality trimming of reads using two-color chemistry (NextSeq).
; Some Illumina instruments use a two-color chemistry to encode the four bases.
; This includes the NextSeq and the NovaSeq.
; In those instruments, a `dark cycle` (with no detected color) encodes a G.
; However, dark cycles also occur when sequencing `falls off` the end of the fragment.
; The read then contains a run of high-quality, but incorrect `G` calls at its 3' end.
nextseq_trim=20
; Since Cutadapt allows partial matches between the read and the adapter sequence,
; short matches can occur by chance, leading to erroneously trimmed bases.
; For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter.
; To reduce the number of falsely trimmed bases, the alignment algorithm requires that
; at least {overlap} bases match between adapter and read.
overlap=10
; Read2 insert length.
insert=150
;[Optional]Other cutadapt parameters,eg: `-g XXX`
cutadapt_param=

[star]
; is output unmapped reads. select from [`True`,`False`],default False,which mean not output unmapped reads.
out_unmapped=False
; Alignment will be output only if the number of matched bases is higher than or equal to this value.
out_filter_match_n_min=0
; How many places are allowed to match a read at most.
out_filter_multimap_n_max=1
; Maximum memory that picard can use.
picard_mem=40
; Maximum memory that STAR can use.
star_mem=40
;[Optional]Additional parameters from star software,eg: --param1 value1 --param1 value1
; you should be familiar with star software.
star_param=

[featureCounts]
; Specify attribute type in GTF annotation. Meta-features used for read counting will be
; extracted from annotation using the provided value(gene or exon).
gtf_type=gene
;[Optional]Additional parameters for the featureCounts software
feature_counts_param=

[count]
; Expected cell number
expected_cell_num=3000
; when correct barcode and umi, the umis count less than this value will
; be discard.set this params will accelerated running speed but  some data will be discarded.
n_umi_filter=5
; when barcode correct,low_umis_count/high_umis_count need less than this value.
; if set 1,merge low to high for all match case.
barcode_correct_limit=0.01
; when umi correct,low count/high count need less than this value.
; if set 1,merge low to high for all match case.
umi_correct_limit=0.1
;[Optional]Force the cell number to be this number. default None, you can provided a integer value for this param
force_cell_num=
;Choose from [`auto`, `EmptyDrops_CR`]
cell_calling_method=EmptyDrops_CR

[analysis]