PaulaAlessio
diff --git a/‎CMakeLists.txt
+20-1 b/‎CMakeLists.txt
+20-1
diff --git a/‎README_trimFilter.md
+81-9 b/‎README_trimFilter.md
+81-9
diff --git a/‎doxygen_sqlite3.db
1.19 MB b/‎doxygen_sqlite3.db
1.19 MB
diff --git a/‎examples/bloomROC/create_ROC_png.R
+14 b/‎examples/bloomROC/create_ROC_png.R
+14
diff --git a/‎examples/bloomROC/example_ROC_0p005_bloom.csv
+15-15 b/‎examples/bloomROC/example_ROC_0p005_bloom.csv
+15-15
diff --git a/‎examples/bloomROC/example_ROC_0p005_bloom.pdf
-25 Bytes b/‎examples/bloomROC/example_ROC_0p005_bloom.pdf
-25 Bytes
diff --git a/‎examples/bloomROC/example_ROC_0p0075_bloom.csv
+15-15 b/‎examples/bloomROC/example_ROC_0p0075_bloom.csv
+15-15
diff --git a/‎examples/bloomROC/example_ROC_0p0075_bloom.pdf
-17 Bytes b/‎examples/bloomROC/example_ROC_0p0075_bloom.pdf
-17 Bytes
diff --git a/‎examples/bloomROC/example_ROC_0p01_bloom.csv
+16-16 b/‎examples/bloomROC/example_ROC_0p01_bloom.csv
+16-16
diff --git a/‎examples/bloomROC/example_ROC_0p01_bloom.pdf
-14 Bytes b/‎examples/bloomROC/example_ROC_0p01_bloom.pdf
-14 Bytes
diff --git a/‎examples/bloomROC/example_ROC_0p02_bloom.csv
+16-16 b/‎examples/bloomROC/example_ROC_0p02_bloom.csv
+16-16
diff --git a/‎examples/bloomROC/example_ROC_0p02_bloom.pdf
-5 Bytes b/‎examples/bloomROC/example_ROC_0p02_bloom.pdf
-5 Bytes
diff --git a/‎examples/fa_fq_files/adapter_even_long.fa
+2 b/‎examples/fa_fq_files/adapter_even_long.fa
+2
diff --git a/‎examples/fa_fq_files/adapter_even_short.fa
+2 b/‎examples/fa_fq_files/adapter_even_short.fa
+2
diff --git a/‎examples/fa_fq_files/adapter_odd_long.fa
+2 b/‎examples/fa_fq_files/adapter_odd_long.fa
+2
diff --git a/‎examples/fa_fq_files/adapter_odd_short.fa
+3 b/‎examples/fa_fq_files/adapter_odd_short.fa
+3
@@ -94,7 +94,7 @@ CONFIGURE_FILE(${CMAKE_SOURCE_DIR}/config.h.in
                ${CMAKE_SOURCE_DIR}/config.h [ESCAPE_QUOTES])
 
 # Set verbose 
-set(CMAKE_VERBOSE_MAKE ON)
+#set(CMAKE_VERBOSE_MAKE ON)
 
 # Set compiler flags
 set(CMAKE_C_FLAGS  "-Wall -O3 -march=native -std=c11") 
@@ -132,6 +132,7 @@ add_executable(trimFilter ${PROJECT_SOURCE_DIR}/trimFilter.c
             ${PROJECT_SOURCE_DIR}/io_trimFilter.c 
             ${PROJECT_SOURCE_DIR}/fa_read.c 
             ${PROJECT_SOURCE_DIR}/fq_read.c 
+            ${PROJECT_SOURCE_DIR}/adapters.c 
             ${PROJECT_SOURCE_DIR}/tree.c 
             ${PROJECT_SOURCE_DIR}/bloom.c 
             ${PROJECT_SOURCE_DIR}/city.c 
@@ -140,6 +141,24 @@ add_executable(trimFilter ${PROJECT_SOURCE_DIR}/trimFilter.c
             ${PROJECT_SOURCE_DIR}/Lmer.c
             ${PROJECT_SOURCE_DIR}/str_manip.c )
 
+
+add_executable(trimFilterDS ${PROJECT_SOURCE_DIR}/trimFilterDS.c 
+            ${PROJECT_SOURCE_DIR}/init_trimFilter.c 
+            ${PROJECT_SOURCE_DIR}/io_trimFilter.c 
+            ${PROJECT_SOURCE_DIR}/fa_read.c 
+            ${PROJECT_SOURCE_DIR}/ds_read.c 
+            ${PROJECT_SOURCE_DIR}/fq_read.c 
+            ${PROJECT_SOURCE_DIR}/tree.c 
+            ${PROJECT_SOURCE_DIR}/bloom.c 
+            ${PROJECT_SOURCE_DIR}/city.c 
+            ${PROJECT_SOURCE_DIR}/adapters.c 
+            ${PROJECT_SOURCE_DIR}/trim.c 
+            ${PROJECT_SOURCE_DIR}/fopen_gen.c
+            ${PROJECT_SOURCE_DIR}/Lmer.c
+            ${PROJECT_SOURCE_DIR}/str_manip.c )
+
+
+         
 # Set linker flags
 set(CMAKE_C_LINK_FLAGS  "-lm " )
 add_executable(makeBloom ${PROJECT_SOURCE_DIR}/makeBloom.c 
 
@@ -92,7 +92,7 @@ Options:
 ## Output description
 
 - `O_PREFIX_good.fq.gz`: contains reads that passed all filters (maybe trimmed).
-- `O_PREFIX_adap.fq.gz`: contains discarded due to the presence of adapters.
+- `O_PREFIX_adap.fq.gz`: contains reads discarded due to the presence of adapters.
 - `O_PREFIX_cont.fq.gz`: contains contamination reads.
 - `O_PREFIX_lowQ.fq.gz`: contains reads discarded due to low quality issues.
 - `O_PREFIX_NNNN.fq.gz`: contains reads discarded due to *N*'s issues.
@@ -118,7 +118,64 @@ Options:
 
 #### Adapters
 
-TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
+Technical sequences within the reads are detected if the option
+`--adapters <ADAPTERS.fa>:<mismatches>:<score>` is given. The
+adapter(s) sequence(s) are read from the fasta file, and the 
+search is done using an 'seed and extend' approach. It starts by looking for
+16-nucleotides long seeds, for which a user defined number of mismatches is
+allowed (`mismatches`). If found, a score is computed. If the score is larger 
+than the user defined threshold (`score`) and the number of matched 
+nucleotides exceeds 12, then the read is trimmed if the remaining part is 
+longer than `MINL` (user defined) and discarded otherwise. If no 
+16-nucleotides long seeds are found, we proceed with 8-nucleotides long seeds 
+and apply the same criteria to trim/discard a read. A list of possible
+situations follows, to illustrate how it works (`MINL=25`, `mismatches=2`):
+
+```
+ADAPTER: CAAGCAGAAGACGGCATACGAG
+REV_COM: AGATCGGAAGAGCTCGTATGCC
+
+CASE1A:  CACAGTCGATCAGCGAGCAGGCATTCATGCTGAGATCGGAAGAGATCGTATG
+                                         ||||||||||||X|||----
+                                         AGATCGGAAGAGCTCGTATG
+         - Seed: 16 Nucleotides
+         - Return: trimmed, TRIMA:0:31
+CASE1B:  CACATCATCGCTAGCTATCGATCGATCGATGCTATGCAAGATCGGAAGAGCT
+                                               ||||||||------
+                                               AGATCGGAAGAGCT
+         - Seed: 8 Nucleotides
+         - Return: trimmed, TRIMA:0:37
+CASE1C:  CACATCATCGCTAGCTATCGATCGATCGATGCTATGCACGAAGATCGGAAGA
+                                                  ||||||||---
+                                                  AGATCGGAAGA
+         - Seed: 8 Nucleotides
+         - Return: nothing done, reason: Match length < 12
+CASE2A:  CATACATCACGAGCTAGCTAGAGATCGGAAGAGCTCGTATGCCCAGCATCGA
+                               ||||||||||||||||------
+                               AGATCGGAAGAGCTCGTATGCC
+         - Seed: 16 Nucleotides
+         - Return: discarded, reason: remaining read too short.
+CASE2B:  CCACAGTACAATACATCACGAGCTAGCTAGAGATCGGAAGAGCTCGTATGCA
+                                     ||||||||||||||||||||||
+                                     AGATCGGAAGAGCTCGTATGCC
+         - Seed: 16 Nucleotides
+         - Return: trimmed, TRIMA:0:28
+CASE3A:  TATGCCGTCTTCTGCTTGCAGTGCATGCTGATGCATGCTGCATGCTAGCTGC
+         ||||||||||||||||--
+         TATGCCGTCTTCTGCTTG
+         - Seed: 16 Nucleotides
+         - Return: discarded, reason: remaining read too short
+CASE3B:  CGTCTTCTGCTTGCCGATCGATGCTAGCTACGATCGTCGAGCTAGCTACGTG
+         ||||||||-----
+         CGTCTTCTGCTTG
+         - Seed: 8 Nucleotides
+         - Return: discarded, reason: remaining read too short
+CASE3C:  TCTTCTGCTTGCCGATCGATGCTAGCTACGATCGTCGAGCTAGCTACGTGCG
+         ||||||||---
+         TCTTCTGCTTG
+         - Seed: 8 Nucleotides
+         - Return: nothing done, reason: Match length < 12
+```
 
 #### Impurities
 
@@ -307,17 +364,32 @@ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
 
 ## Test/examples
 
- The examples in folder `examples/trimFilter_SReport/` works in the following
+ The examples in folder `examples/trimFilter_SReport/` work in the following
  way:
 
-1. See folder `fa_fq_files`. The file `EColi_rRNA.fq` was created with
+1. See folder `examples/fa_fq_files`. The file `EColi_rRNA.fq` was created with
  `create_fq.sh` and contains:                                               
    * 2e5 reads of length 50 from `EColi_genome.fa` with NO errors. 
    * 5e4 reads of length 50 from `rRNA_modified.fa` with NO errrors 
      (rRNA contaminations).                                                  
    * Artificially generated reads with low quality score (see `create_fq.sh`)
-   * Artificially generated reads with Ns (see `create_fq.sh`).              
-2. `run_example_TREE.sh`: the code was tested with flags:                     
+   * Artificially generated reads with Ns (see `create_fq.sh`).
+   * Adapter files: `adapter_even_long.fa`, `adapter_odd_long.fa`, 
+   `adapter_even_short.fa`, `adapter_odd_short.fa`. Fasta files containing 
+    one adapter sequence each, longer/shorter than 16 nucleotides and with 
+    an even/odd length. 
+   * Example files to test the adapter contamination searchs:
+   `human_[even/odd]_wad_[even/odd]_[long/short].fq`. Short fastq files where
+    adapters contaminations have been inserted in all possible ways:
+    even/odd positions, at the beginning/middle/end of the reads. Read 
+    lengths are even or odd as the first suffix indicates. The adapter 
+    contaminations included are suggested by the second even/odd suffix, 
+    and the long/short suffix. 
+2. `adapters/run_example.sh`: runs examples of reads containing adapters
+   contaminations. A set of different possibilities is covered. 
+   See README file inside the folder `adapters`
+
+3. `run_example_TREE.sh`: the code was tested with flags:                     
    ```
     $ ../../bin/trimFilter -l 50 --ifq PATH/TO/EColi_rRNA.fq.gz 
     --method TREE --ifa PATH/TO/rRNA_modified.fa:0.9:50 
@@ -326,13 +398,13 @@ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
    i.e., we check for contaminations from rRNA, trim reads with lowQ at
    the ends and less than 5% in the remaining part, and strip reads
    containing N's. The output should coincide with the files `example_TREE*`                 
-3. `run_example_BLOOM.sh`:                                                    
+4. `run_example_BLOOM.sh`:                                                    
   * bloom filter is generated for `rRNA_modified.fa` with FPR = 0.0075
     and `kmersize=25`. The output should coincide with `rRNA_example.bf*`.
   * trimFilter is run like in 2. but passing a bloom filter to look for
     contaminations with `score=0.4`. 
-4. `run_example_SA.sh`: TODO                                      
-5. With this set up, it is possible to run further customized tests.         
+5. `run_example_SA.sh`: TODO                                      
+6. With this set up, it is possible to run further customized tests.         
 
 **NOTE:** `rRNA_modified.fa` is the `rRNA_CRUnit.fa` sequence, where we have     
         removed the lines containing N's for testing purposes.                
 
@@ -0,0 +1,14 @@
+# Generates ROC curves.
+tags <-  c( "0p0075")
+for (FPR_text in tags) {
+   bloom <- read.csv(paste0("example_ROC_",FPR_text,"_bloom.csv"))
+   FPR = bloom[,4]
+   TPR = bloom[,2]
+   pdf(paste0("ROC_",FPR_text,"_bloom.png"))
+   plot(FPR,TPR, main="ROC curves with option -p 0.0075", 
+     xlab="False positive rate",ylab="sensitivity", 
+     xlim = c(min(FPR),max(FPR)),
+     ylim = c(min(TPR),max(TPR)),
+     type="o", col="blue")
+   dev.off()
+}
@@ -1,17 +1,17 @@
 FN,TP,TN,FP
-0.050000,0.050470,0.949530,0.999420,0.000580
-0.060000,0.050670,0.949330,0.999600,0.000400
-0.070000,0.051010,0.948990,0.999690,0.000310
-0.080000,0.051850,0.948150,0.999820,0.000180
-0.090000,0.052440,0.947560,0.999850,0.000150
-0.100000,0.052960,0.947040,0.999870,0.000130
-0.110000,0.053850,0.946150,0.999880,0.000120
-0.120000,0.055700,0.944300,0.999950,0.000050
-0.130000,0.056650,0.943350,0.999950,0.000050
-0.140000,0.057770,0.942230,0.999950,0.000050
+0.050000,0.050420,0.949580,0.999530,0.000470
+0.060000,0.050640,0.949360,0.999650,0.000350
+0.070000,0.050950,0.949050,0.999710,0.000290
+0.080000,0.051880,0.948120,0.999830,0.000170
+0.090000,0.052460,0.947540,0.999860,0.000140
+0.100000,0.053030,0.946970,0.999870,0.000130
+0.110000,0.053820,0.946180,0.999880,0.000120
+0.120000,0.055580,0.944420,0.999930,0.000070
+0.130000,0.056590,0.943410,0.999950,0.000050
+0.140000,0.057740,0.942260,0.999950,0.000050
 0.150000,0.059110,0.940890,0.999960,0.000040
-0.160000,0.062000,0.938000,0.999990,0.000010
-0.170000,0.063510,0.936490,0.999990,0.000010
-0.180000,0.065400,0.934600,0.999990,0.000010
-0.190000,0.067290,0.932710,0.999990,0.000010
-0.200000,0.071730,0.928270,0.999990,0.000010
+0.160000,0.062080,0.937920,0.999990,0.000010
+0.170000,0.063800,0.936200,0.999990,0.000010
+0.180000,0.065520,0.934480,0.999990,0.000010
+0.190000,0.067360,0.932640,0.999990,0.000010
+0.200000,0.071640,0.928360,0.999990,0.000010
@@ -1,17 +1,17 @@
 FN,TP,TN,FP
-0.050000,0.050400,0.949600,0.999500,0.000500
-0.060000,0.050640,0.949360,0.999650,0.000350
-0.070000,0.050920,0.949080,0.999740,0.000260
-0.080000,0.051780,0.948220,0.999820,0.000180
-0.090000,0.052350,0.947650,0.999840,0.000160
-0.100000,0.052960,0.947040,0.999880,0.000120
-0.110000,0.053680,0.946320,0.999890,0.000110
-0.120000,0.055480,0.944520,0.999920,0.000080
-0.130000,0.056330,0.943670,0.999950,0.000050
+0.050000,0.050370,0.949630,0.999530,0.000470
+0.060000,0.050630,0.949370,0.999680,0.000320
+0.070000,0.050880,0.949120,0.999720,0.000280
+0.080000,0.051690,0.948310,0.999850,0.000150
+0.090000,0.052250,0.947750,0.999880,0.000120
+0.100000,0.052850,0.947150,0.999880,0.000120
+0.110000,0.053530,0.946470,0.999890,0.000110
+0.120000,0.055390,0.944610,0.999930,0.000070
+0.130000,0.056350,0.943650,0.999950,0.000050
 0.140000,0.057500,0.942500,0.999960,0.000040
-0.150000,0.058730,0.941270,0.999960,0.000040
-0.160000,0.061640,0.938360,0.999990,0.000010
-0.170000,0.063350,0.936650,0.999990,0.000010
-0.180000,0.064990,0.935010,0.999990,0.000010
-0.190000,0.066880,0.933120,0.999990,0.000010
-0.200000,0.071190,0.928810,0.999990,0.000010
+0.150000,0.058820,0.941180,0.999970,0.000030
+0.160000,0.061730,0.938270,0.999980,0.000020
+0.170000,0.063180,0.936820,0.999990,0.000010
+0.180000,0.064860,0.935140,0.999990,0.000010
+0.190000,0.066970,0.933030,0.999990,0.000010
+0.200000,0.071110,0.928890,0.999990,0.000010
@@ -1,17 +1,17 @@
 FN,TP,TN,FP
-0.050000,0.050290,0.949710,0.999290,0.000710
-0.060000,0.050500,0.949500,0.999580,0.000420
-0.070000,0.050810,0.949190,0.999670,0.000330
-0.080000,0.051600,0.948400,0.999800,0.000200
-0.090000,0.052120,0.947880,0.999830,0.000170
-0.100000,0.052730,0.947270,0.999880,0.000120
-0.110000,0.053480,0.946520,0.999880,0.000120
-0.120000,0.055190,0.944810,0.999910,0.000090
-0.130000,0.056210,0.943790,0.999940,0.000060
-0.140000,0.057160,0.942840,0.999950,0.000050
-0.150000,0.058430,0.941570,0.999970,0.000030
-0.160000,0.061380,0.938620,0.999980,0.000020
-0.170000,0.063010,0.936990,0.999980,0.000020
-0.180000,0.064640,0.935360,0.999980,0.000020
-0.190000,0.066360,0.933640,0.999980,0.000020
-0.200000,0.070740,0.929260,0.999990,0.000010
+0.050000,0.050330,0.949670,0.999340,0.000660
+0.060000,0.050520,0.949480,0.999580,0.000420
+0.070000,0.050810,0.949190,0.999680,0.000320
+0.080000,0.051580,0.948420,0.999800,0.000200
+0.090000,0.052140,0.947860,0.999830,0.000170
+0.100000,0.052700,0.947300,0.999870,0.000130
+0.110000,0.053340,0.946660,0.999890,0.000110
+0.120000,0.055150,0.944850,0.999930,0.000070
+0.130000,0.056110,0.943890,0.999940,0.000060
+0.140000,0.057160,0.942840,0.999960,0.000040
+0.150000,0.058550,0.941450,0.999970,0.000030
+0.160000,0.061200,0.938800,0.999990,0.000010
+0.170000,0.062930,0.937070,0.999990,0.000010
+0.180000,0.064730,0.935270,0.999990,0.000010
+0.190000,0.066600,0.933400,0.999990,0.000010
+0.200000,0.070600,0.929400,0.999990,0.000010
@@ -1,17 +1,17 @@
 FN,TP,TN,FP
-0.050000,0.049710,0.950290,0.990740,0.009260
-0.060000,0.050130,0.949870,0.997260,0.002740
-0.070000,0.050520,0.949480,0.999020,0.000980
-0.080000,0.051170,0.948830,0.999690,0.000310
-0.090000,0.051530,0.948470,0.999790,0.000210
-0.100000,0.052120,0.947880,0.999830,0.000170
-0.110000,0.052650,0.947350,0.999850,0.000150
-0.120000,0.054310,0.945690,0.999910,0.000090
-0.130000,0.055230,0.944770,0.999920,0.000080
-0.140000,0.056230,0.943770,0.999930,0.000070
-0.150000,0.057350,0.942650,0.999950,0.000050
-0.160000,0.059930,0.940070,0.999950,0.000050
-0.170000,0.061310,0.938690,0.999970,0.000030
-0.180000,0.062910,0.937090,0.999980,0.000020
-0.190000,0.064800,0.935200,0.999990,0.000010
-0.200000,0.068810,0.931190,0.999990,0.000010
+0.050000,0.049780,0.950220,0.990760,0.009240
+0.060000,0.050210,0.949790,0.996800,0.003200
+0.070000,0.050570,0.949430,0.999030,0.000970
+0.080000,0.051180,0.948820,0.999750,0.000250
+0.090000,0.051650,0.948350,0.999830,0.000170
+0.100000,0.052080,0.947920,0.999850,0.000150
+0.110000,0.052640,0.947360,0.999880,0.000120
+0.120000,0.054190,0.945810,0.999900,0.000100
+0.130000,0.055090,0.944910,0.999910,0.000090
+0.140000,0.056150,0.943850,0.999930,0.000070
+0.150000,0.057200,0.942800,0.999950,0.000050
+0.160000,0.059800,0.940200,0.999980,0.000020
+0.170000,0.061320,0.938680,0.999980,0.000020
+0.180000,0.062900,0.937100,0.999990,0.000010
+0.190000,0.064660,0.935340,0.999990,0.000010
+0.200000,0.068520,0.931480,0.999990,0.000010
@@ -0,0 +1,2 @@
+>Illumina Single End Adapter 2
+CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
@@ -0,0 +1,2 @@
+>Artificial adapter (trimmed from Illumina Single End Adapter 2)
+CAAGCAGAAGACGG
@@ -0,0 +1,2 @@
+>Illumina Single End Adapter 1
+GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG
@@ -0,0 +1,3 @@
+>Artificial adapter (trimmed from Illumina Single End Adapter 1)
+GATCGGAAGAGCTCG
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>Illumina Single End Adapter 2`
	`2`	`+CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>Artificial adapter (trimmed from Illumina Single End Adapter 2)`
	`2`	`+CAAGCAGAAGACGG`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>Illumina Single End Adapter 1`
	`2`	`+GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+>Artificial adapter (trimmed from Illumina Single End Adapter 1)`
	`2`	`+GATCGGAAGAGCTCG`
	`3`	`+`