Merge branch 'develop' into cell-grouping

# Conflicts: # build.gradle.kts # changelogs/v4.4.3.md # regression/cli-help/align.txt # regression/cli-help/analyze.txt # regression/presets/analyze/test-gf-intersection.yaml # src/main/resources/presets/protocols/generic-single-cell.yaml # src/test/kotlin/com/milaboratory/mixcr/PresetsTest.kt
aglucaci · Sep 18, 2023 · 1023141 · 1023141
2 parents 0e1d7bf + 43185d7
commit 1023141
Show file tree

Hide file tree

Showing 275 changed files with 3,601 additions and 1,330 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -133,14 +133,14 @@ val toObfuscate: Configuration by configurations.creating {
 val obfuscationLibs: Configuration by configurations.creating
 
 
-val mixcrAlgoVersion = "4.4.2-31-cell-grouping"
+val mixcrAlgoVersion = "4.4.2-45-different-things"
 val milibVersion = ""
 val mitoolVersion = ""
 val repseqioVersion = ""
 
 val picocliVersion = "4.6.3"
 val jacksonBomVersion = "2.15.2"
-val milmVersion = "3.8.0"
+val milmVersion = "4.0.0"
 
 val cliktVersion = "3.5.0"
 val jcommanderVersion = "1.72"

diff --git a/changelogs/v4.4.3.md b/changelogs/v4.4.3.md
@@ -1,16 +1,54 @@
 # New features
-- Export biochemical properties of gene regions with `-biochemicalProperty <property> <geneFeature>` export option.
-  Available properties: Hydropathy, Charge, Polarity, Volume, Strength, MjEnergy, Kf1, Kf2, Kf3, Kf4, Kf5, Kf6, Kf7,
-  Kf8, Kf9, Kf10, Rim, Surface, Turn, Alpha, Beta, Core, Disorder, N2Strength, N2Hydrophobicity, N2Volume, N2Surface.
+
+- Export biochemical properties of gene regions with `-biochemicalProperty <geneFeature> <property>`
+  or `-baseBiochemicalProperties <geneFeature>` export options. Available in export for alignments, clones and SHM tree
+  nodes. Available properties: Hydropathy, Charge, Polarity, Volume, Strength, MjEnergy, Kf1, Kf2, Kf3, Kf4, Kf5, Kf6,
+  Kf7, Kf8, Kf9, Kf10, Rim, Surface, Turn, Alpha, Beta, Core, Disorder, N2Strength, N2Hydrophobicity, N2Volume,
+  N2Surface.
+- Export isotype with `-isotype [<(primary|subclass|auto)>]`
+- Added `isotype` field to `exportClones` as default for presets that support isotype identification.
+- Export `-mutationRate [<gene_feature>]` in `exportShmTreesWithNodes`, `exportClones` and `exportCloneGroups` command:
+  number of mutations from germline divided by target sequence size. For `exportClones` and `exportCloneGroups` CDR3 is
+  not included in calculation.
+- Support `cram` files as input for `analyze` and `align` commands. Optionally a reference to the genome can be
+  specified by `--reference-for-cram`
 - Added two commands for processing single cell data: `groupClones` and `exportCloneGroups`. First command will
   calculate `cellGroup` for each clone, second will export info about each group.
+
+# Algorithm enhancement
+
+- Global consensus assembly algorithm used in `assemble` to collapse UMI/Cell groups into contigs now have better seed
+  selection empiric for multi-consensus assembly scenarios. This increases sensitivity during assembly of secondary
+  consensuses from the same group of sequences.
+- Consensus assembly parameters `maxNormalizedAlignmentPenalty` and `altSeedPenaltyTolerance` are adjusted to increase
+  sensitivity.
+
 # Minor fixes
-- Corrected the help message that appears when using a deprecated preset and incorrectly suggests using `--assemble-contigs-by` instead of `--assemble-clonotypes-by`.
-- The `--split-by-sample` option is now set to `true` by default for all `align` presets, as well as all presets that inherit from it. This new default behavior applies unless it is directly overridden in the preset or with `--dont-split-by-sample` mix-in.
+
+- Corrected the help message that appears when using a deprecated preset and incorrectly suggests
+  using `--assemble-contigs-by` instead of `--assemble-clonotypes-by`.
+- The `--split-by-sample` option is now set to `true` by default for all `align` presets, as well as all presets that
+  inherit from it. This new default behavior applies unless it is directly overridden in the preset or
+  with `--dont-split-by-sample` mix-in.
 - Fixed possible crash with `--dry-run` option in `analyze`
 - On `exportClone` and `exportShmTreesWithNodes` in case of splitting by tags, reads count of clones will be changed to
   the sum of reads for given tags selection
-- Fixed assembling feature for `irepertoire-human-rna-xcr-repseq-plus` preset. Now {CDR2Begin:FR4End}
+- Fixed assembling feature for `irepertoire-human-rna-xcr-repseq-plus` preset. Now `{CDR2Begin:FR4End}`.
+- `exportAlignments` function now reports UMI and/or Cell barcodes by default for presets with barcodes.
+- `exportAlignments` by default now include the column `topChains`. `exportClones` function reports `topChains` for single cell presets.
+- Changed tag pattern for `cellecta-human-rna-xcr-umi-drivermap-air`. Now UMI includes a part of the C-gene primer to increase diversity, and R2 is also used for payload.
+- Fixed calculation of `geneFamilyName` for genes like `IGHA*00` (without number before `*` symbol)
+- Added new constrain in low quality read mapping procedure preventing cross-cell read mapping
+- Added split by C gene feature to `thermofisher-human-rna-igh-oncomine-lr` preset.
+- Fixed usage of BAM input for `analyze`|`align` if file contains both paired and single reads
+
+# Minor features
+
+- Better formatting in `listPresets` command. Added grouping by vendor, labels and optional filtering
+- Validation of input types in `align` or `analyze` by given tag pattern
 
 # New Presets
-- `bd-sc-xcr-rhapsody-full-length-enhanced-bead-v2` new preset for BD full-length protocol with enhanced beads V2 featuring B384 whitelists.
+
+- `bd-sc-xcr-rhapsody-full-length-enhanced-bead-v2` new preset for BD full-length protocol with enhanced beads V2
+  featuring B384 whitelists.
+- `takara-mouse-rna-tcr-umi-smarseq` preset for Takara Bio SMART-Seq Mouse TCR (with UMIs).
diff --git a/ensure-test-data.sh b/ensure-test-data.sh
@@ -61,6 +61,19 @@ elif [ "${1}" == "int" ] || [ "${1}" == "reg"  ]; then
         curl -sS -O https://s3.amazonaws.com/files.milaboratory.com/test-data/CD4M1_test_R2.fastq.gz
     fi
 
+    if [[ ! -f subset_B004-7_S247_L001_I1_001.fastq.gz ]]; then
+        curl -sS -O https://s3.amazonaws.com/files.milaboratory.com/test-data/subset_B004-7_S247_L001_I1_001.fastq.gz
+    fi
+    if [[ ! -f subset_B004-7_S247_L001_I2_001.fastq.gz ]]; then
+        curl -sS -O https://s3.amazonaws.com/files.milaboratory.com/test-data/subset_B004-7_S247_L001_I2_001.fastq.gz
+    fi
+    if [[ ! -f subset_B004-7_S247_L001_R1_001.fastq.gz ]]; then
+        curl -sS -O https://s3.amazonaws.com/files.milaboratory.com/test-data/subset_B004-7_S247_L001_R1_001.fastq.gz
+    fi
+    if [[ ! -f subset_B004-7_S247_L001_R2_001.fastq.gz ]]; then
+        curl -sS -O https://s3.amazonaws.com/files.milaboratory.com/test-data/subset_B004-7_S247_L001_R2_001.fastq.gz
+    fi
+
     mkdir -p regression
     cd regression
     files=$(cat $dir/regression/list)

diff --git a/itests.sh b/itests.sh
@@ -139,13 +139,18 @@ if [[ $run_tests == true ]]; then
 
   ln -s -f ../src/test/resources/sequences/big/CD4M1_test_R1.fastq.gz ${dir}/test_target/CD4M1_test_R1.fastq.gz
   ln -s -f ../src/test/resources/sequences/big/CD4M1_test_R2.fastq.gz ${dir}/test_target/CD4M1_test_R2.fastq.gz
+  ln -s -f ../src/test/resources/sequences/big/subset_B004-7_S247_L001_R1_001.fastq.gz ${dir}/test_target/subset_B004-7_S247_L001_R1_001.fastq.gz
+  ln -s -f ../src/test/resources/sequences/big/subset_B004-7_S247_L001_R2_001.fastq.gz ${dir}/test_target/subset_B004-7_S247_L001_R2_001.fastq.gz
+  ln -s -f ../src/test/resources/sequences/big/subset_B004-7_S247_L001_I1_001.fastq.gz ${dir}/test_target/subset_B004-7_S247_L001_I1_001.fastq.gz
+  ln -s -f ../src/test/resources/sequences/big/subset_B004-7_S247_L001_I2_001.fastq.gz ${dir}/test_target/subset_B004-7_S247_L001_I2_001.fastq.gz
   ln -s -f ../src/test/resources/sequences/big/single_cell_vdj_t_subset_R1.fastq.gz ${dir}/test_target/single_cell_vdj_t_subset_R1.fastq.gz
   ln -s -f ../src/test/resources/sequences/big/single_cell_vdj_t_subset_R2.fastq.gz ${dir}/test_target/single_cell_vdj_t_subset_R2.fastq.gz
   ln -s -f ../src/test/resources/sequences/big/trees_samples ${dir}/test_target/trees_samples
   ln -s -f ../src/test/resources/sequences/big/regression ${dir}/test_target/regression
   ln -s -f ../src/test/resources/sequences/umi_ig_data_2_subset_R1.fastq.gz ${dir}/test_target/umi_ig_data_2_subset_R1.fastq.gz
   ln -s -f ../src/test/resources/sequences/umi_ig_data_2_subset_R2.fastq.gz ${dir}/test_target/umi_ig_data_2_subset_R2.fastq.gz
   ln -s -f ../src/test/resources/bam/unsorted.bam ${dir}/test_target/unsorted.bam
+  ln -s -f ../src/test/resources/bam/unpairedSortedByCoord.bam ${dir}/test_target/unpaired.bam
   ln -s -f ../src/test/resources/library_for_alleles_test.json ${dir}/test_target/library_for_alleles_test.json
   ln -s -f ../src/test/resources/sligtly_broken_library_for_alleles_test.json ${dir}/test_target/sligtly_broken_library_for_alleles_test.json
   cd ${dir}

diff --git a/itests/case-IR.sh b/itests/case-IR.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Single-cell integration test
+
+assert() {
+  expected=$(echo -ne "${2:-}")
+  result="$(eval 2>/dev/null $1)" || true
+  result="$(sed -e 's/ *$//' -e 's/^ *//' <<<"$result")"
+  if [[ "$result" == "$expected" ]]; then
+    return
+  fi
+  result="$(sed -e :a -e '$!N;s/\n/\\n/;ta' <<<"$result")"
+  [[ -z "$result" ]] && result="nothing" || result="\"$result\""
+  [[ -z "$2" ]] && expected="nothing" || expected="\"$2\""
+  echo "expected $expected got $result for" "$1"
+  exit 1
+}
+
+set -eux
+
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    subset_B004-7_S247_L001_I1_001.fastq.gz \
+    subset_B004-7_S247_L001_I2_001.fastq.gz \
+    output_normal
+
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_{{IR}}_001.fastq.gz \
+    output_with_template
+
+## R2 as UMI
+mixcr analyze generic-lt-single-cell-amplicon-with-umi \
+    --tag-pattern "^(R1:*)\^(UMI:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    subset_B004-7_S247_L001_I1_001.fastq.gz \
+    subset_B004-7_S247_L001_I2_001.fastq.gz \
+    output_UMI1
+
+# R1 as UMI and payload
+mixcr analyze generic-lt-single-cell-amplicon-with-umi \
+    --tag-pattern "^N{16}(UMI:N{10})(R1:*)\^(R2:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    subset_B004-7_S247_L001_I1_001.fastq.gz \
+    subset_B004-7_S247_L001_I2_001.fastq.gz \
+    output_UMI2
+
+# R1+R2+I1
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    subset_B004-7_S247_L001_I1_001.fastq.gz \
+    output_R1_R2_I1
diff --git a/itests/case014.sh → itests/case-bam.sh b/itests/case014.sh → itests/case-bam.sh
@@ -35,3 +35,9 @@ mixcr align --preset test-generic -s hs --rna --floating-left-alignment-boundary
 mixcr alignmentsDiff bam.vdjca fq.vdjca > diff
 
 assert "cat diff | grep 'Total number of different reads'" "Total number of different reads: 0"
+
+# can parse file with single and paired reads in the same file
+mixcr align -f --preset test-generic -s hs --rna \
+  --floating-left-alignment-boundary \
+  --rigid-right-alignment-boundary C \
+  unpaired.bam bam.vdjca
diff --git a/itests/case-tag_validation.sh b/itests/case-tag_validation.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+
+#
+# Copyright (c) 2014-2022, MiLaboratories Inc. All Rights Reserved
+#
+# Before downloading or accessing the software, please read carefully the
+# License Agreement available at:
+# https://github.com/milaboratory/mixcr/blob/develop/LICENSE
+#
+# By downloading or accessing the software, you accept and agree to be bound
+# by the terms of the License Agreement. If you do not want to agree to the terms
+# of the Licensing Agreement, you must not download or access the software.
+#
+# Start from BAM integration test
+
+assert() {
+  expected=$(echo -ne "${2:-}")
+  result="$(eval 2>/dev/null $1)" || true
+  result="$(sed -e 's/ *$//' -e 's/^ *//' <<<"$result")"
+  if [[ "$result" == "$expected" ]]; then
+    return
+  fi
+  result="$(sed -e :a -e '$!N;s/\n/\\n/;ta' <<<"$result")"
+  [[ -z "$result" ]] && result="nothing" || result="\"$result\""
+  [[ -z "$2" ]] && expected="nothing" || expected="\"$2\""
+  echo "expected $expected got $result for" "$1"
+  exit 1
+}
+
+mixcr align -f --preset test-generic -s hs --rna \
+  --tag-pattern "^(R1:*)\^(R2:*)" \
+  --floating-left-alignment-boundary \
+  --rigid-right-alignment-boundary C \
+  unpaired.bam bam.vdjca 2>err
+
+cat err
+assert "grep -c 'Tag pattern require BAM file to contain only paired reads' err" "1"
+
+mixcr align -f --preset test-generic -s hs --rna \
+  --tag-pattern "^(R1:*)" \
+  --floating-left-alignment-boundary \
+  --rigid-right-alignment-boundary C \
+  unpaired.bam bam.vdjca 2>err
+
+cat err
+assert "grep -c 'Tag pattern require BAM file to contain only single reads' err" "1"
+
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    subset_B004-7_S247_L001_I1_001.fastq.gz \
+    output 2>err
+
+cat err
+assert "grep -c 'Tag pattern require 4 input files, got 3' err" "1"
+
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)\^(CELL2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    output 2>err
+
+cat err
+assert "grep -c 'Tag pattern require 4 input files, got 2' err" "1"
+
+mixcr analyze generic-lt-single-cell-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)\^(CELL1:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    output 2>err
+
+cat err
+assert "grep -c 'Tag pattern require 3 input files, got 2' err" "1"
+
+mixcr analyze generic-amplicon \
+    --tag-pattern "^(R1:*)\^(R2:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    output 2>err
+
+cat err
+assert "grep -c 'Tag pattern require 2 input files, got 1' err" "1"
+
+mixcr analyze generic-amplicon \
+    --tag-pattern "^(R1:*)" \
+    --species hsa \
+    --rna \
+    --floating-left-alignment-boundary \
+    --floating-right-alignment-boundary C \
+    subset_B004-7_S247_L001_R1_001.fastq.gz \
+    subset_B004-7_S247_L001_R2_001.fastq.gz \
+    output 2>err
+
+cat err
+assert "grep -c 'Tag pattern require 1 input file, got 2' err" "1"
diff --git a/itests/case019_multiple_cell_barcodes.sh b/itests/case019_multiple_cell_barcodes.sh
@@ -36,18 +36,18 @@ mixcr exportReports --yaml case19.vdjcontigs.contigs.clns
 mixcr exportReports case19.vdjcontigs.contigs.clns
 
 #doesn't split by cell
-assert "mixcr exportClones --no-header --drop-default-fields -cloneId case19.vdjcontigs.contigs.clns | wc -l" "7"
+assert "mixcr exportClones --no-header --drop-default-fields -cloneId case19.vdjcontigs.contigs.clns | wc -l" "8"
 #split by cell (cell tags are exported)
-assert "mixcr exportClones --no-header case19.vdjcontigs.contigs.clns | wc -l" "10"
+assert "mixcr exportClones --no-header case19.vdjcontigs.contigs.clns | wc -l" "11"
 #cellId also split by cell
-assert "mixcr exportClones --no-header --drop-default-fields -cellId -cloneId case19.vdjcontigs.contigs.clns | wc -l" "10"
+assert "mixcr exportClones --no-header --drop-default-fields -cellId -cloneId case19.vdjcontigs.contigs.clns | wc -l" "11"
 #all cells tags found
 assert "mixcr exportClones --no-header --drop-default-fields -cellId case19.vdjcontigs.contigs.clns | grep 'cant_get_tag_need_to_be_split' | wc -l" "0"
 #there are three cells
 assert "mixcr exportClones --no-header --drop-default-fields -cellId case19.vdjcontigs.contigs.clns | sort | uniq | wc -l" "3"
 
 ## `tail +2` - skip first line with column names
-assert "mixcr exportAirr case19.vdjcontigs.contigs.clns | tail +2 | wc -l" "10" #splitted by Cell
+assert "mixcr exportAirr case19.vdjcontigs.contigs.clns | tail +2 | wc -l" "11" #splitted by Cell
 assert "mixcr exportAirr case19.vdjcontigs.contigs.clns | head -n 1 | grep cell_id | wc -l" "1"
 assert "mixcr exportAirr case19.vdjcontigs.contigs.clns | head -n 1 | grep umi_count | wc -l" "1"