diff --git a/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs b/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs
new file mode 100644
index 00000000..f81bbfd7
--- /dev/null
+++ b/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs
@@ -0,0 +1,9 @@
+using Genome;
+
+namespace MitoHeteroplasmy
+{
+ public interface IMitoHeteroplasmyProvider
+ {
+ double?[] GetVrfPercentiles(string genotype, IChromosome chrome, int position, string[] altAllele, double[] vrfs);
+ }
+}
\ No newline at end of file
diff --git a/MitoHeteroplasmy/MitoHeteroplasmy.csproj b/MitoHeteroplasmy/MitoHeteroplasmy.csproj
new file mode 100644
index 00000000..2b894a25
--- /dev/null
+++ b/MitoHeteroplasmy/MitoHeteroplasmy.csproj
@@ -0,0 +1,15 @@
+
+
+
+ netcoreapp2.1
+ latest
+
+
+
+
+
+
+
+
+
+
diff --git a/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs b/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs
new file mode 100644
index 00000000..7f951812
--- /dev/null
+++ b/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs
@@ -0,0 +1,66 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Genome;
+
+namespace MitoHeteroplasmy
+{
+ public sealed class MitoHeteroplasmyProvider : IMitoHeteroplasmyProvider
+ {
+ private const string MitoChromUcscName = "chrM";
+ private static readonly Dictionary AlleleToInt = new Dictionary { { "A", 0 }, { "C", 1 }, { "G", 2 }, { "T", 3 } };
+ private const int SequenceLengthMax = int.MaxValue / 4;
+
+ private readonly Dictionary _alleleToVrf = new Dictionary();
+
+ public void Add(int position, string altAllele, IEnumerable vrfs, int[] alleleDepths)
+ {
+ var vrfsInt = vrfs.Select(ToIntVrfForm).ToArray();
+ _alleleToVrf[EncodeMitoPositionAndAltAllele(position, altAllele)] = (vrfsInt, alleleDepths);
+ }
+
+ public double?[] GetVrfPercentiles(string genotypes, IChromosome chrom, int position, string[] altAlleles, double[] vrfs)
+ {
+ if (vrfs == null) return null;
+ if (chrom.UcscName != MitoChromUcscName) return null;
+
+ var sampleAlleles = genotypes.Split('|', '/').Where(x => x != "0")
+ .Select(x => GetAlleleByGenotype(x, altAlleles));
+
+ var percentiles = vrfs.Zip(sampleAlleles, (vrf, allele) => GetVrfPercentile(position, allele, vrf)).ToArray();
+ return percentiles.All(x => x == null) ? null : percentiles;
+ }
+
+ private static string GetAlleleByGenotype(string genotypeIndex, string[] altAlleles) => altAlleles[int.Parse(genotypeIndex) - 1];
+
+ private double? GetVrfPercentile(int position, string altAllele, double vrf)
+ {
+ if (string.IsNullOrEmpty(altAllele)) return null;
+
+ var positionAndAltAlleleIntForm = EncodeMitoPositionAndAltAllele(position, altAllele);
+
+ if (!_alleleToVrf.TryGetValue(positionAndAltAlleleIntForm, out (int[] Vrfs, int[] AlleleDepths) data)) return null;
+
+ var scaledVrf = vrf * 1000;
+ int nearestBiggerVrfIndex;
+ for (nearestBiggerVrfIndex = 0; nearestBiggerVrfIndex < data.Vrfs.Length; nearestBiggerVrfIndex++)
+ {
+ if (data.Vrfs[nearestBiggerVrfIndex] > scaledVrf) break;
+ }
+
+ var numSmallerOrEqualAlleles = 0.0;
+ var numAllAlleles = 0;
+ for (var i = 0; i < data.AlleleDepths.Length; i++)
+ {
+ if (i < nearestBiggerVrfIndex) numSmallerOrEqualAlleles += data.AlleleDepths[i];
+ numAllAlleles += data.AlleleDepths[i];
+ }
+
+ return numSmallerOrEqualAlleles / numAllAlleles;
+ }
+
+ private static int ToIntVrfForm(double vrf) => Convert.ToInt32(vrf * 1000);
+
+ private static int EncodeMitoPositionAndAltAllele(int position, string altAllele) => SequenceLengthMax * AlleleToInt[altAllele] + position;
+ }
+}
\ No newline at end of file
diff --git a/MitoHeteroplasmy/MitoHeteroplasmyReader.cs b/MitoHeteroplasmy/MitoHeteroplasmyReader.cs
new file mode 100644
index 00000000..80b7f315
--- /dev/null
+++ b/MitoHeteroplasmy/MitoHeteroplasmyReader.cs
@@ -0,0 +1,44 @@
+using System;
+using System.IO;
+using System.IO.Compression;
+using System.Linq;
+using System.Reflection;
+using OptimizedCore;
+
+namespace MitoHeteroplasmy
+{
+ public static class MitoHeteroplasmyReader
+ {
+ private const int PositionIndex = 0;
+ private const int AltIndex = 2;
+ private const int VrfIndex = 3;
+ private const int AlleleDepthIndex = 4;
+
+ private const string ResourceName = "MitoHeteroplasmy.Resources.MitoHeteroplasmy.tsv.gz";
+ public static MitoHeteroplasmyProvider GetData()
+ {
+ var assembly = Assembly.GetExecutingAssembly();
+ using var stream = assembly.GetManifestResourceStream(ResourceName);
+ if (stream == null) throw new NullReferenceException("Unable to read from the Mitochondrial Heteroplasmy file");
+
+ using var gzStream = new GZipStream(stream, CompressionMode.Decompress);
+ using var reader = new StreamReader(gzStream);
+ reader.ReadLine();
+
+ var mitoHeteroplasmyData = new MitoHeteroplasmyProvider();
+ while (true)
+ {
+ string line = reader.ReadLine();
+ if (line == null) break;
+
+ var fields = line.OptimizedSplit('\t');
+ var position = int.Parse(fields[PositionIndex]);
+ var vrfs = fields[VrfIndex].Split(',').Select(double.Parse);
+ var alleleDepths = fields[AlleleDepthIndex].Split(',').Select(int.Parse).ToArray();
+ mitoHeteroplasmyData.Add(position, fields[AltIndex], vrfs, alleleDepths);
+ }
+
+ return mitoHeteroplasmyData;
+ }
+ }
+}
\ No newline at end of file
diff --git a/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz b/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz
new file mode 100644
index 00000000..577b5c7b
Binary files /dev/null and b/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz differ
diff --git a/Nirvana.sln b/Nirvana.sln
index 1c73bb8b..e9d8adbc 100644
--- a/Nirvana.sln
+++ b/Nirvana.sln
@@ -44,7 +44,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tabix", "Tabix\Tabix.csproj
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cloud", "Cloud\Cloud.csproj", "{E93914C8-2599-46BE-BE18-6229E53F581B}"
EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Downloader", "Downloader\Downloader.csproj", "{5B81B762-8A86-466A-A947-AC2CA53EE40D}"
EndProject
@@ -58,12 +58,14 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NirvanaLambda", "NirvanaLam
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SingleAnnotationLambda", "SingleAnnotationLambda\SingleAnnotationLambda.csproj", "{C9B4E16E-FF30-4CE0-A617-F833696FBE10}"
EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}"
EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomStrValidationLambda", "CustomStrValidationLambda\CustomStrValidationLambda.csproj", "{F3E60E51-EE07-4768-8EC3-E3A323DFA547}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MitoHeteroplasmy", "MitoHeteroplasmy\MitoHeteroplasmy.csproj", "{387E4C8D-6A27-40DE-A305-F3F047B8D865}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -186,6 +188,10 @@ Global
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.Build.0 = Release|Any CPU
+ {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/Nirvana.sln.DotSettings b/Nirvana.sln.DotSettings
index 2e9c7637..67cdd502 100644
--- a/Nirvana.sln.DotSettings
+++ b/Nirvana.sln.DotSettings
@@ -174,4 +174,5 @@
True
True
True
- True
\ No newline at end of file
+ True
+ True
\ No newline at end of file
diff --git a/Nirvana/Properties/launchSettings.json b/Nirvana/Properties/launchSettings.json
index bbf33196..37935723 100644
--- a/Nirvana/Properties/launchSettings.json
+++ b/Nirvana/Properties/launchSettings.json
@@ -209,6 +209,11 @@
"commandName": "Project",
"commandLineArgs": "--cache /Users/rroy1/development/Cache\\26\\GRCh38\\Both --sd /Users/rroy1/development/SupplementaryDatabase\\56\\GRCh38 --ref /Users/rroy1/development/References/6/Homo_sapiens.GRCh38.Nirvana.dat --in DQ-Strelka-Germline-chr22-hg38.vcf.gz --out dq",
"workingDirectory": "/Users/rroy1/development/TestDatasets"
+ },
+ "SK MitoHeteroplasmy": {
+ "commandName": "Project",
+ "commandLineArgs": "-i debug.vcf -c D:\\Nirvana\\Development\\Cache\\26\\GRCh38\\Both -r D:\\Nirvana\\Development\\References\\Homo_sapiens.GRCh38.Nirvana.dat -o debug",
+ "workingDirectory": "D:\\Nirvana\\test_runs\\run_Nirvana"
}
}
}
\ No newline at end of file
diff --git a/Nirvana/StreamAnnotation.cs b/Nirvana/StreamAnnotation.cs
index 2699c9c0..643975bd 100644
--- a/Nirvana/StreamAnnotation.cs
+++ b/Nirvana/StreamAnnotation.cs
@@ -4,6 +4,7 @@
using ErrorHandling.Exceptions;
using Genome;
using IO;
+using MitoHeteroplasmy;
using VariantAnnotation;
using VariantAnnotation.Interface;
using VariantAnnotation.Interface.IO;
@@ -23,10 +24,10 @@ public static ExitCodes Annotate(Stream headerStream, Stream inputVcfStream, Str
var metrics = annotationResources.Metrics;
PerformanceMetrics.ShowAnnotationHeader();
- IChromosome currentChromosome = new EmptyChromosome("dummy");
- int numVariants = 0;
-
- using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter))
+ IChromosome currentChromosome = new EmptyChromosome("dummy");
+ int numVariants = 0;
+ IMitoHeteroplasmyProvider mitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetData();
+ using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter, mitoHeteroplasmyProvider))
using (var jsonWriter = new JsonWriter(outputJsonStream, outputJsonIndexStream, annotationResources, Date.CurrentTimeStamp, vcfReader.GetSampleNames(), false))
{
try
@@ -97,7 +98,7 @@ private static void SetMitochondrialAnnotationBehavior(IAnnotationResources anno
}
private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAnnotationResources annotationResources,
- IVcfFilter vcfFilter)
+ IVcfFilter vcfFilter, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider)
{
var vcfReader = FileUtilities.GetStreamReader(vcfStream);
@@ -111,7 +112,7 @@ private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAn
}
return VcfReader.Create(headerReader, vcfReader, annotationResources.SequenceProvider,
- annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator);
+ annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator, mitoHeteroplasmyProvider);
}
}
}
\ No newline at end of file
diff --git a/RepeatExpansions/RepeatExpansionProvider.cs b/RepeatExpansions/RepeatExpansionProvider.cs
index 7944949c..1461362d 100644
--- a/RepeatExpansions/RepeatExpansionProvider.cs
+++ b/RepeatExpansions/RepeatExpansionProvider.cs
@@ -6,6 +6,7 @@
using Intervals;
using IO;
using RepeatExpansions.IO;
+using VariantAnnotation.Interface;
using VariantAnnotation.Interface.AnnotatedPositions;
using Variants;
diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs
index 87e5f607..c7d2beea 100644
--- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs
+++ b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs
@@ -5,27 +5,21 @@
using ErrorHandling;
using IO;
using SAUtils.InputFileParsers;
-using VariantAnnotation.Providers;
-using VariantAnnotation.SA;
namespace SAUtils.MitoHeteroplasmy
{
public static class MitoHeteroplasmyDb
{
private static string _inputFile;
- private static string _compressedReference;
private static string _outputDirectory;
+ private const string OutFileName = "MitoHeteroplasmy.tsv";
+ private const string HeaderLine = "#POS\tREF\tALT\tVRFs\tAlleleDepths";
public static ExitCodes Run(string command, string[] commandArgs)
{
var ops = new OptionSet
{
- {
- "ref|r=",
- "compressed reference sequence file",
- v => _compressedReference = v
- },
{
"in|i=",
"input BED file path",
@@ -42,11 +36,10 @@ public static ExitCodes Run(string command, string[] commandArgs)
var exitCode = new ConsoleAppBuilder(commandArgs, ops)
.Parse()
- .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref")
.CheckInputFilenameExists(_inputFile, "Mitochondrial Heteroplasmy BED file", "--in")
.CheckDirectoryExists(_outputDirectory, "output directory", "--out")
.SkipBanner()
- .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample)
+ .ShowHelpMenu("Creates a TSV file with mitochondrial heteroplasmy information", commandLineExample)
.ShowErrors()
.Execute(ProgramExecution);
@@ -55,17 +48,12 @@ public static ExitCodes Run(string command, string[] commandArgs)
private static ExitCodes ProgramExecution()
{
- var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference));
- var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version");
- string outFileName = $"{version.Name}_{version.Version}";
-
- using (var primateAiParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider))
- using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix)))
- using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSufix)))
- using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.MitoHeteroplasmyTag, true, false, SaCommon.SchemaVersion, false))
- {
- nsaWriter.Write(primateAiParser.GetItems());
- }
+ using var mitoHeteroplasmyParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile));
+ using var tsvStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, OutFileName));
+ using var tsvWriter = new StreamWriter(tsvStream);
+ tsvWriter.WriteLine(HeaderLine);
+ foreach(var line in mitoHeteroplasmyParser.GetOutputLines())
+ tsvWriter.WriteLine(line);
return ExitCodes.Success;
}
diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs
deleted file mode 100644
index b731b817..00000000
--- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs
+++ /dev/null
@@ -1,33 +0,0 @@
-using Genome;
-using OptimizedCore;
-using VariantAnnotation.Interface.SA;
-using VariantAnnotation.IO;
-
-namespace SAUtils.MitoHeteroplasmy
-{
- public sealed class MitoHeteroplasmyItem:ISupplementaryDataItem
- {
- private readonly AlleleStats _stats;
- public MitoHeteroplasmyItem(IChromosome chromosome, int position, string refAllele, string altAllele, AlleleStats stats)
- {
- Chromosome = chromosome;
- Position = position;
- RefAllele = refAllele;
- AltAllele = altAllele;
- _stats = stats;
- }
-
- public IChromosome Chromosome { get; }
- public int Position { get; set; }
- public string RefAllele { get; set; }
- public string AltAllele { get; set; }
- public string GetJsonString()
- {
- var sb = StringBuilderCache.Acquire();
- var jsonObject = new JsonObject(sb);
- jsonObject.AddDoubleValue("vrfMean", _stats.vrf_stats.mean, "0.######");
- jsonObject.AddDoubleValue("vrfStdev", _stats.vrf_stats.stdev, "0.######");
- return StringBuilderCache.GetStringAndRelease(sb);
- }
- }
-}
\ No newline at end of file
diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs
index 4c8191ba..66dadcad 100644
--- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs
+++ b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs
@@ -1,72 +1,83 @@
using System;
using System.Collections.Generic;
using System.IO;
+using System.Linq;
using IO;
using Newtonsoft.Json;
using OptimizedCore;
-using VariantAnnotation.Interface.Providers;
namespace SAUtils.MitoHeteroplasmy
{
- public sealed class MitoHeteroplasmyParser: IDisposable
+ public sealed class MitoHeteroplasmyParser : IDisposable
{
private readonly Stream _stream;
- private readonly ISequenceProvider _referenceProvider;
- public MitoHeteroplasmyParser(Stream stream, ISequenceProvider referenceProvider)
+ public MitoHeteroplasmyParser(Stream stream)
{
_stream = stream;
- _referenceProvider = referenceProvider;
}
public void Dispose()
{
_stream?.Dispose();
- _referenceProvider?.Dispose();
}
- public IEnumerable GetItems()
+ public IEnumerable GetOutputLines()
{
-
- using (var reader = FileUtilities.GetStreamReader(_stream))
+ using var reader = FileUtilities.GetStreamReader(_stream);
+ string line;
+ while ((line = reader.ReadLine()) != null)
{
- string line;
- while ((line = reader.ReadLine()) != null)
- {
- // Skip empty lines.
- if (string.IsNullOrWhiteSpace(line)) continue;
-
- // Skip comments, headers
- if (line.OptimizedStartsWith('#')) continue;
-
- var items = ExtractItems(line);
- foreach (var item in items)
- {
- if (item == null) continue;
- yield return item;
- }
- }
- }
+ // Skip empty lines.
+ if (string.IsNullOrWhiteSpace(line)) continue;
+ // Skip comments, headers
+ if (line.OptimizedStartsWith('#')) continue;
+
+ foreach (string item in ExtractItems(line))
+ yield return item;
+ }
}
+
//MT 5 6 {"C:A":{"ad":[1],"allele_type":"alt","vrf":[0.006329113924050633],"vrf_stats":{"kurtosis":241.00408163265314,"max":0.0063291139240506328,"mean":2.5728105382319646e-05,"min":0.0,"nobs":246,"skewness":15.588588185998534,"stdev":0.00040352956522996095,"variance":1.6283611001468132e-07}}}
- private IEnumerable ExtractItems(string line)
+ private static IEnumerable ExtractItems(string line)
{
var splits = line.Split('\t');
- if(splits.Length < 4) yield break;
- var chromosomeName = splits[0];
- if (!_referenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) yield break;
+ if (splits.Length < 4) yield break;
- var chromosome = _referenceProvider.RefNameToChromosome[chromosomeName];
- var position = int.Parse(splits[1])+1; // since this is a bed file
- var info = splits[3];
- var stats = DeserializeStats(info);
+ var position = int.Parse(splits[1]) + 1; // since this is a bed file
+ var info = splits[3];
+ var stats = DeserializeStats(info);
foreach ((string refAllele, string altAllele, AlleleStats alleleStats) in GetAlleleStats(stats))
{
- yield return new MitoHeteroplasmyItem(chromosome, position, refAllele, altAllele, alleleStats);
+ (string formattedVrfs, string alleleDepths) = MergeAndSortByVrf(alleleStats);
+ yield return string.Join('\t', position, refAllele, altAllele, formattedVrfs, alleleDepths);
}
-
+
+ }
+
+ private static (string formattedVrfs, string alleleDepths) MergeAndSortByVrf(AlleleStats alleleStats)
+ {
+ var vrfToAd = new Dictionary();
+ foreach ((string vrf, int ad) in alleleStats.vrf.Select(x => x.ToString("0.###"))
+ .Zip(alleleStats.ad, (a, b) => (a, b)))
+ {
+ if (vrfToAd.ContainsKey(vrf)) vrfToAd[vrf] += ad;
+ else vrfToAd[vrf] = ad;
+ }
+
+ var formattedVrfs = new string[vrfToAd.Count];
+ var alleleDepths = new int[vrfToAd.Count];
+ var i = 0;
+ foreach (var vrf in vrfToAd.Keys.OrderBy(x => double.Parse(x)))
+ {
+ formattedVrfs[i] = vrf;
+ alleleDepths[i] = vrfToAd[vrf];
+ i++;
+ }
+
+ return (string.Join(',',formattedVrfs), string.Join(',', alleleDepths));
}
private static IEnumerable<(string, string, AlleleStats)> GetAlleleStats(PositionStats stats)
@@ -91,7 +102,7 @@ private IEnumerable ExtractItems(string line)
public static PositionStats DeserializeStats(string s)
{
var charArray = s.ToCharArray();
- for (int i = 0; i < charArray.Length - 3; i++)
+ for (var i = 0; i < charArray.Length - 3; i++)
{
if (IsNucleotide(charArray[i])
&& charArray[i + 1] == ':'
diff --git a/SAUtils/MitoHeteroplasmy/StatClasses.cs b/SAUtils/MitoHeteroplasmy/StatClasses.cs
index a5a26452..e895b786 100644
--- a/SAUtils/MitoHeteroplasmy/StatClasses.cs
+++ b/SAUtils/MitoHeteroplasmy/StatClasses.cs
@@ -40,12 +40,5 @@ public class AlleleStats
{
public int[] ad;
public double[] vrf;
- public VrfStats vrf_stats;
- }
-
- public class VrfStats
- {
- public double mean;
- public double stdev;
}
}
\ No newline at end of file
diff --git a/SAUtils/Properties/launchSettings.json b/SAUtils/Properties/launchSettings.json
index 442f061d..fbaa58c5 100644
--- a/SAUtils/Properties/launchSettings.json
+++ b/SAUtils/Properties/launchSettings.json
@@ -153,6 +153,11 @@
"commandName": "Project",
"commandLineArgs": "AaCon --ref c:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --cache c:\\Development\\Cache\\26\\GRCh37\\Both --scr conservationScores.tsv.gz --out c:\\Development\\SupplementaryDatabase\\test",
"workingDirectory": "C:\\development\\ExternalDataSources\\MultiZ46Way\\GRCh37"
+ },
+ "MitoHeteroplasmy_SK": {
+ "commandName": "Project",
+ "commandLineArgs": "MitoHet --in coriell_GRCh37.MT.json.bed.gz --out .",
+ "workingDirectory": "D:\\Nirvana\\gen_data\\MitoHeteroplasmy"
}
}
}
\ No newline at end of file
diff --git a/SingleAnnotationLambda/SingleConfigExtensions.cs b/SingleAnnotationLambda/SingleConfigExtensions.cs
index 6b8c854b..6033e18f 100644
--- a/SingleAnnotationLambda/SingleConfigExtensions.cs
+++ b/SingleAnnotationLambda/SingleConfigExtensions.cs
@@ -2,6 +2,7 @@
using Cloud.Messages.Single;
using ErrorHandling.Exceptions;
using Genome;
+using MitoHeteroplasmy;
using OptimizedCore;
using VariantAnnotation.Interface.IO;
using VariantAnnotation.Interface.Positions;
@@ -51,7 +52,8 @@ private static IPosition ToPosition(string[] vcfFields, ISequenceProvider sequen
SimplePosition simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter());
var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId());
- return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, variantFactory);
+ var mitoHeteroplasmyProvider = new MitoHeteroplasmyProvider();
+ return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory);
}
}
}
diff --git a/UnitTests/EndToEndTests.cs b/UnitTests/EndToEndTests.cs
index 72db46c7..18f93061 100644
--- a/UnitTests/EndToEndTests.cs
+++ b/UnitTests/EndToEndTests.cs
@@ -17,7 +17,7 @@ public sealed class EndToEndTests
public void Annotation_RefMinor_not_annotated_when_no_SA()
{
const string vcfLine = "chr12 7054859 . G . 100 PASS . . .";
- var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, vcfLine);
+ var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine);
string output = annotatedPosition.GetJsonString();
Assert.Null(output);
}
@@ -36,7 +36,7 @@ public void Annotation_RefMinor_not_annotated_when_no_SA()
[InlineData("chr12 7043410 . CT GATG 50 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7043410,\"refAllele\":\"CT\",\"altAlleles\":[\"GATG\"],\"quality\":50,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7043410:7043411:GATG\",\"chromosome\":\"chr12\",\"begin\":7043410,\"end\":7043411,\"refAllele\":\"CT\",\"altAllele\":\"GATG\",\"variantType\":\"indel\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001007026.1:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001007027.1:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"429-430\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"XM_005253672.1:c.99_100delCTinsGATG\",\"hgvsp\":\"XP_005253729.1:p.(Ser34MetfsTer27)\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"329-330\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001940.3:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001931.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000356654.4:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000349076.3:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"333-334\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000396684.2:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000379915.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")]
public void Annotate_with_SA(string vcfLine, string expectedResults)
{
- var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, vcfLine);
+ var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine);
string observedResults = annotatedPosition.GetJsonString();
Assert.Equal(expectedResults, observedResults);
}
diff --git a/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs b/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs
new file mode 100644
index 00000000..813ad338
--- /dev/null
+++ b/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs
@@ -0,0 +1,27 @@
+using MitoHeteroplasmy;
+using UnitTests.TestUtilities;
+using Xunit;
+
+namespace UnitTests.MitoHeteroplasmy
+{
+ public sealed class MitoHeteroplasmyProviderTests
+ {
+ [Fact]
+ public void GetVrfPercentiles_AsExpected()
+ {
+ var provider = new MitoHeteroplasmyProvider();
+ provider.Add(1, "C", new []{0.123, 0.200, 0.301}, new []{1, 3, 4});
+ provider.Add(1, "G", new[] { 0.101, 0.201}, new[] { 1, 2});
+
+ var chrom = ChromosomeUtilities.ChrM;
+ var position = 1;
+ var altAlleles = new[] {"C", "T",};
+
+ var percentilesSample1 = provider.GetVrfPercentiles("0|1", chrom, position, altAlleles, new[] { 0.2});
+ var percentilesSample2 = provider.GetVrfPercentiles("0/2", chrom, position, altAlleles, new[] { 0.12, 0.421});
+
+ Assert.Equal(new double?[]{0.5}, percentilesSample1);
+ Assert.Null(percentilesSample2);
+ }
+ }
+}
diff --git a/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs b/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs
index 6bd6a39c..49f729d7 100644
--- a/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs
+++ b/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs
@@ -36,15 +36,13 @@ private static Stream GetStream()
[Fact]
public void ParseItems()
{
- using (var parser = new MitoHeteroplasmyParser(GetStream(), GetSequenceProvider()))
- {
- var items = parser.GetItems().ToList();
+ using var parser = new MitoHeteroplasmyParser(GetStream());
+ var items = parser.GetOutputLines().ToList();
- Assert.Equal(4, items.Count);
+ Assert.Equal(4, items.Count);
- Assert.Equal("\"vrfMean\":0.000026,\"vrfStdev\":0.000404", items[0].GetJsonString());
-
- }
+ Assert.Equal("6\tC\tA\t0.006\t1", items[0]);
+ Assert.Equal("8\tG\tA\t0.002,0.003,0.004\t1,2,1", items[1]);
}
[Fact]
diff --git a/UnitTests/TestUtilities/AnnotationUtilities.cs b/UnitTests/TestUtilities/AnnotationUtilities.cs
index fe50391b..b5767859 100644
--- a/UnitTests/TestUtilities/AnnotationUtilities.cs
+++ b/UnitTests/TestUtilities/AnnotationUtilities.cs
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using Genome;
+using MitoHeteroplasmy;
using Nirvana;
using OptimizedCore;
using VariantAnnotation;
@@ -14,7 +15,7 @@ namespace UnitTests.TestUtilities
{
public static class AnnotationUtilities
{
- internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List saPaths,
+ internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List saPaths, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider,
string vcfLine)
{
var annotationFiles = new AnnotationFiles();
@@ -24,16 +25,16 @@ internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix,
var (annotator, sequenceProvider) = GetAnnotatorAndSequenceProvider(cacheFilePrefix, saPaths);
var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId());
- var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, variantFactory);
+ var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory);
var annotatedPosition = annotator.Annotate(position);
return annotatedPosition;
}
- internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, VariantFactory variantFactory)
+ internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory)
{
var simplePosition = GetSimplePosition(vcfLine, sequenceProvider.RefNameToChromosome);
- return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, variantFactory);
+ return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory);
}
internal static SimplePosition GetSimplePosition(string vcfLine,
diff --git a/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs b/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs
index 14ee5801..d5ca51d3 100644
--- a/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs
+++ b/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs
@@ -56,7 +56,7 @@ public void GetJsonString_StrelkaSomatic()
var seqProvider = ParserTestUtils.GetSequenceProvider(13813, "T", 'C', ChromosomeUtilities.RefNameToChromosome);
var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId());
- var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory);
+ var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory);
IVariant[] variants = GetVariants();
IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants);
diff --git a/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs b/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs
index 9a2ff245..d8ef958b 100644
--- a/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs
+++ b/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs
@@ -10,7 +10,7 @@ public sealed class SampleExtensionsTests
public void GetJsonString_Nominal()
{
var sample = new Sample(new[] {23, 34}, 12.345f, 3, new[] {"-", "+"}, true, "1/2", 98, true, 56.67f,
- new[] {8, 14}, new[] {7, 4}, new[] {10, 15}, 34, new[] {0.34, 0.56}, 1, 2.3, null);
+ new[] {8, 14}, new[] {7, 4}, new[] {10, 15}, 34, new[] {0.34, 0.56}, 1, 2.3, null, null);
string observedResult = sample.GetJsonString();
diff --git a/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs b/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs
index 1ee74435..4e785d41 100644
--- a/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs
+++ b/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs
@@ -1,11 +1,13 @@
using System.Linq;
using Vcf.Sample;
using Xunit;
+using static UnitTests.Vcf.Samples.TestUtilities;
namespace UnitTests.Vcf.Samples.Legacy
{
public sealed class LegacySampleFieldExtractorTests
{
+
[Fact]
public void FormatIndicesTest()
{
@@ -54,7 +56,7 @@ public void AlleleDepths(string formatCol, string sampleCol, int[] expectedAllel
string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
@@ -72,7 +74,7 @@ public void AlleleDepthsMultiAllelic(string formatCol, string sampleCol, int[] e
string vcfLine = $"chr1\t5592503\t.\tC\tT,A\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 2);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null);
Assert.Single(samples);
@@ -91,7 +93,7 @@ public void FailedFilter(string sampleCol, bool? expectedFailedFilter)
string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:FT\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null);
Assert.Single(samples);
@@ -111,7 +113,7 @@ public void Genotype(string sampleCol, string expectedGenotype)
string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:AD\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null);
Assert.Single(samples);
@@ -133,7 +135,7 @@ public void GenotypeQuality(string formatCol, string sampleCol, int? expectedGen
string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
@@ -157,7 +159,7 @@ public void TotalDepth(string formatCol, string sampleCol, int? expectedTotalDep
string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
@@ -173,7 +175,7 @@ public void PiscesTotalDepth()
"chr1\t115251293\t.\tGA\tG\t100\tSB;LowVariantFreq\tDP=7882\tGT:GQ:AD:VF:NL:SB:GQX\t0/1:100:7588,294:0:20:-100.0000:100";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null);
var sample = samples[0];
var observedTotalDepth = sample.TotalDepth;
@@ -196,7 +198,7 @@ public void VariantFrequency_Nominal(string altAllele, string formatCol, string
string vcfLine = $"chr1\t5592503\t.\tC\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
@@ -220,7 +222,7 @@ public void VariantFrequency_ReturnNull(string refAllele, string altAllele, stri
var vcfLine = $"chr1\t5592503\t.\t{refAllele}\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(altAllele.Split(',').Length), null);
Assert.Single(samples);
var sample = samples[0];
@@ -238,7 +240,7 @@ public void Leftout_fields_return_null(string formatCol, string sampleCol, strin
var vcfLine = $"chr1\t5592503\t.\tA\tC\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null);
Assert.Single(samples);
var sample = samples[0];
@@ -271,7 +273,7 @@ public void MajorChromosomeCopyTest()
const string vcfLine = "1 9314202 Canvas:GAIN:1:9314202:9404148 N 36 PASS SVTYPE=CNV;END=9404148;ensembl_gene_id=ENSG00000049239,ENSG00000252841,ENSG00000171621 RC:BC:CN:MCC . 151:108:6:4";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null);
Assert.Equal(2, samples.Length);
@@ -288,7 +290,7 @@ public void EmptySamples()
const string vcfLine = "chrX 2735147 . G A 38.25 VQSRTrancheSNP99.90to100.00 AC=3;AF=0.500;AN=6;BaseQRankSum=-0.602;DP=56;Dels=0.00;FS=30.019;HaplotypeScore=7.7259;MLEAC=3;MLEAF=0.500;MQ=41.18;MQ0=0;MQRankSum=0.098;QD=1.06;ReadPosRankSum=0.266;SB=-8.681e-03;VQSLOD=-6.0901;culprit=QD GT:AD:DP:GQ:PL 0:7,0:7:3:0,3,39 ./. 0/1:14,3:17:35:35,0,35 1/1:9,10:19:3:41,3,0";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal(4, samples.Length);
@@ -314,7 +316,7 @@ public void VariantFrequencyNan(string formatCol, string sampleCol, string expec
var vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}";
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
@@ -337,7 +339,7 @@ public void SplitReadCounts()
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal(2, samples.Length);
var sample1 = samples[0];
@@ -356,7 +358,7 @@ public void EmptySample()
var vcfColumns = vcfLine.Split('\t');
- var samples = vcfColumns.ToSamples(new FormatIndices(), 1);
+ var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null);
Assert.Single(samples);
var sample = samples[0];
diff --git a/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs b/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs
index 183bdd97..4174f394 100644
--- a/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs
+++ b/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs
@@ -1,6 +1,10 @@
-using VariantAnnotation.Interface.Positions;
+using MitoHeteroplasmy;
+using UnitTests.TestUtilities;
+using VariantAnnotation.Interface.Positions;
+using Vcf;
using Vcf.Sample;
using Xunit;
+using static UnitTests.Vcf.Samples.TestUtilities;
namespace UnitTests.Vcf.Samples
{
@@ -24,15 +28,15 @@ public void ExtractSample_PEPE()
{
var formatIndices = new FormatIndices();
formatIndices.Set("GT:GQ:AD:DP:VF:NL:SB:NC:US:AQ:LQ");
- var sample = SampleFieldExtractor.ExtractSample("0/1:5:338,1:339:0.00295:30:-7.3191:0.0314:0,0,0,1,0,0,17,1,129,21,148,22:3.366:0.000", formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample("0/1:5:338,1:339:0.00295:30:-7.3191:0.0314:0,0,0,1,0,0,17,1,129,21,148,22:3.366:0.000", formatIndices, GetSimplePositionUsingAlleleNum(1), null);
- Assert.Equal("0/1", sample.Genotype);
- Assert.Equal(5, sample.GenotypeQuality);
- Assert.Equal(new[] { 338, 1 }, sample.AlleleDepths);
- Assert.Equal(339, sample.TotalDepth);
+ Assert.Equal("0/1", sample.Genotype);
+ Assert.Equal(5, sample.GenotypeQuality);
+ Assert.Equal(new[] { 338, 1 }, sample.AlleleDepths);
+ Assert.Equal(339, sample.TotalDepth);
Assert.Equal(new[] { 0.00295 }, sample.VariantFrequencies);
- Assert.Equal(3.366f, sample.ArtifactAdjustedQualityScore);
- Assert.Equal(0.000f, sample.LikelihoodRatioQualityScore);
+ Assert.Equal(3.366f, sample.ArtifactAdjustedQualityScore);
+ Assert.Equal(0.000f, sample.LikelihoodRatioQualityScore);
}
[Fact]
@@ -40,13 +44,13 @@ public void ExtractSample_DragenSomatic_AsExpected()
{
var formatIndices = new FormatIndices();
formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS");
- var sample = SampleFieldExtractor.ExtractSample("0|1:3.96:33,8:0.195:13,6:20,2:41:17,16,4,4:13,20,4,4:534234", formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample("0|1:3.96:33,8:0.195:13,6:20,2:41:17,16,4,4:13,20,4,4:534234", formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal("0|1", sample.Genotype);
Assert.Equal(3.96, sample.SomaticQuality);
Assert.Equal(new[] { 33, 8 }, sample.AlleleDepths);
Assert.Equal(41, sample.TotalDepth);
- Assert.Equal(new[] { 8/41.0 }, sample.VariantFrequencies);
+ Assert.Equal(new[] { 8 / 41.0 }, sample.VariantFrequencies);
}
[Fact]
@@ -54,7 +58,7 @@ public void ExtractSample_DragenCNV_AsExpected()
{
var formatIndices = new FormatIndices();
formatIndices.Set("GT:CN:MCN");
- var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal("0|1", sample.Genotype);
Assert.Equal(3, sample.CopyNumber);
@@ -68,7 +72,7 @@ public void ExtractSample_DragenCNV_MCN_LOH(string formatField, string sampleFie
{
var formatIndices = new FormatIndices();
formatIndices.Set(formatField);
- var sample = SampleFieldExtractor.ExtractSample(sampleField, formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample(sampleField, formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.True(sample.IsLossOfHeterozygosity);
}
@@ -78,7 +82,7 @@ public void ExtractSample_ExpansionHunter()
{
var formatIndices = new FormatIndices();
formatIndices.Set("GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC");
- var sample = SampleFieldExtractor.ExtractSample("1/1:SPANNING/SPANNING:15/15:15-15/15-15:22/22:23/23:0/0:38.270270", formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample("1/1:SPANNING/SPANNING:15/15:15-15/15-15:22/22:23/23:0/0:38.270270", formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal("1/1", sample.Genotype);
Assert.Equal(new[] { 15, 15 }, sample.RepeatUnitCounts);
@@ -88,7 +92,7 @@ public void ExtractSample_ExpansionHunter()
public void ExtractSample_EmptySampleColumn_ReturnEmptySample()
{
var formatIndices = new FormatIndices();
- var sample = SampleFieldExtractor.ExtractSample(null, formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample(null, formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.True(sample.IsEmpty);
}
@@ -96,7 +100,7 @@ public void ExtractSample_EmptySampleColumn_ReturnEmptySample()
public void ExtractSample_DotInSampleColumn_ReturnEmptySample()
{
var formatIndices = new FormatIndices();
- var sample = SampleFieldExtractor.ExtractSample(".", formatIndices, 1);
+ var sample = SampleFieldExtractor.ExtractSample(".", formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.True(sample.IsEmpty);
}
@@ -129,7 +133,7 @@ public void ToSamples_SMN1_CNV()
"./1:.:.:.:.:1.26335:3:4:6:cnvLength:Inherited"
};
- ISample[] samples = cols.ToSamples(formatIndices, 1);
+ ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.Equal(4, samples.Length);
@@ -166,8 +170,25 @@ public void ToSamples_TooFewVcfColumns_ReturnNull()
"SVTYPE=CNV;END=125075279;REFLEN=6510"
};
- ISample[] samples = cols.ToSamples(formatIndices, 1);
+ ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null);
Assert.Null(samples);
}
+
+ [Fact]
+ public void ExtractSample_MitoHeteroplasmy_AsExpected()
+ {
+ var provider = new MitoHeteroplasmyProvider();
+ provider.Add(1, "C", new[] { 0.123, 0.200, 0.301 }, new[] { 1, 3, 4 });
+ provider.Add(1, "G", new[] { 0.101, 0.201 }, new[] { 1, 2 });
+
+ var simplePosition = new SimplePosition(ChromosomeUtilities.ChrM, 1, "A", new[] { "C", "T"});
+
+ var formatIndices = new FormatIndices();
+ formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS");
+ var sample = SampleFieldExtractor.ExtractSample("1|2:3.96:0,15,85:0.195:13,6:20,2:100:17,16,4,4:13,20,4,4:534234", formatIndices, simplePosition, provider);
+
+ Assert.Equal(new[] { 15 / 100.0, 85 / 100.0 }, sample.VariantFrequencies);
+ Assert.Equal(new[] { "0.13", null }, sample.HeteroplasmyPercentile);
+ }
}
}
\ No newline at end of file
diff --git a/UnitTests/Vcf/Samples/SampleTests.cs b/UnitTests/Vcf/Samples/SampleTests.cs
index 9ffdc164..e5316eca 100644
--- a/UnitTests/Vcf/Samples/SampleTests.cs
+++ b/UnitTests/Vcf/Samples/SampleTests.cs
@@ -9,7 +9,7 @@ public sealed class SampleTests
public void Sample_ReturnEmpty()
{
var emptySample = new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null,
- null, null, null, null);
+ null, null, null, null, null);
Assert.True(emptySample.IsEmpty);
Assert.True(Sample.EmptySample.IsEmpty);
diff --git a/UnitTests/Vcf/Samples/TestUtilities.cs b/UnitTests/Vcf/Samples/TestUtilities.cs
new file mode 100644
index 00000000..c1928f29
--- /dev/null
+++ b/UnitTests/Vcf/Samples/TestUtilities.cs
@@ -0,0 +1,18 @@
+using Moq;
+using VariantAnnotation.Interface.Positions;
+
+namespace UnitTests.Vcf.Samples
+{
+ public static class TestUtilities
+ {
+ public static ISimplePosition GetSimplePositionUsingAlleleNum(int numAlleles)
+ {
+ var mock = new Mock();
+ mock.SetupGet(x => x.AltAlleles).Returns(new string[numAlleles]);
+ mock.SetupGet(x => x.Start).Returns(-1);
+
+ return mock.Object;
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/UnitTests/Vcf/VcfReaderTests.cs b/UnitTests/Vcf/VcfReaderTests.cs
index de069152..abef124c 100644
--- a/UnitTests/Vcf/VcfReaderTests.cs
+++ b/UnitTests/Vcf/VcfReaderTests.cs
@@ -39,7 +39,7 @@ public void ValidateVcfHeader_ExceptionThrown_NoFileFormat()
AddLines(headers);
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
var reader = FileUtilities.GetStreamReader(_ms);
- Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator));
+ Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null));
}
[Fact]
@@ -49,7 +49,7 @@ public void ValidateVcfHeader_ExceptionThrown_NoChromHeaderLine()
AddLines(headers);
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
var reader = FileUtilities.GetStreamReader(_ms);
- Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator));
+ Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null));
}
[Fact]
@@ -61,7 +61,7 @@ public void Sample_names_are_reported()
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null))
{
samples = vcfReader.GetSampleNames();
}
@@ -97,7 +97,7 @@ public void CheckContigId_IncorrectAutoAndSexChromLength_ThrowException(string c
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator));
+ Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null));
}
[Theory]
@@ -110,7 +110,7 @@ public void CheckContigId_InferredAssemblyIsUnknown_GivenIrregularChrom(string c
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null))
{
Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly);
}
@@ -125,7 +125,7 @@ public void CheckContigId_IsRcrsMitochondrionTrue_InferredAssemblyIsUnknown_Give
AddLines(headers);
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null))
{
Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly);
Assert.True(vcfReader.IsRcrsMitochondrion);
@@ -142,7 +142,7 @@ public void CheckContigId_IsRcrsMitochondrionFalse_InferredAssemblyIsUnknown_Giv
var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null))
{
Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly);
Assert.False(vcfReader.IsRcrsMitochondrion);
@@ -176,7 +176,7 @@ public void GetNextPosition()
IPosition observedResult;
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null))
{
observedResult = vcfReader.GetNextPosition();
}
@@ -209,7 +209,7 @@ public void CheckSampleConsistency_oneSample()
var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null))
{
//first line is valid. So, no exception
Assert.NotNull(vcfReader.GetNextPosition());
@@ -236,7 +236,7 @@ public void CheckSampleConsistency_noSample()
var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome);
using (var reader = FileUtilities.GetStreamReader(_ms))
- using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator))
+ using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null))
{
//first line is valid. So, no exception
Assert.NotNull(vcfReader.GetNextPosition());
diff --git a/UnitTests/Vcf/VcfReaderUtilsTests.cs b/UnitTests/Vcf/VcfReaderUtilsTests.cs
index f88eed8e..ffde5ec4 100644
--- a/UnitTests/Vcf/VcfReaderUtilsTests.cs
+++ b/UnitTests/Vcf/VcfReaderUtilsTests.cs
@@ -92,7 +92,7 @@ public void Test_crash_caused_by_variant_trimming ()
var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator);
- var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, seqProvider, variantFactory);
+ var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, seqProvider, null, variantFactory);
var annotatedVariants1 = Annotator.GetAnnotatedVariants(position1.Variants);
@@ -115,7 +115,7 @@ public void ParseVcfLine_line_with_only_NonRef_is_refMinor()
ParserTestUtils.GetSequenceProvider(10628385, "C", 'A', ChromosomeUtilities.RefNameToChromosome);
var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator);
- var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory);
+ var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory);
var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants);
Assert.Equal("C", position.RefAllele);
@@ -137,7 +137,7 @@ public void ParseVcfLine_line_with_only_NonRef_is_not_refMinor()
ParserTestUtils.GetSequenceProvider(10005, "C", 'A', ChromosomeUtilities.RefNameToChromosome);
var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator);
- var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory);
+ var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory);
var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants);
Assert.Equal("C", position.RefAllele);
diff --git a/VariantAnnotation.Interface/Positions/ISample.cs b/VariantAnnotation.Interface/Positions/ISample.cs
index 746d6597..adde96d1 100644
--- a/VariantAnnotation.Interface/Positions/ISample.cs
+++ b/VariantAnnotation.Interface/Positions/ISample.cs
@@ -20,6 +20,6 @@ public interface ISample
int? MinorHaplotypeCopyNumber { get; }
double? SomaticQuality { get; }
bool? IsLossOfHeterozygosity { get; }
-
+ string[] HeteroplasmyPercentile { get; }
}
}
\ No newline at end of file
diff --git a/VariantAnnotation/IO/SampleExtensions.cs b/VariantAnnotation/IO/SampleExtensions.cs
index c77b5ff6..f17942d0 100644
--- a/VariantAnnotation/IO/SampleExtensions.cs
+++ b/VariantAnnotation/IO/SampleExtensions.cs
@@ -32,7 +32,8 @@ public static string GetJsonString(this ISample sample)
if (sample.IsLossOfHeterozygosity.HasValue)
jsonObject.AddBoolValue("lossOfHeterozygosity", sample.IsLossOfHeterozygosity.Value);
jsonObject.AddDoubleValue("somaticQuality", sample.SomaticQuality, "0.#");
-
+ jsonObject.AddStringValues("heteroplasmyPercentile", sample.HeteroplasmyPercentile);
+
sb.Append(JsonObject.CloseBrace);
return StringBuilderCache.GetStringAndRelease(sb);
}
diff --git a/Vcf/Position.cs b/Vcf/Position.cs
index 87634fbb..b41fd8a3 100644
--- a/Vcf/Position.cs
+++ b/Vcf/Position.cs
@@ -8,6 +8,7 @@
using Vcf.Info;
using Vcf.Sample;
using Vcf.VariantCreator;
+using MitoHeteroplasmy;
namespace Vcf
{
@@ -69,7 +70,7 @@ private static (bool HasStructuralVariant, bool HasShortTandemRepeat) CheckVaria
return (hasStructuralVariant, hasShortTandemRepeat);
}
- public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, VariantFactory variantFactory)
+ public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory)
{
if (simplePosition == null) return null;
@@ -91,7 +92,7 @@ public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProv
int end = ExtractEnd(infoData, simplePosition.Start, simplePosition.RefAllele.Length);
double? quality = vcfFields[VcfCommon.QualIndex].GetNullableValue(double.TryParse);
string[] filters = vcfFields[VcfCommon.FilterIndex].OptimizedSplit(';');
- ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, altAlleles.Length);
+ ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, simplePosition, mitoHeteroplasmyProvider);
IVariant[] variants = variantFactory.CreateVariants(simplePosition.Chromosome, simplePosition.Start, end,
simplePosition.RefAllele, altAlleles, infoData, simplePosition.IsDecomposed,
diff --git a/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs b/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs
index eac71bdd..1343570d 100644
--- a/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs
+++ b/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs
@@ -59,7 +59,7 @@ internal ISample ExtractSample(string sampleColumn)
var sample = new Sample(alleleDepths, sampleFields.AQ, sampleFields.CopyNumber, sampleFields.DST,
failedFilter, genotype, genotypeQuality, false, sampleFields.LQ, pairEndReadCounts, null, splitReadCounts,
- totalDepth, variantFrequencies, null, null, isLossOfHeterozygosity);
+ totalDepth, variantFrequencies, null, null, isLossOfHeterozygosity, null);
return sample;
}
diff --git a/Vcf/Sample/Sample.cs b/Vcf/Sample/Sample.cs
index af211908..e2405e07 100644
--- a/Vcf/Sample/Sample.cs
+++ b/Vcf/Sample/Sample.cs
@@ -22,14 +22,15 @@ public sealed class Sample : ISample
public int? MinorHaplotypeCopyNumber { get; }
public double? SomaticQuality { get; }
public bool? IsLossOfHeterozygosity { get; }
+ public string[] HeteroplasmyPercentile { get; }
public static readonly Sample EmptySample =
- new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null);
+ new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null, null);
public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copyNumber,
string[] diseaseAffectedStatuses, bool failedFilter, string genotype, int? genotypeQuality, bool isDeNovo,
float? likelihoodRatioQualityScore, int[] pairedEndReadCounts, int[] repeatUnitCounts,
- int[] splitReadCounts, int? totalDepth, double[] variantFrequencies, int? minorHaplotypeCopyNumber, double? somaticQuality, bool? isLossOfHeterozygosity)
+ int[] splitReadCounts, int? totalDepth, double[] variantFrequencies, int? minorHaplotypeCopyNumber, double? somaticQuality, bool? isLossOfHeterozygosity, string[] heteroplasmyPercentile)
{
AlleleDepths = alleleDepths;
ArtifactAdjustedQualityScore = artifactAdjustedQualityScore;
@@ -46,6 +47,7 @@ public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copy
TotalDepth = totalDepth;
VariantFrequencies = variantFrequencies;
IsLossOfHeterozygosity = isLossOfHeterozygosity;
+ HeteroplasmyPercentile = heteroplasmyPercentile;
MinorHaplotypeCopyNumber = minorHaplotypeCopyNumber;
SomaticQuality = somaticQuality;
@@ -64,6 +66,7 @@ public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copy
IsLossOfHeterozygosity == null &&
MinorHaplotypeCopyNumber == null &&
SomaticQuality == null &&
+ HeteroplasmyPercentile == null &&
!FailedFilter &&
!IsDeNovo;
}
diff --git a/Vcf/Sample/SampleFieldExtractor.cs b/Vcf/Sample/SampleFieldExtractor.cs
index ca9dd74f..08264f6b 100644
--- a/Vcf/Sample/SampleFieldExtractor.cs
+++ b/Vcf/Sample/SampleFieldExtractor.cs
@@ -1,4 +1,6 @@
-using OptimizedCore;
+using System.Linq;
+using MitoHeteroplasmy;
+using OptimizedCore;
using VariantAnnotation.Interface.IO;
using VariantAnnotation.Interface.Positions;
using Vcf.Sample.Legacy;
@@ -7,7 +9,7 @@ namespace Vcf.Sample
{
internal static class SampleFieldExtractor
{
- internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, int numAltAlleles)
+ internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, ISimplePosition simplePosition, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider)
{
if (vcfColumns.Length < VcfCommon.MinNumColumnsSampleGenotypes) return null;
@@ -20,14 +22,14 @@ internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices form
for (int index = VcfCommon.GenotypeIndex; index < vcfColumns.Length; index++)
{
- samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, numAltAlleles, legacySampleExtractor);
+ samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, simplePosition, mitoHeteroplasmyProvider, legacySampleExtractor);
}
return samples;
}
- internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, int numAltAlleles,
- LegacySampleFieldExtractor legacyExtractor = null)
+ internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, ISimplePosition simplePosition,
+ IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, LegacySampleFieldExtractor legacyExtractor = null)
{
// sanity check: make sure we have a format column
if (string.IsNullOrEmpty(sampleColumn)) return Sample.EmptySample;
@@ -59,13 +61,15 @@ internal static ISample ExtractSample(string sampleColumn, FormatIndices formatI
int? minorHaplotypeCopyNumber = sampleColumns.GetString(formatIndices.MCN).GetInteger();
double? somaticQuality = sampleColumns.GetString(formatIndices.SQ).GetDouble();
- double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, numAltAlleles);
+ double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, simplePosition.AltAlleles.Length);
+ string[] mitoHeteroplasmyPercentiles = mitoHeteroplasmyProvider?.GetVrfPercentiles(genotype, simplePosition.Chromosome, simplePosition.Start,
+ simplePosition.AltAlleles, variantFrequencies)?.Select(x => x?.ToString("0.##")).ToArray();
var isLoh = GetLoh(copyNumber, minorHaplotypeCopyNumber, genotype);
var sample = new Sample(alleleDepths, artifactAdjustedQualityScore, copyNumber, diseaseAffectedStatuses,
failedFilter, genotype, genotypeQuality, isDeNovo, likelihoodRatioQualityScore, pairedEndReadCounts,
- repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh);
+ repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh, mitoHeteroplasmyPercentiles);
return sample;
}
diff --git a/Vcf/SimplePosition.cs b/Vcf/SimplePosition.cs
index 02833935..e5b6641b 100644
--- a/Vcf/SimplePosition.cs
+++ b/Vcf/SimplePosition.cs
@@ -19,7 +19,7 @@ public sealed class SimplePosition : ISimplePosition
public string[] Vids { get; private set; }
public List[] LinkedVids { get; private set; }
- private SimplePosition(IChromosome chromosome, int start, string refAllele, string[] altAlleles)
+ internal SimplePosition(IChromosome chromosome, int start, string refAllele, string[] altAlleles)
{
Chromosome = chromosome;
Start = start;
diff --git a/Vcf/Vcf.csproj b/Vcf/Vcf.csproj
index bddce04b..8a165267 100644
--- a/Vcf/Vcf.csproj
+++ b/Vcf/Vcf.csproj
@@ -9,5 +9,6 @@
+
\ No newline at end of file
diff --git a/Vcf/VcfReader.cs b/Vcf/VcfReader.cs
index dafcefb4..c71e438e 100644
--- a/Vcf/VcfReader.cs
+++ b/Vcf/VcfReader.cs
@@ -3,6 +3,7 @@
using System.IO;
using ErrorHandling.Exceptions;
using Genome;
+using MitoHeteroplasmy;
using OptimizedCore;
using VariantAnnotation.Interface;
using VariantAnnotation.Interface.IO;
@@ -23,6 +24,7 @@ public sealed class VcfReader : IVcfReader
private readonly ISequenceProvider _sequenceProvider;
private readonly IDictionary _refNameToChromosome;
private readonly IVcfFilter _vcfFilter;
+ private readonly IMitoHeteroplasmyProvider _mitoHeteroplasmyProvider;
public bool IsRcrsMitochondrion { get; private set; }
public string VcfLine { get; private set; }
public GenomeAssembly InferredGenomeAssembly { get; private set; } = GenomeAssembly.Unknown;
@@ -37,21 +39,22 @@ public sealed class VcfReader : IVcfReader
public string[] GetSampleNames() => _sampleNames;
private VcfReader(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider,
- IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator)
+ IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider)
{
- _headerReader = headerReader;
- _reader = vcfLineReader;
- _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator);
- _sequenceProvider = sequenceProvider;
- _refMinorProvider = refMinorProvider;
- _vcfFilter = vcfFilter;
- _refNameToChromosome = sequenceProvider.RefNameToChromosome;
+ _headerReader = headerReader;
+ _reader = vcfLineReader;
+ _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator);
+ _sequenceProvider = sequenceProvider;
+ _refMinorProvider = refMinorProvider;
+ _vcfFilter = vcfFilter;
+ _refNameToChromosome = sequenceProvider.RefNameToChromosome;
+ _mitoHeteroplasmyProvider = mitoHeteroplasmyProvider;
}
public static VcfReader Create(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider,
- IRefMinorProvider refMinorProvider, IRecomposer recomposer, IVcfFilter vcfFilter, IVariantIdCreator vidCreator)
+ IRefMinorProvider refMinorProvider, IRecomposer recomposer, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider)
{
- var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator);
+ var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator, mitoHeteroplasmyProvider);
vcfReader.ParseHeader();
vcfReader.SetRecomposer(recomposer);
return vcfReader;
@@ -187,7 +190,7 @@ private void CheckVcfOrder(string referenceName)
_currentReferenceName = referenceName;
}
- public IPosition GetNextPosition() => Position.ToPosition(GetNextSimplePosition(), _refMinorProvider, _sequenceProvider, _variantFactory);
+ public IPosition GetNextPosition() => Position.ToPosition(GetNextSimplePosition(), _refMinorProvider, _sequenceProvider, _mitoHeteroplasmyProvider, _variantFactory);
public void Dispose() => _reader?.Dispose();
}