diff --git a/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs b/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs new file mode 100644 index 00000000..f81bbfd7 --- /dev/null +++ b/MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs @@ -0,0 +1,9 @@ +using Genome; + +namespace MitoHeteroplasmy +{ + public interface IMitoHeteroplasmyProvider + { + double?[] GetVrfPercentiles(string genotype, IChromosome chrome, int position, string[] altAllele, double[] vrfs); + } +} \ No newline at end of file diff --git a/MitoHeteroplasmy/MitoHeteroplasmy.csproj b/MitoHeteroplasmy/MitoHeteroplasmy.csproj new file mode 100644 index 00000000..2b894a25 --- /dev/null +++ b/MitoHeteroplasmy/MitoHeteroplasmy.csproj @@ -0,0 +1,15 @@ + + + + netcoreapp2.1 + latest + + + + + + + + + + diff --git a/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs b/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs new file mode 100644 index 00000000..7f951812 --- /dev/null +++ b/MitoHeteroplasmy/MitoHeteroplasmyProvider.cs @@ -0,0 +1,66 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Genome; + +namespace MitoHeteroplasmy +{ + public sealed class MitoHeteroplasmyProvider : IMitoHeteroplasmyProvider + { + private const string MitoChromUcscName = "chrM"; + private static readonly Dictionary AlleleToInt = new Dictionary { { "A", 0 }, { "C", 1 }, { "G", 2 }, { "T", 3 } }; + private const int SequenceLengthMax = int.MaxValue / 4; + + private readonly Dictionary _alleleToVrf = new Dictionary(); + + public void Add(int position, string altAllele, IEnumerable vrfs, int[] alleleDepths) + { + var vrfsInt = vrfs.Select(ToIntVrfForm).ToArray(); + _alleleToVrf[EncodeMitoPositionAndAltAllele(position, altAllele)] = (vrfsInt, alleleDepths); + } + + public double?[] GetVrfPercentiles(string genotypes, IChromosome chrom, int position, string[] altAlleles, double[] vrfs) + { + if (vrfs == null) return null; + if (chrom.UcscName != MitoChromUcscName) return null; + + var sampleAlleles = genotypes.Split('|', '/').Where(x => x != "0") + .Select(x => GetAlleleByGenotype(x, altAlleles)); + + var percentiles = vrfs.Zip(sampleAlleles, (vrf, allele) => GetVrfPercentile(position, allele, vrf)).ToArray(); + return percentiles.All(x => x == null) ? null : percentiles; + } + + private static string GetAlleleByGenotype(string genotypeIndex, string[] altAlleles) => altAlleles[int.Parse(genotypeIndex) - 1]; + + private double? GetVrfPercentile(int position, string altAllele, double vrf) + { + if (string.IsNullOrEmpty(altAllele)) return null; + + var positionAndAltAlleleIntForm = EncodeMitoPositionAndAltAllele(position, altAllele); + + if (!_alleleToVrf.TryGetValue(positionAndAltAlleleIntForm, out (int[] Vrfs, int[] AlleleDepths) data)) return null; + + var scaledVrf = vrf * 1000; + int nearestBiggerVrfIndex; + for (nearestBiggerVrfIndex = 0; nearestBiggerVrfIndex < data.Vrfs.Length; nearestBiggerVrfIndex++) + { + if (data.Vrfs[nearestBiggerVrfIndex] > scaledVrf) break; + } + + var numSmallerOrEqualAlleles = 0.0; + var numAllAlleles = 0; + for (var i = 0; i < data.AlleleDepths.Length; i++) + { + if (i < nearestBiggerVrfIndex) numSmallerOrEqualAlleles += data.AlleleDepths[i]; + numAllAlleles += data.AlleleDepths[i]; + } + + return numSmallerOrEqualAlleles / numAllAlleles; + } + + private static int ToIntVrfForm(double vrf) => Convert.ToInt32(vrf * 1000); + + private static int EncodeMitoPositionAndAltAllele(int position, string altAllele) => SequenceLengthMax * AlleleToInt[altAllele] + position; + } +} \ No newline at end of file diff --git a/MitoHeteroplasmy/MitoHeteroplasmyReader.cs b/MitoHeteroplasmy/MitoHeteroplasmyReader.cs new file mode 100644 index 00000000..80b7f315 --- /dev/null +++ b/MitoHeteroplasmy/MitoHeteroplasmyReader.cs @@ -0,0 +1,44 @@ +using System; +using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Reflection; +using OptimizedCore; + +namespace MitoHeteroplasmy +{ + public static class MitoHeteroplasmyReader + { + private const int PositionIndex = 0; + private const int AltIndex = 2; + private const int VrfIndex = 3; + private const int AlleleDepthIndex = 4; + + private const string ResourceName = "MitoHeteroplasmy.Resources.MitoHeteroplasmy.tsv.gz"; + public static MitoHeteroplasmyProvider GetData() + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream(ResourceName); + if (stream == null) throw new NullReferenceException("Unable to read from the Mitochondrial Heteroplasmy file"); + + using var gzStream = new GZipStream(stream, CompressionMode.Decompress); + using var reader = new StreamReader(gzStream); + reader.ReadLine(); + + var mitoHeteroplasmyData = new MitoHeteroplasmyProvider(); + while (true) + { + string line = reader.ReadLine(); + if (line == null) break; + + var fields = line.OptimizedSplit('\t'); + var position = int.Parse(fields[PositionIndex]); + var vrfs = fields[VrfIndex].Split(',').Select(double.Parse); + var alleleDepths = fields[AlleleDepthIndex].Split(',').Select(int.Parse).ToArray(); + mitoHeteroplasmyData.Add(position, fields[AltIndex], vrfs, alleleDepths); + } + + return mitoHeteroplasmyData; + } + } +} \ No newline at end of file diff --git a/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz b/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz new file mode 100644 index 00000000..577b5c7b Binary files /dev/null and b/MitoHeteroplasmy/Resources/MitoHeteroplasmy.tsv.gz differ diff --git a/Nirvana.sln b/Nirvana.sln index 1c73bb8b..e9d8adbc 100644 --- a/Nirvana.sln +++ b/Nirvana.sln @@ -44,7 +44,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tabix", "Tabix\Tabix.csproj EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cloud", "Cloud\Cloud.csproj", "{E93914C8-2599-46BE-BE18-6229E53F581B}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Downloader", "Downloader\Downloader.csproj", "{5B81B762-8A86-466A-A947-AC2CA53EE40D}" EndProject @@ -58,12 +58,14 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NirvanaLambda", "NirvanaLam EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SingleAnnotationLambda", "SingleAnnotationLambda\SingleAnnotationLambda.csproj", "{C9B4E16E-FF30-4CE0-A617-F833696FBE10}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomStrValidationLambda", "CustomStrValidationLambda\CustomStrValidationLambda.csproj", "{F3E60E51-EE07-4768-8EC3-E3A323DFA547}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MitoHeteroplasmy", "MitoHeteroplasmy\MitoHeteroplasmy.csproj", "{387E4C8D-6A27-40DE-A305-F3F047B8D865}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -186,6 +188,10 @@ Global {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Debug|Any CPU.Build.0 = Debug|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.ActiveCfg = Release|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.Build.0 = Release|Any CPU + {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.Build.0 = Debug|Any CPU + {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.ActiveCfg = Release|Any CPU + {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Nirvana.sln.DotSettings b/Nirvana.sln.DotSettings index 2e9c7637..67cdd502 100644 --- a/Nirvana.sln.DotSettings +++ b/Nirvana.sln.DotSettings @@ -174,4 +174,5 @@ True True True - True \ No newline at end of file + True + True \ No newline at end of file diff --git a/Nirvana/Properties/launchSettings.json b/Nirvana/Properties/launchSettings.json index bbf33196..37935723 100644 --- a/Nirvana/Properties/launchSettings.json +++ b/Nirvana/Properties/launchSettings.json @@ -209,6 +209,11 @@ "commandName": "Project", "commandLineArgs": "--cache /Users/rroy1/development/Cache\\26\\GRCh38\\Both --sd /Users/rroy1/development/SupplementaryDatabase\\56\\GRCh38 --ref /Users/rroy1/development/References/6/Homo_sapiens.GRCh38.Nirvana.dat --in DQ-Strelka-Germline-chr22-hg38.vcf.gz --out dq", "workingDirectory": "/Users/rroy1/development/TestDatasets" + }, + "SK MitoHeteroplasmy": { + "commandName": "Project", + "commandLineArgs": "-i debug.vcf -c D:\\Nirvana\\Development\\Cache\\26\\GRCh38\\Both -r D:\\Nirvana\\Development\\References\\Homo_sapiens.GRCh38.Nirvana.dat -o debug", + "workingDirectory": "D:\\Nirvana\\test_runs\\run_Nirvana" } } } \ No newline at end of file diff --git a/Nirvana/StreamAnnotation.cs b/Nirvana/StreamAnnotation.cs index 2699c9c0..643975bd 100644 --- a/Nirvana/StreamAnnotation.cs +++ b/Nirvana/StreamAnnotation.cs @@ -4,6 +4,7 @@ using ErrorHandling.Exceptions; using Genome; using IO; +using MitoHeteroplasmy; using VariantAnnotation; using VariantAnnotation.Interface; using VariantAnnotation.Interface.IO; @@ -23,10 +24,10 @@ public static ExitCodes Annotate(Stream headerStream, Stream inputVcfStream, Str var metrics = annotationResources.Metrics; PerformanceMetrics.ShowAnnotationHeader(); - IChromosome currentChromosome = new EmptyChromosome("dummy"); - int numVariants = 0; - - using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter)) + IChromosome currentChromosome = new EmptyChromosome("dummy"); + int numVariants = 0; + IMitoHeteroplasmyProvider mitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetData(); + using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter, mitoHeteroplasmyProvider)) using (var jsonWriter = new JsonWriter(outputJsonStream, outputJsonIndexStream, annotationResources, Date.CurrentTimeStamp, vcfReader.GetSampleNames(), false)) { try @@ -97,7 +98,7 @@ private static void SetMitochondrialAnnotationBehavior(IAnnotationResources anno } private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAnnotationResources annotationResources, - IVcfFilter vcfFilter) + IVcfFilter vcfFilter, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { var vcfReader = FileUtilities.GetStreamReader(vcfStream); @@ -111,7 +112,7 @@ private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAn } return VcfReader.Create(headerReader, vcfReader, annotationResources.SequenceProvider, - annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator); + annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator, mitoHeteroplasmyProvider); } } } \ No newline at end of file diff --git a/RepeatExpansions/RepeatExpansionProvider.cs b/RepeatExpansions/RepeatExpansionProvider.cs index 7944949c..1461362d 100644 --- a/RepeatExpansions/RepeatExpansionProvider.cs +++ b/RepeatExpansions/RepeatExpansionProvider.cs @@ -6,6 +6,7 @@ using Intervals; using IO; using RepeatExpansions.IO; +using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs index 87e5f607..c7d2beea 100644 --- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs +++ b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs @@ -5,27 +5,21 @@ using ErrorHandling; using IO; using SAUtils.InputFileParsers; -using VariantAnnotation.Providers; -using VariantAnnotation.SA; namespace SAUtils.MitoHeteroplasmy { public static class MitoHeteroplasmyDb { private static string _inputFile; - private static string _compressedReference; private static string _outputDirectory; + private const string OutFileName = "MitoHeteroplasmy.tsv"; + private const string HeaderLine = "#POS\tREF\tALT\tVRFs\tAlleleDepths"; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { - { - "ref|r=", - "compressed reference sequence file", - v => _compressedReference = v - }, { "in|i=", "input BED file path", @@ -42,11 +36,10 @@ public static ExitCodes Run(string command, string[] commandArgs) var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() - .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFile, "Mitochondrial Heteroplasmy BED file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() - .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) + .ShowHelpMenu("Creates a TSV file with mitochondrial heteroplasmy information", commandLineExample) .ShowErrors() .Execute(ProgramExecution); @@ -55,17 +48,12 @@ public static ExitCodes Run(string command, string[] commandArgs) private static ExitCodes ProgramExecution() { - var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); - var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); - string outFileName = $"{version.Name}_{version.Version}"; - - using (var primateAiParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider)) - using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) - using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSufix))) - using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.MitoHeteroplasmyTag, true, false, SaCommon.SchemaVersion, false)) - { - nsaWriter.Write(primateAiParser.GetItems()); - } + using var mitoHeteroplasmyParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile)); + using var tsvStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, OutFileName)); + using var tsvWriter = new StreamWriter(tsvStream); + tsvWriter.WriteLine(HeaderLine); + foreach(var line in mitoHeteroplasmyParser.GetOutputLines()) + tsvWriter.WriteLine(line); return ExitCodes.Success; } diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs deleted file mode 100644 index b731b817..00000000 --- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs +++ /dev/null @@ -1,33 +0,0 @@ -using Genome; -using OptimizedCore; -using VariantAnnotation.Interface.SA; -using VariantAnnotation.IO; - -namespace SAUtils.MitoHeteroplasmy -{ - public sealed class MitoHeteroplasmyItem:ISupplementaryDataItem - { - private readonly AlleleStats _stats; - public MitoHeteroplasmyItem(IChromosome chromosome, int position, string refAllele, string altAllele, AlleleStats stats) - { - Chromosome = chromosome; - Position = position; - RefAllele = refAllele; - AltAllele = altAllele; - _stats = stats; - } - - public IChromosome Chromosome { get; } - public int Position { get; set; } - public string RefAllele { get; set; } - public string AltAllele { get; set; } - public string GetJsonString() - { - var sb = StringBuilderCache.Acquire(); - var jsonObject = new JsonObject(sb); - jsonObject.AddDoubleValue("vrfMean", _stats.vrf_stats.mean, "0.######"); - jsonObject.AddDoubleValue("vrfStdev", _stats.vrf_stats.stdev, "0.######"); - return StringBuilderCache.GetStringAndRelease(sb); - } - } -} \ No newline at end of file diff --git a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs index 4c8191ba..66dadcad 100644 --- a/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs +++ b/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs @@ -1,72 +1,83 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using IO; using Newtonsoft.Json; using OptimizedCore; -using VariantAnnotation.Interface.Providers; namespace SAUtils.MitoHeteroplasmy { - public sealed class MitoHeteroplasmyParser: IDisposable + public sealed class MitoHeteroplasmyParser : IDisposable { private readonly Stream _stream; - private readonly ISequenceProvider _referenceProvider; - public MitoHeteroplasmyParser(Stream stream, ISequenceProvider referenceProvider) + public MitoHeteroplasmyParser(Stream stream) { _stream = stream; - _referenceProvider = referenceProvider; } public void Dispose() { _stream?.Dispose(); - _referenceProvider?.Dispose(); } - public IEnumerable GetItems() + public IEnumerable GetOutputLines() { - - using (var reader = FileUtilities.GetStreamReader(_stream)) + using var reader = FileUtilities.GetStreamReader(_stream); + string line; + while ((line = reader.ReadLine()) != null) { - string line; - while ((line = reader.ReadLine()) != null) - { - // Skip empty lines. - if (string.IsNullOrWhiteSpace(line)) continue; - - // Skip comments, headers - if (line.OptimizedStartsWith('#')) continue; - - var items = ExtractItems(line); - foreach (var item in items) - { - if (item == null) continue; - yield return item; - } - } - } + // Skip empty lines. + if (string.IsNullOrWhiteSpace(line)) continue; + // Skip comments, headers + if (line.OptimizedStartsWith('#')) continue; + + foreach (string item in ExtractItems(line)) + yield return item; + } } + //MT 5 6 {"C:A":{"ad":[1],"allele_type":"alt","vrf":[0.006329113924050633],"vrf_stats":{"kurtosis":241.00408163265314,"max":0.0063291139240506328,"mean":2.5728105382319646e-05,"min":0.0,"nobs":246,"skewness":15.588588185998534,"stdev":0.00040352956522996095,"variance":1.6283611001468132e-07}}} - private IEnumerable ExtractItems(string line) + private static IEnumerable ExtractItems(string line) { var splits = line.Split('\t'); - if(splits.Length < 4) yield break; - var chromosomeName = splits[0]; - if (!_referenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) yield break; + if (splits.Length < 4) yield break; - var chromosome = _referenceProvider.RefNameToChromosome[chromosomeName]; - var position = int.Parse(splits[1])+1; // since this is a bed file - var info = splits[3]; - var stats = DeserializeStats(info); + var position = int.Parse(splits[1]) + 1; // since this is a bed file + var info = splits[3]; + var stats = DeserializeStats(info); foreach ((string refAllele, string altAllele, AlleleStats alleleStats) in GetAlleleStats(stats)) { - yield return new MitoHeteroplasmyItem(chromosome, position, refAllele, altAllele, alleleStats); + (string formattedVrfs, string alleleDepths) = MergeAndSortByVrf(alleleStats); + yield return string.Join('\t', position, refAllele, altAllele, formattedVrfs, alleleDepths); } - + + } + + private static (string formattedVrfs, string alleleDepths) MergeAndSortByVrf(AlleleStats alleleStats) + { + var vrfToAd = new Dictionary(); + foreach ((string vrf, int ad) in alleleStats.vrf.Select(x => x.ToString("0.###")) + .Zip(alleleStats.ad, (a, b) => (a, b))) + { + if (vrfToAd.ContainsKey(vrf)) vrfToAd[vrf] += ad; + else vrfToAd[vrf] = ad; + } + + var formattedVrfs = new string[vrfToAd.Count]; + var alleleDepths = new int[vrfToAd.Count]; + var i = 0; + foreach (var vrf in vrfToAd.Keys.OrderBy(x => double.Parse(x))) + { + formattedVrfs[i] = vrf; + alleleDepths[i] = vrfToAd[vrf]; + i++; + } + + return (string.Join(',',formattedVrfs), string.Join(',', alleleDepths)); } private static IEnumerable<(string, string, AlleleStats)> GetAlleleStats(PositionStats stats) @@ -91,7 +102,7 @@ private IEnumerable ExtractItems(string line) public static PositionStats DeserializeStats(string s) { var charArray = s.ToCharArray(); - for (int i = 0; i < charArray.Length - 3; i++) + for (var i = 0; i < charArray.Length - 3; i++) { if (IsNucleotide(charArray[i]) && charArray[i + 1] == ':' diff --git a/SAUtils/MitoHeteroplasmy/StatClasses.cs b/SAUtils/MitoHeteroplasmy/StatClasses.cs index a5a26452..e895b786 100644 --- a/SAUtils/MitoHeteroplasmy/StatClasses.cs +++ b/SAUtils/MitoHeteroplasmy/StatClasses.cs @@ -40,12 +40,5 @@ public class AlleleStats { public int[] ad; public double[] vrf; - public VrfStats vrf_stats; - } - - public class VrfStats - { - public double mean; - public double stdev; } } \ No newline at end of file diff --git a/SAUtils/Properties/launchSettings.json b/SAUtils/Properties/launchSettings.json index 442f061d..fbaa58c5 100644 --- a/SAUtils/Properties/launchSettings.json +++ b/SAUtils/Properties/launchSettings.json @@ -153,6 +153,11 @@ "commandName": "Project", "commandLineArgs": "AaCon --ref c:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --cache c:\\Development\\Cache\\26\\GRCh37\\Both --scr conservationScores.tsv.gz --out c:\\Development\\SupplementaryDatabase\\test", "workingDirectory": "C:\\development\\ExternalDataSources\\MultiZ46Way\\GRCh37" + }, + "MitoHeteroplasmy_SK": { + "commandName": "Project", + "commandLineArgs": "MitoHet --in coriell_GRCh37.MT.json.bed.gz --out .", + "workingDirectory": "D:\\Nirvana\\gen_data\\MitoHeteroplasmy" } } } \ No newline at end of file diff --git a/SingleAnnotationLambda/SingleConfigExtensions.cs b/SingleAnnotationLambda/SingleConfigExtensions.cs index 6b8c854b..6033e18f 100644 --- a/SingleAnnotationLambda/SingleConfigExtensions.cs +++ b/SingleAnnotationLambda/SingleConfigExtensions.cs @@ -2,6 +2,7 @@ using Cloud.Messages.Single; using ErrorHandling.Exceptions; using Genome; +using MitoHeteroplasmy; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; @@ -51,7 +52,8 @@ private static IPosition ToPosition(string[] vcfFields, ISequenceProvider sequen SimplePosition simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId()); - return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, variantFactory); + var mitoHeteroplasmyProvider = new MitoHeteroplasmyProvider(); + return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); } } } diff --git a/UnitTests/EndToEndTests.cs b/UnitTests/EndToEndTests.cs index 72db46c7..18f93061 100644 --- a/UnitTests/EndToEndTests.cs +++ b/UnitTests/EndToEndTests.cs @@ -17,7 +17,7 @@ public sealed class EndToEndTests public void Annotation_RefMinor_not_annotated_when_no_SA() { const string vcfLine = "chr12 7054859 . G . 100 PASS . . ."; - var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, vcfLine); + var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine); string output = annotatedPosition.GetJsonString(); Assert.Null(output); } @@ -36,7 +36,7 @@ public void Annotation_RefMinor_not_annotated_when_no_SA() [InlineData("chr12 7043410 . CT GATG 50 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7043410,\"refAllele\":\"CT\",\"altAlleles\":[\"GATG\"],\"quality\":50,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7043410:7043411:GATG\",\"chromosome\":\"chr12\",\"begin\":7043410,\"end\":7043411,\"refAllele\":\"CT\",\"altAllele\":\"GATG\",\"variantType\":\"indel\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001007026.1:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001007027.1:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"429-430\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"XM_005253672.1:c.99_100delCTinsGATG\",\"hgvsp\":\"XP_005253729.1:p.(Ser34MetfsTer27)\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"329-330\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001940.3:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001931.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000356654.4:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000349076.3:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"333-334\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000396684.2:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000379915.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")] public void Annotate_with_SA(string vcfLine, string expectedResults) { - var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, vcfLine); + var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine); string observedResults = annotatedPosition.GetJsonString(); Assert.Equal(expectedResults, observedResults); } diff --git a/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs b/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs new file mode 100644 index 00000000..813ad338 --- /dev/null +++ b/UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs @@ -0,0 +1,27 @@ +using MitoHeteroplasmy; +using UnitTests.TestUtilities; +using Xunit; + +namespace UnitTests.MitoHeteroplasmy +{ + public sealed class MitoHeteroplasmyProviderTests + { + [Fact] + public void GetVrfPercentiles_AsExpected() + { + var provider = new MitoHeteroplasmyProvider(); + provider.Add(1, "C", new []{0.123, 0.200, 0.301}, new []{1, 3, 4}); + provider.Add(1, "G", new[] { 0.101, 0.201}, new[] { 1, 2}); + + var chrom = ChromosomeUtilities.ChrM; + var position = 1; + var altAlleles = new[] {"C", "T",}; + + var percentilesSample1 = provider.GetVrfPercentiles("0|1", chrom, position, altAlleles, new[] { 0.2}); + var percentilesSample2 = provider.GetVrfPercentiles("0/2", chrom, position, altAlleles, new[] { 0.12, 0.421}); + + Assert.Equal(new double?[]{0.5}, percentilesSample1); + Assert.Null(percentilesSample2); + } + } +} diff --git a/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs b/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs index 6bd6a39c..49f729d7 100644 --- a/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs +++ b/UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs @@ -36,15 +36,13 @@ private static Stream GetStream() [Fact] public void ParseItems() { - using (var parser = new MitoHeteroplasmyParser(GetStream(), GetSequenceProvider())) - { - var items = parser.GetItems().ToList(); + using var parser = new MitoHeteroplasmyParser(GetStream()); + var items = parser.GetOutputLines().ToList(); - Assert.Equal(4, items.Count); + Assert.Equal(4, items.Count); - Assert.Equal("\"vrfMean\":0.000026,\"vrfStdev\":0.000404", items[0].GetJsonString()); - - } + Assert.Equal("6\tC\tA\t0.006\t1", items[0]); + Assert.Equal("8\tG\tA\t0.002,0.003,0.004\t1,2,1", items[1]); } [Fact] diff --git a/UnitTests/TestUtilities/AnnotationUtilities.cs b/UnitTests/TestUtilities/AnnotationUtilities.cs index fe50391b..b5767859 100644 --- a/UnitTests/TestUtilities/AnnotationUtilities.cs +++ b/UnitTests/TestUtilities/AnnotationUtilities.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using Genome; +using MitoHeteroplasmy; using Nirvana; using OptimizedCore; using VariantAnnotation; @@ -14,7 +15,7 @@ namespace UnitTests.TestUtilities { public static class AnnotationUtilities { - internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List saPaths, + internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List saPaths, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, string vcfLine) { var annotationFiles = new AnnotationFiles(); @@ -24,16 +25,16 @@ internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, var (annotator, sequenceProvider) = GetAnnotatorAndSequenceProvider(cacheFilePrefix, saPaths); var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId()); - var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, variantFactory); + var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); var annotatedPosition = annotator.Annotate(position); return annotatedPosition; } - internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, VariantFactory variantFactory) + internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory) { var simplePosition = GetSimplePosition(vcfLine, sequenceProvider.RefNameToChromosome); - return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, variantFactory); + return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); } internal static SimplePosition GetSimplePosition(string vcfLine, diff --git a/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs b/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs index 14ee5801..d5ca51d3 100644 --- a/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs +++ b/UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs @@ -56,7 +56,7 @@ public void GetJsonString_StrelkaSomatic() var seqProvider = ParserTestUtils.GetSequenceProvider(13813, "T", 'C', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); - var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory); + var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); diff --git a/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs b/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs index 9a2ff245..d8ef958b 100644 --- a/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs +++ b/UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs @@ -10,7 +10,7 @@ public sealed class SampleExtensionsTests public void GetJsonString_Nominal() { var sample = new Sample(new[] {23, 34}, 12.345f, 3, new[] {"-", "+"}, true, "1/2", 98, true, 56.67f, - new[] {8, 14}, new[] {7, 4}, new[] {10, 15}, 34, new[] {0.34, 0.56}, 1, 2.3, null); + new[] {8, 14}, new[] {7, 4}, new[] {10, 15}, 34, new[] {0.34, 0.56}, 1, 2.3, null, null); string observedResult = sample.GetJsonString(); diff --git a/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs b/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs index 1ee74435..4e785d41 100644 --- a/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs +++ b/UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs @@ -1,11 +1,13 @@ using System.Linq; using Vcf.Sample; using Xunit; +using static UnitTests.Vcf.Samples.TestUtilities; namespace UnitTests.Vcf.Samples.Legacy { public sealed class LegacySampleFieldExtractorTests { + [Fact] public void FormatIndicesTest() { @@ -54,7 +56,7 @@ public void AlleleDepths(string formatCol, string sampleCol, int[] expectedAllel string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); @@ -72,7 +74,7 @@ public void AlleleDepthsMultiAllelic(string formatCol, string sampleCol, int[] e string vcfLine = $"chr1\t5592503\t.\tC\tT,A\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 2); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null); Assert.Single(samples); @@ -91,7 +93,7 @@ public void FailedFilter(string sampleCol, bool? expectedFailedFilter) string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:FT\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null); Assert.Single(samples); @@ -111,7 +113,7 @@ public void Genotype(string sampleCol, string expectedGenotype) string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:AD\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null); Assert.Single(samples); @@ -133,7 +135,7 @@ public void GenotypeQuality(string formatCol, string sampleCol, int? expectedGen string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); @@ -157,7 +159,7 @@ public void TotalDepth(string formatCol, string sampleCol, int? expectedTotalDep string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); @@ -173,7 +175,7 @@ public void PiscesTotalDepth() "chr1\t115251293\t.\tGA\tG\t100\tSB;LowVariantFreq\tDP=7882\tGT:GQ:AD:VF:NL:SB:GQX\t0/1:100:7588,294:0:20:-100.0000:100"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null); var sample = samples[0]; var observedTotalDepth = sample.TotalDepth; @@ -196,7 +198,7 @@ public void VariantFrequency_Nominal(string altAllele, string formatCol, string string vcfLine = $"chr1\t5592503\t.\tC\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); @@ -220,7 +222,7 @@ public void VariantFrequency_ReturnNull(string refAllele, string altAllele, stri var vcfLine = $"chr1\t5592503\t.\t{refAllele}\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(altAllele.Split(',').Length), null); Assert.Single(samples); var sample = samples[0]; @@ -238,7 +240,7 @@ public void Leftout_fields_return_null(string formatCol, string sampleCol, strin var vcfLine = $"chr1\t5592503\t.\tA\tC\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null); Assert.Single(samples); var sample = samples[0]; @@ -271,7 +273,7 @@ public void MajorChromosomeCopyTest() const string vcfLine = "1 9314202 Canvas:GAIN:1:9314202:9404148 N 36 PASS SVTYPE=CNV;END=9404148;ensembl_gene_id=ENSG00000049239,ENSG00000252841,ENSG00000171621 RC:BC:CN:MCC . 151:108:6:4"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null); Assert.Equal(2, samples.Length); @@ -288,7 +290,7 @@ public void EmptySamples() const string vcfLine = "chrX 2735147 . G A 38.25 VQSRTrancheSNP99.90to100.00 AC=3;AF=0.500;AN=6;BaseQRankSum=-0.602;DP=56;Dels=0.00;FS=30.019;HaplotypeScore=7.7259;MLEAC=3;MLEAF=0.500;MQ=41.18;MQ0=0;MQRankSum=0.098;QD=1.06;ReadPosRankSum=0.266;SB=-8.681e-03;VQSLOD=-6.0901;culprit=QD GT:AD:DP:GQ:PL 0:7,0:7:3:0,3,39 ./. 0/1:14,3:17:35:35,0,35 1/1:9,10:19:3:41,3,0"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Equal(4, samples.Length); @@ -314,7 +316,7 @@ public void VariantFrequencyNan(string formatCol, string sampleCol, string expec var vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); @@ -337,7 +339,7 @@ public void SplitReadCounts() var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Equal(2, samples.Length); var sample1 = samples[0]; @@ -356,7 +358,7 @@ public void EmptySample() var vcfColumns = vcfLine.Split('\t'); - var samples = vcfColumns.ToSamples(new FormatIndices(), 1); + var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null); Assert.Single(samples); var sample = samples[0]; diff --git a/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs b/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs index 183bdd97..4174f394 100644 --- a/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs +++ b/UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs @@ -1,6 +1,10 @@ -using VariantAnnotation.Interface.Positions; +using MitoHeteroplasmy; +using UnitTests.TestUtilities; +using VariantAnnotation.Interface.Positions; +using Vcf; using Vcf.Sample; using Xunit; +using static UnitTests.Vcf.Samples.TestUtilities; namespace UnitTests.Vcf.Samples { @@ -24,15 +28,15 @@ public void ExtractSample_PEPE() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:GQ:AD:DP:VF:NL:SB:NC:US:AQ:LQ"); - var sample = SampleFieldExtractor.ExtractSample("0/1:5:338,1:339:0.00295:30:-7.3191:0.0314:0,0,0,1,0,0,17,1,129,21,148,22:3.366:0.000", formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample("0/1:5:338,1:339:0.00295:30:-7.3191:0.0314:0,0,0,1,0,0,17,1,129,21,148,22:3.366:0.000", formatIndices, GetSimplePositionUsingAlleleNum(1), null); - Assert.Equal("0/1", sample.Genotype); - Assert.Equal(5, sample.GenotypeQuality); - Assert.Equal(new[] { 338, 1 }, sample.AlleleDepths); - Assert.Equal(339, sample.TotalDepth); + Assert.Equal("0/1", sample.Genotype); + Assert.Equal(5, sample.GenotypeQuality); + Assert.Equal(new[] { 338, 1 }, sample.AlleleDepths); + Assert.Equal(339, sample.TotalDepth); Assert.Equal(new[] { 0.00295 }, sample.VariantFrequencies); - Assert.Equal(3.366f, sample.ArtifactAdjustedQualityScore); - Assert.Equal(0.000f, sample.LikelihoodRatioQualityScore); + Assert.Equal(3.366f, sample.ArtifactAdjustedQualityScore); + Assert.Equal(0.000f, sample.LikelihoodRatioQualityScore); } [Fact] @@ -40,13 +44,13 @@ public void ExtractSample_DragenSomatic_AsExpected() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS"); - var sample = SampleFieldExtractor.ExtractSample("0|1:3.96:33,8:0.195:13,6:20,2:41:17,16,4,4:13,20,4,4:534234", formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample("0|1:3.96:33,8:0.195:13,6:20,2:41:17,16,4,4:13,20,4,4:534234", formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3.96, sample.SomaticQuality); Assert.Equal(new[] { 33, 8 }, sample.AlleleDepths); Assert.Equal(41, sample.TotalDepth); - Assert.Equal(new[] { 8/41.0 }, sample.VariantFrequencies); + Assert.Equal(new[] { 8 / 41.0 }, sample.VariantFrequencies); } [Fact] @@ -54,7 +58,7 @@ public void ExtractSample_DragenCNV_AsExpected() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:CN:MCN"); - var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3, sample.CopyNumber); @@ -68,7 +72,7 @@ public void ExtractSample_DragenCNV_MCN_LOH(string formatField, string sampleFie { var formatIndices = new FormatIndices(); formatIndices.Set(formatField); - var sample = SampleFieldExtractor.ExtractSample(sampleField, formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample(sampleField, formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.True(sample.IsLossOfHeterozygosity); } @@ -78,7 +82,7 @@ public void ExtractSample_ExpansionHunter() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC"); - var sample = SampleFieldExtractor.ExtractSample("1/1:SPANNING/SPANNING:15/15:15-15/15-15:22/22:23/23:0/0:38.270270", formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample("1/1:SPANNING/SPANNING:15/15:15-15/15-15:22/22:23/23:0/0:38.270270", formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.Equal("1/1", sample.Genotype); Assert.Equal(new[] { 15, 15 }, sample.RepeatUnitCounts); @@ -88,7 +92,7 @@ public void ExtractSample_ExpansionHunter() public void ExtractSample_EmptySampleColumn_ReturnEmptySample() { var formatIndices = new FormatIndices(); - var sample = SampleFieldExtractor.ExtractSample(null, formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample(null, formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.True(sample.IsEmpty); } @@ -96,7 +100,7 @@ public void ExtractSample_EmptySampleColumn_ReturnEmptySample() public void ExtractSample_DotInSampleColumn_ReturnEmptySample() { var formatIndices = new FormatIndices(); - var sample = SampleFieldExtractor.ExtractSample(".", formatIndices, 1); + var sample = SampleFieldExtractor.ExtractSample(".", formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.True(sample.IsEmpty); } @@ -129,7 +133,7 @@ public void ToSamples_SMN1_CNV() "./1:.:.:.:.:1.26335:3:4:6:cnvLength:Inherited" }; - ISample[] samples = cols.ToSamples(formatIndices, 1); + ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.Equal(4, samples.Length); @@ -166,8 +170,25 @@ public void ToSamples_TooFewVcfColumns_ReturnNull() "SVTYPE=CNV;END=125075279;REFLEN=6510" }; - ISample[] samples = cols.ToSamples(formatIndices, 1); + ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null); Assert.Null(samples); } + + [Fact] + public void ExtractSample_MitoHeteroplasmy_AsExpected() + { + var provider = new MitoHeteroplasmyProvider(); + provider.Add(1, "C", new[] { 0.123, 0.200, 0.301 }, new[] { 1, 3, 4 }); + provider.Add(1, "G", new[] { 0.101, 0.201 }, new[] { 1, 2 }); + + var simplePosition = new SimplePosition(ChromosomeUtilities.ChrM, 1, "A", new[] { "C", "T"}); + + var formatIndices = new FormatIndices(); + formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS"); + var sample = SampleFieldExtractor.ExtractSample("1|2:3.96:0,15,85:0.195:13,6:20,2:100:17,16,4,4:13,20,4,4:534234", formatIndices, simplePosition, provider); + + Assert.Equal(new[] { 15 / 100.0, 85 / 100.0 }, sample.VariantFrequencies); + Assert.Equal(new[] { "0.13", null }, sample.HeteroplasmyPercentile); + } } } \ No newline at end of file diff --git a/UnitTests/Vcf/Samples/SampleTests.cs b/UnitTests/Vcf/Samples/SampleTests.cs index 9ffdc164..e5316eca 100644 --- a/UnitTests/Vcf/Samples/SampleTests.cs +++ b/UnitTests/Vcf/Samples/SampleTests.cs @@ -9,7 +9,7 @@ public sealed class SampleTests public void Sample_ReturnEmpty() { var emptySample = new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, - null, null, null, null); + null, null, null, null, null); Assert.True(emptySample.IsEmpty); Assert.True(Sample.EmptySample.IsEmpty); diff --git a/UnitTests/Vcf/Samples/TestUtilities.cs b/UnitTests/Vcf/Samples/TestUtilities.cs new file mode 100644 index 00000000..c1928f29 --- /dev/null +++ b/UnitTests/Vcf/Samples/TestUtilities.cs @@ -0,0 +1,18 @@ +using Moq; +using VariantAnnotation.Interface.Positions; + +namespace UnitTests.Vcf.Samples +{ + public static class TestUtilities + { + public static ISimplePosition GetSimplePositionUsingAlleleNum(int numAlleles) + { + var mock = new Mock(); + mock.SetupGet(x => x.AltAlleles).Returns(new string[numAlleles]); + mock.SetupGet(x => x.Start).Returns(-1); + + return mock.Object; + } + + } +} \ No newline at end of file diff --git a/UnitTests/Vcf/VcfReaderTests.cs b/UnitTests/Vcf/VcfReaderTests.cs index de069152..abef124c 100644 --- a/UnitTests/Vcf/VcfReaderTests.cs +++ b/UnitTests/Vcf/VcfReaderTests.cs @@ -39,7 +39,7 @@ public void ValidateVcfHeader_ExceptionThrown_NoFileFormat() AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); var reader = FileUtilities.GetStreamReader(_ms); - Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)); + Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)); } [Fact] @@ -49,7 +49,7 @@ public void ValidateVcfHeader_ExceptionThrown_NoChromHeaderLine() AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); var reader = FileUtilities.GetStreamReader(_ms); - Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)); + Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)); } [Fact] @@ -61,7 +61,7 @@ public void Sample_names_are_reported() var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)) { samples = vcfReader.GetSampleNames(); } @@ -97,7 +97,7 @@ public void CheckContigId_IncorrectAutoAndSexChromLength_ThrowException(string c var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)); + Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)); } [Theory] @@ -110,7 +110,7 @@ public void CheckContigId_InferredAssemblyIsUnknown_GivenIrregularChrom(string c var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); } @@ -125,7 +125,7 @@ public void CheckContigId_IsRcrsMitochondrionTrue_InferredAssemblyIsUnknown_Give AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); Assert.True(vcfReader.IsRcrsMitochondrion); @@ -142,7 +142,7 @@ public void CheckContigId_IsRcrsMitochondrionFalse_InferredAssemblyIsUnknown_Giv var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); Assert.False(vcfReader.IsRcrsMitochondrion); @@ -176,7 +176,7 @@ public void GetNextPosition() IPosition observedResult; using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null)) { observedResult = vcfReader.GetNextPosition(); } @@ -209,7 +209,7 @@ public void CheckSampleConsistency_oneSample() var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null)) { //first line is valid. So, no exception Assert.NotNull(vcfReader.GetNextPosition()); @@ -236,7 +236,7 @@ public void CheckSampleConsistency_noSample() var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) - using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator)) + using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullRecomposer(), new NullVcfFilter(), _vidCreator, null)) { //first line is valid. So, no exception Assert.NotNull(vcfReader.GetNextPosition()); diff --git a/UnitTests/Vcf/VcfReaderUtilsTests.cs b/UnitTests/Vcf/VcfReaderUtilsTests.cs index f88eed8e..ffde5ec4 100644 --- a/UnitTests/Vcf/VcfReaderUtilsTests.cs +++ b/UnitTests/Vcf/VcfReaderUtilsTests.cs @@ -92,7 +92,7 @@ public void Test_crash_caused_by_variant_trimming () var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); - var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, seqProvider, variantFactory); + var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants1 = Annotator.GetAnnotatedVariants(position1.Variants); @@ -115,7 +115,7 @@ public void ParseVcfLine_line_with_only_NonRef_is_refMinor() ParserTestUtils.GetSequenceProvider(10628385, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); - var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory); + var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants); Assert.Equal("C", position.RefAllele); @@ -137,7 +137,7 @@ public void ParseVcfLine_line_with_only_NonRef_is_not_refMinor() ParserTestUtils.GetSequenceProvider(10005, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); - var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, variantFactory); + var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants); Assert.Equal("C", position.RefAllele); diff --git a/VariantAnnotation.Interface/Positions/ISample.cs b/VariantAnnotation.Interface/Positions/ISample.cs index 746d6597..adde96d1 100644 --- a/VariantAnnotation.Interface/Positions/ISample.cs +++ b/VariantAnnotation.Interface/Positions/ISample.cs @@ -20,6 +20,6 @@ public interface ISample int? MinorHaplotypeCopyNumber { get; } double? SomaticQuality { get; } bool? IsLossOfHeterozygosity { get; } - + string[] HeteroplasmyPercentile { get; } } } \ No newline at end of file diff --git a/VariantAnnotation/IO/SampleExtensions.cs b/VariantAnnotation/IO/SampleExtensions.cs index c77b5ff6..f17942d0 100644 --- a/VariantAnnotation/IO/SampleExtensions.cs +++ b/VariantAnnotation/IO/SampleExtensions.cs @@ -32,7 +32,8 @@ public static string GetJsonString(this ISample sample) if (sample.IsLossOfHeterozygosity.HasValue) jsonObject.AddBoolValue("lossOfHeterozygosity", sample.IsLossOfHeterozygosity.Value); jsonObject.AddDoubleValue("somaticQuality", sample.SomaticQuality, "0.#"); - + jsonObject.AddStringValues("heteroplasmyPercentile", sample.HeteroplasmyPercentile); + sb.Append(JsonObject.CloseBrace); return StringBuilderCache.GetStringAndRelease(sb); } diff --git a/Vcf/Position.cs b/Vcf/Position.cs index 87634fbb..b41fd8a3 100644 --- a/Vcf/Position.cs +++ b/Vcf/Position.cs @@ -8,6 +8,7 @@ using Vcf.Info; using Vcf.Sample; using Vcf.VariantCreator; +using MitoHeteroplasmy; namespace Vcf { @@ -69,7 +70,7 @@ private static (bool HasStructuralVariant, bool HasShortTandemRepeat) CheckVaria return (hasStructuralVariant, hasShortTandemRepeat); } - public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, VariantFactory variantFactory) + public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory) { if (simplePosition == null) return null; @@ -91,7 +92,7 @@ public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProv int end = ExtractEnd(infoData, simplePosition.Start, simplePosition.RefAllele.Length); double? quality = vcfFields[VcfCommon.QualIndex].GetNullableValue(double.TryParse); string[] filters = vcfFields[VcfCommon.FilterIndex].OptimizedSplit(';'); - ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, altAlleles.Length); + ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, simplePosition, mitoHeteroplasmyProvider); IVariant[] variants = variantFactory.CreateVariants(simplePosition.Chromosome, simplePosition.Start, end, simplePosition.RefAllele, altAlleles, infoData, simplePosition.IsDecomposed, diff --git a/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs b/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs index eac71bdd..1343570d 100644 --- a/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs +++ b/Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs @@ -59,7 +59,7 @@ internal ISample ExtractSample(string sampleColumn) var sample = new Sample(alleleDepths, sampleFields.AQ, sampleFields.CopyNumber, sampleFields.DST, failedFilter, genotype, genotypeQuality, false, sampleFields.LQ, pairEndReadCounts, null, splitReadCounts, - totalDepth, variantFrequencies, null, null, isLossOfHeterozygosity); + totalDepth, variantFrequencies, null, null, isLossOfHeterozygosity, null); return sample; } diff --git a/Vcf/Sample/Sample.cs b/Vcf/Sample/Sample.cs index af211908..e2405e07 100644 --- a/Vcf/Sample/Sample.cs +++ b/Vcf/Sample/Sample.cs @@ -22,14 +22,15 @@ public sealed class Sample : ISample public int? MinorHaplotypeCopyNumber { get; } public double? SomaticQuality { get; } public bool? IsLossOfHeterozygosity { get; } + public string[] HeteroplasmyPercentile { get; } public static readonly Sample EmptySample = - new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null); + new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null, null); public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copyNumber, string[] diseaseAffectedStatuses, bool failedFilter, string genotype, int? genotypeQuality, bool isDeNovo, float? likelihoodRatioQualityScore, int[] pairedEndReadCounts, int[] repeatUnitCounts, - int[] splitReadCounts, int? totalDepth, double[] variantFrequencies, int? minorHaplotypeCopyNumber, double? somaticQuality, bool? isLossOfHeterozygosity) + int[] splitReadCounts, int? totalDepth, double[] variantFrequencies, int? minorHaplotypeCopyNumber, double? somaticQuality, bool? isLossOfHeterozygosity, string[] heteroplasmyPercentile) { AlleleDepths = alleleDepths; ArtifactAdjustedQualityScore = artifactAdjustedQualityScore; @@ -46,6 +47,7 @@ public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copy TotalDepth = totalDepth; VariantFrequencies = variantFrequencies; IsLossOfHeterozygosity = isLossOfHeterozygosity; + HeteroplasmyPercentile = heteroplasmyPercentile; MinorHaplotypeCopyNumber = minorHaplotypeCopyNumber; SomaticQuality = somaticQuality; @@ -64,6 +66,7 @@ public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copy IsLossOfHeterozygosity == null && MinorHaplotypeCopyNumber == null && SomaticQuality == null && + HeteroplasmyPercentile == null && !FailedFilter && !IsDeNovo; } diff --git a/Vcf/Sample/SampleFieldExtractor.cs b/Vcf/Sample/SampleFieldExtractor.cs index ca9dd74f..08264f6b 100644 --- a/Vcf/Sample/SampleFieldExtractor.cs +++ b/Vcf/Sample/SampleFieldExtractor.cs @@ -1,4 +1,6 @@ -using OptimizedCore; +using System.Linq; +using MitoHeteroplasmy; +using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using Vcf.Sample.Legacy; @@ -7,7 +9,7 @@ namespace Vcf.Sample { internal static class SampleFieldExtractor { - internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, int numAltAlleles) + internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, ISimplePosition simplePosition, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { if (vcfColumns.Length < VcfCommon.MinNumColumnsSampleGenotypes) return null; @@ -20,14 +22,14 @@ internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices form for (int index = VcfCommon.GenotypeIndex; index < vcfColumns.Length; index++) { - samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, numAltAlleles, legacySampleExtractor); + samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, simplePosition, mitoHeteroplasmyProvider, legacySampleExtractor); } return samples; } - internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, int numAltAlleles, - LegacySampleFieldExtractor legacyExtractor = null) + internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, ISimplePosition simplePosition, + IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, LegacySampleFieldExtractor legacyExtractor = null) { // sanity check: make sure we have a format column if (string.IsNullOrEmpty(sampleColumn)) return Sample.EmptySample; @@ -59,13 +61,15 @@ internal static ISample ExtractSample(string sampleColumn, FormatIndices formatI int? minorHaplotypeCopyNumber = sampleColumns.GetString(formatIndices.MCN).GetInteger(); double? somaticQuality = sampleColumns.GetString(formatIndices.SQ).GetDouble(); - double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, numAltAlleles); + double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, simplePosition.AltAlleles.Length); + string[] mitoHeteroplasmyPercentiles = mitoHeteroplasmyProvider?.GetVrfPercentiles(genotype, simplePosition.Chromosome, simplePosition.Start, + simplePosition.AltAlleles, variantFrequencies)?.Select(x => x?.ToString("0.##")).ToArray(); var isLoh = GetLoh(copyNumber, minorHaplotypeCopyNumber, genotype); var sample = new Sample(alleleDepths, artifactAdjustedQualityScore, copyNumber, diseaseAffectedStatuses, failedFilter, genotype, genotypeQuality, isDeNovo, likelihoodRatioQualityScore, pairedEndReadCounts, - repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh); + repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh, mitoHeteroplasmyPercentiles); return sample; } diff --git a/Vcf/SimplePosition.cs b/Vcf/SimplePosition.cs index 02833935..e5b6641b 100644 --- a/Vcf/SimplePosition.cs +++ b/Vcf/SimplePosition.cs @@ -19,7 +19,7 @@ public sealed class SimplePosition : ISimplePosition public string[] Vids { get; private set; } public List[] LinkedVids { get; private set; } - private SimplePosition(IChromosome chromosome, int start, string refAllele, string[] altAlleles) + internal SimplePosition(IChromosome chromosome, int start, string refAllele, string[] altAlleles) { Chromosome = chromosome; Start = start; diff --git a/Vcf/Vcf.csproj b/Vcf/Vcf.csproj index bddce04b..8a165267 100644 --- a/Vcf/Vcf.csproj +++ b/Vcf/Vcf.csproj @@ -9,5 +9,6 @@ + \ No newline at end of file diff --git a/Vcf/VcfReader.cs b/Vcf/VcfReader.cs index dafcefb4..c71e438e 100644 --- a/Vcf/VcfReader.cs +++ b/Vcf/VcfReader.cs @@ -3,6 +3,7 @@ using System.IO; using ErrorHandling.Exceptions; using Genome; +using MitoHeteroplasmy; using OptimizedCore; using VariantAnnotation.Interface; using VariantAnnotation.Interface.IO; @@ -23,6 +24,7 @@ public sealed class VcfReader : IVcfReader private readonly ISequenceProvider _sequenceProvider; private readonly IDictionary _refNameToChromosome; private readonly IVcfFilter _vcfFilter; + private readonly IMitoHeteroplasmyProvider _mitoHeteroplasmyProvider; public bool IsRcrsMitochondrion { get; private set; } public string VcfLine { get; private set; } public GenomeAssembly InferredGenomeAssembly { get; private set; } = GenomeAssembly.Unknown; @@ -37,21 +39,22 @@ public sealed class VcfReader : IVcfReader public string[] GetSampleNames() => _sampleNames; private VcfReader(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, - IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator) + IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { - _headerReader = headerReader; - _reader = vcfLineReader; - _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator); - _sequenceProvider = sequenceProvider; - _refMinorProvider = refMinorProvider; - _vcfFilter = vcfFilter; - _refNameToChromosome = sequenceProvider.RefNameToChromosome; + _headerReader = headerReader; + _reader = vcfLineReader; + _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator); + _sequenceProvider = sequenceProvider; + _refMinorProvider = refMinorProvider; + _vcfFilter = vcfFilter; + _refNameToChromosome = sequenceProvider.RefNameToChromosome; + _mitoHeteroplasmyProvider = mitoHeteroplasmyProvider; } public static VcfReader Create(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, - IRefMinorProvider refMinorProvider, IRecomposer recomposer, IVcfFilter vcfFilter, IVariantIdCreator vidCreator) + IRefMinorProvider refMinorProvider, IRecomposer recomposer, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { - var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator); + var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator, mitoHeteroplasmyProvider); vcfReader.ParseHeader(); vcfReader.SetRecomposer(recomposer); return vcfReader; @@ -187,7 +190,7 @@ private void CheckVcfOrder(string referenceName) _currentReferenceName = referenceName; } - public IPosition GetNextPosition() => Position.ToPosition(GetNextSimplePosition(), _refMinorProvider, _sequenceProvider, _variantFactory); + public IPosition GetNextPosition() => Position.ToPosition(GetNextSimplePosition(), _refMinorProvider, _sequenceProvider, _mitoHeteroplasmyProvider, _variantFactory); public void Dispose() => _reader?.Dispose(); }