Skip to content

Commit

Permalink
Feature/heteroplasmy tsv 4696 (#524)
Browse files Browse the repository at this point in the history
* Make the TSV output

* Initial implementation of the sample level heteroplasmy provider

* output null in the JSON output if heteroplasmyPercentile is not available

* Fix broken unit tests

* Add proper unit tests

* Bugfix

* Use gzipped file to reduce the size of compiled file

* format change

* Fix broken unit tests
  • Loading branch information
shulik7 authored and rajatshuvro committed May 5, 2020
1 parent c48c1e8 commit a5a16f7
Show file tree
Hide file tree
Showing 37 changed files with 390 additions and 196 deletions.
9 changes: 9 additions & 0 deletions MitoHeteroplasmy/IMitoHeteroplasmyProvider.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
using Genome;

namespace MitoHeteroplasmy
{
public interface IMitoHeteroplasmyProvider
{
double?[] GetVrfPercentiles(string genotype, IChromosome chrome, int position, string[] altAllele, double[] vrfs);
}
}
15 changes: 15 additions & 0 deletions MitoHeteroplasmy/MitoHeteroplasmy.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
<LangVersion>latest</LangVersion>
</PropertyGroup>
<ItemGroup>
<EmbeddedResource Include="Resources\MitoHeteroplasmy.tsv.gz" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\OptimizedCore\OptimizedCore.csproj" />
<ProjectReference Include="..\VariantAnnotation.Interface\VariantAnnotation.Interface.csproj" />
</ItemGroup>

</Project>
66 changes: 66 additions & 0 deletions MitoHeteroplasmy/MitoHeteroplasmyProvider.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Genome;

namespace MitoHeteroplasmy
{
public sealed class MitoHeteroplasmyProvider : IMitoHeteroplasmyProvider
{
private const string MitoChromUcscName = "chrM";
private static readonly Dictionary<string, int> AlleleToInt = new Dictionary<string, int> { { "A", 0 }, { "C", 1 }, { "G", 2 }, { "T", 3 } };
private const int SequenceLengthMax = int.MaxValue / 4;

private readonly Dictionary<int, (int[] Vrfs, int[] AlleleDepths)> _alleleToVrf = new Dictionary<int, (int[], int[])>();

public void Add(int position, string altAllele, IEnumerable<double> vrfs, int[] alleleDepths)
{
var vrfsInt = vrfs.Select(ToIntVrfForm).ToArray();
_alleleToVrf[EncodeMitoPositionAndAltAllele(position, altAllele)] = (vrfsInt, alleleDepths);
}

public double?[] GetVrfPercentiles(string genotypes, IChromosome chrom, int position, string[] altAlleles, double[] vrfs)
{
if (vrfs == null) return null;
if (chrom.UcscName != MitoChromUcscName) return null;

var sampleAlleles = genotypes.Split('|', '/').Where(x => x != "0")
.Select(x => GetAlleleByGenotype(x, altAlleles));

var percentiles = vrfs.Zip(sampleAlleles, (vrf, allele) => GetVrfPercentile(position, allele, vrf)).ToArray();
return percentiles.All(x => x == null) ? null : percentiles;
}

private static string GetAlleleByGenotype(string genotypeIndex, string[] altAlleles) => altAlleles[int.Parse(genotypeIndex) - 1];

private double? GetVrfPercentile(int position, string altAllele, double vrf)
{
if (string.IsNullOrEmpty(altAllele)) return null;

var positionAndAltAlleleIntForm = EncodeMitoPositionAndAltAllele(position, altAllele);

if (!_alleleToVrf.TryGetValue(positionAndAltAlleleIntForm, out (int[] Vrfs, int[] AlleleDepths) data)) return null;

var scaledVrf = vrf * 1000;
int nearestBiggerVrfIndex;
for (nearestBiggerVrfIndex = 0; nearestBiggerVrfIndex < data.Vrfs.Length; nearestBiggerVrfIndex++)
{
if (data.Vrfs[nearestBiggerVrfIndex] > scaledVrf) break;
}

var numSmallerOrEqualAlleles = 0.0;
var numAllAlleles = 0;
for (var i = 0; i < data.AlleleDepths.Length; i++)
{
if (i < nearestBiggerVrfIndex) numSmallerOrEqualAlleles += data.AlleleDepths[i];
numAllAlleles += data.AlleleDepths[i];
}

return numSmallerOrEqualAlleles / numAllAlleles;
}

private static int ToIntVrfForm(double vrf) => Convert.ToInt32(vrf * 1000);

private static int EncodeMitoPositionAndAltAllele(int position, string altAllele) => SequenceLengthMax * AlleleToInt[altAllele] + position;
}
}
44 changes: 44 additions & 0 deletions MitoHeteroplasmy/MitoHeteroplasmyReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Reflection;
using OptimizedCore;

namespace MitoHeteroplasmy
{
public static class MitoHeteroplasmyReader
{
private const int PositionIndex = 0;
private const int AltIndex = 2;
private const int VrfIndex = 3;
private const int AlleleDepthIndex = 4;

private const string ResourceName = "MitoHeteroplasmy.Resources.MitoHeteroplasmy.tsv.gz";
public static MitoHeteroplasmyProvider GetData()
{
var assembly = Assembly.GetExecutingAssembly();
using var stream = assembly.GetManifestResourceStream(ResourceName);
if (stream == null) throw new NullReferenceException("Unable to read from the Mitochondrial Heteroplasmy file");

using var gzStream = new GZipStream(stream, CompressionMode.Decompress);
using var reader = new StreamReader(gzStream);
reader.ReadLine();

var mitoHeteroplasmyData = new MitoHeteroplasmyProvider();
while (true)
{
string line = reader.ReadLine();
if (line == null) break;

var fields = line.OptimizedSplit('\t');
var position = int.Parse(fields[PositionIndex]);
var vrfs = fields[VrfIndex].Split(',').Select(double.Parse);
var alleleDepths = fields[AlleleDepthIndex].Split(',').Select(int.Parse).ToArray();
mitoHeteroplasmyData.Add(position, fields[AltIndex], vrfs, alleleDepths);
}

return mitoHeteroplasmyData;
}
}
}
Binary file not shown.
12 changes: 9 additions & 3 deletions Nirvana.sln
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tabix", "Tabix\Tabix.csproj
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cloud", "Cloud\Cloud.csproj", "{E93914C8-2599-46BE-BE18-6229E53F581B}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Downloader", "Downloader\Downloader.csproj", "{5B81B762-8A86-466A-A947-AC2CA53EE40D}"
EndProject
Expand All @@ -58,12 +58,14 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NirvanaLambda", "NirvanaLam
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SingleAnnotationLambda", "SingleAnnotationLambda\SingleAnnotationLambda.csproj", "{C9B4E16E-FF30-4CE0-A617-F833696FBE10}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomStrValidationLambda", "CustomStrValidationLambda\CustomStrValidationLambda.csproj", "{F3E60E51-EE07-4768-8EC3-E3A323DFA547}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MitoHeteroplasmy", "MitoHeteroplasmy\MitoHeteroplasmy.csproj", "{387E4C8D-6A27-40DE-A305-F3F047B8D865}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -186,6 +188,10 @@ Global
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.Build.0 = Release|Any CPU
{387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.Build.0 = Debug|Any CPU
{387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.ActiveCfg = Release|Any CPU
{387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
3 changes: 2 additions & 1 deletion Nirvana.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,5 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=Translocations/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=ucsc/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=uncompressing/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Vids/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Vids/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Vrfs/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
5 changes: 5 additions & 0 deletions Nirvana/Properties/launchSettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@
"commandName": "Project",
"commandLineArgs": "--cache /Users/rroy1/development/Cache\\26\\GRCh38\\Both --sd /Users/rroy1/development/SupplementaryDatabase\\56\\GRCh38 --ref /Users/rroy1/development/References/6/Homo_sapiens.GRCh38.Nirvana.dat --in DQ-Strelka-Germline-chr22-hg38.vcf.gz --out dq",
"workingDirectory": "/Users/rroy1/development/TestDatasets"
},
"SK MitoHeteroplasmy": {
"commandName": "Project",
"commandLineArgs": "-i debug.vcf -c D:\\Nirvana\\Development\\Cache\\26\\GRCh38\\Both -r D:\\Nirvana\\Development\\References\\Homo_sapiens.GRCh38.Nirvana.dat -o debug",
"workingDirectory": "D:\\Nirvana\\test_runs\\run_Nirvana"
}
}
}
13 changes: 7 additions & 6 deletions Nirvana/StreamAnnotation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using ErrorHandling.Exceptions;
using Genome;
using IO;
using MitoHeteroplasmy;
using VariantAnnotation;
using VariantAnnotation.Interface;
using VariantAnnotation.Interface.IO;
Expand All @@ -23,10 +24,10 @@ public static ExitCodes Annotate(Stream headerStream, Stream inputVcfStream, Str
var metrics = annotationResources.Metrics;
PerformanceMetrics.ShowAnnotationHeader();

IChromosome currentChromosome = new EmptyChromosome("dummy");
int numVariants = 0;

using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter))
IChromosome currentChromosome = new EmptyChromosome("dummy");
int numVariants = 0;
IMitoHeteroplasmyProvider mitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetData();
using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter, mitoHeteroplasmyProvider))
using (var jsonWriter = new JsonWriter(outputJsonStream, outputJsonIndexStream, annotationResources, Date.CurrentTimeStamp, vcfReader.GetSampleNames(), false))
{
try
Expand Down Expand Up @@ -97,7 +98,7 @@ private static void SetMitochondrialAnnotationBehavior(IAnnotationResources anno
}

private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAnnotationResources annotationResources,
IVcfFilter vcfFilter)
IVcfFilter vcfFilter, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider)
{
var vcfReader = FileUtilities.GetStreamReader(vcfStream);

Expand All @@ -111,7 +112,7 @@ private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAn
}

return VcfReader.Create(headerReader, vcfReader, annotationResources.SequenceProvider,
annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator);
annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator, mitoHeteroplasmyProvider);
}
}
}
1 change: 1 addition & 0 deletions RepeatExpansions/RepeatExpansionProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using Intervals;
using IO;
using RepeatExpansions.IO;
using VariantAnnotation.Interface;
using VariantAnnotation.Interface.AnnotatedPositions;
using Variants;

Expand Down
30 changes: 9 additions & 21 deletions SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,21 @@
using ErrorHandling;
using IO;
using SAUtils.InputFileParsers;
using VariantAnnotation.Providers;
using VariantAnnotation.SA;

namespace SAUtils.MitoHeteroplasmy
{
public static class MitoHeteroplasmyDb
{
private static string _inputFile;
private static string _compressedReference;
private static string _outputDirectory;
private const string OutFileName = "MitoHeteroplasmy.tsv";
private const string HeaderLine = "#POS\tREF\tALT\tVRFs\tAlleleDepths";

public static ExitCodes Run(string command, string[] commandArgs)

{
var ops = new OptionSet
{
{
"ref|r=",
"compressed reference sequence file",
v => _compressedReference = v
},
{
"in|i=",
"input BED file path",
Expand All @@ -42,11 +36,10 @@ public static ExitCodes Run(string command, string[] commandArgs)

var exitCode = new ConsoleAppBuilder(commandArgs, ops)
.Parse()
.CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref")
.CheckInputFilenameExists(_inputFile, "Mitochondrial Heteroplasmy BED file", "--in")
.CheckDirectoryExists(_outputDirectory, "output directory", "--out")
.SkipBanner()
.ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample)
.ShowHelpMenu("Creates a TSV file with mitochondrial heteroplasmy information", commandLineExample)
.ShowErrors()
.Execute(ProgramExecution);

Expand All @@ -55,17 +48,12 @@ public static ExitCodes Run(string command, string[] commandArgs)

private static ExitCodes ProgramExecution()
{
var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference));
var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version");
string outFileName = $"{version.Name}_{version.Version}";

using (var primateAiParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider))
using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix)))
using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSufix)))
using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.MitoHeteroplasmyTag, true, false, SaCommon.SchemaVersion, false))
{
nsaWriter.Write(primateAiParser.GetItems());
}
using var mitoHeteroplasmyParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile));
using var tsvStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, OutFileName));
using var tsvWriter = new StreamWriter(tsvStream);
tsvWriter.WriteLine(HeaderLine);
foreach(var line in mitoHeteroplasmyParser.GetOutputLines())
tsvWriter.WriteLine(line);

return ExitCodes.Success;
}
Expand Down
33 changes: 0 additions & 33 deletions SAUtils/MitoHeteroplasmy/MitoHeteroplasmyItem.cs

This file was deleted.

Loading

0 comments on commit a5a16f7

Please sign in to comment.