Skip to content

Commit

Permalink
add omim reader
Browse files Browse the repository at this point in the history
  • Loading branch information
yujiang02 committed Aug 18, 2017
1 parent da0b722 commit aa564df
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 3 deletions.
12 changes: 9 additions & 3 deletions Nirvana.sln
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26430.15
VisualStudioVersion = 15.0.26430.16
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation.Interface", "VariantAnnotation.Interface\VariantAnnotation.Interface.csproj", "{248C8736-3A76-4F45-A131-A776BD3257C9}"
EndProject
Expand All @@ -15,13 +15,15 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation", "Varian
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ErrorHandling", "ErrorHandling\ErrorHandling.csproj", "{A65F4919-CDB8-49C5-ADA4-66055A3F4923}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UnitTests", "UnitTests\UnitTests.csproj", "{0CB1644A-BEA1-4CF6-AD5F-E544512769C2}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UnitTests", "UnitTests\UnitTests.csproj", "{0CB1644A-BEA1-4CF6-AD5F-E544512769C2}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CommandLine", "CommandLine\CommandLine.csproj", "{147C336A-6A6E-43F4-BDDC-8C8B72199C5D}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CommonUtilities", "CommonUtilities\CommonUtilities.csproj", "{6688B01D-5352-4747-B411-456CBA6A4B8A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CacheUtils", "CacheUtils\CacheUtils.csproj", "{986CF15B-DFAE-4C39-98D0-75A15271B34A}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CacheUtils", "CacheUtils\CacheUtils.csproj", "{986CF15B-DFAE-4C39-98D0-75A15271B34A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SAUtils", "SAUtils\SAUtils.csproj", "{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -69,6 +71,10 @@ Global
{986CF15B-DFAE-4C39-98D0-75A15271B34A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{986CF15B-DFAE-4C39-98D0-75A15271B34A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{986CF15B-DFAE-4C39-98D0-75A15271B34A}.Release|Any CPU.Build.0 = Release|Any CPU
{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
177 changes: 177 additions & 0 deletions SAUtils/InputFileParsers/Omim/OmimReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using Compression.Utilities;
using VariantAnnotation.GeneAnnotation;

namespace SAUtils.InputFileParsers.Omim
{
public class OmimReader : IEnumerable<OmimEntry>
{
#region members

private readonly FileInfo _omimFileInfo;
private int _mimNumberCol;
private int _hgncCol;
private int _geneDescriptionCol;
private int _phenotypeCol;

#endregion


public OmimReader(FileInfo omimFileInfo)
{
_omimFileInfo = omimFileInfo;
}
public IEnumerator<OmimEntry> GetEnumerator()
{
return GetOmimItems().GetEnumerator();
}


IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}



private IEnumerable<OmimEntry> GetOmimItems()
{
using (var reader = GZipUtilities.GetAppropriateStreamReader(_omimFileInfo.FullName))
{
string line;
while ((line = reader.ReadLine()) != null)
{
if (IsHeader(line))
{
ParseHeader(line);
continue;
}
if(!IsContentLine(line)) continue;

var contents = line.Split('\t');
var mimNumber =Convert.ToInt32(contents[_mimNumberCol]);
var geneSymbol = contents[_hgncCol];
var description = contents[_geneDescriptionCol].Replace(@"\\'",@"'");
var phenotypeInfo = contents[_phenotypeCol].Replace(@",,", @",");
var phenotypes = ParsePhenotypes(phenotypeInfo);

if(string.IsNullOrEmpty(geneSymbol)) continue;


yield return new OmimEntry(geneSymbol, description, mimNumber, phenotypes);
}
}
}

private void ParseHeader(string line)
{
line = line.Trim('#').Trim(' ');
var colNames = line.Split('\t').Select(x => x.Trim(' ')).ToList();
for (var index = 0; index < colNames.Count; index++)
{
switch (colNames[index])
{
case "Mim Number":
_mimNumberCol = index;
break;
case "Gene Name":
_geneDescriptionCol = index;
break;
case "Approved Symbol":
_hgncCol = index;
break;
case "Phenotypes":
_phenotypeCol = index;
break;
}
}
}

private static List<OmimEntry.Phenotype> ParsePhenotypes(string line)
{
var phenotypes = new List<OmimEntry.Phenotype>();

if (string.IsNullOrEmpty(line)) return phenotypes;

var infos = line.Split(';');
phenotypes.AddRange(infos.Select(ExtractPhenotype));

return phenotypes;
}

private static OmimEntry.Phenotype ExtractPhenotype(string info)
{
info = info.Trim(' ').Replace(@"\\'","'");

if (string.IsNullOrWhiteSpace(info) || string.IsNullOrEmpty(info)) return null;

var phenotypeRegex = new Regex(@"^(.+?)(?:,\s(\d{6}))?\s\((\d)\)(?:,\s)?(.*)?$");
var match = phenotypeRegex.Match(info);
var phenotypeGroup = match.Groups[1].ToString();
string phenotype;
OmimEntry.Comments comments;
ParsePhenotypeMapping(phenotypeGroup, out phenotype, out comments);

var mimNumber = string.IsNullOrEmpty(match.Groups[2].Value) ? 0 : Convert.ToInt32(match.Groups[2].Value);
var mapping = (OmimEntry.Mapping) Convert.ToInt16(match.Groups[3].Value);

var inheritance = string.IsNullOrEmpty(match.Groups[4].Value) ? null : match.Groups[4].ToString();
var inheritances = ExtractInheritances(inheritance);
return new OmimEntry.Phenotype(mimNumber,phenotype,mapping,comments,inheritances);
}

private static HashSet<string> ExtractInheritances(string inheritance)
{
var inheritances = new HashSet<string>();
if (string.IsNullOrEmpty(inheritance)) return inheritances;

var contents = inheritance.Split(',');
foreach (var content in contents)
{
var trimmedContent = content.Trim(' ');
inheritances.Add(trimmedContent);
}

return inheritances;
}

private static void ParsePhenotypeMapping(string phenotypeGroup, out string phenotype, out OmimEntry.Comments comments)
{
phenotypeGroup = phenotypeGroup.Trim(' ');
phenotype = phenotypeGroup.TrimStart('?', '{', '[').TrimEnd('}', ']');
comments = OmimEntry.Comments.unknown;

if (phenotypeGroup.Substring(0, 2).Contains("?"))
{
comments = OmimEntry.Comments.unconfirmed_or_possibly_spurious_mapping;
}
else
{
if (phenotypeGroup.StartsWith("{"))
{
comments = OmimEntry.Comments.contribute_to_susceptibility_to_multifactorial_disorders_or_to_susceptibility_to_infection;
}else if (phenotypeGroup.StartsWith("["))
{
comments = OmimEntry.Comments.nondiseases;
}

}

}

private static bool IsHeader(string line)
{
return line.StartsWith("# Chromosome");
}

private static bool IsContentLine(string line)
{
return !line.StartsWith("#");
}
}
}
12 changes: 12 additions & 0 deletions SAUtils/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
using System;

namespace SAUtils
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
}
}
}
12 changes: 12 additions & 0 deletions SAUtils/SAUtils.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp1.1</TargetFramework>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\VariantAnnotation\VariantAnnotation.csproj" />
</ItemGroup>

</Project>

0 comments on commit aa564df

Please sign in to comment.