Skip to content

Commit

Permalink
outputting transcript ids by grouping them under a protein sequence (…
Browse files Browse the repository at this point in the history
…#484)
  • Loading branch information
rajatshuvro authored and GitHub Enterprise committed Feb 11, 2020
1 parent 834c35d commit 82adcf0
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
3 changes: 2 additions & 1 deletion SAUtils/AAConservation/AaConservationMain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ private static ExitCodes ProgramExecution()
using (var stream = GZipUtilities.GetAppropriateReadStream(_scoresFile))
using(var parser = new ProteinConservationParser(stream))
using(var outStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName+ProteinConservationCommon.FileSuffix)))
using(var writer = new ProteinConservationWriter(outStream, transcriptData, version))
using(var groupStream = FileUtilities.GetCreateStream("transcriptGroups.txt"))
using(var writer = new ProteinConservationWriter(outStream, groupStream, transcriptData, version))
{
writer.Write(parser.GetItems());
}
Expand Down
36 changes: 35 additions & 1 deletion SAUtils/AAConservation/ProteinConservationWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace SAUtils.AAConservation
public sealed class ProteinConservationWriter:IDisposable
{
private readonly Stream _stream;
private readonly Stream _transcriptGroupStream;
private readonly GenomeAssembly _assembly;
private readonly ExtendedBinaryWriter _writer;
private readonly DataSourceVersion _version;
Expand All @@ -22,9 +23,10 @@ public sealed class ProteinConservationWriter:IDisposable
// so, we need to load them up and check for duplicates and resolve them.


public ProteinConservationWriter(Stream stream, TranscriptCacheData transcriptData, DataSourceVersion version)
public ProteinConservationWriter(Stream stream, Stream groupStream, TranscriptCacheData transcriptData, DataSourceVersion version)
{
_stream = stream;
_transcriptGroupStream = groupStream;
_writer = new ExtendedBinaryWriter(_stream);
_transcriptCacheData = transcriptData;
_version = version;
Expand All @@ -43,16 +45,24 @@ public void Write(IEnumerable<ProteinConservationItem> items)
CheckProteinSetOverlap(alignedProteinsAndScores, nirvanaProteins);

var transcriptScores = new Dictionary<string, byte[]>();
//protein sequence -> transcript ids mapping
var transcriptGroupsByProtein = new Dictionary<string, List<string>>(alignedProteinsAndScores.Count);
foreach (var protein in alignedProteinsAndScores.Keys)
{
transcriptGroupsByProtein.Add(protein, new List<string>());
}
foreach (var transcriptIntervalArray in _transcriptCacheData.TranscriptIntervalArrays)
{
if (transcriptIntervalArray == null) continue;//may happen since for GRCh38 decoy contigs, there may be none
foreach (var transcriptInterval in transcriptIntervalArray.Array)
{
var transcript = transcriptInterval.Value;
if(transcript.Translation == null) continue;
var peptideSeq = transcript.Translation.PeptideSeq;
if(!alignedProteinsAndScores.TryGetValue(transcript.Translation.PeptideSeq, out var scores)) continue;

transcriptScores.TryAdd(transcript.Id.WithVersion, scores);
transcriptGroupsByProtein[peptideSeq].Add(transcript.Id.WithVersion);
}
}

Expand All @@ -62,12 +72,36 @@ public void Write(IEnumerable<ProteinConservationItem> items)
transcriptScore.Write(_writer);
}

WriteTranscriptGroups(transcriptGroupsByProtein);

Console.WriteLine($"Recorded conservation scores for {transcriptScores.Count} transcripts.");
//writing an empty item to indicate end of records
var endOfRecordItem = TranscriptConservationScores.GetEmptyItem();
endOfRecordItem.Write(_writer);
}

private void WriteTranscriptGroups(Dictionary<string, List<string>> transcriptGroupsByProtein)
{
using (var writer = new StreamWriter(_transcriptGroupStream))
{
var ensemblIds = new List<string>();
var refseqIds = new List<string>();
writer.WriteLine("#EnsemblIds\tRefSeqIds\tPeptide sequence");
foreach (var (protein,ids) in transcriptGroupsByProtein)
{
if(ids.Count == 0) continue;
ensemblIds.Clear();
refseqIds.Clear();
foreach (var id in ids)
{
if(id.StartsWith("ENST")) ensemblIds.Add(id);
else refseqIds.Add(id);
}
writer.WriteLine($"{string.Join(',',ensemblIds)}\t{string.Join(',',refseqIds)}\t{protein}");
}
}
}

private void CheckProteinSetOverlap(Dictionary<string, byte[]> proteinAndScores, HashSet<string> nirvanaProteins)
{
var count = 0;
Expand Down

0 comments on commit 82adcf0

Please sign in to comment.