Skip to content

Commit

Permalink
code for hal2synteny feature
Browse files Browse the repository at this point in the history
  • Loading branch information
Ksenia Krasheninnikova committed Sep 20, 2018
1 parent 276eb8c commit 9b639fd
Show file tree
Hide file tree
Showing 10 changed files with 842 additions and 1 deletion.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# order is important, libraries first
modules = api stats randgen validate mutations fasta alignmentDepth liftover lod maf chain extract analysis phyloP modify assemblyHub
modules = api stats randgen validate mutations fasta alignmentDepth liftover lod maf chain extract analysis phyloP modify assemblyHub synteny


.PHONY: all %.all clean %.clean doxy %.doxy

Expand Down
16 changes: 16 additions & 0 deletions synteny/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
rootPath = ../
include ../include.mk

libHeaders = $(wildcard inc/*)
libSourcesAll = $(wildcard impl/*.cpp)
libSources=$(subst impl/main.cpp,,${libSourcesAll})

cpp = h5c++ ${h5prefix}

all : ${binPath}/hal2synteny

${binPath}/hal2synteny : impl/main.cpp ${libPath}/halLib.a ${libPath}/halLiftover.a ${basicLibsDependencies}
${cpp} ${cppflags} -std=c++11 -I${sonLibPath} -I${rootPath}/liftover/inc -I${rootPath}/api/inc -Iinc -Iimpl impl/main.cpp ${libPath}/halLib.a ${libPath}/halLiftover.a ${sonLibPath}/sonLib.a -o ${binPath}/hal2synteny

clean:
rm -f ${binPath}/hal2synteny
108 changes: 108 additions & 0 deletions synteny/impl/hal2psl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/


#include "hal2psl.h"
#include "psl.h"


void hal::Hal2Psl::storePslResults(std::vector<PslBlock>& pslBlocks){

BedList::iterator i = _outBedLines.begin();
for (; i != _outBedLines.end(); ++i)
{
if (_addExtraColumns == false)
{
i->_extra.clear();
}
makeUpPsl(i->_psl, i->_blocks, i->_strand, i->_start, i->_chrName,
pslBlocks);
}

}
std::vector<PslBlock> hal::Hal2Psl::convert2psl(hal::AlignmentConstPtr alignment,
const hal::Genome* srcGenome,
const hal::Genome* tgtGenome,
const std::string srcChrom){

std::vector<PslBlock> pslBlocks;
if (srcGenome->getNumSequences() > 0){
_srcGenome = srcGenome;
_tgtGenome = tgtGenome;
_coalescenceLimit = NULL;
_traverseDupes = true;
_addExtraColumns = false;
_missedSet.clear();
_tgtSet.clear();
_tgtSet.insert(tgtGenome);
hal::SequenceIteratorConstPtr seqIt = srcGenome->getSequenceIterator();
hal::SequenceIteratorConstPtr seqEnd = srcGenome->getSequenceEndIterator();
for (; !seqIt->equals(seqEnd); seqIt->toNext()) {
_outBedLines.clear();
_srcSequence = seqIt->getSequence();
auto chrName = _srcSequence->getName();
if (srcChrom != chrName) {continue;}
//else {std::cout << chrName << std::endl;}
_bedLine._start = 0;
_bedLine._end = _srcSequence->getSequenceLength();
_mappedBlocks.clear();
_outPSL = true;
visitBegin();
liftInterval(_mappedBlocks);
if (_mappedBlocks.size()) {
assignBlocksToIntervals();
}
//_outBedLines.sort(BedLineSrcLess());
storePslResults(pslBlocks);
cleanResults();
}
}
return pslBlocks;
}


void hal::Hal2Psl::makeUpPsl(const std::vector<hal::PSLInfo>& vpsl,
const std::vector<hal::BedBlock>& blocks,
const char strand,
const hal_index_t start,
const std::string chrName,
std::vector<PslBlock>& pslBlocks){
assert(vpsl.size() == 1);
const hal::PSLInfo& psl = vpsl[0];

for (size_t i = 0; i < blocks.size(); ++i) {
PslBlock b = PslBlock();
b.size = blocks[i]._length;
b.qName = psl._qSeqName;
b.qSize = psl._qSeqSize;

b.qStart = psl._qBlockStarts[i] - psl._qChromOffset;
b.qEnd = b.qStart + b.size;
if (psl._qStrand == '-'){
auto posStart = b.qStart;
b.qStart = psl._qSeqSize - posStart - b.size;
b.qEnd = psl._qSeqSize - posStart;
}

b.tStart = blocks[i]._start + start;
b.tEnd = b.tStart + b.size;
if (strand == '-')
{
auto posStart = b.tStart;
b.tStart = psl._tSeqSize - posStart - b.size;
b.tEnd = psl._tSeqSize - posStart;
}
std::stringstream ss;
ss << psl._qStrand << strand;
ss >> b.strand;
//b.strand = psl._qStrand+'/'+strand;
b.tSize = psl._tSeqSize;
b.qSize = psl._qSeqSize;
b.tName = chrName;
pslBlocks.push_back(b);
}
}

122 changes: 122 additions & 0 deletions synteny/impl/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

#include "hal.h"
#include "halCLParserInstance.h"

#include "psl_io.cpp"
#include "psl_merger.cpp"
#include "hal2psl.cpp"


static hal::CLParserPtr initParser() {
auto optionsParser = hal::hdf5CLParserInstance();
optionsParser->addArgument("halFile", "input psl file from liftover");
optionsParser->addOption("queryGenome", "source genome", "\"\"");
optionsParser->addOption("targetGenome", "reference genome name", "\"\"");
optionsParser->addArgument("outPslPath", "output psl file ffor synteny blocks");
optionsParser->addOption("minBlockSize",
"lower bound on synteny block length",
5000);
optionsParser->addOption("maxAnchorDistance",
"upper bound on distance for syntenic psl blocks",
5000);
optionsParser->addOption("queryChromosome",
"chromosome to infer synteny",
"\"\"");
return optionsParser;

}

const hal::Genome* openGenomeOrThrow(const hal::AlignmentConstPtr& alignment,
const std::string& genomeName) {
const hal::Genome* genome = alignment->openGenome(genomeName);
if (genome == NULL){
throw hal_exception(std::string("Reference genome, ") + genomeName +
", not found in alignment");
}
return genome;
}

hal::AlignmentConstPtr openAlignmentOrThrow(const std::string& halFile,
const hal::CLParserPtr& optionsParser){

auto alignment = hal::openHalAlignmentReadOnly(halFile, optionsParser);
if (alignment->getNumGenomes() == 0){
throw hal_exception("hal alignment is empty");
}
return alignment;
}

void validateInputOrThrow(const std::string& queryGenomeName,
const std::string& targetGenomeName,
const std::string& queryChromosome) {
if (queryGenomeName == "\"\"" || targetGenomeName == "\"\"" || queryChromosome == "\"\""){
throw hal_exception("--queryGenome and --targetGenome and --queryChromosome must be"
"specified");
}

if (queryGenomeName == targetGenomeName){
throw hal_exception("--queryGenome and --targetGenome must be"
"different");
}
}



int main(int argc, char *argv[]) {
hal::CLParserPtr optionsParser = initParser();
std::string halFile;
std::string outPslPath;
std::string queryGenomeName;
std::string targetGenomeName;
std::string queryChromosome;
hal_size_t minBlockSize;
hal_size_t maxAnchorDistance;
try {
optionsParser->parseOptions(argc, argv);
halFile = optionsParser->getArgument<std::string>("halFile");
queryGenomeName = optionsParser->getOption<std::string>("queryGenome");
targetGenomeName = optionsParser->getOption<std::string>("targetGenome");
outPslPath = optionsParser->getArgument<std::string>("outPslPath");
minBlockSize = optionsParser->getOption<hal_size_t>("minBlockSize");
maxAnchorDistance = optionsParser->getOption<hal_size_t>("maxAnchorDistance");
queryChromosome = optionsParser->getOption<std::string>("queryChromosome");
optionsParser->setDescription("convert psl alignments into synteny blocks");
}
catch(std::exception& e) {
std::cerr << e.what() << std::endl;
optionsParser->printUsage(std::cerr);
exit(1);
}

validateInputOrThrow(queryGenomeName,targetGenomeName,queryChromosome);
try {
auto alignment = openAlignmentOrThrow(halFile, optionsParser);
auto targetGenome = openGenomeOrThrow(alignment, targetGenomeName);
auto queryGenome = openGenomeOrThrow(alignment, queryGenomeName);


auto hal2psl = hal::Hal2Psl();
//std::cout << "reading hal " << std::endl;
auto blocks = hal2psl.convert2psl(alignment, queryGenome,
targetGenome, queryChromosome);


//std::cout << "merging " << blocks.size() << " blocks"<< std::endl;
auto merged_blocks = dag_merge(blocks,
minBlockSize, maxAnchorDistance);
//std::cout << "writing psl " << std::endl;
psl_io::write_psl(merged_blocks, outPslPath);
}
catch(std::exception& e)
{
std::cerr << "Exception caught: " << e.what() << std::endl;
return 1;
}
return 0;

}
102 changes: 102 additions & 0 deletions synteny/impl/psl_io.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
#include "psl_io.h"

namespace psl_io {
std::vector<std::string> split(const std::string &s, char delim) {
std::stringstream ss(s);
std::string item;
std::vector<std::string> elems;
while (std::getline(ss, item, delim)) {
elems.push_back(std::move(item));
}
return elems;
}


std::vector<PslBlock> get_blocks_set(const std::string psl) {
std::ifstream input(psl);
std::vector<PslBlock> blocks;
for (std::string line; std::getline(input, line);) {
auto cur_b = Psl(line).get_blocks();
blocks.insert(blocks.end(), cur_b.begin(), cur_b.end());
}
return blocks;
}

std::vector<int> get_qInserts(const std::vector<PslBlock>& blocks) {
std::vector<int> result;
for (int i=0; i < int(blocks.size())-1; ++i) {
auto dif = blocks[i+1].qStart - blocks[i].qEnd;
if (dif > 0) {
result.push_back(dif);
}
}
return result;
}

std::vector<int> get_tInserts(const std::vector<PslBlock>& blocks) {
std::vector<int> result;
for (int i=0; i < int(blocks.size())-1; ++i) {
auto dif = blocks[i+1].tStart - blocks[i].tEnd;
if (dif > 0) {
result.push_back(dif);
//std::cout << blocks[i+1].tStart << " " << blocks[i].tEnd << " " << dif << std::endl;
}
}
return result;
}

Psl construct_psl(std::vector<PslBlock> blocks) {
auto psl = Psl();
psl.match = std::accumulate(blocks.begin(), blocks.end(), 0,
[] (int total, PslBlock item)
{ return total + item.qEnd - item.qStart; });
psl.misMatch = 0;
psl.repMatch = 0;
psl.nCount = 0;
auto qInserts = get_qInserts(blocks);
psl.qNumInsert = qInserts.size();
psl.qBaseInsert = std::accumulate(qInserts.begin(), qInserts.end(), 0);
auto tInserts = get_tInserts(blocks);
psl.tNumInsert = tInserts.size();
psl.tBaseInsert = std::accumulate(tInserts.begin(), tInserts.end(), 0);
psl.qName = blocks[0].qName;
psl.qSize = blocks[0].qSize;
psl.qStart = blocks[0].qStart;
psl.qEnd = blocks.back().qEnd;
psl.tName = blocks[0].tName;
psl.tSize = blocks[0].tSize;
psl.strand = blocks[0].strand;
psl.blockCount = blocks.size();
psl.blocks = blocks;
//if (psl.strand == "++") {
psl.tStart = blocks.front().tStart;
psl.tEnd = blocks.back().tEnd;
/*}
else if (psl.strand == "+-") {
psl.tEnd = psl.tSize - blocks[0].tStart;
psl.tStart = psl.tSize - blocks.back().tEnd;
//??for (auto b: psl.blocks) {
// b.tStart = psl.tSize - b.tStart;
//}
//std::reverse(psl.blocks.begin(),psl.blocks.end());
}*/
return psl;
}

void write_psl(const std::vector<std::vector<PslBlock> >& merged_blocks,
const std::string& outFilePath) {
std::ofstream ofs;
ofs.open (outFilePath, std::ofstream::out);
for (auto path : merged_blocks){
auto psl = construct_psl(path);
ofs << psl << std::endl;
}
ofs.close();
}
}
Loading

0 comments on commit 9b639fd

Please sign in to comment.