Skip to content

Commit

Permalink
kminmer binary coverage preem filter method
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanZhangUCSC committed Dec 7, 2024
1 parent 3eaea51 commit 8a20052
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 50 deletions.
86 changes: 70 additions & 16 deletions src/haplotype_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,72 @@ void noFilter(
std::cerr << "Finished noFilter: " << nodes.size() << " nodes" << std::endl;
}

std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order, std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges) {
void filter_method_mbc(
std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestor, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, std::vector<bool>& excludeReads,
const std::unordered_map<std::string, double>& kminmer_binary_coverage
) {
std::cerr << "Filter method mbc: filter out haplotypes that do not have a unique best read score" << std::endl;

std::vector<std::pair<std::string, double>> kminmer_binary_coverage_vec;
for (const auto& node : kminmer_binary_coverage) {
if (leastRecentIdenticalAncestor.find(node.first) != leastRecentIdenticalAncestor.end()) continue;
double curCoverage = node.second;
if (identicalSets.find(node.first) != identicalSets.end()) {
for (const auto& identicalNode : identicalSets.at(node.first)) {
if (kminmer_binary_coverage.at(identicalNode) > curCoverage) {
curCoverage = kminmer_binary_coverage.at(identicalNode);
}
}
}
kminmer_binary_coverage_vec.emplace_back(std::make_pair(node.first, curCoverage));
}

std::sort(kminmer_binary_coverage_vec.begin(), kminmer_binary_coverage_vec.end(), [](const auto& a, const auto& b) {
return a.second > b.second;
});

std::vector<std::string> probableNodes;
int numProbableNodes = 0;
for (const auto& [node, coverage] : kminmer_binary_coverage_vec) {
if (coverage == 1.0) {
probableNodes.push_back(node);
} else if (numProbableNodes < 1000) {
probableNodes.push_back(node);
++numProbableNodes;
}
}

exclude_noninformative_reads(excludeReads, probableNodes, allScores);

size_t numExcludedReads = std::count(excludeReads.begin(), excludeReads.end(), true);

std::cerr << "Excluding " << numExcludedReads << " reads in total" << std::endl;

probs.resize(allScores.begin()->second.size() - numExcludedReads, probableNodes.size());

size_t colIndex = 0;
for (const auto& node : probableNodes) {
if (leastRecentIdenticalAncestor.find(node) != leastRecentIdenticalAncestor.end()) {
std::cerr << "Error: Node " << node << " has a least recent identical ancestor." << std::endl;
exit(1);
}
const auto& curNodeScores = allScores.at(node);
size_t rowIndex = 0;
for (size_t i = 0; i < curNodeScores.size(); ++i) {
if (excludeReads[i]) continue;
probs(rowIndex, colIndex) = curNodeScores[i].second;
++rowIndex;
}
nodes.push_back(node);
++colIndex;
}
std::cerr << "Finished mbc filter: " << nodes.size() << " nodes" << std::endl;

}

std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order) {
std::unordered_set<std::string> result;

if (T->allNodes.find(node) == T->allNodes.end()) {
Expand All @@ -138,24 +203,14 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri
// Add children to the queue
for (Node* child : T->allNodes.at(current_node)->children) {
if (visited.find(child->identifier) == visited.end()) {
const auto& childKminmerChanges = kminmerChanges.find(child->identifier)->second;
if (childKminmerChanges.first.size() > 0) {
bfs_queue.push({child->identifier, level + 1});
} else {
bfs_queue.push({child->identifier, level});
}
bfs_queue.push({child->identifier, level + 1});
visited.insert(child->identifier);
}
}
}
// Add parent to the queue (if exists)
if (T->allNodes.at(current_node)->parent && visited.find(T->allNodes.at(current_node)->parent->identifier) == visited.end()) {
const auto& parentKminmerChanges = kminmerChanges.find(T->allNodes.at(current_node)->parent->identifier)->second;
if (parentKminmerChanges.first.size() > 0) {
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
} else {
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level});
}
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
visited.insert(T->allNodes.at(current_node)->parent->identifier);
}
}
Expand All @@ -168,8 +223,7 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri
void filter_method_uhs(
std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order,
std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order
) {
std::cerr << "Filter method 1: filter out haplotypes that do not have a unique best read score" << std::endl;

Expand Down Expand Up @@ -233,7 +287,7 @@ void filter_method_uhs(
}
identicalSet.insert(filteredNode);
for (const auto& node : identicalSet) {
std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order, kminmerChanges);
std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order);
for (const auto& neighbor : nth_order_neighbors) {
std::string neighbor_leastRecentIdenticalAncestor = neighbor;
if (leastRecentIdenticalAncestors.find(neighbor) != leastRecentIdenticalAncestors.end()) {
Expand Down
4 changes: 3 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ Developer options:
--genotype-from-sam Generate VCF from SAM file using mutation spectrum as prior.
--sam-file <path> Path to SAM file to generate VCF from.
--ref-file <path> Path to reference FASTA file to generate VCF from.
--save-jaccard Save jaccard index between reads and haplotypes to <prefix>.jaccard.txt
)";


Expand Down Expand Up @@ -289,6 +290,7 @@ int main(int argc, const char** argv) {
bool prior = args["--prior"] && args["--prior"].isBool() ? args["--prior"].asBool() : false;
bool placement_per_read = args["--place-per-read"] && args["--place-per-read"].isBool() ? args["--place-per-read"].asBool() : false;
bool genotype_from_sam = args["--genotype-from-sam"] && args["--genotype-from-sam"].isBool() ? args["--genotype-from-sam"].asBool() : false;
bool save_jaccard = args["--save-jaccard"] && args["--save-jaccard"].isBool() ? args["--save-jaccard"].asBool() : false;

int k = std::stoi(args["-k"].asString());
int s = std::stoi(args["-s"].asString());
Expand Down Expand Up @@ -446,7 +448,7 @@ int main(int argc, const char** argv) {
std::cerr << "Reference node (" << refNode << ") specified but not found in the pangenome." << std::endl;
return 1;
}
pmi::place(T, index_input, reads1, reads2, mutMat, prefix, refFileName, samFileName, bamFileName, mpileupFileName, vcfFileName, aligner, refNode);
pmi::place(T, index_input, reads1, reads2, mutMat, prefix, refFileName, samFileName, bamFileName, mpileupFileName, vcfFileName, aligner, refNode, save_jaccard);
}

auto end = std::chrono::high_resolution_clock::now();
Expand Down
32 changes: 14 additions & 18 deletions src/mgsr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,16 @@ namespace mgsr {
// hash begs
std::unordered_map<size_t, std::set<std::map<int32_t, positionInfo>::iterator, IteratorComparator>> hashToPositionsMap;

void addPosition(const int32_t& beg, const int32_t& end, const size_t& fhash, const size_t& rhash, const bool& rev, std::unordered_set<size_t>& affectedSeedmers, std::vector<int32_t>& add_kminmerChangesVec) {
void addPosition(const int32_t& beg, const int32_t& end, const size_t& fhash, const size_t& rhash, const bool& rev, std::unordered_set<size_t>& affectedSeedmers) {
auto it = positionMap.emplace(beg, positionInfo(end, fhash, rhash, rev)).first;
if (fhash != rhash) {
size_t minHash = std::min(fhash, rhash);
hashToPositionsMap[minHash].insert(it);
affectedSeedmers.insert(minHash);
add_kminmerChangesVec.push_back(beg);
}
}

void subPosition(std::map<int32_t, positionInfo>::iterator& it, const int32_t& end, const size_t& fhash, const size_t& rhash, const bool& rev, std::unordered_set<size_t>& affectedSeedmers, std::vector<int32_t>& add_kminmerChangesVec, std::vector<int32_t>& del_kminmerChangesVec) {
void subPosition(std::map<int32_t, positionInfo>::iterator& it, const int32_t& end, const size_t& fhash, const size_t& rhash, const bool& rev, std::unordered_set<size_t>& affectedSeedmers) {
const auto& [oldEnd, oldFHash, oldRHash, oldRev] = it->second;
const auto& beg = it->first;
if (oldFHash != oldRHash) {
Expand All @@ -69,7 +68,6 @@ namespace mgsr {
hashToPositionIt->second.erase(it);
if (hashToPositionIt->second.empty()) hashToPositionsMap.erase(hashToPositionIt);
affectedSeedmers.insert(minHash);
del_kminmerChangesVec.push_back(beg);
}

it->second.endPos = end;
Expand All @@ -80,11 +78,10 @@ namespace mgsr {
size_t minHash = std::min(fhash, rhash);
hashToPositionsMap[minHash].insert(it);
affectedSeedmers.insert(minHash);
add_kminmerChangesVec.push_back(beg);
}
}

void delPosition(std::map<int32_t, positionInfo>::iterator& it, std::unordered_set<size_t>& affectedSeedmers, std::vector<int32_t>& del_kminmerChangesVec) {
void delPosition(std::map<int32_t, positionInfo>::iterator& it, std::unordered_set<size_t>& affectedSeedmers) {
const auto& [end, fhash, rhash, rev] = it->second;
const auto& beg = it->first;
if (fhash != rhash) {
Expand All @@ -93,7 +90,6 @@ namespace mgsr {
hashToPositionIt->second.erase(it);
if (hashToPositionIt->second.empty()) hashToPositionsMap.erase(hashToPositionIt);
affectedSeedmers.insert(minHash);
del_kminmerChangesVec.push_back(beg);
}
positionMap.erase(it);
}
Expand Down Expand Up @@ -336,8 +332,6 @@ namespace mgsr {
void updateSeedmersIndex(const std::vector<std::tuple<int64_t, bool, bool, std::optional<size_t>, std::optional<size_t>, std::optional<bool>, std::optional<bool>, std::optional<int64_t>, std::optional<int64_t>>>& seedChanges,
std::map<uint32_t, seeding::onSeedsHash>& onSeedsHashMap,
mgsr::seedmers& seedmersIndex,
std::vector<int32_t>& add_kminmerChanges,
std::vector<int32_t>& del_kminmerChanges,
std::unordered_set<size_t>& affectedSeedmers,
const int& seedK,
const int& seedL,
Expand All @@ -355,7 +349,7 @@ namespace mgsr {
while (positionMapIt != positionMap.end()) {
const auto& [toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev] = positionMapIt->second;
backTrackPositionMapChAdd.emplace_back(std::make_tuple(positionMapIt->first, toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev));
seedmersIndex.delPosition(positionMapIt, affectedSeedmers, del_kminmerChanges);
seedmersIndex.delPosition(positionMapIt, affectedSeedmers);
++positionMapIt;
}
return;
Expand Down Expand Up @@ -394,10 +388,10 @@ namespace mgsr {
if (isReplacement) {
const auto& [oldEnd, oldFHash, oldRHash, oldRev] = curKminmerPositionIt->second;
backTrackPositionMapChAdd.emplace_back(std::make_tuple(curKminmerPositionIt->first, oldEnd, oldFHash, oldRHash, oldRev));
seedmersIndex.subPosition(curKminmerPositionIt, lastKminmerSeedIt->second.endPos, curforwardHash, curReverseHash, curReverseHash < curforwardHash, affectedSeedmers, add_kminmerChanges, del_kminmerChanges);
seedmersIndex.subPosition(curKminmerPositionIt, lastKminmerSeedIt->second.endPos, curforwardHash, curReverseHash, curReverseHash < curforwardHash, affectedSeedmers);
} else {
backTrackPositionMapErase.emplace_back(firstKminmerSeedIt->first);
seedmersIndex.addPosition(firstKminmerSeedIt->first, lastKminmerSeedIt->second.endPos, curforwardHash, curReverseHash, curReverseHash < curforwardHash, affectedSeedmers, add_kminmerChanges);
seedmersIndex.addPosition(firstKminmerSeedIt->first, lastKminmerSeedIt->second.endPos, curforwardHash, curReverseHash, curReverseHash < curforwardHash, affectedSeedmers);
}

processedSeedBegs.insert(firstKminmerSeedIt->first);
Expand All @@ -417,10 +411,10 @@ namespace mgsr {
if (isReplacement) {
const auto& [oldEnd, oldFHash, oldRHash, oldRev] = curKminmerPositionIt->second;
backTrackPositionMapChAdd.emplace_back(std::make_tuple(curKminmerPositionIt->first, oldEnd, oldFHash, oldRHash, oldRev));
seedmersIndex.subPosition(curKminmerPositionIt, lastKminmerSeedIt->second.endPos, forwardKminmerHash, reverseKminmerHash, reverseKminmerHash < forwardKminmerHash, affectedSeedmers, add_kminmerChanges, del_kminmerChanges);
seedmersIndex.subPosition(curKminmerPositionIt, lastKminmerSeedIt->second.endPos, forwardKminmerHash, reverseKminmerHash, reverseKminmerHash < forwardKminmerHash, affectedSeedmers);
} else {
backTrackPositionMapErase.emplace_back(firstKminmerSeedIt->first);
seedmersIndex.addPosition(firstKminmerSeedIt->first, lastKminmerSeedIt->second.endPos, forwardKminmerHash, reverseKminmerHash, reverseKminmerHash < forwardKminmerHash, affectedSeedmers, add_kminmerChanges);
seedmersIndex.addPosition(firstKminmerSeedIt->first, lastKminmerSeedIt->second.endPos, forwardKminmerHash, reverseKminmerHash, reverseKminmerHash < forwardKminmerHash, affectedSeedmers);
}

processedSeedBegs.insert(firstKminmerSeedIt->first);
Expand All @@ -438,7 +432,7 @@ namespace mgsr {
const auto& [toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev] = toEraseIt->second;
backTrackPositionMapChAdd.emplace_back(std::make_tuple(toEraseIt->first, toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev));
processedSeedBegs.insert(toEraseIt->first);
seedmersIndex.delPosition(toEraseIt, affectedSeedmers, del_kminmerChanges);
seedmersIndex.delPosition(toEraseIt, affectedSeedmers);
}
}
}
Expand All @@ -448,7 +442,7 @@ namespace mgsr {
while (lastPositionMapIt->first > maxBegCoord) {
const auto& [toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev] = lastPositionMapIt->second;
backTrackPositionMapChAdd.emplace_back(std::make_tuple(lastPositionMapIt->first, toEraseEnd, toEraseFHash, toEraseRHash, toEraseRev));
seedmersIndex.delPosition(lastPositionMapIt, affectedSeedmers, del_kminmerChanges);
seedmersIndex.delPosition(lastPositionMapIt, affectedSeedmers);
--lastPositionMapIt;
}
}
Expand Down Expand Up @@ -888,7 +882,7 @@ namespace mgsr {
const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets, Eigen::MatrixXd& probs,
std::vector<std::string>& nodes, Eigen::VectorXd& props, double& llh, const std::string& preEMFilterMethod, const int& preEMFilterNOrder,
const int& emFilterRound, const int& checkFrequency, const int& removeIteration, const double& insigProp,
const int& roundsRemove, const double& removeThreshold, const bool& leafNodesOnly, std::unordered_map<std::string,std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges,
const int& roundsRemove, const double& removeThreshold, const bool& leafNodesOnly, const std::unordered_map<std::string, double>& kminmer_binary_coverage,
std::string excludeNode
) {
if (excludeNode.empty()) {
Expand All @@ -905,8 +899,10 @@ namespace mgsr {
std::cerr << "pre-EM filter nodes size: " << allScores.size() - leastRecentIdenticalAncestors.size() << "\n" << std::endl;
if (preEMFilterMethod == "null") {
haplotype_filter::noFilter(nodes, probs, allScores, leastRecentIdenticalAncestors, lowScoreReads, numLowScoreReads, excludeNode, excludeReads);
} else if (preEMFilterMethod == "mbc") {
haplotype_filter::filter_method_mbc(nodes, probs, allScores, leastRecentIdenticalAncestors, identicalSets, lowScoreReads, numLowScoreReads, excludeNode, excludeReads, kminmer_binary_coverage);
} else if (preEMFilterMethod == "uhs") {
haplotype_filter::filter_method_uhs(nodes, probs, allScores, leastRecentIdenticalAncestors, identicalSets, lowScoreReads, numLowScoreReads, excludeNode, excludeReads, T, preEMFilterNOrder, kminmerChanges);
haplotype_filter::filter_method_uhs(nodes, probs, allScores, leastRecentIdenticalAncestors, identicalSets, lowScoreReads, numLowScoreReads, excludeNode, excludeReads, T, preEMFilterNOrder);
} else if (preEMFilterMethod == "uhs2") {
haplotype_filter::filter_method_uhs_2(nodes, probs, allScores, leastRecentIdenticalAncestors, identicalSets, lowScoreReads, numLowScoreReads, excludeNode, excludeReads, T, preEMFilterNOrder);
} else if (preEMFilterMethod == "hsc1") {
Expand Down
Loading

0 comments on commit 8a20052

Please sign in to comment.