Skip to content

Commit

Permalink
collapsing edges with 0 add/sub kminmers during preem filter method b…
Browse files Browse the repository at this point in the history
…y unique highest read score
  • Loading branch information
AlanZhangUCSC committed Dec 5, 2024
1 parent f217ee0 commit 3eaea51
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 41 deletions.
265 changes: 243 additions & 22 deletions src/haplotype_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,69 @@ using namespace panmanUtils;

namespace haplotype_filter {

void exclude_noninformative_reads(
std::vector<bool>& excludeReads, std::vector<std::string>& selectedNodes,
const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores
) {
const auto& refScores = allScores.at(selectedNodes[0]);
std::vector<bool> informativeReads;
if (selectedNodes.size() > 1) {
informativeReads.resize(refScores.size(), false);
} else {
informativeReads = std::vector<bool>(refScores.size(), true);
}

for (size_t i = 1; i < selectedNodes.size(); ++i) {
const auto& curScores = allScores.at(selectedNodes[i]);
for (size_t j = 0; j < curScores.size(); ++j) {
if (excludeReads[j]) {
informativeReads[j] = true;
continue;
}
if (curScores[j].first != refScores[j].first) {
informativeReads[j] = true;
}
}
}

size_t numInformativeReads = std::count(informativeReads.begin(), informativeReads.end(), true);
std::cout << "Excluding " << informativeReads.size() - numInformativeReads << " reads that are not informative" << std::endl;
std::cerr << "Excluding " << informativeReads.size() - numInformativeReads << " reads that are not informative" << std::endl;

for (size_t i = 0; i < informativeReads.size(); ++i) {
if (excludeReads[i]) continue;
if (!informativeReads[i]) {
excludeReads[i] = true;
}
}
}

void noFilter(
std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads,
const std::string& excludeNode, const std::vector<bool>& excludeReads
const std::string& excludeNode, std::vector<bool>& excludeReads
) {
std::cerr << "No preEM filtering" << std::endl;

size_t numHighDuplicateReads = std::count(excludeReads.begin(), excludeReads.end(), true);

std::cerr << "Excluding " << numHighDuplicateReads << " reads with high number of duplicates" << std::endl;

std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
size_t index = 0;
for (const auto& [nodeName, nodeScores] : allScores) {
if (leastRecentIdenticalAncestors.count(nodeName)) {
continue;
}
allNodes[index] = nodeName;
++index;
}

exclude_noninformative_reads(excludeReads, allNodes, allScores);

size_t numExcludedReads = std::count(excludeReads.begin(), excludeReads.end(), true);

std::cerr << "Excluding " << numExcludedReads << " reads with high number of duplicates" << std::endl;
std::cerr << "Excluding " << numExcludedReads << " reads in total" << std::endl;

if (!excludeNode.empty()) {
probs.resize(allScores.begin()->second.size() - numExcludedReads, allScores.size() - leastRecentIdenticalAncestors.size() - 1);
Expand All @@ -60,7 +113,7 @@ void noFilter(
std::cerr << "Finished noFilter: " << nodes.size() << " nodes" << std::endl;
}

std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order) {
std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order, std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges) {
std::unordered_set<std::string> result;

if (T->allNodes.find(node) == T->allNodes.end()) {
Expand All @@ -85,14 +138,24 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri
// Add children to the queue
for (Node* child : T->allNodes.at(current_node)->children) {
if (visited.find(child->identifier) == visited.end()) {
bfs_queue.push({child->identifier, level + 1});
const auto& childKminmerChanges = kminmerChanges.find(child->identifier)->second;
if (childKminmerChanges.first.size() > 0) {
bfs_queue.push({child->identifier, level + 1});
} else {
bfs_queue.push({child->identifier, level});
}
visited.insert(child->identifier);
}
}
}
// Add parent to the queue (if exists)
if (T->allNodes.at(current_node)->parent && visited.find(T->allNodes.at(current_node)->parent->identifier) == visited.end()) {
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
const auto& parentKminmerChanges = kminmerChanges.find(T->allNodes.at(current_node)->parent->identifier)->second;
if (parentKminmerChanges.first.size() > 0) {
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
} else {
bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level});
}
visited.insert(T->allNodes.at(current_node)->parent->identifier);
}
}
Expand All @@ -102,10 +165,11 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri

}

void filter_method_1(
void filter_method_uhs(
std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order,
std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges
) {
std::cerr << "Filter method 1: filter out haplotypes that do not have a unique best read score" << std::endl;

Expand All @@ -127,23 +191,23 @@ void filter_method_1(
for (auto& vec : scoresMatrix) vec.resize(allScores.size() - leastRecentIdenticalAncestors.size());


std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
size_t colIndex = 0;
for (const auto& [nodeName, nodeScores] : allScores) {
if (leastRecentIdenticalAncestors.count(nodeName)) {
continue;
}
allNodes[colIndex] = nodeName;
size_t rowIndex = 0;
for (size_t j = 0; j < nodeScores.size(); ++j) {
if (excludeReads[j]) {
std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
size_t colIndex = 0;
for (const auto& [nodeName, nodeScores] : allScores) {
if (leastRecentIdenticalAncestors.count(nodeName)) {
continue;
}
scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
++rowIndex;
allNodes[colIndex] = nodeName;
size_t rowIndex = 0;
for (size_t j = 0; j < nodeScores.size(); ++j) {
if (excludeReads[j]) {
continue;
}
scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
++rowIndex;
}
++colIndex;
}
++colIndex;
}

for (size_t i = 0; i < scoresMatrix.size(); ++i) {
const auto& curReadScores = scoresMatrix[i];
Expand All @@ -169,7 +233,7 @@ for (const auto& [nodeName, nodeScores] : allScores) {
}
identicalSet.insert(filteredNode);
for (const auto& node : identicalSet) {
std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order);
std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order, kminmerChanges);
for (const auto& neighbor : nth_order_neighbors) {
std::string neighbor_leastRecentIdenticalAncestor = neighbor;
if (leastRecentIdenticalAncestors.find(neighbor) != leastRecentIdenticalAncestors.end()) {
Expand All @@ -184,6 +248,163 @@ for (const auto& [nodeName, nodeScores] : allScores) {
}
}

if (nodes.size() == 0) {
nodes = allNodes;
}

probs.resize(scoresMatrix.size(), nodes.size());
colIndex = 0;
for (const auto& node : nodes) {
const auto& curNodeScores = allScores.find(node)->second;
size_t rowIndex = 0;
for (size_t j = 0; j < curNodeScores.size(); ++j) {
if (excludeReads[j]) continue;
probs(rowIndex, colIndex) = curNodeScores[j].second;
++rowIndex;
}
++colIndex;
}

std::cout << "Finished filter method 1: " << nodes.size() << " nodes.. prob matrix size: " << probs.rows() << " x " << probs.cols() << std::endl;
}

std::unordered_set<std::string> get_neighborhoods(Tree *T, const std::string& node) {
std::unordered_set<std::string> result;

// Check if the node exists in the tree
if (T->allNodes.find(node) == T->allNodes.end()) {
return result;
}

Node* currentNode = T->allNodes[node];

if (currentNode->children.empty()) {
// Case 1: Leaf Node - go up 2 levels to the grandparent
if (!currentNode->parent || !currentNode->parent->parent) {
return result; // If no grandparent exists, return empty set
}

Node* grandparent = currentNode->parent->parent;

// Add the grandparent and all descendants of the grandparent
std::queue<Node*> queue;
queue.push(grandparent);

while (!queue.empty()) {
Node* descendant = queue.front();
queue.pop();

result.insert(descendant->identifier);

for (Node* child : descendant->children) {
queue.push(child);
}
}
} else {
// Case 2: Internal Node - go up 1 level to the parent
if (!currentNode->parent) {
return result; // If no parent exists, return empty set
}

Node* parent = currentNode->parent;

// Add all nodes 2 edges down from the parent
for (Node* child : parent->children) {
for (Node* grandchild : child->children) {
result.insert(grandchild->identifier);
}
}
}

return result;
}

void filter_method_uhs_2(
std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order
) {
std::cerr << "Filter method 1: filter out haplotypes that do not have a unique best read score" << std::endl;

std::unordered_set<std::string> filteredNodes;
size_t numExcludedReads = std::count(excludeReads.begin(), excludeReads.end(), true);

std::cerr << "Excluding " << numExcludedReads << " reads with high number of duplicates" << std::endl;

size_t totalNumReads = allScores.begin()->second.size();
if (totalNumReads < numExcludedReads) {
std::cerr << "Error: total number of reads (" << totalNumReads << ") is less than the number of excluded reads (" << numExcludedReads << ")" << std::endl;
exit(1);
}
std::vector<std::vector<int32_t>> scoresMatrix(totalNumReads - numExcludedReads);
if (allScores.size() < leastRecentIdenticalAncestors.size()) {
std::cerr << "Error: number of nodes (" << allScores.size() << ") is less than the number of least recent identical ancestors (" << leastRecentIdenticalAncestors.size() << ")" << std::endl;
exit(1);
}
for (auto& vec : scoresMatrix) vec.resize(allScores.size() - leastRecentIdenticalAncestors.size());


std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
size_t colIndex = 0;
for (const auto& [nodeName, nodeScores] : allScores) {
if (leastRecentIdenticalAncestors.count(nodeName)) {
continue;
}
allNodes[colIndex] = nodeName;
size_t rowIndex = 0;
for (size_t j = 0; j < nodeScores.size(); ++j) {
if (excludeReads[j]) {
continue;
}
scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
++rowIndex;
}
++colIndex;
}

for (size_t i = 0; i < scoresMatrix.size(); ++i) {
const auto& curReadScores = scoresMatrix[i];
int32_t highestScore = 0;
std::string highestScoringNode = "";
for (size_t j = 0; j < curReadScores.size(); ++j) {
if (curReadScores[j] == 0) continue;
if (curReadScores[j] > highestScore) {
highestScore = curReadScores[j];
highestScoringNode = allNodes[j];
} else if (curReadScores[j] == highestScore) {
highestScoringNode = "";
}
}
if (highestScoringNode != "") filteredNodes.insert(highestScoringNode);
}

std::unordered_set<std::string> nodes_seen;
for (const auto& filteredNode : filteredNodes) {
std::unordered_set<std::string> identicalSet{};
if (identicalSets.find(filteredNode) != identicalSets.end()) {
identicalSet = identicalSets.at(filteredNode);
}
identicalSet.insert(filteredNode);
for (const auto& node : identicalSet) {
std::unordered_set<std::string> neighborhoods = get_neighborhoods(T, node);
for (const auto& neighbor : neighborhoods) {
std::string neighbor_leastRecentIdenticalAncestor = neighbor;
if (leastRecentIdenticalAncestors.find(neighbor) != leastRecentIdenticalAncestors.end()) {
neighbor_leastRecentIdenticalAncestor = leastRecentIdenticalAncestors.at(neighbor);
}
if (nodes_seen.find(neighbor_leastRecentIdenticalAncestor) == nodes_seen.end() &&
leastRecentIdenticalAncestors.find(neighbor_leastRecentIdenticalAncestor) == leastRecentIdenticalAncestors.end()) {
nodes.push_back(neighbor_leastRecentIdenticalAncestor);
nodes_seen.insert(neighbor_leastRecentIdenticalAncestor);
}
}
}
}

if (nodes.size() == 0) {
nodes = allNodes;
}

probs.resize(scoresMatrix.size(), nodes.size());
colIndex = 0;
for (const auto& node : nodes) {
Expand Down
Loading

0 comments on commit 3eaea51

Please sign in to comment.