collapsing edges with 0 add/sub kminmers during preem filter method b…

…y unique highest read score
amkram · Dec 5, 2024 · 3eaea51 · 3eaea51
1 parent f217ee0
commit 3eaea51
Show file tree

Hide file tree

Showing 3 changed files with 276 additions and 41 deletions.
diff --git a/src/haplotype_filter.hpp b/src/haplotype_filter.hpp
@@ -26,16 +26,69 @@ using namespace panmanUtils;
 
 namespace haplotype_filter {
 
+void exclude_noninformative_reads(
+  std::vector<bool>& excludeReads, std::vector<std::string>& selectedNodes,
+  const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores
+) {
+  const auto& refScores = allScores.at(selectedNodes[0]);
+  std::vector<bool> informativeReads;
+  if (selectedNodes.size() > 1) {
+    informativeReads.resize(refScores.size(), false);
+  } else {
+    informativeReads = std::vector<bool>(refScores.size(), true);
+  }
+
+  for (size_t i = 1; i < selectedNodes.size(); ++i) {
+    const auto& curScores = allScores.at(selectedNodes[i]);
+    for (size_t j = 0; j < curScores.size(); ++j) {
+      if (excludeReads[j]) {
+        informativeReads[j] = true;
+        continue;
+      }
+      if (curScores[j].first != refScores[j].first) {
+        informativeReads[j] = true;
+      }
+    }
+  }
+
+  size_t numInformativeReads = std::count(informativeReads.begin(), informativeReads.end(), true);
+  std::cout << "Excluding " << informativeReads.size() - numInformativeReads << " reads that are not informative" << std::endl;
+  std::cerr << "Excluding " << informativeReads.size() - numInformativeReads << " reads that are not informative" << std::endl;
+
+  for (size_t i = 0; i < informativeReads.size(); ++i) {
+    if (excludeReads[i]) continue;
+    if (!informativeReads[i]) {
+      excludeReads[i] = true;
+    }
+  }
+}
+
 void noFilter(
   std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
   const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads,
-  const std::string& excludeNode, const std::vector<bool>& excludeReads
+  const std::string& excludeNode, std::vector<bool>& excludeReads
 ) {
   std::cerr << "No preEM filtering" << std::endl;
 
+  size_t numHighDuplicateReads = std::count(excludeReads.begin(), excludeReads.end(), true);
+
+  std::cerr << "Excluding " << numHighDuplicateReads << " reads with high number of duplicates" << std::endl;
+
+  std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
+  size_t index = 0;
+  for (const auto& [nodeName, nodeScores] : allScores) {
+    if (leastRecentIdenticalAncestors.count(nodeName)) {
+      continue;
+    }
+    allNodes[index] = nodeName;
+    ++index;
+  }
+
+  exclude_noninformative_reads(excludeReads, allNodes, allScores);
+
   size_t numExcludedReads = std::count(excludeReads.begin(), excludeReads.end(), true);
 
-  std::cerr << "Excluding " << numExcludedReads << " reads with high number of duplicates" << std::endl;
+  std::cerr << "Excluding " << numExcludedReads << " reads in total" << std::endl;
 
   if (!excludeNode.empty()) {
     probs.resize(allScores.begin()->second.size() - numExcludedReads, allScores.size() - leastRecentIdenticalAncestors.size() - 1);
@@ -60,7 +113,7 @@ void noFilter(
   std::cerr << "Finished noFilter: " << nodes.size() << " nodes" << std::endl;
 }
 
-std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order) {
+std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::string& node, int n_order, std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges) {
   std::unordered_set<std::string> result;
 
   if (T->allNodes.find(node) == T->allNodes.end()) {
@@ -85,14 +138,24 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri
         // Add children to the queue
         for (Node* child : T->allNodes.at(current_node)->children) {
           if (visited.find(child->identifier) == visited.end()) {
-            bfs_queue.push({child->identifier, level + 1});
+            const auto& childKminmerChanges = kminmerChanges.find(child->identifier)->second;
+            if (childKminmerChanges.first.size() > 0) {
+              bfs_queue.push({child->identifier, level + 1});
+            } else {
+              bfs_queue.push({child->identifier, level});
+            }
             visited.insert(child->identifier);
           }
         }
       }
       // Add parent to the queue (if exists)
       if (T->allNodes.at(current_node)->parent && visited.find(T->allNodes.at(current_node)->parent->identifier) == visited.end()) {
-        bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
+        const auto& parentKminmerChanges = kminmerChanges.find(T->allNodes.at(current_node)->parent->identifier)->second;
+        if (parentKminmerChanges.first.size() > 0) {
+          bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level + 1});
+        } else {
+          bfs_queue.push({T->allNodes.at(current_node)->parent->identifier, level});
+        }
         visited.insert(T->allNodes.at(current_node)->parent->identifier);
       }
     }
@@ -102,10 +165,11 @@ std::unordered_set<std::string> get_nth_order_neighbors(Tree *T, const std::stri
 
 }
 
-void filter_method_1(
+void filter_method_uhs(
   std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
   const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
-  const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order
+  const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order,
+  std::unordered_map<std::string, std::pair<std::vector<int32_t>, std::vector<int32_t>>>& kminmerChanges
 ) {
   std::cerr << "Filter method 1: filter out haplotypes that do not have a unique best read score" << std::endl;
 
@@ -127,23 +191,23 @@ void filter_method_1(
   for (auto& vec : scoresMatrix) vec.resize(allScores.size() - leastRecentIdenticalAncestors.size());
 
 
-std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
-size_t colIndex = 0;
-for (const auto& [nodeName, nodeScores] : allScores) {
-  if (leastRecentIdenticalAncestors.count(nodeName)) {
-    continue;
-  }
-  allNodes[colIndex] = nodeName;
-  size_t rowIndex = 0;
-  for (size_t j = 0; j < nodeScores.size(); ++j) {
-    if (excludeReads[j]) {
+  std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
+  size_t colIndex = 0;
+  for (const auto& [nodeName, nodeScores] : allScores) {
+    if (leastRecentIdenticalAncestors.count(nodeName)) {
       continue;
     }
-    scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
-    ++rowIndex;
+    allNodes[colIndex] = nodeName;
+    size_t rowIndex = 0;
+    for (size_t j = 0; j < nodeScores.size(); ++j) {
+      if (excludeReads[j]) {
+        continue;
+      }
+      scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
+      ++rowIndex;
+    }
+    ++colIndex;
   }
-  ++colIndex;
-}
 
   for (size_t i = 0; i < scoresMatrix.size(); ++i) {
     const auto& curReadScores = scoresMatrix[i];
@@ -169,7 +233,7 @@ for (const auto& [nodeName, nodeScores] : allScores) {
     }
     identicalSet.insert(filteredNode);
     for (const auto& node : identicalSet) {
-      std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order);
+      std::unordered_set<std::string> nth_order_neighbors = get_nth_order_neighbors(T, node, n_order, kminmerChanges);
       for (const auto& neighbor : nth_order_neighbors) {
         std::string neighbor_leastRecentIdenticalAncestor = neighbor;
         if (leastRecentIdenticalAncestors.find(neighbor) != leastRecentIdenticalAncestors.end()) {
@@ -184,6 +248,163 @@ for (const auto& [nodeName, nodeScores] : allScores) {
     }
   } 
 
+  if (nodes.size() == 0) {
+    nodes = allNodes;
+  }
+
+  probs.resize(scoresMatrix.size(), nodes.size());
+  colIndex = 0;
+  for (const auto& node : nodes) {
+    const auto& curNodeScores = allScores.find(node)->second;
+    size_t rowIndex = 0;
+    for (size_t j = 0; j < curNodeScores.size(); ++j) {
+      if (excludeReads[j]) continue;
+      probs(rowIndex, colIndex) = curNodeScores[j].second;
+      ++rowIndex;
+    }
+    ++colIndex;
+  }
+
+  std::cout << "Finished filter method 1: " << nodes.size() << " nodes.. prob matrix size: " << probs.rows() << " x " << probs.cols() << std::endl;
+}
+
+std::unordered_set<std::string> get_neighborhoods(Tree *T, const std::string& node) {
+  std::unordered_set<std::string> result;
+
+  // Check if the node exists in the tree
+  if (T->allNodes.find(node) == T->allNodes.end()) {
+    return result;
+  }
+
+  Node* currentNode = T->allNodes[node];
+
+  if (currentNode->children.empty()) {
+    // Case 1: Leaf Node - go up 2 levels to the grandparent
+    if (!currentNode->parent || !currentNode->parent->parent) {
+      return result; // If no grandparent exists, return empty set
+    }
+
+    Node* grandparent = currentNode->parent->parent;
+
+    // Add the grandparent and all descendants of the grandparent
+    std::queue<Node*> queue;
+    queue.push(grandparent);
+
+    while (!queue.empty()) {
+      Node* descendant = queue.front();
+      queue.pop();
+
+      result.insert(descendant->identifier);
+
+      for (Node* child : descendant->children) {
+        queue.push(child);
+      }
+    }
+  } else {
+    // Case 2: Internal Node - go up 1 level to the parent
+    if (!currentNode->parent) {
+      return result; // If no parent exists, return empty set
+    }
+
+    Node* parent = currentNode->parent;
+
+    // Add all nodes 2 edges down from the parent
+    for (Node* child : parent->children) {
+      for (Node* grandchild : child->children) {
+        result.insert(grandchild->identifier);
+      }
+    }
+  }
+
+  return result;
+}
+
+void filter_method_uhs_2(
+  std::vector<std::string>& nodes, Eigen::MatrixXd& probs, const std::unordered_map<std::string, tbb::concurrent_vector<std::pair<int32_t, double>>>& allScores,
+  const std::unordered_map<std::string, std::string>& leastRecentIdenticalAncestors, const std::unordered_map<std::string, std::unordered_set<std::string>>& identicalSets,
+  const std::vector<bool>& lowScoreReads, const size_t& numLowScoreReads, const std::string& excludeNode, const std::vector<bool>& excludeReads, Tree *T, int n_order
+) {
+  std::cerr << "Filter method 1: filter out haplotypes that do not have a unique best read score" << std::endl;
+
+  std::unordered_set<std::string> filteredNodes;
+  size_t numExcludedReads = std::count(excludeReads.begin(), excludeReads.end(), true);
+
+  std::cerr << "Excluding " << numExcludedReads << " reads with high number of duplicates" << std::endl;
+
+  size_t totalNumReads = allScores.begin()->second.size();
+  if (totalNumReads < numExcludedReads) {
+    std::cerr << "Error: total number of reads (" << totalNumReads << ") is less than the number of excluded reads (" << numExcludedReads << ")" << std::endl;
+    exit(1);
+  }
+  std::vector<std::vector<int32_t>> scoresMatrix(totalNumReads - numExcludedReads);
+  if (allScores.size() < leastRecentIdenticalAncestors.size()) {
+    std::cerr << "Error: number of nodes (" << allScores.size() << ") is less than the number of least recent identical ancestors (" << leastRecentIdenticalAncestors.size() << ")" << std::endl;
+    exit(1);
+  }
+  for (auto& vec : scoresMatrix) vec.resize(allScores.size() - leastRecentIdenticalAncestors.size());
+
+
+  std::vector<std::string> allNodes(allScores.size() - leastRecentIdenticalAncestors.size());
+  size_t colIndex = 0;
+  for (const auto& [nodeName, nodeScores] : allScores) {
+    if (leastRecentIdenticalAncestors.count(nodeName)) {
+      continue;
+    }
+    allNodes[colIndex] = nodeName;
+    size_t rowIndex = 0;
+    for (size_t j = 0; j < nodeScores.size(); ++j) {
+      if (excludeReads[j]) {
+        continue;
+      }
+      scoresMatrix[rowIndex][colIndex] = nodeScores[j].first;
+      ++rowIndex;
+    }
+    ++colIndex;
+  }
+
+  for (size_t i = 0; i < scoresMatrix.size(); ++i) {
+    const auto& curReadScores = scoresMatrix[i];
+    int32_t highestScore = 0;
+    std::string highestScoringNode = "";
+    for (size_t j = 0; j < curReadScores.size(); ++j) {
+      if (curReadScores[j] == 0) continue;
+      if (curReadScores[j] > highestScore) {
+        highestScore = curReadScores[j];
+        highestScoringNode = allNodes[j];
+      } else if (curReadScores[j] == highestScore) {
+        highestScoringNode = "";
+      }
+    }
+    if (highestScoringNode != "") filteredNodes.insert(highestScoringNode);
+  }
+
+  std::unordered_set<std::string> nodes_seen;
+  for (const auto& filteredNode : filteredNodes) {
+    std::unordered_set<std::string> identicalSet{};
+    if (identicalSets.find(filteredNode) != identicalSets.end()) {
+      identicalSet = identicalSets.at(filteredNode);
+    }
+    identicalSet.insert(filteredNode);
+    for (const auto& node : identicalSet) {
+      std::unordered_set<std::string> neighborhoods = get_neighborhoods(T, node);
+      for (const auto& neighbor : neighborhoods) {
+        std::string neighbor_leastRecentIdenticalAncestor = neighbor;
+        if (leastRecentIdenticalAncestors.find(neighbor) != leastRecentIdenticalAncestors.end()) {
+          neighbor_leastRecentIdenticalAncestor = leastRecentIdenticalAncestors.at(neighbor);
+        }
+        if (nodes_seen.find(neighbor_leastRecentIdenticalAncestor) == nodes_seen.end() &&
+            leastRecentIdenticalAncestors.find(neighbor_leastRecentIdenticalAncestor) == leastRecentIdenticalAncestors.end()) {
+          nodes.push_back(neighbor_leastRecentIdenticalAncestor);
+          nodes_seen.insert(neighbor_leastRecentIdenticalAncestor);
+        }
+      }
+    }
+  } 
+
+  if (nodes.size() == 0) {
+    nodes = allNodes;
+  }
+
   probs.resize(scoresMatrix.size(), nodes.size());
   colIndex = 0;
   for (const auto& node : nodes) {