Skip to content

Commit c939b7a

Browse files
committed
GSVar: improved CNV liftover to HG38 (fuzzy matching)
1 parent 0464958 commit c939b7a

File tree

2 files changed

+61
-31
lines changed

2 files changed

+61
-31
lines changed

src/GSvar/NGSDReplicationWidget.cpp

+60-30
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "NGSD.h"
33
#include "GSvarHelper.h"
44
#include "Settings.h"
5+
#include <QDir>
56

67
NGSDReplicationWidget::NGSDReplicationWidget(QWidget* parent)
78
: QWidget(parent)
@@ -440,7 +441,7 @@ void NGSDReplicationWidget::replicateVariantData()
440441
}
441442

442443
//TODO NGSDReplicationWidget:
443-
//- CNVs: report_configuration_cnv, variant_validation CNVs
444+
//- CNVs: variant_validation CNVs
444445
//- SVs: report_configuration_sv, variant_validation SVs
445446
//- CNVs somatic: somatic_report_configuration_cnv
446447
//- misc: gaps, cfdna_panel_genes, cfdna_panels
@@ -539,7 +540,7 @@ void NGSDReplicationWidget::performPostChecks()
539540
}
540541
}
541542

542-
int NGSDReplicationWidget::liftOverCnv(int source_cnv_id, int callset_id, bool debug_output)
543+
int NGSDReplicationWidget::liftOverCnv(int source_cnv_id, int callset_id, QString& error_message)
543544
{
544545
CopyNumberVariant var = db_source_->cnv(source_cnv_id);
545546

@@ -551,41 +552,39 @@ int NGSDReplicationWidget::liftOverCnv(int source_cnv_id, int callset_id, bool d
551552
}
552553
catch(Exception& e)
553554
{
554-
if (debug_output) qDebug() << "-1" << var.toString() << e.message();
555+
error_message = "lift-over failed: " + e.message().replace("\t", " ").replace("<br>", " ").replace("&nbsp;", " ");
555556
return -1;
556557
}
557558

558559
//check new chromosome is ok
559560
if (!coords.chr().isNonSpecial())
560561
{
561-
if (debug_output) qDebug() << "-2" << var.toString() << coords.chr().str()+":"+QString::number(coords.start())+"-"+QString::number(coords.end());
562+
563+
error_message = "chromosome is now special chromosome: " + coords.chr().strNormalized(true);
562564
return -2;
563565
}
564566

565-
//check sequence context is the same (ref +-5 bases). If it is not, the strand might have changed, e.g. in NIPA1, GDF2, ANKRD35, TPTE, ...
566-
bool strand_changed = false;
567-
Sequence context_old = genome_index_->seq(var.chr(), var.start()-5, 10);
568-
Sequence context_new = genome_index_hg38_->seq(coords.chr(), coords.start()-5, 10);
569-
if (context_old!=context_new)
567+
//check for CNV in target NGSD
568+
QString cn = db_source_->getValue("SELECT cn FROM cnv WHERE id="+QString::number(source_cnv_id)).toString();
569+
QString cnv_id = db_target_->getValue("SELECT id FROM cnv WHERE cnv_callset_id="+QString::number(callset_id)+" AND chr='"+coords.chr().strNormalized(true)+"' AND start='"+QString::number(coords.start())+"' AND end='"+QString::number(coords.start())+"' AND cn="+cn, true).toString();
570+
if (cnv_id.isEmpty())
570571
{
571-
context_new.reverseComplement();
572-
if (context_old==context_new)
573-
{
574-
strand_changed = true;
575-
}
576-
else
572+
//no exact match, try fuzzy match (at least 70% of original/liftover size)
573+
int size_hg19 = var.size();
574+
int size_hg38 = coords.length();
575+
576+
SqlQuery query = db_target_->getQuery();
577+
query.exec("SELECT * FROM cnv WHERE cnv_callset_id='" + QString::number(callset_id) + "' AND cn='"+cn+"' AND chr='" + coords.chr().strNormalized(true) + "' AND end>" + QString::number(coords.start()) + " AND start<" + QString::number(coords.end()) + "");
578+
while (query.next())
577579
{
578-
context_new.reverseComplement();
579-
if (debug_output) qDebug() << "-3" << var.toString() << coords.chr().str()+":"+QString::number(coords.start())+"-"+QString::number(coords.end()) << "old_context="+context_old << "new_context="+context_new;
580-
return -3;
580+
int size = query.value("end").toInt() - query.value("start").toInt();
581+
if (size > 0.7 * size_hg19 && size > 0.7 * size_hg38)
582+
{
583+
return query.value("id").toInt();
584+
}
581585
}
582-
}
583586

584-
//check for CNV in target NGSD
585-
QString cnv_id = db_target_->cnvId(CopyNumberVariant(coords.chr(), coords.start(), coords.end()), callset_id, false);
586-
if (cnv_id.isEmpty())
587-
{
588-
if (debug_output) qDebug() << "-4" << var.toString() << coords.chr().str()+":"+QString::number(coords.start())+"-"+QString::number(coords.end()) << "strand_changed="+QString(strand_changed?"yes":"no");
587+
error_message = "CNV not found in NGSD";
589588
return -4;
590589
}
591590

@@ -594,6 +593,11 @@ int NGSDReplicationWidget::liftOverCnv(int source_cnv_id, int callset_id, bool d
594593

595594
void NGSDReplicationWidget::updateReportConfigCnv()
596595
{
596+
QFile file(QCoreApplication::applicationDirPath() + QDir::separator() + "liftover_report_config_cnvs.tsv");
597+
file.open(QIODevice::WriteOnly);
598+
QTextStream debug_stream(&file);
599+
debug_stream << "#ps\tCNV\tcn\tsize_kb\tregs\tll_per_regs\tfound_in_HG38\terror\tcomments\n";
600+
597601
QString table = "report_configuration_cnv";
598602
QStringList fields = db_target_->tableInfo(table).fieldNames();
599603

@@ -698,6 +702,7 @@ void NGSDReplicationWidget::updateReportConfigCnv()
698702
int target_cnv_id = -1;
699703

700704
QString ps_id = db_source_->getValue("SELECT processed_sample_id FROM report_configuration WHERE id=" + query.value("report_configuration_id").toString()).toString();
705+
QString ps = db_source_->processedSampleName(ps_id);
701706
QVariant callset_id = db_target_->getValue("SELECT id FROM cnv_callset WHERE processed_sample_id=" + ps_id);
702707
if (!callset_id.isValid())
703708
{
@@ -707,19 +712,44 @@ void NGSDReplicationWidget::updateReportConfigCnv()
707712

708713
bool causal_variant = query.value("causal").toInt()==1;
709714
bool pathogenic_variant = query.value("class").toInt()>3;
710-
target_cnv_id = liftOverCnv(source_cnv_id, callset_id.toInt(), causal_variant||pathogenic_variant);
715+
QString error_message = "";
716+
target_cnv_id = liftOverCnv(source_cnv_id, callset_id.toInt(), error_message);
717+
718+
int cn = db_source_->getValue("SELECT cn FROM cnv WHERE id=" + QString::number(source_cnv_id), false).toInt();
719+
int ll = -1;
720+
int regs = -1;
721+
QString metrics = db_source_->getValue("SELECT quality_metrics FROM cnv WHERE id=" + QString::number(source_cnv_id), false).toString();
722+
foreach(QString metric, metrics.split(","))
723+
{
724+
metric.replace("\"", "");
725+
metric.replace("{", "");
726+
metric.replace("}", "");
727+
if (metric.contains("loglikelihood"))
728+
{
729+
ll = metric.split(":")[1].toInt();
730+
}
731+
if (metric.contains("regions") || metric.contains("no_of_regions"))
732+
{
733+
regs = metric.split(":")[1].toInt();
734+
}
735+
}
736+
QStringList comments;
737+
if (causal_variant) comments << "causal";
738+
if (pathogenic_variant) comments << "pathogenic";
739+
CopyNumberVariant var = db_source_->cnv(source_cnv_id);
740+
741+
debug_stream << ps << "\t" << var.toString() << "\t" << cn << "\t" << QString::number((double)var.size()/1000, 'f', 2) << "\t" << regs << "\t" << QString::number((double)ll/regs, 'f', 2) << "\t" << (target_cnv_id>=0 ? "yes" : "no") << "\t" << "\t" << error_message << comments.join(", ") << "\n";
742+
debug_stream.flush();
711743

712744
//warn if causal/pathogenic CNV could not be lifed
713745
if (target_cnv_id<0)
714746
{
715747
if (causal_variant)
716748
{
717-
QString ps = db_source_->processedSampleName(ps_id);
718749
addWarning("Causal CNV " + db_source_->cnv(source_cnv_id).toString() + " of sample " + ps + " could not be lifted (" + QString::number(target_cnv_id) + ")!");
719750
}
720-
else if (pathogenic_variant && target_cnv_id<0)
751+
else if (pathogenic_variant)
721752
{
722-
QString ps = db_source_->processedSampleName(ps_id);
723753
addWarning("Pathogenic CNV " + db_source_->cnv(source_cnv_id).toString() + " of sample " + ps + " (class " + query.value("class").toString() + ") could not be lifted (" + QString::number(target_cnv_id) + ")!");
724754
}
725755
}
@@ -738,7 +768,7 @@ void NGSDReplicationWidget::updateReportConfigCnv()
738768
}
739769
try
740770
{
741-
//TODO q_add.exec();
771+
q_add.exec();
742772
}
743773
catch (Exception& e)
744774
{
@@ -765,7 +795,7 @@ void NGSDReplicationWidget::updateReportConfigCnv()
765795
QStringList details;
766796
if (c_added>0) details << "added " + QString::number(c_added);
767797
if (c_not_mappable>0) details << "skipped unmappable CNVs " + QString::number(c_not_mappable);
768-
if (c_not_in_ngsd>0) details << "skipped CNVs not in NGSD " + QString::number(c_not_in_ngsd);
798+
if (c_not_in_ngsd>0) details << "skipped CNVs not found in NGSD " + QString::number(c_not_in_ngsd);
769799
if (c_no_callset>0) details << "skipped CNVs of samples without callset " + QString::number(c_no_callset);
770800
if (c_kept>0) details << "kept " + QString::number(c_kept);
771801
if (c_updated>0) details << "updated " + QString::number(c_updated);

src/GSvar/NGSDReplicationWidget.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ protected slots:
3131

3232
int liftOverVariant(int source_variant_id, bool debug_output);
3333
void updateTable(QString table, bool contains_variant_id=false, QString where_clause="");
34-
int liftOverCnv(int source_cnv_id, int callset_id, bool debug_output);
34+
int liftOverCnv(int source_cnv_id, int callset_id, QString& error_message);
3535
void updateReportConfigCnv();
3636

3737
private:

0 commit comments

Comments
 (0)