Fix preprocessing lincs (theislab#11)

* Ignore config full lincs * Add pp for full lincs dataset * Minor changes * Fix de keys derived from 'cov_drug_dose_name' * Move imports for faster db access * Change datset to "_pp" for full lincs * Minor changes * Move SMILES addition to pp folder * Replace '/' in dict keys for .h5ad compatibility * Add check for keys in `adata.uns['rank_genes_groups_cov']`
AsclepiusInformatica · Aug 9, 2021 · 6167a9a · 6167a9a
1 parent bb65b05
commit 6167a9a
Showing 4 changed files with 946 additions and 674 deletions.
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,4 @@ dmypy.json
 # VSCode 
 settings.json
 
+scripts/config_full_lincs.yaml
diff --git a/compert/seml_sweep_icb.py b/compert/seml_sweep_icb.py
@@ -6,14 +6,10 @@
 import os
 import json
 import time
-import torch
 import seml
 import numpy as np
 import pandas as pd
-from compert.train import custom_collate, evaluate
-from compert.data import load_dataset_splits
-from compert.model import ComPert
-from compert.graph_model.graph_model import Drugemb
+import torch
 
 ex = Experiment()
 seml.setup_logger(ex)
@@ -60,6 +56,8 @@ def init_dataset(self, dataset_type: str, data_params: dict):
         Since we set prefix="dataset ", this method only gets passed the respective sub-dictionary, enabling a modular
         experiment design.
         """
+        from compert.data import load_dataset_splits
+
         if dataset_type == "kang":
             self.datasets, self.dataset = load_dataset_splits(
                 **data_params, return_dataset=True
@@ -75,6 +73,8 @@ def init_dataset(self, dataset_type: str, data_params: dict):
 
     @ex.capture(prefix="model")
     def init_drug_embedding(self, gnn_model: dict, hparams: dict):
+        from compert.graph_model.graph_model import Drugemb
+
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model_type = gnn_model["model_type"]
         dim = hparams["dim"]
@@ -100,6 +100,8 @@ def init_drug_embedding(self, gnn_model: dict, hparams: dict):
 
     @ex.capture(prefix="model")
     def init_model(self, hparams: dict, additional_params: dict):
+        from compert.model import ComPert
+
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         self.autoencoder = ComPert(
@@ -113,6 +115,8 @@ def init_model(self, hparams: dict, additional_params: dict):
         )
 
     def update_datasets(self):
+        from compert.train import custom_collate
+
         self.datasets.update(
             {
                 "loader_tr": torch.utils.data.DataLoader(
@@ -131,6 +135,7 @@ def init_all(self, seed):
         """
         Sequentially run the sub-initializers of the experiment.
         """
+
         self.seed = seed
         self.init_dataset()
         self.init_drug_embedding()
@@ -147,6 +152,8 @@ def train(
         save_checkpoints: bool,
         save_dir: str,
     ):
+        from compert.train import evaluate
+
         print(f"CWD: {os.getcwd()}")
         print(f"Save dir: {save_dir}")
         print(f"Is path?: {os.path.exists(save_dir)}")

diff --git a/preprocessing/lincs.ipynb b/preprocessing/lincs.ipynb
diff --git a/notebooks/lincs_SMILES.ipynb → preprocessing/lincs_SMILES.ipynb b/notebooks/lincs_SMILES.ipynb → preprocessing/lincs_SMILES.ipynb
Original file line number	Diff line number	Diff line change
		@@ -144,3 +144,4 @@ dmypy.json
		# VSCode
		settings.json

		scripts/config_full_lincs.yaml