diff --git a/.gitignore b/.gitignore
index 9f9b444..b0f4bf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -144,3 +144,4 @@ dmypy.json
 # VSCode 
 settings.json
 
+scripts/config_full_lincs.yaml
diff --git a/compert/seml_sweep_icb.py b/compert/seml_sweep_icb.py
index 987a771..eaaf1c9 100644
--- a/compert/seml_sweep_icb.py
+++ b/compert/seml_sweep_icb.py
@@ -6,14 +6,10 @@
 import os
 import json
 import time
-import torch
 import seml
 import numpy as np
 import pandas as pd
-from compert.train import custom_collate, evaluate
-from compert.data import load_dataset_splits
-from compert.model import ComPert
-from compert.graph_model.graph_model import Drugemb
+import torch
 
 ex = Experiment()
 seml.setup_logger(ex)
@@ -60,6 +56,8 @@ def init_dataset(self, dataset_type: str, data_params: dict):
         Since we set prefix="dataset ", this method only gets passed the respective sub-dictionary, enabling a modular
         experiment design.
         """
+        from compert.data import load_dataset_splits
+
         if dataset_type == "kang":
             self.datasets, self.dataset = load_dataset_splits(
                 **data_params, return_dataset=True
@@ -75,6 +73,8 @@ def init_dataset(self, dataset_type: str, data_params: dict):
 
     @ex.capture(prefix="model")
     def init_drug_embedding(self, gnn_model: dict, hparams: dict):
+        from compert.graph_model.graph_model import Drugemb
+
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model_type = gnn_model["model_type"]
         dim = hparams["dim"]
@@ -100,6 +100,8 @@ def init_drug_embedding(self, gnn_model: dict, hparams: dict):
 
     @ex.capture(prefix="model")
     def init_model(self, hparams: dict, additional_params: dict):
+        from compert.model import ComPert
+
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         self.autoencoder = ComPert(
@@ -113,6 +115,8 @@ def init_model(self, hparams: dict, additional_params: dict):
         )
 
     def update_datasets(self):
+        from compert.train import custom_collate
+
         self.datasets.update(
             {
                 "loader_tr": torch.utils.data.DataLoader(
@@ -131,6 +135,7 @@ def init_all(self, seed):
         """
         Sequentially run the sub-initializers of the experiment.
         """
+
         self.seed = seed
         self.init_dataset()
         self.init_drug_embedding()
@@ -147,6 +152,8 @@ def train(
         save_checkpoints: bool,
         save_dir: str,
     ):
+        from compert.train import evaluate
+
         print(f"CWD: {os.getcwd()}")
         print(f"Save dir: {save_dir}")
         print(f"Is path?: {os.path.exists(save_dir)}")
diff --git a/preprocessing/lincs.ipynb b/preprocessing/lincs.ipynb
index 0f6c7ae..88a5ac7 100644
--- a/preprocessing/lincs.ipynb
+++ b/preprocessing/lincs.ipynb
@@ -3,90 +3,178 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "source": [
+    "import pandas as pd\n",
+    "import scanpy as sc\n",
+    "sc.set_figure_params(dpi=100, frameon=False)\n",
+    "sc.logging.print_header()"
+   ],
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
-     "text": [
-      "/home/icb/yuge.ji/miniconda3/envs/py37/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    },
-    {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
-      "scanpy==1.6.0 anndata==0.7.4 umap==0.4.6 numpy==1.19.2 scipy==1.6.1 pandas==1.2.3 scikit-learn==0.23.2 statsmodels==0.11.1 python-igraph==0.8.3 leidenalg==0.8.3\n"
+      "scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2\n"
      ]
     }
    ],
-   "source": [
-    "import pandas as pd\n",
-    "import scanpy as sc\n",
-    "sc.set_figure_params(dpi=100, frameon=False)\n",
-    "sc.logging.print_header()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "source": [
+    "import os\n",
+    "os.chdir('./../')\n",
+    "from compert.helper import rank_genes_groups_by_cov"
+   ],
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
+     "name": "stderr",
      "text": [
-      "/home/icb/yuge.ji/miniconda3/envs/py37/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
+      "Using backend: pytorch\n"
      ]
     }
    ],
-   "source": [
-    "import os\n",
-    "os.chdir('./../')\n",
-    "from compert.helper import rank_genes_groups_by_cov"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "import warnings\n",
-    "warnings.filterwarnings('ignore')"
-   ]
+    "warnings.filterwarnings('ignore')   "
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
    "source": [
-    "adata = sc.read('datasets/lincs.h5ad')"
-   ]
+    "full = True \n",
+    "load_adata = True \n",
+    "adata_in = 'datasets/lincs_full.h5ad' if full else 'datasets/lincs.h5ad'\n",
+    "adata = sc.read(adata_in) if load_adata else None\n",
+    "\n",
+    "adata_out = ''.join(adata_in.split('.')[:-1]) + '_pp.h5ad'\n",
+    "adata_out"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "'datasets/lincs_full_pp.h5ad'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 4
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "adata.obs['condition'] = adata.obs['pert_iname']\n",
+    "adata.obs['condition'] = adata.obs['condition'].str.replace('/','|')\n",
+    "\n",
     "adata.obs['cell_type'] = adata.obs['cell_id']\n",
     "adata.obs['dose_val'] = adata.obs['pert_dose']\n",
     "adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)\n",
-    "adata.obs['control'] = (adata.obs['condition'] == 'DMSO').astype(int)"
-   ]
+    "adata.obs['control'] = (adata.obs['condition'] == 'DMSO').astype(int)\n",
+    "\n",
+    "# adata.obs['cov_drug_dose_name'] = adata.obs['cov_drug_dose_name'].str.replace('/','|')"
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "source": [
+    "pd.crosstab(adata.obs.condition, adata.obs.cell_type)"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
+      "text/plain": [
+       "cell_type                                         A375  A549  A673  AGS  ASC  \\\n",
+       "condition                                                                      \n",
+       "(+)-3-(1-propyl-piperidin-3-yl)-phenol               0     0     0    0    0   \n",
+       "(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin     0     0     0    0    0   \n",
+       "1,2,3,4,5,6-hexabromocyclohexane                     5     5     0    0    3   \n",
+       "1,2,3,4-tetrahydroisoquinoline                       0     0     0    0    0   \n",
+       "1,2-dichlorobenzene                                  3     6     0    0    4   \n",
+       "...                                                ...   ...   ...  ...  ...   \n",
+       "zonisamide                                          23     6     0    0    0   \n",
+       "zopiclone                                            0     0     0    0    0   \n",
+       "zosuquidar                                          20     6     0    0    3   \n",
+       "zoxazolamine                                         6     5     0    0    4   \n",
+       "zuclopenthixol                                       3     6     0    0    4   \n",
+       "\n",
+       "cell_type                                         ASC.C  BT20  CD34  CL34  \\\n",
+       "condition                                                                   \n",
+       "(+)-3-(1-propyl-piperidin-3-yl)-phenol                0     0     0     0   \n",
+       "(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin      0     0     0     0   \n",
+       "1,2,3,4,5,6-hexabromocyclohexane                      0     0     0     0   \n",
+       "1,2,3,4-tetrahydroisoquinoline                        0     0     0     0   \n",
+       "1,2-dichlorobenzene                                   0     0     0     0   \n",
+       "...                                                 ...   ...   ...   ...   \n",
+       "zonisamide                                            0     0     0     0   \n",
+       "zopiclone                                             0     0     0     0   \n",
+       "zosuquidar                                            0     0     0     0   \n",
+       "zoxazolamine                                          0     0     0     0   \n",
+       "zuclopenthixol                                        0     0     0     0   \n",
+       "\n",
+       "cell_type                                         CORL23  ...  SW620  SW948  \\\n",
+       "condition                                                 ...                 \n",
+       "(+)-3-(1-propyl-piperidin-3-yl)-phenol                 0  ...      0      0   \n",
+       "(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin       0  ...      0      0   \n",
+       "1,2,3,4,5,6-hexabromocyclohexane                       0  ...      0      0   \n",
+       "1,2,3,4-tetrahydroisoquinoline                         0  ...      0      0   \n",
+       "1,2-dichlorobenzene                                    0  ...      0      0   \n",
+       "...                                                  ...  ...    ...    ...   \n",
+       "zonisamide                                             0  ...      0      0   \n",
+       "zopiclone                                              0  ...      0      0   \n",
+       "zosuquidar                                             0  ...      0      0   \n",
+       "zoxazolamine                                           0  ...      0      0   \n",
+       "zuclopenthixol                                         0  ...      0      0   \n",
+       "\n",
+       "cell_type                                         T3M10  THP1  TYKNU  U266  \\\n",
+       "condition                                                                    \n",
+       "(+)-3-(1-propyl-piperidin-3-yl)-phenol                0     0      0     0   \n",
+       "(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin      0     0      0     0   \n",
+       "1,2,3,4,5,6-hexabromocyclohexane                      0     0      0     0   \n",
+       "1,2,3,4-tetrahydroisoquinoline                        0     0      0     0   \n",
+       "1,2-dichlorobenzene                                   0     0      0     0   \n",
+       "...                                                 ...   ...    ...   ...   \n",
+       "zonisamide                                            0     0      0     0   \n",
+       "zopiclone                                             0     0      0     0   \n",
+       "zosuquidar                                            0     0      0     0   \n",
+       "zoxazolamine                                          0     0      0     0   \n",
+       "zuclopenthixol                                        0     0      0     0   \n",
+       "\n",
+       "cell_type                                         U937  VCAP  WSUDLCL2  YAPC  \n",
+       "condition                                                                     \n",
+       "(+)-3-(1-propyl-piperidin-3-yl)-phenol               0     0         0     0  \n",
+       "(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin     0     0         0     0  \n",
+       "1,2,3,4,5,6-hexabromocyclohexane                     0     0         0     0  \n",
+       "1,2,3,4-tetrahydroisoquinoline                       0     0         0     0  \n",
+       "1,2-dichlorobenzene                                  0    12         0     0  \n",
+       "...                                                ...   ...       ...   ...  \n",
+       "zonisamide                                           0     9         0    18  \n",
+       "zopiclone                                            0     0         0     0  \n",
+       "zosuquidar                                           0    10         0    17  \n",
+       "zoxazolamine                                         0    10         0     0  \n",
+       "zuclopenthixol                                       0    12         0     0  \n",
+       "\n",
+       "[20551 rows x 83 columns]"
+      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -155,8 +243,8 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1B Parent</th>\n",
-       "      <td>8</td>\n",
+       "      <th>(+)-3-(1-propyl-piperidin-3-yl)-phenol</th>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -179,8 +267,8 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2-methoxyestradiol</th>\n",
-       "      <td>18</td>\n",
+       "      <th>(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin</th>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -200,12 +288,12 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>17</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3,6-dimethoxyflavone</th>\n",
+       "      <th>1,2,3,4,5,6-hexabromocyclohexane</th>\n",
+       "      <td>5</td>\n",
        "      <td>5</td>\n",
-       "      <td>3</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>3</td>\n",
@@ -227,31 +315,19 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3-amino-benzamide</th>\n",
-       "      <td>30</td>\n",
-       "      <td>61</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
+       "      <th>1,2,3,4-tetrahydroisoquinoline</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
        "      <td>...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
        "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>9</td>\n",
-       "      <td>2</td>\n",
-       "      <td>18</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5-methoxy-alpha-methyltryptamine</th>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -261,17 +337,29 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1,2-dichlorobenzene</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
        "      <td>0</td>\n",
-       "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -299,36 +387,36 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>xanthoxyline</th>\n",
-       "      <td>3</td>\n",
-       "      <td>5</td>\n",
+       "      <th>zonisamide</th>\n",
+       "      <td>23</td>\n",
+       "      <td>6</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>4</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>...</td>\n",
        "      <td>0</td>\n",
+       "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>7</td>\n",
        "      <td>0</td>\n",
+       "      <td>9</td>\n",
        "      <td>0</td>\n",
+       "      <td>18</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>yohimbine</th>\n",
-       "      <td>35</td>\n",
-       "      <td>16</td>\n",
+       "      <th>zopiclone</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>4</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -342,17 +430,17 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>26</td>\n",
        "      <td>0</td>\n",
-       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>zacopride</th>\n",
-       "      <td>9</td>\n",
-       "      <td>11</td>\n",
-       "      <td>0</td>\n",
+       "      <th>zosuquidar</th>\n",
+       "      <td>20</td>\n",
+       "      <td>6</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>3</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -366,14 +454,14 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>19</td>\n",
-       "      <td>0</td>\n",
+       "      <td>10</td>\n",
        "      <td>0</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>zaprinast</th>\n",
-       "      <td>21</td>\n",
+       "      <th>zoxazolamine</th>\n",
        "      <td>6</td>\n",
+       "      <td>5</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>4</td>\n",
@@ -390,178 +478,197 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>12</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>18</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>zileuton</th>\n",
-       "      <td>32</td>\n",
-       "      <td>13</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
+       "      <th>zuclopenthixol</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
        "      <td>...</td>\n",
-       "      <td>3</td>\n",
-       "      <td>4</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
        "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>2</td>\n",
-       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>1001 rows × 82 columns</p>\n",
+       "<p>20551 rows × 83 columns</p>\n",
        "</div>"
-      ],
-      "text/plain": [
-       "cell_type                         A375  A549  A673  AGS  ASC  ASC.C  BT20  \\\n",
-       "condition                                                                   \n",
-       "1B Parent                            8     0     0    0    0      0     0   \n",
-       "2-methoxyestradiol                  18     0     0    0    0      0     0   \n",
-       "3,6-dimethoxyflavone                 5     3     0    0    3      0     0   \n",
-       "3-amino-benzamide                   30    61     2    2    0      0     0   \n",
-       "5-methoxy-alpha-methyltryptamine     0     0     0    0    0      0     0   \n",
-       "...                                ...   ...   ...  ...  ...    ...   ...   \n",
-       "xanthoxyline                         3     5     0    0    4      0     0   \n",
-       "yohimbine                           35    16     0    0    4      0     0   \n",
-       "zacopride                            9    11     0    0    0      0     0   \n",
-       "zaprinast                           21     6     0    0    4      0     0   \n",
-       "zileuton                            32    13     2    2    0      0     0   \n",
-       "\n",
-       "cell_type                         CD34  CL34  CORL23  ...  SW620  SW948  \\\n",
-       "condition                                             ...                 \n",
-       "1B Parent                            0     0       0  ...      0      0   \n",
-       "2-methoxyestradiol                   0     0       0  ...      0      0   \n",
-       "3,6-dimethoxyflavone                 0     0       0  ...      0      0   \n",
-       "3-amino-benzamide                    0     2       2  ...      4      4   \n",
-       "5-methoxy-alpha-methyltryptamine     0     0       0  ...      0      0   \n",
-       "...                                ...   ...     ...  ...    ...    ...   \n",
-       "xanthoxyline                         0     0       0  ...      0      0   \n",
-       "yohimbine                            0     0       0  ...      0      0   \n",
-       "zacopride                            0     0       0  ...      0      0   \n",
-       "zaprinast                            0     0       0  ...      0      0   \n",
-       "zileuton                             0     1       1  ...      3      4   \n",
-       "\n",
-       "cell_type                         T3M10  THP1  TYKNU  U266  U937  VCAP  \\\n",
-       "condition                                                                \n",
-       "1B Parent                             0     0      0     0     0     0   \n",
-       "2-methoxyestradiol                    0     0      0     0     0     0   \n",
-       "3,6-dimethoxyflavone                  0     0      0     0     0     0   \n",
-       "3-amino-benzamide                     2     2      2     0     2     9   \n",
-       "5-methoxy-alpha-methyltryptamine      0     0      0     0     0     0   \n",
-       "...                                 ...   ...    ...   ...   ...   ...   \n",
-       "xanthoxyline                          0     0      0     0     0     7   \n",
-       "yohimbine                             0     0      0     0     0    26   \n",
-       "zacopride                             0     0      0     0     0    19   \n",
-       "zaprinast                             0     0      0     0     0    12   \n",
-       "zileuton                              2     1      2     0     2    19   \n",
-       "\n",
-       "cell_type                         WSUDLCL2  YAPC  \n",
-       "condition                                         \n",
-       "1B Parent                                0     0  \n",
-       "2-methoxyestradiol                       0    17  \n",
-       "3,6-dimethoxyflavone                     0     0  \n",
-       "3-amino-benzamide                        2    18  \n",
-       "5-methoxy-alpha-methyltryptamine         0     0  \n",
-       "...                                    ...   ...  \n",
-       "xanthoxyline                             0     0  \n",
-       "yohimbine                                0    18  \n",
-       "zacopride                                0     0  \n",
-       "zaprinast                                0    18  \n",
-       "zileuton                                 2    18  \n",
-       "\n",
-       "[1001 rows x 82 columns]"
       ]
      },
-     "execution_count": 6,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 6
     }
    ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "source": [
-    "pd.crosstab(adata.obs.condition, adata.obs.cell_type)"
-   ]
+    "drug_abundance = adata.obs.condition.value_counts()\n",
+    "suff_drug_abundance = drug_abundance.index[drug_abundance>5]"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "source": [
+    "# Delete conditions isufficient # of observations\n",
+    "adata = adata[adata.obs.condition.isin(suff_drug_abundance)].copy()\n",
+    "adata "
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 1023036 × 978\n",
+       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control'\n",
+       "    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'\n",
+       "    uns: 'cydata_pull'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 8
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
    "source": [
     "Calculate differential genes manually, such that the genes are the same per condition."
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 9,
+   "source": [
+    "%%time\n",
+    "from tqdm.notebook import tqdm\n",
+    "import numpy as np \n",
+    "\n",
+    "de_genes = {}\n",
+    "de_genes_quick = {}\n",
+    "\n",
+    "adata_df = adata.to_df()\n",
+    "adata_df['condition'] = adata.obs.condition\n",
+    "dmso = adata_df[adata_df.condition == \"DMSO\"].mean()\n",
+    "\n",
+    "for cond, df in tqdm(adata_df.groupby('condition')): \n",
+    "    if cond != 'DMSO':\n",
+    "        drug_mean = df.mean()\n",
+    "        de_50_idx = np.argsort(abs(drug_mean-dmso))[-50:]\n",
+    "        de_genes_quick[cond] = drug_mean.index[de_50_idx].values\n",
+    "\n",
+    "if full: \n",
+    "    de_genes = de_genes_quick\n",
+    "\n",
+    "else:\n",
+    "    sc.tl.rank_genes_groups(\n",
+    "        adata,\n",
+    "        groupby='condition', \n",
+    "        reference='DMSO',\n",
+    "        rankby_abs=True,\n",
+    "        n_genes=50\n",
+    "    )\n",
+    "    for cond in tqdm(np.unique(adata.obs['condition'])):\n",
+    "        if cond != 'DMSO':\n",
+    "            df = sc.get.rank_genes_groups_df(adata, group=cond)  # this takes a while\n",
+    "            de_genes[cond] = df['names'][:50].values\n"
+   ],
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'\n"
-     ]
+     "output_type": "display_data",
+     "data": {
+      "text/plain": [
+       "  0%|          | 0/17990 [00:00<?, ?it/s]"
+      ],
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "74e98aa7a092439e84b5babed884e28d"
+      }
+     },
+     "metadata": {}
     },
     {
-     "name": "stderr",
      "output_type": "stream",
-     "text": [
-      "... storing 'cov_drug_dose_name' as categorical\n"
-     ]
-    },
-    {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
-      "CPU times: user 4min 4s, sys: 412 ms, total: 4min 4s\n",
-      "Wall time: 4min 5s\n"
+      "CPU times: user 41.5 s, sys: 877 ms, total: 42.4 s\n",
+      "Wall time: 42.4 s\n"
      ]
     }
    ],
+   "metadata": {
+    "scrolled": true
+   }
+  },
+  {
+   "cell_type": "markdown",
    "source": [
-    "%%time\n",
-    "sc.tl.rank_genes_groups(\n",
-    "    adata,\n",
-    "    groupby='condition', \n",
-    "    reference='DMSO',\n",
-    "    rankby_abs=True,\n",
-    "    n_genes=50\n",
-    ")\n",
-    "\n",
-    "de_genes = {}\n",
-    "for cond in adata.obs['condition']:\n",
-    "    if cond != 'DMSO':\n",
-    "        df = sc.get.rank_genes_groups_df(adata, group=cond)  # this takes a while\n",
-    "        de_genes[cond] = df['names'][:50].values"
-   ]
+    "Mapping from `rank_genes_groups_cov` might cause problems when drug contains '_'"
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 10,
    "source": [
-    "adata.uns['rank_genes_groups_cov'] = {cond: de_genes[cond.split('_')[1]] for cond in adata.obs['cov_drug_dose_name'].unique() if cond.split('_')[1] != 'DMSO'}"
-   ]
+    "def extract_drug(cond): \n",
+    "    return '_'.join(cond.split('_')[1:-1])\n",
+    "\n",
+    "adata.obs['cov_drug_dose_name'].apply(lambda s: len(s.split('_'))).value_counts()"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "3    1022382\n",
+       "4        654\n",
+       "Name: cov_drug_dose_name, dtype: int64"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 10
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 11,
+   "source": [
+    "adata.uns['rank_genes_groups_cov'] = {cond: de_genes_quick[extract_drug(cond)] for cond in adata.obs.cov_drug_dose_name.unique() if extract_drug(cond) != 'DMSO'}"
+   ],
    "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
    "source": [
     "adata.obs['split'] = 'train'\n",
     "\n",
@@ -580,17 +687,73 @@
     "    copy=True\n",
     ").obs.index\n",
     "adata.obs['split'].loc[test_idx] = 'test'"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {
+    "scrolled": true
+   }
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 13,
+   "source": [
+    "pd.crosstab(adata.obs['split'], adata.obs['condition'])"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
+      "text/plain": [
+       "condition  (+)-3-(1-propyl-piperidin-3-yl)-phenol  \\\n",
+       "split                                               \n",
+       "ood                                             0   \n",
+       "test                                            5   \n",
+       "train                                          13   \n",
+       "\n",
+       "condition  (+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin  \\\n",
+       "split                                                         \n",
+       "ood                                                       0   \n",
+       "test                                                      0   \n",
+       "train                                                    18   \n",
+       "\n",
+       "condition  1,2,3,4,5,6-hexabromocyclohexane  1,2,3,4-tetrahydroisoquinoline  \\\n",
+       "split                                                                         \n",
+       "ood                                       0                               0   \n",
+       "test                                      8                               4   \n",
+       "train                                    29                              14   \n",
+       "\n",
+       "condition  1,2-dichlorobenzene  1,2-propylene-glycol  1-benzylimidazole  \\\n",
+       "split                                                                     \n",
+       "ood                          0                     0                  0   \n",
+       "test                        13                     5                  9   \n",
+       "train                       58                    22                 40   \n",
+       "\n",
+       "condition  1-methylisoquinoline  1-monopalmitin  1-phenylbiguanide  ...  \\\n",
+       "split                                                               ...   \n",
+       "ood                           0               0                  0  ...   \n",
+       "test                          6               7                  3  ...   \n",
+       "train                        12              27                 25  ...   \n",
+       "\n",
+       "condition  ziprasidone  zofenopril-calcium  zolantidine  zolmitriptan  \\\n",
+       "split                                                                   \n",
+       "ood                  0                   0            0             0   \n",
+       "test                37                  23           12            25   \n",
+       "train              152                 103           52            87   \n",
+       "\n",
+       "condition  zolpidem  zonisamide  zopiclone  zosuquidar  zoxazolamine  \\\n",
+       "split                                                                  \n",
+       "ood               0           0          0           0             0   \n",
+       "test             30          38          2          31            13   \n",
+       "train           175         185         10         151            79   \n",
+       "\n",
+       "condition  zuclopenthixol  \n",
+       "split                      \n",
+       "ood                     0  \n",
+       "test                   15  \n",
+       "train                  68  \n",
+       "\n",
+       "[3 rows x 17990 columns]"
+      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -610,27 +773,27 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th>condition</th>\n",
-       "      <th>1B Parent</th>\n",
-       "      <th>2-methoxyestradiol</th>\n",
-       "      <th>3,6-dimethoxyflavone</th>\n",
-       "      <th>3-amino-benzamide</th>\n",
-       "      <th>5-methoxy-alpha-methyltryptamine</th>\n",
-       "      <th>ABT-737</th>\n",
-       "      <th>AG-490</th>\n",
-       "      <th>AG-14361</th>\n",
-       "      <th>AICA-ribonucleotide</th>\n",
-       "      <th>ALW-II-38-3</th>\n",
+       "      <th>(+)-3-(1-propyl-piperidin-3-yl)-phenol</th>\n",
+       "      <th>(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin</th>\n",
+       "      <th>1,2,3,4,5,6-hexabromocyclohexane</th>\n",
+       "      <th>1,2,3,4-tetrahydroisoquinoline</th>\n",
+       "      <th>1,2-dichlorobenzene</th>\n",
+       "      <th>1,2-propylene-glycol</th>\n",
+       "      <th>1-benzylimidazole</th>\n",
+       "      <th>1-methylisoquinoline</th>\n",
+       "      <th>1-monopalmitin</th>\n",
+       "      <th>1-phenylbiguanide</th>\n",
        "      <th>...</th>\n",
-       "      <th>veliparib</th>\n",
-       "      <th>vinburnine</th>\n",
-       "      <th>voglibose</th>\n",
-       "      <th>wiskostatin</th>\n",
-       "      <th>xanthohumol</th>\n",
-       "      <th>xanthoxyline</th>\n",
-       "      <th>yohimbine</th>\n",
-       "      <th>zacopride</th>\n",
-       "      <th>zaprinast</th>\n",
-       "      <th>zileuton</th>\n",
+       "      <th>ziprasidone</th>\n",
+       "      <th>zofenopril-calcium</th>\n",
+       "      <th>zolantidine</th>\n",
+       "      <th>zolmitriptan</th>\n",
+       "      <th>zolpidem</th>\n",
+       "      <th>zonisamide</th>\n",
+       "      <th>zopiclone</th>\n",
+       "      <th>zosuquidar</th>\n",
+       "      <th>zoxazolamine</th>\n",
+       "      <th>zuclopenthixol</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>split</th>\n",
@@ -665,13 +828,13 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>64</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>...</td>\n",
-       "      <td>135</td>\n",
+       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -684,143 +847,241 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>test</th>\n",
-       "      <td>2</td>\n",
-       "      <td>17</td>\n",
-       "      <td>4</td>\n",
-       "      <td>59</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>125</td>\n",
-       "      <td>23</td>\n",
-       "      <td>66</td>\n",
-       "      <td>50</td>\n",
-       "      <td>89</td>\n",
-       "      <td>...</td>\n",
-       "      <td>188</td>\n",
-       "      <td>24</td>\n",
-       "      <td>20</td>\n",
-       "      <td>7</td>\n",
-       "      <td>9</td>\n",
+       "      <td>13</td>\n",
+       "      <td>5</td>\n",
        "      <td>9</td>\n",
-       "      <td>60</td>\n",
-       "      <td>26</td>\n",
-       "      <td>33</td>\n",
-       "      <td>51</td>\n",
+       "      <td>6</td>\n",
+       "      <td>7</td>\n",
+       "      <td>3</td>\n",
+       "      <td>...</td>\n",
+       "      <td>37</td>\n",
+       "      <td>23</td>\n",
+       "      <td>12</td>\n",
+       "      <td>25</td>\n",
+       "      <td>30</td>\n",
+       "      <td>38</td>\n",
+       "      <td>2</td>\n",
+       "      <td>31</td>\n",
+       "      <td>13</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>train</th>\n",
-       "      <td>20</td>\n",
-       "      <td>100</td>\n",
-       "      <td>30</td>\n",
-       "      <td>343</td>\n",
+       "      <td>13</td>\n",
+       "      <td>18</td>\n",
+       "      <td>29</td>\n",
        "      <td>14</td>\n",
-       "      <td>583</td>\n",
-       "      <td>130</td>\n",
-       "      <td>272</td>\n",
-       "      <td>249</td>\n",
-       "      <td>509</td>\n",
-       "      <td>...</td>\n",
-       "      <td>936</td>\n",
-       "      <td>150</td>\n",
-       "      <td>105</td>\n",
        "      <td>58</td>\n",
-       "      <td>58</td>\n",
-       "      <td>53</td>\n",
-       "      <td>266</td>\n",
-       "      <td>81</td>\n",
+       "      <td>22</td>\n",
+       "      <td>40</td>\n",
+       "      <td>12</td>\n",
+       "      <td>27</td>\n",
+       "      <td>25</td>\n",
+       "      <td>...</td>\n",
+       "      <td>152</td>\n",
+       "      <td>103</td>\n",
+       "      <td>52</td>\n",
+       "      <td>87</td>\n",
        "      <td>175</td>\n",
-       "      <td>279</td>\n",
+       "      <td>185</td>\n",
+       "      <td>10</td>\n",
+       "      <td>151</td>\n",
+       "      <td>79</td>\n",
+       "      <td>68</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3 rows × 1001 columns</p>\n",
+       "<p>3 rows × 17990 columns</p>\n",
        "</div>"
-      ],
-      "text/plain": [
-       "condition  1B Parent  2-methoxyestradiol  3,6-dimethoxyflavone  \\\n",
-       "split                                                            \n",
-       "ood                0                   0                     0   \n",
-       "test               2                  17                     4   \n",
-       "train             20                 100                    30   \n",
-       "\n",
-       "condition  3-amino-benzamide  5-methoxy-alpha-methyltryptamine  ABT-737  \\\n",
-       "split                                                                     \n",
-       "ood                        0                                 0       64   \n",
-       "test                      59                                 4      125   \n",
-       "train                    343                                14      583   \n",
-       "\n",
-       "condition  AG-490  AG-14361  AICA-ribonucleotide  ALW-II-38-3  ...  veliparib  \\\n",
-       "split                                                          ...              \n",
-       "ood             0         0                    0            0  ...        135   \n",
-       "test           23        66                   50           89  ...        188   \n",
-       "train         130       272                  249          509  ...        936   \n",
-       "\n",
-       "condition  vinburnine  voglibose  wiskostatin  xanthohumol  xanthoxyline  \\\n",
-       "split                                                                      \n",
-       "ood                 0          0            0            0             0   \n",
-       "test               24         20            7            9             9   \n",
-       "train             150        105           58           58            53   \n",
-       "\n",
-       "condition  yohimbine  zacopride  zaprinast  zileuton  \n",
-       "split                                                 \n",
-       "ood                0          0          0         0  \n",
-       "test              60         26         33        51  \n",
-       "train            266         81        175       279  \n",
-       "\n",
-       "[3 rows x 1001 columns]"
       ]
      },
-     "execution_count": 10,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 13
     }
    ],
-   "source": [
-    "pd.crosstab(adata.obs['split'], adata.obs['condition'])"
-   ]
+   "metadata": {
+    "scrolled": true
+   }
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 14,
    "source": [
-    "del(adata.uns['rank_genes_groups'])  # too large"
-   ]
+    "try: \n",
+    "    del(adata.uns['rank_genes_groups'])  # too large\n",
+    "except: \n",
+    "    print('All good.')"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "All good.\n"
+     ]
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 15,
    "source": [
     "# code compatibility\n",
     "from scipy import sparse\n",
     "adata.X = sparse.csr_matrix(adata.X)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
+   "execution_count": 16,
+   "source": [
+    "sc.write('datasets/lincs_full_pp.h5ad', adata)"
+   ],
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
+     "name": "stderr",
      "text": [
+      "... storing 'condition' as categorical\n",
+      "... storing 'cov_drug_dose_name' as categorical\n",
       "... storing 'split' as categorical\n"
      ]
     }
    ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "source": [
+    "print('all done.')"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "all done.\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
    "source": [
-    "sc.write('datasets/lincs.h5ad', adata)"
-   ]
+    "### Check that `adata.uns[rank_genes_groups_cov]` has all entries in `adata.obs.cov_drug_dose_name` as keys"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "source": [
+    "for i, k in enumerate(adata.obs.cov_drug_dose_name.unique()):\n",
+    "    try: \n",
+    "        adata.uns['rank_genes_groups_cov'][k]\n",
+    "    except: \n",
+    "        print(f\"{i}: {k}\") if 'DMSO' not in k else None"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Checking the same for the stored adata object"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "source": [
+    "adata_2 = sc.read('datasets/lincs_full_pp.h5ad')"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "source": [
+    "for i, k in enumerate(adata_2.obs.cov_drug_dose_name.unique()):\n",
+    "    try: \n",
+    "        adata_2.uns['rank_genes_groups_cov'][k]\n",
+    "    except: \n",
+    "        print(f\"{i}: {k}\") if 'DMSO' not in k else None"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "source": [
+    "set(list(adata.uns['rank_genes_groups_cov'])) - set((list(adata_2.uns['rank_genes_groups_cov'])))"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "set()"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 21
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "source": [
+    "set((list(adata_2.uns['rank_genes_groups_cov']))) - set(list(adata.uns['rank_genes_groups_cov']))"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "set()"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 22
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "name": "python3",
+   "display_name": "Python 3.8.8 64-bit ('py38': conda)"
   },
   "language_info": {
    "codemirror_mode": {
@@ -832,9 +1093,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.8.8"
+  },
+  "interpreter": {
+   "hash": "6ea99325af0b09b2c098ddbe7d13703f3e2b6f04181042d146addb2812ce9602"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/notebooks/lincs_SMILES.ipynb b/preprocessing/lincs_SMILES.ipynb
similarity index 60%
rename from notebooks/lincs_SMILES.ipynb
rename to preprocessing/lincs_SMILES.ipynb
index c4194db..30f2b4d 100644
--- a/notebooks/lincs_SMILES.ipynb
+++ b/preprocessing/lincs_SMILES.ipynb
@@ -2,287 +2,255 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "ee79bb85",
-   "metadata": {},
+   "execution_count": 1,
+   "source": [
+    "import os \n",
+    "import scanpy as sc\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "sc.set_figure_params(dpi=80, frameon=False)\n",
+    "sc.logging.print_header()\n",
+    "os.getcwd()"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
       "scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2\n"
      ]
     },
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "'/mnt/home/icb/leon.hetzel/git/CPA_graphs/notebooks'"
+       "'/home/icb/leon.hetzel/git/CPA_graphs/preprocessing'"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 1
     }
    ],
-   "source": [
-    "import os \n",
-    "import scanpy as sc\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np \n",
-    "import pandas as pd\n",
-    "sc.set_figure_params(dpi=80, frameon=False)\n",
-    "sc.logging.print_header()\n",
-    "os.getcwd()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "831629f0",
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "0995dff1",
-   "metadata": {},
    "source": [
     "### Loading LINCS and reference data"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "8d354ea2",
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
    "source": [
-    "full = False\n",
+    "full = True\n",
+    "load_adata = True \n",
     "if full:\n",
-    "    adata = sc.read('../datasets/lincs_full.h5ad')\n",
+    "    adata_in = '../datasets/lincs_full_pp.h5ad'\n",
     "    adata_out = '../datasets/lincs_full_smiles.h5ad' \n",
     "else: \n",
-    "    adata = sc.read('../datasets/lincs.h5ad')\n",
-    "    adata_out = '../datasets/lincs_smiles.h5ad'  \n"
-   ]
+    "    adata_in = '../datasets/lincs.h5ad'\n",
+    "    adata_out = '../datasets/lincs_smiles.h5ad'  \n",
+    "adata = sc.read(adata_in) if load_adata else None\n"
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "1a0a245f",
-   "metadata": {},
    "source": [
     "Checking number of drugs for LINCS"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "2b58a210",
-   "metadata": {},
+   "source": [
+    "pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))\n",
+    "print(f\"# of unique perturbations: {len(pert_id_unique)}\")"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "# of unique perturbations: 1120\n"
+      "# of unique perturbations: 18743\n"
      ]
     }
    ],
-   "source": [
-    "pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))\n",
-    "print(f\"# of unique perturbations: {len(pert_id_unique)}\")"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "69660129",
-   "metadata": {},
    "source": [
     "Loading reference dataframe and restricting to `'pert_id'` and `'canonical_smiles'`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "77d69791",
-   "metadata": {},
+   "source": [
+    "reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = \"\\t\")\n",
+    "reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]\n",
+    "reference_df.canonical_smiles.value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "-666                                                                                           6\n",
-       "CS(=O)(=O)CCNCc1ccc(o1)-c1ccc2ncnc(Nc3ccc(OCc4cccc(F)c4)c(Cl)c3)c2c1                           2\n",
-       "restricted                                                                                     2\n",
-       "C[C@@H](CO)N1C[C@@H](C)[C@@H](CN(C)CC2CC2)OCCCC[C@@H](C)Oc3ccc(NS(=O)(=O)c4cn(C)cn4)cc3C1=O    1\n",
-       "Cc1cc(Cl)ccc1OCCCC(=O)NO                                                                       1\n",
-       "                                                                                              ..\n",
-       "CC(COc1ccccc1)N(CCCl)Cc1ccccc1                                                                 1\n",
-       "N1c2ccccc2Oc2ccccc12                                                                           1\n",
-       "Cc1ccc(C)c(NS(=O)(=O)c2cc(ccc2Cl)C(=O)Nc3ccccc3c4ccccc4)c1                                     1\n",
-       "Cn1cnc(CCNC(=O)C[C@@H]2CC[C@H]3[C@@H](COC[C@H](O)CN3C(=O)Nc3ccc(cc3)C(F)(F)F)O2)c1             1\n",
-       "OCCNCCNc1ccc(NCCNCCO)c2C(=O)c3c(O)ccc(O)c3C(=O)c12                                             1\n",
-       "Name: canonical_smiles, Length: 981, dtype: int64"
+       "-666                                                                                                                                                  63\n",
+       "restricted                                                                                                                                            14\n",
+       "CCC1=C[C@@H]2C[N@](C1)Cc1c([nH]c3ccccc13)[C@@](C2)(C(=O)OC)c1cc2c(cc1OC)N(C)[C@@H]1[C@]22CCN3CC=C[C@@](CC)([C@@H]23)[C@@H](OC(C)=O)[C@]1(O)C(=O)OC     2\n",
+       "CN(\\N=C\\c1cnc2ccc(Br)cn12)S(=O)(=O)c1cc(ccc1C)[N+]([O-])=O                                                                                             2\n",
+       "CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)COCCN                                                                                                     2\n",
+       "                                                                                                                                                      ..\n",
+       "CC(C)(C)[S@@](=O)N1Cc2cc(nc(c2[C@H]1CCO)-c1cccc(c1)-c1cccnc1)C(=O)N[C@H]1CCN(Cc2ccccc2)C1                                                              1\n",
+       "C[C@H](CO)N1C[C@H](C)[C@H](CN(C)Cc2ccccc2)OCCCC[C@@H](C)Oc3ccc(NS(=O)(=O)c4cccs4)cc3C1=O                                                               1\n",
+       "COc1ccc(Cl)cc1Nc2nnc(s2)c3ccncc3                                                                                                                       1\n",
+       "C[C@@H](NC(=O)C[C@H]1C[C@@H]2[C@@H](Oc3ccc(NC(=O)c4cccnc4)cc23)[C@H](CO)O1)c5ccccc5                                                                    1\n",
+       "Cc1ccc2N=C3N(CCC3(O)C(=O)c2c1)c1ccccc1                                                                                                                 1\n",
+       "Name: canonical_smiles, Length: 17770, dtype: int64"
       ]
      },
-     "execution_count": 5,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 5
     }
    ],
-   "source": [
-    "reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = \"\\t\")\n",
-    "reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]\n",
-    "reference_df.canonical_smiles.value_counts()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "23ecebcc",
-   "metadata": {},
+   "source": [
+    "cond = ~pert_id_unique.isin(reference_df.pert_id)\n",
+    "print(f\"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.\")"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "From 1120 total drugs, 132 were not part of the reference dataframe.\n"
+      "From 18743 total drugs, 890 were not part of the reference dataframe.\n"
      ]
     }
    ],
-   "source": [
-    "cond = ~pert_id_unique.isin(reference_df.pert_id)\n",
-    "print(f\"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.\")"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "b4f02fdf",
-   "metadata": {},
    "source": [
     "Adding `'canoncical_smiles'` column to `adata.obs` via `pd.merge`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "75dacdc1",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "adata.obs = adata.obs.reset_index().merge(reference_df, how=\"left\").set_index('index')"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "531c0ddc",
-   "metadata": {},
    "source": [
     "Removing invalid SMILES strings "
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "7ba405b3",
-   "metadata": {},
+   "source": [
+    "adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')\n",
+    "invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])\n",
+    "print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')\n",
+    "adata = adata[~invalid_smiles]"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "Among 199620 observations, 11.54% (23033) do not have a valid SMILES string\n"
+      "Among 1023036 observations, 13.66% (139764) do not have a valid SMILES string\n"
      ]
     }
    ],
-   "source": [
-    "adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')\n",
-    "invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])\n",
-    "print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')\n",
-    "adata = adata[~invalid_smiles]"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "3b104625",
-   "metadata": {},
    "source": [
     "Remove invalid `'pert_dose'` value: `-666`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "94346678",
-   "metadata": {},
+   "source": [
+    "cond = adata.obs.pert_dose.isin([-666])\n",
+    "adata = adata[~cond]\n",
+    "print(f\"A total of {cond.sum()} observations have invalid dose values\")"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "A total of 0 observations have invalid dose values\n"
+      "A total of 42592 observations have invalid dose values\n"
      ]
     }
    ],
-   "source": [
-    "cond = adata.obs.pert_dose.isin([-666])\n",
-    "adata = adata[~cond]\n",
-    "print(f\"A total of {cond.sum()} observations have invalid dose values\")"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "9de4b204",
-   "metadata": {},
+   "source": [
+    "drugs_validation = adata.obs.canonical_smiles.value_counts() < 6\n",
+    "valid_drugs = drugs_validation.index[~drugs_validation]\n",
+    "cond = adata.obs.canonical_smiles.isin(valid_drugs)\n",
+    "print(f\"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates\")\n",
+    "adata = adata[cond]"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "A total of 0 observation belong to drugs which do not have enough replicates\n"
+      "A total of 3 observation belong to drugs which do not have enough replicates\n"
      ]
     }
    ],
-   "source": [
-    "drugs_validation = adata.obs.canonical_smiles.value_counts() < 6\n",
-    "valid_drugs = drugs_validation.index[~drugs_validation]\n",
-    "cond = adata.obs.canonical_smiles.isin(valid_drugs)\n",
-    "print(f\"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates\")\n",
-    "adata = adata[cond]"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "7eabc68c",
-   "metadata": {},
    "source": [
     "Checking that SMILES are valid according to `rdkit` "
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "1c23d3c5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "View of AnnData object with n_obs × n_vars = 176587 × 978\n",
-       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles'\n",
-       "    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'\n",
-       "    uns: 'rank_genes_groups_cov'"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "from rdkit import Chem\n",
     "\n",
@@ -311,243 +279,265 @@
     "    return dataframe\n",
     "\n",
     "adata"
-   ]
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "View of AnnData object with n_obs × n_vars = 840677 × 978\n",
+       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles'\n",
+       "    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'\n",
+       "    uns: 'cydata_pull', 'rank_genes_groups_cov'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 11
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "0bf04aa2",
-   "metadata": {},
+   "source": [
+    "cond = remove_invalid_smiles(adata.obs, smiles_key='canonical_smiles', return_condition=True)\n",
+    "adata = adata[cond]"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
       "A total of 0 have invalid SMILES strings\n"
      ]
     }
    ],
-   "source": [
-    "cond = remove_invalid_smiles(adata.obs, smiles_key='canonical_smiles', return_condition=True)\n",
-    "adata = adata[cond]"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "5f298667",
-   "metadata": {},
    "source": [
     "### Add additional drugbank info to `adata.obs`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "a045fdbd",
-   "metadata": {},
+   "source": [
+    "from os.path import exists\n",
+    "\n",
+    "drugbank_path = '../datasets/drug_bank/drugbank_all.csv'\n",
+    "if exists(drugbank_path): \n",
+    "    drugbank_df = pd.read_csv(drugbank_path)\n",
+    "else: \n",
+    "    print(f'Invalid path: {drugbank_path}')"
+   ],
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
+     "name": "stderr",
      "text": [
       "/home/icb/leon.hetzel/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (54,62) have mixed types.Specify dtype option on import or set low_memory=False.\n",
       "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
      ]
     }
    ],
-   "source": [
-    "from os.path import exists\n",
-    "\n",
-    "drugbank_path = '../datasets/drug_bank/drugbank_all.csv'\n",
-    "if exists(drugbank_path): \n",
-    "    drugbank_df = pd.read_csv(drugbank_path)\n",
-    "else: \n",
-    "    print(f'Invalid path: {drugbank_path}')"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "70712bc3",
-   "metadata": {},
+   "source": [
+    "from rdkit.Chem import CanonSmiles\n",
+    "\n",
+    "drugs_canonical = pd.Series(np.unique(adata.obs.canonical_smiles)).apply(CanonSmiles)\n",
+    "db_canonical_smiles = drugbank_df.SMILES.apply(CanonSmiles)\n",
+    "n_overlap = drugs_canonical.isin(db_canonical_smiles).sum()\n",
+    "print(f'From a total of {len(drugs_canonical)}, {100*n_overlap/len(drugs_canonical):.2f}% ({n_overlap}) is also available in drugbank.')"
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "From a total of 979, 20.43% (200) is also available in drugbank.\n"
+      "From a total of 17767, 7.72% (1371) is also available in drugbank.\n"
      ]
     },
     {
-     "name": "stderr",
      "output_type": "stream",
+     "name": "stderr",
      "text": [
-      "RDKit WARNING: [22:31:51] WARNING: not removing hydrogen atom without neighbors\n",
-      "RDKit WARNING: [22:31:51] WARNING: not removing hydrogen atom without neighbors\n",
-      "RDKit WARNING: [22:31:51] WARNING: not removing hydrogen atom without neighbors\n"
+      "RDKit WARNING: [11:57:09] WARNING: not removing hydrogen atom without neighbors\n",
+      "RDKit WARNING: [11:57:09] WARNING: not removing hydrogen atom without neighbors\n",
+      "RDKit WARNING: [11:57:09] WARNING: not removing hydrogen atom without neighbors\n"
      ]
     }
    ],
-   "source": [
-    "from rdkit.Chem import CanonSmiles\n",
-    "\n",
-    "drugs_canonical = pd.Series(np.unique(adata.obs.canonical_smiles)).apply(CanonSmiles)\n",
-    "db_canonical_smiles = drugbank_df.SMILES.apply(CanonSmiles)\n",
-    "n_overlap = drugs_canonical.isin(db_canonical_smiles).sum()\n",
-    "print(f'From a total of {len(drugs_canonical)}, {100*n_overlap/len(drugs_canonical):.2f}% ({n_overlap}) is also available in drugbank.')"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "0459f6ed",
-   "metadata": {},
+   "source": [
+    "cond = db_canonical_smiles.isin(drugs_canonical)\n",
+    "drugbank_df.loc[cond, ['ATC_level_1']].value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "ATC_level_1                               \n",
-       "an                                            68\n",
-       "ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS    42\n",
-       "NERVOUS SYSTEM                                22\n",
-       "CARDIOVASCULAR SYSTEM                         16\n",
-       "ALIMENTARY TRACT AND METABOLISM               13\n",
-       "GENITO URINARY SYSTEM AND SEX HORMONES         8\n",
-       "RESPIRATORY SYSTEM                             6\n",
-       "SENSORY ORGANS                                 6\n",
-       "DERMATOLOGICALS                                5\n",
-       "INSECTICIDES AND REPELLENTS                    4\n",
-       "MUSCULO-SKELETAL SYSTEM                        4\n",
-       "ANTIINFECTIVES FOR SYSTEMIC USE                2\n",
-       "BLOOD AND BLOOD FORMING ORGANS                 2\n",
-       "VARIOUS                                        2\n",
+       "an                                            393\n",
+       "NERVOUS SYSTEM                                184\n",
+       "CARDIOVASCULAR SYSTEM                         152\n",
+       "ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS     98\n",
+       "ALIMENTARY TRACT AND METABOLISM                93\n",
+       "ANTIINFECTIVES FOR SYSTEMIC USE                81\n",
+       "RESPIRATORY SYSTEM                             78\n",
+       "GENITO URINARY SYSTEM AND SEX HORMONES         60\n",
+       "MUSCULO-SKELETAL SYSTEM                        58\n",
+       "DERMATOLOGICALS                                51\n",
+       "SENSORY ORGANS                                 47\n",
+       "INSECTICIDES AND REPELLENTS                    29\n",
+       "VARIOUS                                        21\n",
+       "BLOOD AND BLOOD FORMING ORGANS                 17\n",
+       "EXCL. SEX HORMONES AND INSULINS                 7\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 15,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 15
     }
    ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
    "source": [
-    "cond = db_canonical_smiles.isin(drugs_canonical)\n",
-    "drugbank_df.loc[cond, ['ATC_level_1']].value_counts()"
-   ]
+    "### Add `train`, `test`, `ood` split for full lincs dataset (if not already part in `adata.obs`)"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "if 'split' not in list(adata.obs):\n",
+    "    print(\"Addig 'split' to 'adata.obs'.\")\n",
+    "    unique_drugs = np.unique(adata.obs.canonical_smiles)\n",
+    "    drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2)\n",
+    "    drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5)\n",
+    "\n",
+    "    adata.obs['split'] = 'train'\n",
+    "    adata.obs.loc[adata.obs.canonical_smiles.isin(drugs_val), 'split'] = 'test'\n",
+    "    adata.obs.loc[adata.obs.canonical_smiles.isin(drugs_test), 'split'] = 'ood'"
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "b939230e",
-   "metadata": {},
    "source": [
     "### Check that `.obs.split=='test'` has sufficient samples for `pert_id` and `cell_id`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "dfcaf235",
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 17,
+   "source": [
+    "adata.obs.split.value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "train    144579\n",
-       "test      27592\n",
-       "ood        4416\n",
+       "train    699463\n",
+       "test     133383\n",
+       "ood        7831\n",
        "Name: split, dtype: int64"
       ]
      },
-     "execution_count": 16,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 17
     }
    ],
-   "source": [
-    "adata.obs.split.value_counts()"
-   ]
+   "metadata": {
+    "scrolled": true
+   }
   },
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "3cde65ff",
-   "metadata": {},
+   "source": [
+    "cond_test = adata.obs.split.isin(['test'])\n",
+    "adata.obs.loc[cond_test, 'cell_id'].value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "MCF7        4091\n",
-       "PC3         3330\n",
-       "VCAP        2244\n",
-       "HT29        2201\n",
-       "A375        2130\n",
-       "            ... \n",
-       "H1299         22\n",
-       "TYKNU         21\n",
-       "COV644        20\n",
-       "WSUDLCL2      20\n",
-       "HS27A          2\n",
+       "MCF7        18766\n",
+       "VCAP        17841\n",
+       "PC3         17414\n",
+       "A375        11188\n",
+       "HT29        10970\n",
+       "            ...  \n",
+       "HUES3          93\n",
+       "SNUC5          89\n",
+       "NCIH1694       86\n",
+       "SKMEL28        77\n",
+       "HS27A           9\n",
        "Name: cell_id, Length: 82, dtype: int64"
       ]
      },
-     "execution_count": 18,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 18
     }
    ],
-   "source": [
-    "cond_test = adata.obs.split.isin(['test'])\n",
-    "adata.obs.loc[cond_test, 'cell_id'].value_counts()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": 19,
-   "id": "d00bb734",
-   "metadata": {},
+   "source": [
+    "adata.obs.loc[cond_test, 'pert_id'].value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "DMSO             7465\n",
-       "BRD-A19037878     465\n",
-       "BRD-K49328571     325\n",
-       "BRD-K21680192     190\n",
-       "BRD-K59369769     172\n",
-       "                 ... \n",
-       "BRD-A58753560       1\n",
-       "BRD-K48116811       1\n",
-       "BRD-K26373640       1\n",
-       "BRD-K24652731       1\n",
-       "BRD-K43813806       1\n",
-       "Name: pert_id, Length: 975, dtype: int64"
+       "BRD-K60230970    904\n",
+       "BRD-K50691590    882\n",
+       "BRD-K81418486    694\n",
+       "DMSO             618\n",
+       "BRD-A19500257    538\n",
+       "                ... \n",
+       "BRD-K15466057      1\n",
+       "BRD-K11528380      1\n",
+       "BRD-K16621373      1\n",
+       "BRD-K40628295      1\n",
+       "BRD-K55232867      1\n",
+       "Name: pert_id, Length: 15649, dtype: int64"
       ]
      },
-     "execution_count": 19,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 19
     }
    ],
-   "source": [
-    "adata.obs.loc[cond_test, 'pert_id'].value_counts()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "96ff7145",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "pert_id: 179/975 converted back to 'train' due to insufficient # of samples.\n",
-      "cell_id: 1/82 converted back to 'train' due to insufficient # of samples.\n"
-     ]
-    }
-   ],
+   "execution_count": 20,
    "source": [
     "pert_count_treshold = 5\n",
     "cov_count_treshold = 20\n",
@@ -560,141 +550,148 @@
     "\n",
     "cond = cond_test & adata.obs.pert_id.isin(pert_id_neg.index[pert_id_neg])\n",
     "cond |= cond_test & adata.obs.cell_id.isin(cov_id_neg.index[cov_id_neg])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "ec49475c",
-   "metadata": {},
+   ],
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
+     "name": "stdout",
      "text": [
-      "split['test']: 506/176587 samples are converted back to 'train'.\n"
+      "pert_id: 9257/15649 converted back to 'train' due to insufficient # of samples.\n",
+      "cell_id: 1/82 converted back to 'train' due to insufficient # of samples.\n"
      ]
     }
    ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
    "source": [
     "adata.obs['split1'] = adata.obs.split.copy()\n",
     "adata.obs.loc[cond, 'split1'] = 'train'\n",
     "print(f\"split['test']: {cond.sum()}/{len(cond)} samples are converted back to 'train'.\")"
-   ]
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "Trying to set attribute `.obs` of view, copying.\n"
+     ]
+    },
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "split['test']: 18885/840677 samples are converted back to 'train'.\n"
+     ]
+    }
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "0cfeddcb",
-   "metadata": {},
+   "execution_count": 22,
+   "source": [
+    "adata.obs.split1.value_counts()"
+   ],
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "train    145085\n",
-       "test      27086\n",
-       "ood        4416\n",
+       "train    718348\n",
+       "test     114498\n",
+       "ood        7831\n",
        "Name: split1, dtype: int64"
       ]
      },
-     "execution_count": 24,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 22
     }
    ],
-   "source": [
-    "adata.obs.split1.value_counts()"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "3b6bd388",
-   "metadata": {},
+   "execution_count": 23,
+   "source": [
+    "adata.write(adata_out)\n",
+    "adata"
+   ],
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
+     "name": "stderr",
      "text": [
       "... storing 'pert_id' as categorical\n",
       "... storing 'canonical_smiles' as categorical\n"
      ]
     },
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "AnnData object with n_obs × n_vars = 176587 × 978\n",
-       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles', 'split1'\n",
+       "AnnData object with n_obs × n_vars = 840677 × 978\n",
+       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles', 'split1'\n",
        "    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'\n",
-       "    uns: 'rank_genes_groups_cov'"
+       "    uns: 'cydata_pull', 'rank_genes_groups_cov'"
       ]
      },
-     "execution_count": 25,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 23
     }
    ],
-   "source": [
-    "adata.write(adata_out)\n",
-    "adata"
-   ]
+   "metadata": {}
   },
   {
    "cell_type": "markdown",
-   "id": "ab3d1a36",
-   "metadata": {},
    "source": [
     "### Loading the result for `adata_out`"
-   ]
+   ],
+   "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "3724ff2d",
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 24,
    "source": [
     "adata = sc.read(adata_out)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "9100f758",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "AnnData object with n_obs × n_vars = 176587 × 978\n",
-       "    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles', 'split1'\n",
-       "    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'\n",
-       "    uns: 'rank_genes_groups_cov'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
+   "cell_type": "markdown",
+   "source": [
+    "### Check that `adata.uns[rank_genes_groups_cov]` has all entries in `adata.obs.cov_drug_dose_name` as keys"
    ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
    "source": [
-    "adata"
-   ]
+    "for i, k in enumerate(adata.obs.cov_drug_dose_name.unique()):\n",
+    "    try: \n",
+    "        adata.uns['rank_genes_groups_cov'][k]\n",
+    "    except: \n",
+    "        print(f\"{i}: {k}\") if 'DMSO' not in k else None"
+   ],
+   "outputs": [],
+   "metadata": {}
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0139f1c8",
-   "metadata": {},
+   "source": [],
    "outputs": [],
-   "source": []
+   "metadata": {}
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "name": "python3",
+   "display_name": "Python 3.8.8 64-bit ('py38': conda)"
   },
   "language_info": {
    "codemirror_mode": {
@@ -707,8 +704,11 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.8"
+  },
+  "interpreter": {
+   "hash": "6ea99325af0b09b2c098ddbe7d13703f3e2b6f04181042d146addb2812ce9602"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file