add CellO cell type classification

bichkd · Jan 12, 2023 · 8e45b46 · 8e45b46
1 parent db2caa3
commit 8e45b46
Show file tree

Hide file tree

Showing 4 changed files with 1,639 additions and 239 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,15 @@
 a FAIR and scalable interactive visual analytics app for scRNA-Seq data
 
 ## Setting up
+
+for CellO:
+```bash
+mkdir scratch/cello_resources
+curl https://deweylab.biostat.wisc.edu/cell_type_classification/resources_v2.0.0.tar.gz >scratch/cello_resources/resources_v2.0.0.tar.gz
+tar -C scratch/cello_resources -zxf scratch/cello_resources/resources_v2.0.0.tar.gz
+````
+
+
 ```bash
 docker-compose up
 conda env create -f data_import/environment.yml
@@ -11,3 +20,11 @@ make reset_database test_studydata_import
 # 'normal_studydata': real life studies (i.e. with full amount of cells and genes)
 make normal_studydata_import
 ```
+
+## manually executing the study data preparation jupyter notebooks
+
+The notebooks are run in headless mode by `make`. To create new notebooks and explore datasets:
+
+```bash
+(cd data_import && PYTHONPATH=$(pwd) jupyter-lab)
+```
diff --git a/data_import/environment.yml b/data_import/environment.yml
@@ -4,8 +4,10 @@ channels:
   - bioconda
 dependencies:
   - python=3.10
-  - pip
   - pyjaspar
+  # cello-classify needs pygraphviz, which requires the graphviz headers for compilation
+  - graphviz
+  - pip
   - pip:
     - pandas
     - numpy
@@ -17,7 +19,6 @@ dependencies:
     - sqlalchemy
     - psycopg2-binary
     - jupyterlab
-    - 'dramatiq[watch]'
-    - dramatiq-pg
     - pybiomart
     - lxml
+    - cello-classify
diff --git a/data_import/h5ad_preparation.py b/data_import/h5ad_preparation.py
@@ -8,6 +8,8 @@
 from anndata import AnnData
 import logging
 from pathlib import Path
+import cello
+
 import Density_Sampling.density_sampling as density_sampling
 
 logging.basicConfig(format='%(asctime)s.%(msecs)03d %(process)d %(levelname)s %(name)s:%(lineno)d %(message)s',
@@ -292,3 +294,21 @@ def calculate_differentially_expressed_genes(
 
     _cellenium_uns_dictionary(adata)['differentially_expressed_genes'] = result_dataframe.copy()
     return result_dataframe
+
+
+def cello_classify_celltypes(adata: AnnData, cello_clustering_attribute:str):
+    if adata.uns['cellenium']['taxonomy_id'] != 9606:
+        logging.info('skipping CellO classification, taxonomy_id is not human')
+        return
+    resource_dir = basedir.joinpath(f"cello_resources")
+    os.makedirs(resource_dir, exist_ok=True)
+    # Mahmoud: CellO makes mistakes sometimes due to ribosomal protein genes, so would be good to filter them out before the CellO call
+    remove_ribo = adata.var_names.str.startswith(("RPS", "RPL"))
+    adata_cello = adata[:, ~remove_ribo].copy()
+    cello.scanpy_cello(adata_cello, clust_key=cello_clustering_attribute, rsrc_loc=resource_dir, term_ids=True)
+    adata.obs['CellO_celltype'] = adata.obs.join(adata_cello.obs['Most specific cell type'])['Most specific cell type']
+
+    updated_sample_attributes = ['CellO_celltype']
+    updated_sample_attributes.extend(adata.uns['cellenium']['main_sample_attributes'])
+    adata.uns['cellenium']['main_sample_attributes'] = updated_sample_attributes
+
diff --git a/data_import/public_data/pancreas_atlas.ipynb b/data_import/public_data/pancreas_atlas.ipynb