ToryDeng · Feb 10, 2023
diff --git a/‎README.md
+5-4 b/‎README.md
+5-4
diff --git a/‎benchmark/_recorder.py
+1-1 b/‎benchmark/_recorder.py
+1-1
diff --git a/‎benchmark/cluster/spatial/functions.py
-1 b/‎benchmark/cluster/spatial/functions.py
-1
diff --git a/‎benchmark/dataset/_io.py
+1-1 b/‎benchmark/dataset/_io.py
+1-1
diff --git a/‎benchmark/dataset/_load.py
+1 b/‎benchmark/dataset/_load.py
+1
diff --git a/‎benchmark/run_benchmark.py
+4-1 b/‎benchmark/run_benchmark.py
+4-1
diff --git a/‎benchmark/selection/spatial/functions.py
+1-1 b/‎benchmark/selection/spatial/functions.py
+1-1
diff --git a/‎main.py
+6-2 b/‎main.py
+6-2
diff --git a/‎tests/test_benchmark.py
+1-1 b/‎tests/test_benchmark.py
+1-1
diff --git a/‎tutorials/2023-02 14_54_26 scrna.xlsx
6.23 KB b/‎tutorials/2023-02 14_54_26 scrna.xlsx
6.23 KB
diff --git a/‎tutorials/2023-02 14_54_32 scrna.xlsx
6.32 KB b/‎tutorials/2023-02 14_54_32 scrna.xlsx
6.32 KB
diff --git a/‎tutorials/2023-02 14_55_29 spatial.xlsx
5.93 KB b/‎tutorials/2023-02 14_55_29 spatial.xlsx
5.93 KB
diff --git a/‎tutorials/2023-02 14_55_43 spatial.xlsx
6.05 KB b/‎tutorials/2023-02 14_55_43 spatial.xlsx
6.05 KB
diff --git a/‎tutorials/cache/clustering_result/mouse_brain/random_select/2000/random_clustering/0.npy
21.1 KB b/‎tutorials/cache/clustering_result/mouse_brain/random_select/2000/random_clustering/0.npy
21.1 KB
diff --git a/‎tutorials/cache/clustering_result/mouse_brain/random_select/2000/spaGCN/0.npy
21.1 KB b/‎tutorials/cache/clustering_result/mouse_brain/random_select/2000/spaGCN/0.npy
21.1 KB
diff --git a/‎tutorials/cache/selected_genes/mouse_brain/random_select/2000.npy
29.4 KB b/‎tutorials/cache/selected_genes/mouse_brain/random_select/2000.npy
29.4 KB
diff --git a/‎tutorials/read_records.ipynb
+239 b/‎tutorials/read_records.ipynb
+239
diff --git a/‎tutorials/run_benchmarks.ipynb
+702 b/‎tutorials/run_benchmarks.ipynb
+702
@@ -24,10 +24,10 @@ cl_cfg = {'clustering_method': 2}
 run_bench(data_cfg, fs_cfg, cl_cfg, modality='scrna', metrics=['ARI', 'NMI'])
 ```
 
-The evaluation results will be automatically saved as an XLSX file:
+The evaluation results will be automatically saved as an XLSX file in the working directory:
 
 ```text
-2023-02 23:12:47 scrna.xlsx
+2023-02 14_54_32 scrna.xlsx
 ```
 
 Other software features are:
@@ -45,6 +45,7 @@ Other software features are:
 
 | Name  | Language | Reference |
 | :---: | :---:    | :---:     |
+| GeneClust | Python | [paper](https://doi.org/10.1093/bib/bbad042)
 | vst   | Python   | [paper](https://doi.org/10.1016/j.cell.2019.05.031) |
 | mvp   | Python   | [paper](https://www.nature.com/articles/nbt.3192) |
 | triku | Python   | [paper](https://doi.org/10.1093/gigascience/giac017) |
@@ -62,14 +63,14 @@ Other software features are:
 
 | Name  | Language | Reference |
 | :---: | :---:    | :---:     |
+| SC3s | Python  | [paper](https://doi.org/10.1186/s12859-022-05085-z) |
 | Seurat | R       | [paper](https://doi.org/10.1016/j.cell.2021.04.048) |
 | SHARP  | R       | [paper](http://www.genome.org/cgi/doi/10.1101/gr.254557.119) |
 | TSCAN | R       | [paper](https://doi.org/10.1093/nar/gkw430) |
 | CIDR | R       | [paper](https://doi.org/10.1186/s13059-017-1188-0) |
-| SC3s | Python  | [paper](https://doi.org/10.1186/s12859-022-05085-z) |
 
-### spatial transcriptomics
 
+### spatial transcriptomics
 #### Feature selection
 
 | Name  | Language | Reference |
 
@@ -56,7 +56,7 @@ def store_metrics_to_records(
 
 
 def write_records(records: Dict[str, pd.DataFrame], modality: Literal['scrna', 'spatial']):
-    record_name = f"{datetime.now().strftime('%Y-%m %H:%M:%S')} {modality}"
+    record_name = f"{datetime.now().strftime('%Y-%m %H_%M_%S')} {modality}"
     writer = pd.ExcelWriter(f'{record_name}.xlsx')
     for metric, record in records.items():
         record.to_excel(writer, sheet_name=metric, index=True)
 
@@ -35,7 +35,6 @@ def spaGCN_clustering(
         adata: ad.AnnData,
         img: np.ndarray,
         k: int,
-        # shape: Literal['hexagon', 'square'] = 'hexagon',
         random_state: int = 0
 ):
     # prepare positional information
 
@@ -10,7 +10,7 @@
 import scanpy as sc
 from loguru import logger
 
-from benchmark.dataset._utils import is_normalized, to_dense
+from ._utils import is_normalized, to_dense
 
 
 @logger.catch
 
@@ -83,4 +83,5 @@ def load_data(data_name: str, data_props: Dict[str, Union[os.PathLike, str]], mo
     else:
         if 'image_path' in data_props.keys():
             logger.warning("The image will not be loaded when using scRNA-seq data.")
+        return adata, None
 
@@ -12,6 +12,7 @@
 from .cluster import generally_cluster_obs
 from .dataset import load_data
 from .selection import generally_select_features
+from scGeneClust._utils import set_logger
 
 
 def run_bench(
@@ -61,7 +62,8 @@ def run_bench(
         function. The benchmark will call the function like `custom_fs_function(adata, n_selected_genes, **kwargs)`,
         and the return values must be an ndarray that contains features selected by the function. You can write a
         wrapper function to work around incompatible parameters/return values.
-        - list_of_numbers_of_selected_genes: a list of numbers of genes needed to be selected.
+        - list_of_numbers_of_selected_genes: a list of numbers of genes needed to be selected. If the function
+        internally determines the number of selected genes (e.g. GeneClust), write the list as `['auto']`.
 
     cl_cfg
         Configurations of downstream cell clustering/domain detection methods. It should be a dict in the format
@@ -94,6 +96,7 @@ def run_bench(
     -------
     None
     """
+    set_logger()
     if cl_kwarg is None:
         cl_kwarg = dict()
     if fs_kwarg is None:
 
@@ -63,7 +63,7 @@ def SPARKX(adata: ad.AnnData) -> pd.DataFrame:
         raw_adata = adata.raw.to_adata()
         pandas2ri.activate()
         spark = importr("SPARK")
-        stats, res_stest, res_mtest = spark.sparkx(raw_adata.X.T, raw_adata.obsm['spatial'], numCores=os.cpu_count()-1, verbose=False)
+        stats, res_stest, res_mtest = spark.sparkx(raw_adata.X.T, raw_adata.obsm['spatial'], verbose=False)
         pandas2ri.deactivate()
         results = pandas2ri.rpy2py(res_mtest).sort_values('adjustedPval')
         return pd.DataFrame({'Gene': raw_adata.var_names, 'Importance': results['adjustedPval']})
@@ -5,6 +5,10 @@
 # @Software: PyCharm
 
 import scanpy as sc
+from benchmark.selection.spatial.functions import SPARKX
 
-
-# TODO: add README, tutorials
+# TODO: add tutorials
+# TODO: add Seurat clustering for SRT
+# TODO: revise logger.catch
+# TODO: add gif of logging info
+# TODO: check GeneClust dependency
@@ -30,7 +30,7 @@ def test_scrna_benchmark():
         'to_remove': [np.nan, 'Megakaryocytes']
     }}
     fs_cfg = {'seurat_v3': [1000, 2000], random_select: [500], 'GeneClust-ps': ['auto']}
-    cl_cfg = {'KMeans': 2}
+    cl_cfg = {'KMeans': 2,  random_clustering: 1}
 
     run_bench(data_cfg, fs_cfg, cl_cfg, ['ARI', 'NMI'], 'scrna', clean_cache=True, fs_kwarg={'seed': 123}, cl_kwarg={'k':5, 'seed': 123})
     rm_cache("./cache")
 
@@ -0,0 +1,239 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4d4e2599-6ed9-4dc6-b63d-67ce42b68b50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2a44a6b1-e37d-44ed-8f32-443a2e71d280",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record = pd.read_excel(\"2023-02 14_54_32 scrna.xlsx\", sheet_name=None, index_col=[0,1,2], header=[0,1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cf271805-9220-4a14-9f44-b1064593db16",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>fs_method</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">seurat_v3</th>\n",
+       "      <th>random_select</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>n_genes</th>\n",
+       "      <th>1000</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>500</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dataset</th>\n",
+       "      <th>clustering_method</th>\n",
+       "      <th>run</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">PBMC3k</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">KMeans</th>\n",
+       "      <th>0</th>\n",
+       "      <td>0.760368</td>\n",
+       "      <td>0.598463</td>\n",
+       "      <td>0.291028</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.845754</td>\n",
+       "      <td>0.598547</td>\n",
+       "      <td>0.293021</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>random_clustering</th>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000263</td>\n",
+       "      <td>0.000263</td>\n",
+       "      <td>0.000263</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "fs_method                     seurat_v3           random_select\n",
+       "n_genes                            1000      2000          500 \n",
+       "dataset clustering_method run                                  \n",
+       "PBMC3k  KMeans            0    0.760368  0.598463      0.291028\n",
+       "                          1    0.845754  0.598547      0.293021\n",
+       "        random_clustering 0    0.000263  0.000263      0.000263"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "record['ARI']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "dc16ad98-5b68-4da1-a9ca-d842f6d199b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>fs_method</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">seurat_v3</th>\n",
+       "      <th>random_select</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>n_genes</th>\n",
+       "      <th>1000</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>500</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dataset</th>\n",
+       "      <th>clustering_method</th>\n",
+       "      <th>run</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">PBMC3k</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">KMeans</th>\n",
+       "      <th>0</th>\n",
+       "      <td>0.769744</td>\n",
+       "      <td>0.751971</td>\n",
+       "      <td>0.464386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.820645</td>\n",
+       "      <td>0.751988</td>\n",
+       "      <td>0.461544</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>random_clustering</th>\n",
+       "      <th>0</th>\n",
+       "      <td>0.002664</td>\n",
+       "      <td>0.002664</td>\n",
+       "      <td>0.002664</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "fs_method                     seurat_v3           random_select\n",
+       "n_genes                            1000      2000          500 \n",
+       "dataset clustering_method run                                  \n",
+       "PBMC3k  KMeans            0    0.769744  0.751971      0.464386\n",
+       "                          1    0.820645  0.751988      0.461544\n",
+       "        random_clustering 0    0.002664  0.002664      0.002664"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "record['NMI']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bioinfo",
+   "language": "python",
+   "name": "bioinfo"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}