first commit

loic-lb · Sep 4, 2023 · b3e4303 · b3e4303
commit b3e4303
Show file tree

Hide file tree

Showing 11 changed files with 579 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Ignoring scyan tutorials (notebooks) for language statistics on Github
+.ipynb -linguist-detectable
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,115 @@
+# Misc
+cytospace
+show_small.py
+archive/
+workflow/.snakemake
+workflow/logs
+*.ignore.*
+explore
+
+# OS related
+.DS_Store
+
+# Documentation
+site
+
+# Code Ocean related
+code
+results
+
+# Logs
+outputs
+multirun
+lightning_logs
+checkpoints
+run.log
+
+# Data files
+data/*/*
+!data/aml/default.csv
+!data/aml/short.h5ad
+!data/bmmc/default.csv
+!data/poised/default.csv
+!data/debarcoding/default.csv
+!data/README.md
+
+# Jupyter Notebook
+.ipynb_checkpoints
+exploration/*
+*.ipynb
+!notebooks/*.ipynb
+
+# pyenv
+.python-version
+
+# VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+**/.vscode
+
+# Lightning-Hydra-Template
+logs/
+wandb/
+.env
+.autoenv
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+!cli/src/lib
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Misc
+jobs/
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# Spatial-omics preprocessing and analysis
+
+TODO: write README
diff --git a/sopa/__init__.py b/sopa/__init__.py
diff --git a/sopa/io/qptif.py b/sopa/io/qptif.py
@@ -0,0 +1,66 @@
+import argparse
+import re
+import shutil
+from pathlib import Path
+from typing import List
+
+import tifffile as tf
+
+from .xenium import save_ome_tif
+
+
+def get_channel_name(description):
+    return re.search(r"<Name>(.*?)</Name>", description).group(1)
+
+
+def read_series(path: Path) -> List[tf.TiffPageSeries]:
+    with tf.TiffFile(path) as tif:
+        return list(reversed(sorted(tif.series[0].series, key=lambda p: p.size)))
+
+
+def write_zarr(path: Path, series: List[tf.TiffPageSeries], names: List[str]) -> None:
+    import dask.array as da
+    import xarray as xr
+
+    dask_array = da.asarray(series[0].asarray())
+    xarr = xr.DataArray(
+        dask_array, dims=list(series[0]._axes.lower()), coords={"c": names}
+    )
+    ds = xr.Dataset({"image": xarr})
+
+    if path.exists():
+        shutil.rmtree(path)
+
+    print("Saving xarray")
+    ds.to_zarr(path)
+
+
+def main(args):
+    path, output = Path(args.path), Path(args.output)
+
+    assert not output.exists(), f"Output path {output} already exists"
+
+    series = read_series(path)
+    names = [get_channel_name(page.description) for page in series[0]._pages]
+
+    save_ome_tif(output, series, names)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--path",
+        type=str,
+        required=True,
+        help="Path to the qptiff file",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        required=True,
+        help="Path to the morphology.ome.tif file",
+    )
+
+    main(parser.parse_args())
diff --git a/sopa/io/xenium/__init__.py b/sopa/io/xenium/__init__.py
@@ -0,0 +1,3 @@
+from .groups import save_groups
+from .ome import save_ome_tif
+from .polygons import save_polygons
diff --git a/sopa/io/xenium/groups.py b/sopa/io/xenium/groups.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import zarr
+
+
+def add_group(root: zarr.Group, index: int, categories: np.ndarray):
+    group = root.create_group(index)
+    categories_indices = [
+        np.where(categories == cat)[0] for cat in np.unique(categories)
+    ]
+    categories_cum_len = np.cumsum([len(indices) for indices in categories_indices])
+
+    indices = np.concatenate(categories_indices)
+    indptr = np.concatenate([[0], categories_cum_len[:-1]])
+
+    group.array("indices", indices, dtype="uint32", chunks=(len(indices),))
+    group.array("indptr", indptr, dtype="uint32", chunks=(len(indptr),))
+
+
+def save_groups(path: str, df: pd.DataFrame):
+    ATTRS = {
+        "major_version": 1,
+        "minor_version": 0,
+        "number_groupings": 1,
+        "grouping_names": [],
+        "group_names": [],
+    }
+
+    with zarr.ZipStore(path, mode="w") as store:
+        g = zarr.group(store=store)
+        cell_groups = g.create_group("cell_groups")
+
+        for i, name in enumerate(df.columns):
+            ATTRS["grouping_names"].append(name)
+            ATTRS["group_names"].append(list(df[name].unique()))
+
+            add_group(cell_groups, i, df[name])
+
+        cell_groups.attrs.put(ATTRS)
diff --git a/sopa/io/xenium/ome.py b/sopa/io/xenium/ome.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+from typing import List
+
+import tifffile as tf
+
+
+def save_ome_tif(
+    output_path: Path,
+    series: List[tf.TiffPageSeries],
+    channel_names: List[str],
+    pixelsize: float = 0.2125,
+) -> None:
+    with tf.TiffWriter(output_path, bigtiff=True) as tif:
+        metadata = {
+            "SignificantBits": 8,
+            "PhysicalSizeX": pixelsize,
+            "PhysicalSizeXUnit": "µm",
+            "PhysicalSizeY": pixelsize,
+            "PhysicalSizeYUnit": "µm",
+            "Channel": {"Name": channel_names},
+        }
+        options = dict(
+            photometric="minisblack",
+            tile=(1024, 1024),
+            compression="jpeg2000",
+            resolutionunit="CENTIMETER",
+        )
+        tif.write(
+            series[0].asarray(),
+            subifds=len(series) - 1,
+            resolution=(1e4 / pixelsize, 1e4 / pixelsize),
+            metadata=metadata,
+            **options,
+        )
+
+        for i in range(1, len(series)):
+            print(f"Writing sub-resolution {i}...")
+            tif.write(
+                series[i].asarray(),
+                subfiletype=1,
+                resolution=(1e4 * 2**i / pixelsize, 1e4 * 2**i / pixelsize),
+                **options,
+            )
diff --git a/sopa/io/xenium/polygons.py b/sopa/io/xenium/polygons.py
@@ -0,0 +1,88 @@
+from math import ceil
+from pathlib import Path
+
+import numpy as np
+import zarr
+
+CELLS_SUMMARY_ATTRS = {
+    "column_descriptions": [
+        "Cell centroid in X",
+        "Cell centroid in Y",
+        "Cell area",
+        "Nucleus centroid in X",
+        "Nucleus centroid in Y",
+        "Nucleus area",
+        "z_level",
+    ],
+    "column_names": [
+        "cell_centroid_x",
+        "cell_centroid_y",
+        "cell_area",
+        "nucleus_centroid_x",
+        "nucleus_centroid_y",
+        "nucleus_area",
+        "z_level",
+    ],
+}
+
+GROUP_ATTRS = {
+    "major_version": 5,
+    "minor_version": 0,
+    "name": "CellSegmentationDataset",
+    "polygon_set_descriptions": [
+        "DAPI-based nuclei segmentation",
+        "Expansion of nuclei boundaries by 15 \u03bcm",
+    ],
+    "polygon_set_display_names": ["Nucleus boundaries", "Cell boundaries"],
+    "polygon_set_names": ["nucleus", "cell"],
+    "spatial_units": "microns",
+}
+
+
+def save_polygons(path: Path, coordinates: np.ndarray) -> None:
+    num_cells = len(coordinates)
+    cells_fourth = ceil(num_cells / 4)
+    cells_half = ceil(num_cells / 2)
+
+    GROUP_ATTRS["number_cells"] = num_cells
+
+    polygon_vertices = np.stack([coordinates, coordinates])
+    num_points = polygon_vertices.shape[2]
+    n_vertices = num_points // 2
+
+    with zarr.ZipStore(path, mode="w") as store:
+        g = zarr.group(store=store)
+        g.attrs.put(GROUP_ATTRS)
+
+        g.array(
+            "polygon_vertices",
+            polygon_vertices,
+            dtype="float32",
+            chunks=(1, cells_fourth, ceil(num_points / 4)),
+        )
+
+        cell_id = np.ones((num_cells, 2))
+        cell_id[:, 0] = np.arange(1, num_cells + 1)
+        g.array("cell_id", cell_id, dtype="uint32", chunks=(cells_half, 1))
+
+        g.array(
+            "cell_summary",
+            np.zeros((num_cells, 7)),
+            dtype="float64",
+            chunks=(num_cells, 1),
+        )
+        g["cell_summary"].attrs.put(CELLS_SUMMARY_ATTRS)
+
+        g.array(
+            "polygon_num_vertices",
+            np.full((2, num_cells), n_vertices),
+            dtype="int32",
+            chunks=(1, cells_half),
+        )
+
+        g.array(
+            "seg_mask_value",
+            np.arange(1, num_cells + 1),
+            dtype="uint32",
+            chunks=(cells_half,),
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Ignoring scyan tutorials (notebooks) for language statistics on Github
		.ipynb -linguist-detectable
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Spatial-omics preprocessing and analysis

		TODO: write README