Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Zarr library directly as a parallel write sink. #60

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Dec 21, 2024
commit 1d0ac9ea1f52e006ad027616d7ee3820928c9eca
74 changes: 33 additions & 41 deletions ALLCools/count_matrix/dataset.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import pathlib
import subprocess
import tempfile
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import lru_cache
from shutil import rmtree
import tempfile


import numpy as np
import pandas as pd
import pybedtools
import pysam
import xarray as xr
import zarr
import zarr.convenience
import zarr.creation
import zarr.hierarchy
import zarr.storage
from numcodecs import blosc
from scipy import stats
import zarr, zarr.creation, zarr.convenience, zarr.hierarchy, zarr.storage

from ALLCools.utilities import parse_chrom_size, parse_mc_pattern

Expand Down Expand Up @@ -185,7 +187,7 @@ def _count_single_region_set(allc_table, region_config, obs_dim, region_dim):
data = xr.DataArray(
np.array([sample_data]),
coords=[[sample], region_ids, total_mc_types, ["mc", "cov"]],
dims=[obs_dim, region_dim, "mc_type", "count_type"]
dims=[obs_dim, region_dim, "mc_type", "count_type"],
)
total_data.append(data)
total_data = xr.Dataset({f"{region_dim}_da": xr.concat(total_data, dim=obs_dim)})
Expand All @@ -209,9 +211,7 @@ def _calculate_pv(data, reverse_value, obs_dim, var_dim, cutoff=0.9):
return pv


def _count_single_zarr(
allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"
):
def _count_single_zarr(allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"):
"""Process single region set and its quantifiers."""
# count all ALLC and mC types that's needed for quantifiers if this region_dim
count_ds = _count_single_region_set(
Expand All @@ -228,8 +228,9 @@ def _count_single_zarr(
count_da = count_ds.sel(mc_type=count_mc_types)[f"{region_dim}_da"]
max_int = np.iinfo(count_dtype).max
count_da = xr.where(count_da > max_int, max_int, count_da)
regiongroup[f"{region_dim}_da"][
chunk_start : chunk_start + allc_table.index.size, :, :, :] = count_da.astype(count_dtype).data
regiongroup[f"{region_dim}_da"][chunk_start : chunk_start + allc_table.index.size, :, :, :] = count_da.astype(
count_dtype
).data
# deal with hypo-score, hyper-score quantifiers
for quant in region_config["quant"]:
if quant.quant_type == "hypo-score":
Expand All @@ -253,7 +254,9 @@ def _count_single_zarr(
var_dim=region_dim,
**quant.kwargs,
)
regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][chunk_start : chunk_start + allc_table.index.size, :] = data.data
regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][
chunk_start : chunk_start + allc_table.index.size, :
] = data.data

return True

Expand Down Expand Up @@ -312,7 +315,7 @@ def generate_dataset(
# prepare regions and determine quantifiers
pathlib.Path(output_path).mkdir(exist_ok=True)
z = zarr.storage.DirectoryStore(path=output_path)
root = zarr.hierarchy.group(store = z, overwrite = True)
root = zarr.hierarchy.group(store=z, overwrite=True)
datasets, tmpdir = _determine_datasets(regions, quantifiers, chrom_size_path)
# copy chrom_size_path to output_path
subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
Expand All @@ -324,12 +327,9 @@ def generate_dataset(
bed.index.name = region_dim
region_size = bed.index.size
dsobs = regiongroup.array(
name=obs_dim,
data=allc_table.index.values,
chunks=(chunk_size),
dtype=f"<U{max_length}"
name=obs_dim, data=allc_table.index.values, chunks=(chunk_size), dtype=f"<U{max_length}"
)
dsobs.attrs['_ARRAY_DIMENSIONS'] = [obs_dim]
dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
# append region bed to the saved ds
ds = xr.Dataset()
for col, data in bed.items():
Expand All @@ -350,41 +350,33 @@ def generate_dataset(
name=f"{region_dim}_da",
shape=(n_sample, region_size, len(count_mc_types), 2),
chunks=(chunk_size, region_size, len(count_mc_types), 2),
dtype="uint32"
)
DA.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim, "mc_type", "count_type"]
count = regiongroup.array(
name="count_type",
data=(["mc", "cov"]),
dtype="<U3"
)
count.attrs['_ARRAY_DIMENSIONS']=["count_type"]
mc = regiongroup.array(
name="mc_type",
data=count_mc_types,
dtype="<U3"
dtype="uint32",
)
mc.attrs['_ARRAY_DIMENSIONS']=["mc_type"]
DA.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim, "mc_type", "count_type"]
count = regiongroup.array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
count.attrs["_ARRAY_DIMENSIONS"] = ["count_type"]
mc = regiongroup.array(name="mc_type", data=count_mc_types, dtype="<U3")
mc.attrs["_ARRAY_DIMENSIONS"] = ["mc_type"]
# deal with hypo-score, hyper-score quantifiers
for quant in region_config["quant"]:
if quant.quant_type == "hypo-score":
for mc_type in quant.mc_types:
hypo = regiongroup.empty (
name = f"{region_dim}_da_{mc_type}-hypo-score",
hypo = regiongroup.empty(
name=f"{region_dim}_da_{mc_type}-hypo-score",
shape=(allc_table.size, region_size),
chunks = (chunk_size, region_size),
dtype = "float16"
chunks=(chunk_size, region_size),
dtype="float16",
)
hypo.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
hypo.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
elif quant.quant_type == "hyper-score":
for mc_type in quant.mc_types:
hyper = regiongroup.empty (
name = f"{region_dim}_da_{mc_type}-hyper-score",
hyper = regiongroup.empty(
name=f"{region_dim}_da_{mc_type}-hyper-score",
shape=(allc_table.size, region_size),
chunks = (chunk_size, region_size),
dtype = "float16"
chunks=(chunk_size, region_size),
dtype="float16",
)
hyper.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
hyper.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
blosc.use_threads = False
with ProcessPoolExecutor(cpu) as exe:
futures = {}
Expand Down
20 changes: 10 additions & 10 deletions docs/CONDUCT.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ In the interest of fostering an open and welcoming environment, we as contributo

Examples of behavior that contributes to creating a positive environment include:

- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Focusing on what is best for the community
- Showing empathy towards other community members
- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Focusing on what is best for the community
- Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

- The use of sexualized language or imagery and unwelcome sexual attention or advances
- Trolling, insulting/derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or electronic address, without explicit permission
- Other conduct which could reasonably be considered inappropriate in a professional setting
- The use of sexualized language or imagery and unwelcome sexual attention or advances
- Trolling, insulting/derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or electronic address, without explicit permission
- Other conduct which could reasonably be considered inappropriate in a professional setting

## Our Responsibilities

Expand Down
14 changes: 7 additions & 7 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ Report bugs using GitHub issues.

If you are reporting a bug, please include:

- Your operating system name and version.
- Any details about your local setup that might be helpful in troubleshooting.
- Detailed steps to reproduce the bug.
- Your operating system name and version.
- Any details about your local setup that might be helpful in troubleshooting.
- Detailed steps to reproduce the bug.

## Fix Bugs

Expand All @@ -35,10 +35,10 @@ The best way to send feedback is to file an issue on GitHub.

If you are proposing a feature:

- Explain in detail how it would work.
- Keep the scope as narrow as possible, to make it easier to implement.
- Remember that this is a volunteer-driven project, and that contributions
are welcome :)
- Explain in detail how it would work.
- Keep the scope as narrow as possible, to make it easier to implement.
- Remember that this is a volunteer-driven project, and that contributions
are welcome :)

## Get Started

Expand Down
12 changes: 6 additions & 6 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ ALLCools documentation

If you'd like to develop on and build the ALLCools book, you should:

- Clone this repository and run
- Run `pip install -r requirements.txt` (it is recommended you do this within a virtual environment)
- (Recommended) Remove the existing `ALLCools/_build/` directory
- Run `jupyter-book build ALLCools/`
- Clone this repository and run
- Run `pip install -r requirements.txt` (it is recommended you do this within a virtual environment)
- (Recommended) Remove the existing `ALLCools/_build/` directory
- Run `jupyter-book build ALLCools/`

A fully-rendered HTML version of the book will be built in `ALLCools/_build/html/`.

Expand All @@ -21,8 +21,8 @@ The html version of the book is hosted on the `gh-pages` branch of this repo. A

If you wish to disable this automation, you may remove the GitHub actions workflow and build the book manually by:

- Navigating to your local build; and running,
- `ghp-import -n -p -f ALLCools/_build/html`
- Navigating to your local build; and running,
- `ghp-import -n -p -f ALLCools/_build/html`

This will automatically push your build to the `gh-pages` branch. More information on this hosting process can be found [here](https://jupyterbook.org/publish/gh-pages.html#manually-host-your-book-with-github-pages).

Expand Down
12 changes: 6 additions & 6 deletions docs/allcools/cell_level/basic/intro_basic_clustering.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ The dataset we used for 100Kb clustering documentation comes from the hippocampu

#### Download Input Files

- Cell metadata: ADD DOWNLOAD URL
- single-cell ALLC files: ADD DOWNLOAD URL
- MCDS files: ADD DOWNLOAD URL
- Cell metadata: ADD DOWNLOAD URL
- single-cell ALLC files: ADD DOWNLOAD URL
- MCDS files: ADD DOWNLOAD URL

### For 5Kb bins clustering

The dataset we used for 5Kb clustering documentation comes from human PBMC (ADD REFERENCE).

#### Download Input Files

- Cell metadata: ADD DOWNLOAD URL
- single-cell ALLC files: ADD DOWNLOAD URL
- MCDS files: ADD DOWNLOAD URL
- Cell metadata: ADD DOWNLOAD URL
- single-cell ALLC files: ADD DOWNLOAD URL
- MCDS files: ADD DOWNLOAD URL

## Prepare your own datasets

Expand Down
16 changes: 8 additions & 8 deletions docs/allcools/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ ALLCools documentation organization.

## Authors

- Hanqing Liu, developer, initial conception
- Jingtian Zhou, developer, 5kb clustering algorithms
- Wei Tian
- Jiaying Xu
- Hanqing Liu, developer, initial conception
- Jingtian Zhou, developer, 5kb clustering algorithms
- Wei Tian
- Jiaying Xu

## Support

Expand All @@ -37,10 +37,10 @@ figclass: margin
Click on this to create a page specific issue.
```

- The source code is on [github](https://github.com/lhqing/ALLCools);
- For releases and changelog, please check out the [github releases page](https://github.com/lhqing/ALLCools/releases);
- For bugs and feature requests, please use the [issue tracker](https://github.com/lhqing/ALLCools/issues).
- For page-specific issues, please use the "open issue" button on the top-right toggle.
- The source code is on [github](https://github.com/lhqing/ALLCools);
- For releases and changelog, please check out the [github releases page](https://github.com/lhqing/ALLCools/releases);
- For bugs and feature requests, please use the [issue tracker](https://github.com/lhqing/ALLCools/issues).
- For page-specific issues, please use the "open issue" button on the top-right toggle.

## Citing ALLCools

Expand Down
22 changes: 11 additions & 11 deletions docs/allcools/start/analysis_steps.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ In general, the **cellular analysis** is focused on individual cells' overall di

### Sections

- [Basic walk-through of the clustering analysis](../cell_level/basic/intro_basic_clustering.md).
- [Step-by-step description of the clustering analysis](../cell_level/step_by_step/intro_step_by_step_clustering.md).
- [Identification of Differentially Methylated Genes clusters](../cell_level/dmg/intro_dmg.md).
- [Cell-level data integration](../cell_level/integration/intro_integration.md).
- [Potential cell doublets identification](../cell_level/doublets/intro_doublets.md).
- [Basic walk-through of the clustering analysis](../cell_level/basic/intro_basic_clustering.md).
- [Step-by-step description of the clustering analysis](../cell_level/step_by_step/intro_step_by_step_clustering.md).
- [Identification of Differentially Methylated Genes clusters](../cell_level/dmg/intro_dmg.md).
- [Cell-level data integration](../cell_level/integration/intro_integration.md).
- [Potential cell doublets identification](../cell_level/doublets/intro_doublets.md).

### Input

Expand All @@ -49,12 +49,12 @@ Specifically, this strategy starts from a cell-by-5kb-bin hypo-methylation score

### Sections

- [Prepare pseudo-bulk ALLC files](../cluster_level/intro.md)
- [Call Differentially Methylated Region (DMR)](../cluster_level/RegionDS/01a.call_dmr)
- [DMR annotation](../cluster_level/RegionDS/02.annotation.ipynb)
- [DMR motif analysis (finding upstream regulators of DMRs)](../cluster_level/RegionDS/intro_motif.md)
- [DMR - Gene correlation analysis (finding downstream targets of DMRs)](../cluster_level/Correlation/intro_corr)
- [Enhancer prediction with REPTILE algorithm](../cluster_level/REPTILE/intro_reptile.md)
- [Prepare pseudo-bulk ALLC files](../cluster_level/intro.md)
- [Call Differentially Methylated Region (DMR)](../cluster_level/RegionDS/01a.call_dmr)
- [DMR annotation](../cluster_level/RegionDS/02.annotation.ipynb)
- [DMR motif analysis (finding upstream regulators of DMRs)](../cluster_level/RegionDS/intro_motif.md)
- [DMR - Gene correlation analysis (finding downstream targets of DMRs)](../cluster_level/Correlation/intro_corr)
- [Enhancer prediction with REPTILE algorithm](../cluster_level/REPTILE/intro_reptile.md)

### Basic process of genomic analysis

Expand Down
4 changes: 2 additions & 2 deletions docs/allcools/start/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ conda deactivate

Here are some optional packages which might be hard to install on some old systems.

- `rpy2` (R and the R package pvclust) is used for the cluster dendrogram.
- `tpot` is used in REPTILE model.
- `rpy2` (R and the R package pvclust) is used for the cluster dendrogram.
- `tpot` is used in REPTILE model.

```shell
mamba install -n allcools rpy2
Expand Down