Skip to content

Commit

Permalink
Increase chunksize for insert cells mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
qiagu committed Feb 14, 2021
1 parent 7d7224c commit 58e593d
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 10 deletions.
27 changes: 17 additions & 10 deletions cycif_db/cyc_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def add_sample(self, sample):
log.info("Added sample {}.".format(repr(sample)))
return sample

def insert_cells_mappings(self, sample_id, cells, chunksize=2000,
def insert_cells_mappings(self, sample_id, cells, chunksize=10000,
**kwargs):
""" Insert cell quantification data into cells table.
Expand All @@ -121,18 +121,22 @@ def insert_cells_mappings(self, sample_id, cells, chunksize=2000,
marker_db_keys = [self.marker_header_to_dbkey(x) for x in markers]
other_columns = [self.other_feature_to_dbcolumn(x) for x in others]

if isinstance(cells, str):
for df in pd.read_csv(cells, chunksize=chunksize, **kwargs):
if isinstance(cells, DataFrame):
count = cells.shape[0]
for i in range(0, count, chunksize):
df = cells[i: i+chunksize]
self._batch_insert_cells_mappings(df, markers, marker_db_keys,
others, other_columns,
sample_id)

else: # cells is DataFrame
for i in range(0, cells.shape[0], chunksize):
df = cells[i: i+chunksize]
else:
count = 0
for df in pd.read_csv(cells, chunksize=chunksize, iterator=True,
**kwargs):
self._batch_insert_cells_mappings(df, markers, marker_db_keys,
others, other_columns,
sample_id)
count += df.shape[0]
log.info("Added total %d cell records!" % count)

def _batch_insert_cells_mappings(self, dataframe, markers, marker_db_keys,
others, other_columns, sample_id):
Expand Down Expand Up @@ -245,7 +249,7 @@ def insert_sample_markers(self, sample_id, markers, **kwargs):
log.info("Added %d entries of sample marker association!"
% len(associates))

def add_sample_complex(self, sample, cells, markers,
def add_sample_complex(self, sample, cells, markers, chunksize=10000,
dry_run=False, **kwargs):
""" Insert the quantification result from a single sample
into database, including cell quantification table and
Expand All @@ -259,6 +263,8 @@ def add_sample_complex(self, sample, cells, markers,
If str, it's path string to a csv file.
markers: str or pandas.DataFrame object.
If str, it's path string to a csv file.
chuncksize: int or None.
Used in `pd.read_csv`. Read in chunks.
dry_run: bool, default is False.
Whether to run the sample adding without commit.
kwargs: keywords parameter.
Expand All @@ -274,7 +280,8 @@ def add_sample_complex(self, sample, cells, markers,
"against the unique constraint or it has invalid `id`!")
try:
sample = self.add_sample(sample)
self.insert_cells_mappings(sample.id, cells, **kwargs)
self.insert_cells_mappings(sample.id, cells, chunksize=chunksize,
**kwargs)
self.insert_sample_markers(sample.id, markers, **kwargs)
if not dry_run:
self.commit()
Expand Down Expand Up @@ -309,7 +316,7 @@ def delete_sample(self, id=None, name=None, tag=None):
.filter(func.lower(Sample.name) == name.lower())\
.filter((Sample.tag == tag)
| (func.lower(Sample.tag) == str(tag).lower()))\
.delete()
.delete(synchronize_session='fetch')

self.commit()

Expand Down
4 changes: 4 additions & 0 deletions scripts/add_sample_complex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import logging
import pathlib
import sys
import time

work_dir = pathlib.Path(__file__).absolute().parent.parent
sys.path.insert(1, str(work_dir))
Expand Down Expand Up @@ -91,6 +92,9 @@
log.info(f"The path to Cells: {cells_path}.")
log.info(f"The path to Markers: {markers_path}.")

start_time = time.time()
with CycSession() as csess:
csess.add_sample_complex(
sample, cells_path, markers_path, dry_run=args.dry_run)
end_time = time.time()
log.info("Finished in %.10f s" % (end_time - start_time))

0 comments on commit 58e593d

Please sign in to comment.