Skip to content

Commit

Permalink
add more parameters to MAGIC.py script, 10x_HDF5 data loading, and sa…
Browse files Browse the repository at this point in the history
…ving data to csv from gui
  • Loading branch information
pkathail committed Jul 10, 2017
1 parent b776e99 commit 0f2421d
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 30 deletions.
36 changes: 26 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ A python GUI is now available for MAGIC. After following the installation steps
MAGIC can be run using the command line script `MAGIC.py` with the following parameters:

$> MAGIC.py -h
usage: MAGIC.py [-h] -d D -o O [-g G] [--gene-name-file GN]
[--use-ensemble-ids] [--cell-axis CA] [--skip-rows SKIP_ROWS]
[--skip-columns SKIP_COLUMNS] [-n] [-l L]
[--mols-per-cell-min MOLS_PER_CELL_MIN]
[--mols-per-cell-max MOLS_PER_CELL_MAX] [-p P]
[--pca-non-random] [-t T] [-k K] [-ka KA] [-e E] [-r R]
{csv,10x,10x_HDF5,mtx}
usage: MAGIC.py [-h] -d I -o O [-n] [-l L] [-g G] [-p N] [--pca-non-random]
[-t T] [-k K] [-ka KA] [-e E] [-r R]
{csv,10x,mtx}

run MAGIC

positional arguments:
Expand All @@ -46,21 +49,34 @@ MAGIC can be run using the command line script `MAGIC.py` with the following par
optional arguments:
-h, --help show this help message and exit

required arguments:
data loading parameters:
-d D, --data-file D File path of input data file.
-o O, --output-file O
File path of where to save the MAGIC imputed data (in
csv format).

normalization parameters:
-g G, --genome G Genome must be specified when loading 10x_HDF5 data.
--gene-name-file GN Gene name file must be specified when loading mtx
data.
--use-ensemble-ids Use ensemble IDs instead of gene names.
--cell-axis CA When loading a csv, specify whether cells are on rows
or columns (Default = 'rows').
--skip-rows SKIP_ROWS
When loading a csv, number of rows to skip after the
header row (Default = 0).
--skip-columns SKIP_COLUMNS
When loading a csv, number of columns to skip after
the header columns (Default = 0).
normalization/filtering parameters:
-n, --no-normalize Do not perform library size normalization on the data
-l L, --log-transform L
Log-transform data with the specified pseudocount.
--mols-per-cell-min MOLS_PER_CELL_MIN
Minimum molecules/cell to use in filtering.
--mols-per-cell-max MOLS_PER_CELL_MAX
Maximum molecules/cell to use in filtering.

MAGIC parameters:
-g G, --gene-name-file G
Gene name file must be specified when loading mtx
data.
-p P, --pca-components P
Number of pca components to use when running MAGIC
(Default = 20).
Expand Down
49 changes: 39 additions & 10 deletions src/magic/MAGIC.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,39 @@ def error(self, message):
def parse_args(args):
p = NewArgumentParser(description='run MAGIC')
p.add_argument('filetype',
choices=['csv', '10x', 'mtx'],
choices=['csv', '10x', '10x_HDF5', 'mtx'],
help='what is the file type of your original data?')

a = p.add_argument_group('required arguments')
a = p.add_argument_group('data loading parameters')
a.add_argument('-d', '--data-file', metavar='D', required=True,
help='File path of input data file.')
a.add_argument('-o', '--output-file', metavar='O', required=True,
help='File path of where to save the MAGIC imputed data (in csv format).')

n = p.add_argument_group('normalization parameters')
a.add_argument('-g', '--genome', metavar='G',
help='Genome must be specified when loading 10x_HDF5 data.')
a.add_argument('--gene-name-file', metavar='GN',
help='Gene name file must be specified when loading mtx data.')
a.add_argument('--use-ensemble-ids', default=False, action='store_true',
help='Use ensemble IDs instead of gene names.')
a.add_argument('--cell-axis', metavar='CA', default='rows',
choices=['rows', 'columns'],
help='When loading a csv, specify whether cells are on rows or columns (Default = \'rows\').')
a.add_argument('--skip-rows', default=0,
help='When loading a csv, number of rows to skip after the header row (Default = 0).')
a.add_argument('--skip-columns', default=0,
help='When loading a csv, number of columns to skip after the header columns (Default = 0).')

n = p.add_argument_group('normalization/filtering parameters')
n.add_argument('-n', '--no-normalize', default=True, action='store_false',
help='Do not perform library size normalization on the data')
n.add_argument('-l', '--log-transform', metavar='L', default=None,
help='Log-transform data with the specified pseudocount.')
n.add_argument('--mols-per-cell-min', default=0,
help='Minimum molecules/cell to use in filtering.')
n.add_argument('--mols-per-cell-max', default=np.inf,
help='Maximum molecules/cell to use in filtering.')

m = p.add_argument_group('MAGIC parameters')
m.add_argument('-g', '--gene-name-file', metavar='G',
help='Gene name file must be specified when loading mtx data.')
m.add_argument('-p', '--pca-components', metavar='P', default=20,
help='Number of pca components to use when running MAGIC (Default = 20).')
m.add_argument('--pca-non-random', default=True, action='store_false',
Expand All @@ -61,20 +77,33 @@ def main(args: list = None):
try:
if args.filetype == 'csv':
scdata = magic.mg.SCData.from_csv(os.path.expanduser(args.data_file),
data_type='sc-seq', normalize=args.no_normalize)
data_type='sc-seq', normalize=False,
cell_axis=0 if args.cell_axis=='rows' else 1,
rows_after_header_to_skip=args.skip_rows,
cols_after_header_to_skip=args.skip_columns)
elif args.filetype == 'mtx':
scdata = magic.mg.SCData.from_mtx(os.path.expanduser(args.data_file),
os.path.expanduser(args.gene_name_file), normalize=args.no_normalize)
os.path.expanduser(args.gene_name_file), normalize=False)
elif args.filetype == '10x':
scdata = magic.mg.SCData.from_10x(args.data_file, normalize=args.no_normalize)
scdata = magic.mg.SCData.from_10x(args.data_file, normalize=False,
use_ensemble_id=args.use_ensemble_ids)

elif args.filetype == '10x_HDF5':
scdata = magic.mg.SCData.from_10x_HDF5(args.data_file, args.genome, normalize=False,
use_ensemble_id=args.use_ensemble_ids)

scdata.filter_scseq_data(filter_cell_min=args.mols_per_cell_min, filter_cell_max=args.mols_per_cell_max)

if args.no_normalize:
scdata = scdata.normalize_scseq_data()

if args.log_transform != None:
scdata.log_transform_scseq_data(pseudocount=args.log_transform)

scdata.run_magic(n_pca_components=args.pca_components, random_pca=args.pca_non_random, t=args.t,
k=args.k, ka=args.ka, epsilon=args.epsilon, rescale_percent=args.rescale)

scdata.save_magic_to_csv(os.path.expanduser(args.output_file))
scdata.magic.to_csv(os.path.expanduser(args.output_file))

except:
raise
Expand Down
1 change: 1 addition & 0 deletions src/magic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from . import MAGIC_core
from . import mg
from . import magic_gui
from . import MAGIC

__version__ = "0.0"
87 changes: 84 additions & 3 deletions src/magic/magic_gui.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/local/bin/python3

import warnings
warnings.simplefilter("ignore", UserWarning)
import matplotlib
matplotlib.use("TkAgg")
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
Expand Down Expand Up @@ -39,7 +40,9 @@ def initialize(self):
self.fileMenu.add_command(label="Load csv file", command=self.loadCSV)
self.fileMenu.add_command(label="Load sparse data file", command=self.loadMTX)
self.fileMenu.add_command(label="Load 10x file", command=self.load10x)
self.fileMenu.add_command(label="Load 10x HDF5 file", command=self.load10xHDF5)
self.fileMenu.add_command(label="Load saved session from pickle file", command=self.loadPickle)
self.fileMenu.add_command(label="Save selected data to csv", state='disabled', command=self.saveDataToCSV)
self.fileMenu.add_command(label="Save session to pickle file", state='disabled', command=self.saveData)
self.fileMenu.add_command(label="Exit", command=self.quitMAGIC)

Expand Down Expand Up @@ -260,6 +263,69 @@ def load10x(self):

self.wait_window(self.fileInfo)

def load10xHDF5(self):
self.dataFileName = filedialog.askopenfilename(title='Load data file', initialdir='~/.magic/data')
if(self.dataFileName != None):
#pop up data options menu
self.fileInfo = tk.Toplevel()
self.fileInfo.title("Data options")
tk.Label(self.fileInfo, text=u"File name: ").grid(column=0, row=0)
tk.Label(self.fileInfo, text=self.dataFileName).grid(column=1, row=0)

tk.Label(self.fileInfo,text=u"Name:" ,fg="black",bg="white").grid(column=0, row=1)
self.fileNameEntryVar = tk.StringVar()
self.fileNameEntryVar.set('Data ' + str(len(self.data)))
tk.Entry(self.fileInfo, textvariable=self.fileNameEntryVar).grid(column=1,row=1)

tk.Label(self.fileInfo, text=u"Genome:", fg="black",bg="white").grid(column=0, row=2)
self.genomeVar = tk.StringVar()
tk.Entry(self.fileInfo, textvariable=self.genomeVar).grid(column=1, row=2)

tk.Label(self.fileInfo, text=u"Gene names:").grid(column=0, row=3)
self.geneVar = tk.IntVar()
self.geneVar.set(0)
tk.Radiobutton(self.fileInfo, text='Use ensemble IDs', variable=self.geneVar, value=1).grid(column=1, row=3)
tk.Radiobutton(self.fileInfo, text='Use gene names', variable=self.geneVar, value=0).grid(column=2, row=3)

tk.Button(self.fileInfo, text="Compute data statistics", command=partial(self.showRawDataDistributions, file_type='10x')).grid(column=0, row=4)

#filter parameters
self.filterCellMinVar = tk.StringVar()
tk.Label(self.fileInfo,text=u"Filter by molecules per cell. Min:" ,fg="black",bg="white").grid(column=0, row=5)
tk.Entry(self.fileInfo, textvariable=self.filterCellMinVar).grid(column=1,row=5)

self.filterCellMaxVar = tk.StringVar()
tk.Label(self.fileInfo, text=u" Max:" ,fg="black",bg="white").grid(column=2, row=5)
tk.Entry(self.fileInfo, textvariable=self.filterCellMaxVar).grid(column=3,row=5)

self.filterGeneNonzeroVar = tk.StringVar()
tk.Label(self.fileInfo,text=u"Filter by nonzero cells per gene. Min:" ,fg="black",bg="white").grid(column=0, row=6)
tk.Entry(self.fileInfo, textvariable=self.filterGeneNonzeroVar).grid(column=1,row=6)

self.filterGeneMolsVar = tk.StringVar()
tk.Label(self.fileInfo,text=u"Filter by molecules per gene. Min:" ,fg="black",bg="white").grid(column=0, row=7)
tk.Entry(self.fileInfo, textvariable=self.filterGeneMolsVar).grid(column=1,row=7)

#normalize
self.normalizeVar = tk.BooleanVar()
self.normalizeVar.set(True)
tk.Checkbutton(self.fileInfo, text=u"Normalize by library size", variable=self.normalizeVar).grid(column=0, row=8, columnspan=4)

#log transform
self.logTransform = tk.BooleanVar()
self.logTransform.set(False)
tk.Checkbutton(self.fileInfo, text=u"Log-transform data", variable=self.logTransform).grid(column=0, row=9)

self.pseudocount = tk.DoubleVar()
self.pseudocount.set(0.1)
tk.Label(self.fileInfo, text=u"Pseudocount (for log-transform)", fg="black",bg="white").grid(column=1, row=9)
tk.Entry(self.fileInfo, textvariable=self.pseudocount).grid(column=2, row=9)

tk.Button(self.fileInfo, text="Cancel", command=self.fileInfo.destroy).grid(column=1, row=11)
tk.Button(self.fileInfo, text="Load", command=partial(self.processData, file_type='10x_HDF5')).grid(column=2, row=11)

self.wait_window(self.fileInfo)

def getGeneNameFile(self):
self.geneNameFile = filedialog.askopenfilename(title='Select gene name file', initialdir='~/.magic/data')
tk.Label(self.fileInfo,text=self.geneNameFile.split('/')[-1] ,fg="black",bg="white").grid(column=1, row=2)
Expand Down Expand Up @@ -325,6 +391,9 @@ def processData(self, file_type='csv'):
elif file_type == '10x':
scdata = magic.mg.SCData.from_10x(self.dataDir, use_ensemble_id=self.geneVar.get(),
normalize=False)
elif file_type == '10x_HDF5':
scdata = magic.mg.SCData.from_10x_HDF5(os.path.expanduser(self.dataFileName), self.genomeVar.get(),
use_ensemble_id=self.geneVar.get(), normalize=False)

if file_type != 'pickle':
if len(self.filterCellMinVar.get()) > 0 or len(self.filterCellMaxVar.get()) > 0 or len(self.filterGeneNonzeroVar.get()) > 0 or len(self.filterGeneMolsVar.get()) > 0:
Expand Down Expand Up @@ -352,7 +421,8 @@ def processData(self, file_type='csv'):
self.analysisMenu.entryconfig(1, state='normal')
self.analysisMenu.entryconfig(2, state='normal')
self.analysisMenu.entryconfig(3, state='normal')
self.fileMenu.entryconfig(4, state='normal')
self.fileMenu.entryconfig(5, state='normal')
self.fileMenu.entryconfig(6, state='normal')
self.visMenu.entryconfig(0, state='normal')
self.visMenu.entryconfig(1, state='normal')
self.concatButton = tk.Button(self, text=u"Concatenate selected datasets", state='disabled', wraplength=80, command=self.concatenateData)
Expand All @@ -367,10 +437,20 @@ def processData(self, file_type='csv'):
def saveData(self):
for key in self.data_list.selection():
name = self.data_list.item(key)['text'].split(' (')[0]
pickleFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.p', initialfile=key)
pickleFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.p', initialfile=name)
if pickleFileName != None:
self.data[name]['scdata'].save(pickleFileName)

def saveDataToCSV(self):
for key in self.data_list.selection():
name = self.data_list.item(key)['text'].split(' (')[0]
if name in self.data.keys():
CSVFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.csv', initialfile=name)
if CSVFileName != None:
self.data[name]['scdata'].to_csv(CSVFileName)
else:
print('Must select an original or MAGIC imputed data set to save.')

def concatenateData(self):
self.concatOptions = tk.Toplevel()
self.concatOptions.title("Concatenate data sets")
Expand Down Expand Up @@ -673,6 +753,7 @@ def _runMagic(self):
self.data_list.insert(self.curKey, 'end', text=name + ' MAGIC' +
' (' + str(self.data[name]['scdata'].magic.data.shape[0]) +
' x ' + str(self.data[name]['scdata'].magic.data.shape[1]) + ')', open=True)

self.magicProgress.destroy()

def plotPCAVariance(self):
Expand Down
59 changes: 52 additions & 7 deletions src/magic/mg.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from collections import defaultdict, Counter
from subprocess import call, Popen, PIPE
import glob

import tables
import numpy as np
import pandas as pd

Expand All @@ -31,7 +31,7 @@
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from scipy.spatial.distance import squareform
from scipy.sparse import csr_matrix, find, vstack, hstack, issparse
from scipy.sparse import csr_matrix, find, vstack, hstack, issparse, csc_matrix
from scipy.sparse.linalg import eigs
from numpy.linalg import norm
from scipy.stats import gaussian_kde
Expand All @@ -52,7 +52,7 @@


def qualitative_colors(n):
""" Generalte list of colors
""" Generate list of colors
:param n: Number of colors
"""
return sns.color_palette('Set1', n)
Expand Down Expand Up @@ -123,10 +123,10 @@ def save(self, fout: str): # -> None:
with open(fout, 'wb') as f:
pickle.dump(vars(self), f)

def save_magic_to_csv(self, fout: str):
if not isinstance(self.magic, magic.mg.SCData):
raise RuntimeError('Must call run_magic() on data before saving output to csv.')
self.magic.data.to_csv(fout)
def to_csv(self, fout: str):
temp = self.data
temp.columns = [col.split(self._data_prefix)[1] for col in temp.columns]
temp.to_csv(fout)

@classmethod
def load(cls, fin):
Expand Down Expand Up @@ -423,6 +423,51 @@ def from_10x(cls, data_dir, use_ensemble_id=True, normalize=True):

return scdata

@classmethod
def from_10x_HDF5(cls, filename, genome, use_ensemble_id=True, normalize=True):

with tables.open_file(filename, 'r') as f:
try:
group = f.get_node(f.root, genome)
except tables.NoSuchNodeError:
print("That genome does not exist in this file.")
return None
if use_ensemble_id:
gene_names = getattr(group, 'genes').read()
else:
gene_names = getattr(group, 'gene_names').read()
barcodes = getattr(group, 'barcodes').read()
data = getattr(group, 'data').read()
indices = getattr(group, 'indices').read()
indptr = getattr(group, 'indptr').read()
shape = getattr(group, 'shape').read()
matrix = csc_matrix((data, indices, indptr), shape=shape)

dataMatrix = pd.DataFrame(matrix.todense(), columns=np.array([b.decode() for b in barcodes]),
index=np.array([g.decode() for g in gene_names]))
dataMatrix = dataMatrix.transpose()

#Remove empty cells
print('Removing empty cells')
cell_sums = dataMatrix.sum(axis=1)
to_keep = np.where(cell_sums > 0)[0]
dataMatrix = dataMatrix.ix[dataMatrix.index[to_keep], :].astype(np.float32)

#Remove empty genes
print('Removing empty genes')
gene_sums = dataMatrix.sum(axis=0)
to_keep = np.where(gene_sums > 0)[0]
dataMatrix = dataMatrix.ix[:, to_keep].astype(np.float32)

# Construct class object
scdata = cls( dataMatrix, data_type='sc-seq' )

# Normalize if specified
if normalize==True:
scdata = scdata.normalize_scseq_data( )

return scdata

def filter_scseq_data(self, filter_cell_min=0, filter_cell_max=np.inf, filter_gene_nonzero=None, filter_gene_mols=None):

sums = self.data.sum(axis=1)
Expand Down

0 comments on commit 0f2421d

Please sign in to comment.