add more parameters to MAGIC.py script, 10x_HDF5 data loading, and sa…

…ving data to csv from gui
dpeerlab · Jul 10, 2017 · 0f2421d · 0f2421d
1 parent b776e99
commit 0f2421d
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -33,11 +33,14 @@ A python GUI is now available for MAGIC. After following the installation steps
 MAGIC can be run using the command line script `MAGIC.py` with the following parameters:
 
 		$> MAGIC.py -h
+		usage: MAGIC.py [-h] -d D -o O [-g G] [--gene-name-file GN]
+        		        [--use-ensemble-ids] [--cell-axis CA] [--skip-rows SKIP_ROWS]
+                		[--skip-columns SKIP_COLUMNS] [-n] [-l L]
+                		[--mols-per-cell-min MOLS_PER_CELL_MIN]
+                		[--mols-per-cell-max MOLS_PER_CELL_MAX] [-p P]
+                		[--pca-non-random] [-t T] [-k K] [-ka KA] [-e E] [-r R]
+                		{csv,10x,10x_HDF5,mtx}
 		
-		usage: MAGIC.py [-h] -d I -o O [-n] [-l L] [-g G] [-p N] [--pca-non-random]
-                		[-t T] [-k K] [-ka KA] [-e E] [-r R]
-                		{csv,10x,mtx}
-
 		run MAGIC
 
 		positional arguments:
@@ -46,21 +49,34 @@ MAGIC can be run using the command line script `MAGIC.py` with the following par
 		optional arguments:
 		  -h, --help            show this help message and exit
 
-		required arguments:
+		data loading parameters:
 		  -d D, --data-file D   File path of input data file.
 		  -o O, --output-file O
 		                        File path of where to save the MAGIC imputed data (in
 		                        csv format).
-
-		normalization parameters:
+		  -g G, --genome G      Genome must be specified when loading 10x_HDF5 data.
+		  --gene-name-file GN   Gene name file must be specified when loading mtx
+		                        data.
+		  --use-ensemble-ids    Use ensemble IDs instead of gene names.
+		  --cell-axis CA        When loading a csv, specify whether cells are on rows
+		                        or columns (Default = 'rows').
+		  --skip-rows SKIP_ROWS
+		                        When loading a csv, number of rows to skip after the
+		                        header row (Default = 0).
+		  --skip-columns SKIP_COLUMNS
+		                        When loading a csv, number of columns to skip after
+		                        the header columns (Default = 0).
+		
+		normalization/filtering parameters:
 		  -n, --no-normalize    Do not perform library size normalization on the data
 		  -l L, --log-transform L
 		                        Log-transform data with the specified pseudocount.
+		  --mols-per-cell-min MOLS_PER_CELL_MIN
+		                        Minimum molecules/cell to use in filtering.
+		  --mols-per-cell-max MOLS_PER_CELL_MAX
+		                        Maximum molecules/cell to use in filtering.
 
 		MAGIC parameters:
-		  -g G, --gene-name-file G
-		                        Gene name file must be specified when loading mtx
-		                        data.
 		  -p P, --pca-components P
 		                        Number of pca components to use when running MAGIC
 		                        (Default = 20).

diff --git a/src/magic/MAGIC.py b/src/magic/MAGIC.py
@@ -18,23 +18,39 @@ def error(self, message):
 def parse_args(args):
 	p = NewArgumentParser(description='run MAGIC')
 	p.add_argument('filetype',
-                   choices=['csv', '10x', 'mtx'],
+                   choices=['csv', '10x', '10x_HDF5', 'mtx'],
                    help='what is the file type of your original data?')
 
-	a = p.add_argument_group('required arguments')
+	a = p.add_argument_group('data loading parameters')
 	a.add_argument('-d', '--data-file', metavar='D', required=True,
                    help='File path of input data file.')
 	a.add_argument('-o', '--output-file', metavar='O', required=True,
 				   help='File path of where to save the MAGIC imputed data (in csv format).')
-
-	n = p.add_argument_group('normalization parameters')
+	a.add_argument('-g', '--genome', metavar='G',
+					help='Genome must be specified when loading 10x_HDF5 data.')
+	a.add_argument('--gene-name-file', metavar='GN', 
+					help='Gene name file must be specified when loading mtx data.')
+	a.add_argument('--use-ensemble-ids', default=False, action='store_true',
+					help='Use ensemble IDs instead of gene names.')
+	a.add_argument('--cell-axis', metavar='CA', default='rows',
+					choices=['rows', 'columns'], 
+					help='When loading a csv, specify whether cells are on rows or columns (Default = \'rows\').')
+	a.add_argument('--skip-rows', default=0,
+					help='When loading a csv, number of rows to skip after the header row (Default = 0).')
+	a.add_argument('--skip-columns', default=0,
+					help='When loading a csv, number of columns to skip after the header columns (Default = 0).')
+
+	n = p.add_argument_group('normalization/filtering parameters')
 	n.add_argument('-n', '--no-normalize', default=True, action='store_false',
 					help='Do not perform library size normalization on the data')
 	n.add_argument('-l', '--log-transform', metavar='L', default=None,
 					help='Log-transform data with the specified pseudocount.')
+	n.add_argument('--mols-per-cell-min', default=0,
+					help='Minimum molecules/cell to use in filtering.')
+	n.add_argument('--mols-per-cell-max', default=np.inf,
+					help='Maximum molecules/cell to use in filtering.')
+
 	m = p.add_argument_group('MAGIC parameters')
-	m.add_argument('-g', '--gene-name-file', metavar='G', 
-					help='Gene name file must be specified when loading mtx data.')
 	m.add_argument('-p', '--pca-components', metavar='P', default=20,
 				   help='Number of pca components to use when running MAGIC (Default = 20).')
 	m.add_argument('--pca-non-random', default=True, action='store_false',
@@ -61,20 +77,33 @@ def main(args: list = None):
 	try:
 		if args.filetype == 'csv':
 			scdata = magic.mg.SCData.from_csv(os.path.expanduser(args.data_file), 
-											  data_type='sc-seq', normalize=args.no_normalize)
+											  data_type='sc-seq', normalize=False, 
+											  cell_axis=0 if args.cell_axis=='rows' else 1,
+											  rows_after_header_to_skip=args.skip_rows,
+											  cols_after_header_to_skip=args.skip_columns)
 		elif args.filetype == 'mtx':
 			scdata = magic.mg.SCData.from_mtx(os.path.expanduser(args.data_file),
-											  os.path.expanduser(args.gene_name_file), normalize=args.no_normalize)
+											  os.path.expanduser(args.gene_name_file), normalize=False)
 		elif args.filetype == '10x':
-			scdata = magic.mg.SCData.from_10x(args.data_file, normalize=args.no_normalize)
+			scdata = magic.mg.SCData.from_10x(args.data_file, normalize=False, 
+											  use_ensemble_id=args.use_ensemble_ids)
+
+		elif args.filetype == '10x_HDF5':
+			scdata = magic.mg.SCData.from_10x_HDF5(args.data_file, args.genome, normalize=False,
+												   use_ensemble_id=args.use_ensemble_ids)
+
+		scdata.filter_scseq_data(filter_cell_min=args.mols_per_cell_min, filter_cell_max=args.mols_per_cell_max)
+
+		if args.no_normalize:
+			scdata = scdata.normalize_scseq_data()
 
 		if args.log_transform != None:
 			scdata.log_transform_scseq_data(pseudocount=args.log_transform)
 
 		scdata.run_magic(n_pca_components=args.pca_components, random_pca=args.pca_non_random, t=args.t,
 						 k=args.k, ka=args.ka, epsilon=args.epsilon, rescale_percent=args.rescale)
 
-		scdata.save_magic_to_csv(os.path.expanduser(args.output_file))
+		scdata.magic.to_csv(os.path.expanduser(args.output_file))
 
 	except:
 		raise

diff --git a/src/magic/__init__.py b/src/magic/__init__.py
@@ -1,5 +1,6 @@
 from . import MAGIC_core
 from . import mg
 from . import magic_gui
+from . import MAGIC
 
 __version__ = "0.0"
diff --git a/src/magic/magic_gui.py b/src/magic/magic_gui.py
@@ -1,5 +1,6 @@
 #!/usr/local/bin/python3
-
+import warnings
+warnings.simplefilter("ignore", UserWarning)
 import matplotlib
 matplotlib.use("TkAgg")
 from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
@@ -39,7 +40,9 @@ def initialize(self):
         self.fileMenu.add_command(label="Load csv file", command=self.loadCSV)
         self.fileMenu.add_command(label="Load sparse data file", command=self.loadMTX)
         self.fileMenu.add_command(label="Load 10x file", command=self.load10x)
+        self.fileMenu.add_command(label="Load 10x HDF5 file", command=self.load10xHDF5)
         self.fileMenu.add_command(label="Load saved session from pickle file", command=self.loadPickle)
+        self.fileMenu.add_command(label="Save selected data to csv", state='disabled', command=self.saveDataToCSV)
         self.fileMenu.add_command(label="Save session to pickle file", state='disabled', command=self.saveData)
         self.fileMenu.add_command(label="Exit", command=self.quitMAGIC)
 
@@ -260,6 +263,69 @@ def load10x(self):
 
             self.wait_window(self.fileInfo)
 
+    def load10xHDF5(self):
+        self.dataFileName = filedialog.askopenfilename(title='Load data file', initialdir='~/.magic/data')
+        if(self.dataFileName != None):
+            #pop up data options menu
+            self.fileInfo = tk.Toplevel()
+            self.fileInfo.title("Data options")
+            tk.Label(self.fileInfo, text=u"File name: ").grid(column=0, row=0)
+            tk.Label(self.fileInfo, text=self.dataFileName).grid(column=1, row=0)
+
+            tk.Label(self.fileInfo,text=u"Name:" ,fg="black",bg="white").grid(column=0, row=1)
+            self.fileNameEntryVar = tk.StringVar()
+            self.fileNameEntryVar.set('Data ' + str(len(self.data)))
+            tk.Entry(self.fileInfo, textvariable=self.fileNameEntryVar).grid(column=1,row=1)
+
+            tk.Label(self.fileInfo, text=u"Genome:", fg="black",bg="white").grid(column=0, row=2)
+            self.genomeVar = tk.StringVar()
+            tk.Entry(self.fileInfo, textvariable=self.genomeVar).grid(column=1, row=2)
+
+            tk.Label(self.fileInfo, text=u"Gene names:").grid(column=0, row=3)
+            self.geneVar = tk.IntVar()
+            self.geneVar.set(0)
+            tk.Radiobutton(self.fileInfo, text='Use ensemble IDs', variable=self.geneVar, value=1).grid(column=1, row=3)
+            tk.Radiobutton(self.fileInfo, text='Use gene names', variable=self.geneVar, value=0).grid(column=2, row=3)
+
+            tk.Button(self.fileInfo, text="Compute data statistics", command=partial(self.showRawDataDistributions, file_type='10x')).grid(column=0, row=4)
+
+            #filter parameters
+            self.filterCellMinVar = tk.StringVar()
+            tk.Label(self.fileInfo,text=u"Filter by molecules per cell. Min:" ,fg="black",bg="white").grid(column=0, row=5)
+            tk.Entry(self.fileInfo, textvariable=self.filterCellMinVar).grid(column=1,row=5)
+
+            self.filterCellMaxVar = tk.StringVar()
+            tk.Label(self.fileInfo, text=u" Max:" ,fg="black",bg="white").grid(column=2, row=5)
+            tk.Entry(self.fileInfo, textvariable=self.filterCellMaxVar).grid(column=3,row=5)
+
+            self.filterGeneNonzeroVar = tk.StringVar()
+            tk.Label(self.fileInfo,text=u"Filter by nonzero cells per gene. Min:" ,fg="black",bg="white").grid(column=0, row=6)
+            tk.Entry(self.fileInfo, textvariable=self.filterGeneNonzeroVar).grid(column=1,row=6)
+
+            self.filterGeneMolsVar = tk.StringVar()
+            tk.Label(self.fileInfo,text=u"Filter by molecules per gene. Min:" ,fg="black",bg="white").grid(column=0, row=7)
+            tk.Entry(self.fileInfo, textvariable=self.filterGeneMolsVar).grid(column=1,row=7)
+
+            #normalize
+            self.normalizeVar = tk.BooleanVar()
+            self.normalizeVar.set(True)
+            tk.Checkbutton(self.fileInfo, text=u"Normalize by library size", variable=self.normalizeVar).grid(column=0, row=8, columnspan=4)
+
+            #log transform
+            self.logTransform = tk.BooleanVar()
+            self.logTransform.set(False)
+            tk.Checkbutton(self.fileInfo, text=u"Log-transform data", variable=self.logTransform).grid(column=0, row=9)
+
+            self.pseudocount = tk.DoubleVar()
+            self.pseudocount.set(0.1)
+            tk.Label(self.fileInfo, text=u"Pseudocount (for log-transform)", fg="black",bg="white").grid(column=1, row=9)
+            tk.Entry(self.fileInfo, textvariable=self.pseudocount).grid(column=2, row=9)
+
+            tk.Button(self.fileInfo, text="Cancel", command=self.fileInfo.destroy).grid(column=1, row=11)
+            tk.Button(self.fileInfo, text="Load", command=partial(self.processData, file_type='10x_HDF5')).grid(column=2, row=11)
+
+            self.wait_window(self.fileInfo)
+
     def getGeneNameFile(self):
         self.geneNameFile = filedialog.askopenfilename(title='Select gene name file', initialdir='~/.magic/data')
         tk.Label(self.fileInfo,text=self.geneNameFile.split('/')[-1] ,fg="black",bg="white").grid(column=1, row=2)
@@ -325,6 +391,9 @@ def processData(self, file_type='csv'):
         elif file_type == '10x':
             scdata = magic.mg.SCData.from_10x(self.dataDir, use_ensemble_id=self.geneVar.get(),
                                               normalize=False)
+        elif file_type == '10x_HDF5':
+            scdata = magic.mg.SCData.from_10x_HDF5(os.path.expanduser(self.dataFileName), self.genomeVar.get(),
+                                                   use_ensemble_id=self.geneVar.get(), normalize=False)
 
         if file_type != 'pickle':
             if len(self.filterCellMinVar.get()) > 0 or len(self.filterCellMaxVar.get()) > 0 or len(self.filterGeneNonzeroVar.get()) > 0 or len(self.filterGeneMolsVar.get()) > 0:
@@ -352,7 +421,8 @@ def processData(self, file_type='csv'):
         self.analysisMenu.entryconfig(1, state='normal')
         self.analysisMenu.entryconfig(2, state='normal')
         self.analysisMenu.entryconfig(3, state='normal')
-        self.fileMenu.entryconfig(4, state='normal')
+        self.fileMenu.entryconfig(5, state='normal')
+        self.fileMenu.entryconfig(6, state='normal')
         self.visMenu.entryconfig(0, state='normal')
         self.visMenu.entryconfig(1, state='normal')
         self.concatButton = tk.Button(self, text=u"Concatenate selected datasets", state='disabled', wraplength=80, command=self.concatenateData)
@@ -367,10 +437,20 @@ def processData(self, file_type='csv'):
     def saveData(self):
         for key in self.data_list.selection():
             name = self.data_list.item(key)['text'].split(' (')[0]
-            pickleFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.p', initialfile=key)
+            pickleFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.p', initialfile=name)
             if pickleFileName != None:
                 self.data[name]['scdata'].save(pickleFileName)
 
+    def saveDataToCSV(self):
+        for key in self.data_list.selection():
+            name = self.data_list.item(key)['text'].split(' (')[0]
+            if name in self.data.keys():
+                CSVFileName = filedialog.asksaveasfilename(title=name + ': save data', defaultextension='.csv', initialfile=name)
+                if CSVFileName != None:
+                    self.data[name]['scdata'].to_csv(CSVFileName)          
+            else:
+                print('Must select an original or MAGIC imputed data set to save.')
+
     def concatenateData(self):
         self.concatOptions = tk.Toplevel()
         self.concatOptions.title("Concatenate data sets")
@@ -673,6 +753,7 @@ def _runMagic(self):
         self.data_list.insert(self.curKey, 'end', text=name + ' MAGIC' +
                               ' (' + str(self.data[name]['scdata'].magic.data.shape[0]) + 
                               ' x ' + str(self.data[name]['scdata'].magic.data.shape[1]) + ')', open=True)
+
         self.magicProgress.destroy()
 
     def plotPCAVariance(self):

diff --git a/src/magic/mg.py b/src/magic/mg.py
@@ -9,7 +9,7 @@
 from collections import defaultdict, Counter
 from subprocess import call, Popen, PIPE
 import glob
-
+import tables
 import numpy as np
 import pandas as pd
 
@@ -31,7 +31,7 @@
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from scipy.spatial.distance import squareform
-from scipy.sparse import csr_matrix, find, vstack, hstack, issparse
+from scipy.sparse import csr_matrix, find, vstack, hstack, issparse, csc_matrix
 from scipy.sparse.linalg import eigs
 from numpy.linalg import norm
 from scipy.stats import gaussian_kde
@@ -52,7 +52,7 @@
 
 
 def qualitative_colors(n):
-    """ Generalte list of colors
+    """ Generate list of colors
     :param n: Number of colors
     """
     return sns.color_palette('Set1', n)
@@ -123,10 +123,10 @@ def save(self, fout: str):  # -> None:
         with open(fout, 'wb') as f:
             pickle.dump(vars(self), f)
 
-    def save_magic_to_csv(self, fout: str):
-        if not isinstance(self.magic, magic.mg.SCData):
-            raise RuntimeError('Must call run_magic() on data before saving output to csv.')
-        self.magic.data.to_csv(fout)
+    def to_csv(self, fout: str):
+        temp = self.data
+        temp.columns = [col.split(self._data_prefix)[1] for col in temp.columns]
+        temp.to_csv(fout)
 
     @classmethod
     def load(cls, fin):
@@ -423,6 +423,51 @@ def from_10x(cls, data_dir, use_ensemble_id=True, normalize=True):
 
         return scdata
 
+    @classmethod
+    def from_10x_HDF5(cls, filename, genome, use_ensemble_id=True, normalize=True):
+
+        with tables.open_file(filename, 'r') as f:
+            try:
+                group = f.get_node(f.root, genome)
+            except tables.NoSuchNodeError:
+                print("That genome does not exist in this file.")
+                return None
+            if use_ensemble_id:
+                gene_names = getattr(group, 'genes').read()
+            else:
+                gene_names = getattr(group, 'gene_names').read()
+            barcodes = getattr(group, 'barcodes').read()
+            data = getattr(group, 'data').read()
+            indices = getattr(group, 'indices').read()
+            indptr = getattr(group, 'indptr').read()
+            shape = getattr(group, 'shape').read()
+            matrix = csc_matrix((data, indices, indptr), shape=shape)
+
+            dataMatrix = pd.DataFrame(matrix.todense(), columns=np.array([b.decode() for b in barcodes]), 
+                                      index=np.array([g.decode() for g in gene_names]))
+            dataMatrix = dataMatrix.transpose()
+
+            #Remove empty cells
+            print('Removing empty cells')
+            cell_sums = dataMatrix.sum(axis=1)
+            to_keep = np.where(cell_sums > 0)[0]
+            dataMatrix = dataMatrix.ix[dataMatrix.index[to_keep], :].astype(np.float32)
+
+            #Remove empty genes
+            print('Removing empty genes')
+            gene_sums = dataMatrix.sum(axis=0)
+            to_keep = np.where(gene_sums > 0)[0]
+            dataMatrix = dataMatrix.ix[:, to_keep].astype(np.float32)
+
+            # Construct class object
+            scdata = cls( dataMatrix, data_type='sc-seq' )
+
+            # Normalize if specified
+            if normalize==True:
+                scdata = scdata.normalize_scseq_data( )
+
+            return scdata
+
     def filter_scseq_data(self, filter_cell_min=0, filter_cell_max=np.inf, filter_gene_nonzero=None, filter_gene_mols=None):
 
         sums = self.data.sum(axis=1)