Skip to content

Commit

Permalink
Revert "Added support for RSEM. It's a bit hackey."
Browse files Browse the repository at this point in the history
These changes dont belong in master, so I've reverted and put them intoa  new branch.

This reverts commit 1ae8e2e.
  • Loading branch information
nboley committed Jul 15, 2015
1 parent 1ae8e2e commit 24c1d5f
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 97 deletions.
21 changes: 1 addition & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,8 @@ Peaks are grouped by overlap, and then for each oracle peak a single peak from e

Output
------
RSEM input data has a different output than the other input types - see below.

### Peak Output file format
### Output file format
The output format mimics the input file type, with some additional fields.

We provide an example for narrow peak files - note that the first 6 columns
Expand Down Expand Up @@ -159,24 +158,6 @@ The summit of this peak in replicate 1.

[rep N data]

### RSEM Output file format

1. gene_id string
RSEM gene id - the first column of the input file.

2. rep 1 score float
The RSEM estimated posterior count mean for replicate 1 - column 7 of the RSEM output file.

3. rep 2 score float
The RSEM estimated posterior count mean for replicate 2 - column 7 of the RSEM output file.

4. localIDR float
-log10(Local IDR value)

5. globalIDR float
-log10(Global IDR value)



### Plot output

Expand Down
2 changes: 1 addition & 1 deletion idr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def log(*args, level=None):
MAX_MIX_PARAM = 0.99

MIN_RHO = 0.10
MAX_RHO = 0.9999
MAX_RHO = 0.99

MIN_SIGMA = 0.20
MAX_SIGMA = 20
Expand Down
84 changes: 8 additions & 76 deletions idr/idr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy

from scipy.stats.stats import rankdata
from numpy.random import normal

from collections import namedtuple, defaultdict, OrderedDict
from itertools import chain
Expand Down Expand Up @@ -249,10 +248,6 @@ def merge_peaks(all_s_peaks, pk_agg_fn, oracle_pks=None,
merged_peaks.sort(key=lambda x:x.merged_signal, reverse=True)
return merged_peaks

def rank_signal(signal):
rank = numpy.lexsort((numpy.random.random(len(signal)), signal)).argsort()
return numpy.array(rank, dtype=numpy.int)

def build_rank_vectors(merged_peaks):
# allocate memory for the ranks vector
s1 = numpy.zeros(len(merged_peaks))
Expand All @@ -261,7 +256,11 @@ def build_rank_vectors(merged_peaks):
for i, x in enumerate(merged_peaks):
s1[i], s2[i] = x.signals

return ( rank_signal(s1), rank_signal(s2) )
rank1 = numpy.lexsort((numpy.random.random(len(s1)), s1)).argsort()
rank2 = numpy.lexsort((numpy.random.random(len(s2)), s2)).argsort()

return ( numpy.array(rank1, dtype=numpy.int),
numpy.array(rank2, dtype=numpy.int) )

def build_idr_output_line_with_bed6(
m_pk, IDR, localIDR, output_file_type, signal_type):
Expand Down Expand Up @@ -468,7 +467,7 @@ def PossiblyGzippedFile(fname):
parser.add_argument( '--peak-list', '-p', type=PossiblyGzippedFile,
help='If provided, all peaks will be taken from this file.')
parser.add_argument( '--input-file-type', default='narrowPeak',
choices=['narrowPeak', 'broadPeak', 'bed', 'rsem'],
choices=['narrowPeak', 'broadPeak', 'bed'],
help='File type of --samples and --peak-list.')

parser.add_argument( '--rank',
Expand Down Expand Up @@ -720,67 +719,9 @@ def make_boxplots(sample_i):
matplotlib.pyplot.savefig(args.output_file.name + ".png")
return

def load_rsem_samples(fp):
signal = {}
for line in fp:
data = line.split()
gene_id = data[0]
mean_cnt = float(data[7])
mean_cnt_sd = float(data[8])
signal[gene_id] = (mean_cnt, mean_cnt_sd)
return signal

def load_and_rank_rsem_samples(samples_fps):
merged_peaks = defaultdict(list)
for i, fp in enumerate(samples_fps):
for key, (val, sd) in load_rsem_samples(fp).items():
merged_peaks[key].append(val)

genes = []
rvs = [ [] for i in range(len(samples_fps))]
for i, (key, vals) in enumerate(merged_peaks.items()):
# skip genes with less than 10 counts
if all(val < 10 for val in vals):
continue
genes.append(key)
for val, rv in zip(vals, rvs):
rv.append(val)

return genes, [numpy.array(rv) for rv in rvs]


def process_rsem_data(args):
genes, (s1, s2) = load_and_rank_rsem_samples(args.samples)

r1 = rank_signal(s1)
r2 = rank_signal(s2)

#for g, (x, y) in zip(genes, zip(s1, s2)):
# print( g, x, y )
#assert False

localIDRs, IDRs = fit_model_and_calc_idr(
r1, r2,
starting_point=(
args.initial_mu, args.initial_sigma,
args.initial_rho, args.initial_mix_param),
max_iter=args.max_iter,
convergence_eps=args.convergence_eps,
fix_mu=args.fix_mu, fix_sigma=args.fix_sigma )

if args.plot:
plot(args, [s1, s2], [r1, r2], IDRs)

for gene, vals, IDR, localIDR in sorted(
zip(genes, zip(s1, s2), IDRs, localIDRs), key=lambda x:x[2]):
args.output_file.write("{}\t{}\t{}\t{}\n".format(
gene, "\t".join(("%.2f" % x).ljust(10) for x in vals),
"%.2f" % -math.log10(max(1e-5, localIDR)),
"%.2f" % -math.log10(max(1e-5, IDR))))

return
def main():
args = parse_args()

def process_peak_data(args):
# load and merge peaks
merged_peaks, signal_type = load_samples(args)
s1 = numpy.array([pk.signals[0] for pk in merged_peaks])
Expand Down Expand Up @@ -826,15 +767,6 @@ def process_peak_data(args):
useBackwardsCompatibleOutput=args.use_old_output_format)

args.output_file.close()
return

def main():
args = parse_args()
if args.input_file_type == 'rsem':
process_rsem_data(args)
else:
process_peak_data(args)


if __name__ == '__main__':
try:
Expand Down

0 comments on commit 24c1d5f

Please sign in to comment.