Skip to content

Commit

Permalink
updated isoform fraction calculation
Browse files Browse the repository at this point in the history
made calculation relative to highest expression isoform. removed flag
—relative-path. GTF output now contains absolute fraction as an
attribute
  • Loading branch information
kalahasty committed Feb 16, 2016
1 parent 709e8f5 commit 84e162d
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 68 deletions.
80 changes: 33 additions & 47 deletions taco/lib/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from path_graph import KMER_EXPR, smooth_graph, reconstruct_path, \
create_optimal_path_graph, get_lost_nodes
from path_finder import find_paths
from bx.cluster import ClusterTree


__author__ = "Matthew Iyer and Yashar Niknafs"
Expand Down Expand Up @@ -60,31 +59,29 @@ def defaults():
self.max_paths = 0
self.isoform_frac = 0
self.max_isoforms = 0
self.relative_frac = False
return self


class Isoform(object):
__slots__ = ('expr', 'path', 'gene_frac', 'path_frac',
__slots__ = ('expr', 'path', 'rel_frac', 'abs_frac',
'gene_id', 'tss_id')

def __init__(self, path=None, expr=0.0, path_frac=1.0, gene_frac=1.0):
def __init__(self, path=None, expr=0.0, rel_frac=1.0, abs_frac=1.0):
self.path = path
self.expr = expr
self.path_frac = path_frac
self.gene_frac = gene_frac
self.rel_frac = rel_frac
self.abs_frac = abs_frac
self.gene_id = -1
self.tss_id = -1


class Cluster(object):

def __init__(self, relative_frac=False):
def __init__(self):
self._id = 0
self.expr = 1e-10
self.nodes = set()
self.paths = []
self.relative_frac = relative_frac

def __len__(self):
return len(self.paths)
Expand All @@ -97,12 +94,11 @@ def _add(self, path, expr):
def iterpaths(self):
if len(self.paths) == 0:
return
best_expr = self.paths[0][1]
for path, expr in self.paths:
if self.relative_frac:
frac = expr / self.paths[0][1]
else:
frac = expr / self.expr
yield path, expr, frac
rel_frac = 0.0 if best_expr == 0.0 else expr / best_expr
abs_frac = 0.0 if self.expr == 0.0 else expr / self.expr
yield path, expr, rel_frac, abs_frac

def merge(self, *others):
for other in others:
Expand All @@ -112,7 +108,7 @@ def merge(self, *others):
self.paths.sort(key=itemgetter(1), reverse=True)

@staticmethod
def build(paths, min_frac=0.0, relative_frac=False):
def build(paths, min_frac=0.0):
filtered = []
clusters = {}
_id = 0
Expand All @@ -123,25 +119,24 @@ def build(paths, min_frac=0.0, relative_frac=False):
if not c.nodes.isdisjoint(path)]
if len(matches) == 0:
# make new cluster
c = Cluster(relative_frac)
c = Cluster()
c._id = _id
_id += 1
clusters[c._id] = c
else:
# check frac in all clusters
if relative_frac:
fracs = []
for c in clusters.itervalues():
best_expr = c.paths[0][1] + 1e-10
fracs.append(expr / best_expr)
else:
fracs = [(expr / (c.expr + expr))
for c in clusters.itervalues()]
if any((x < min_frac) for x in fracs):
discard = False
for c in clusters.itervalues():
best_expr = c.paths[0][1]
rel_frac = 0.0 if best_expr == 0.0 else expr / best_expr
if rel_frac < min_frac:
discard = True
break
if discard:
filtered.append((path, expr))
continue
c = matches[0]
# merge clusters
c = matches[0]
c.merge(*matches[1:])
for c2 in matches[1:]:
del clusters[c2._id]
Expand All @@ -163,7 +158,7 @@ def next(self):


def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
transcript_id, expr, gene_frac, path_frac):
transcript_id, expr, rel_frac, abs_frac):
tx_start = exons[0].start
tx_end = exons[-1].end
strand_str = Strand.to_gtf(strand)
Expand All @@ -177,12 +172,12 @@ def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
f.feature = 'transcript'
f.start = tx_start
f.end = tx_end
f.score = int(round(1000.0 * gene_frac))
f.score = int(round(1000.0 * rel_frac))
f.strand = strand_str
f.phase = '.'
f.attrs = {'expr': '%.3f' % expr,
'frac': '%.3f' % gene_frac,
'path_frac': '%.3f' % path_frac}
'rel_frac': '%.5f' % rel_frac,
'abs_frac': '%.5f' % abs_frac}
f.attrs.update(attr_dict)
yield f
for e in exons:
Expand All @@ -192,7 +187,7 @@ def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
f.feature = 'exon'
f.start = e.start
f.end = e.end
f.score = int(round(1000.0 * gene_frac))
f.score = int(round(1000.0 * rel_frac))
f.strand = strand_str
f.phase = '.'
f.attrs = {}
Expand Down Expand Up @@ -294,28 +289,20 @@ def assemble_isoforms(sgraph, config):
for kmer_path, expr in find_paths(K, K.graph['source'],
K.graph['sink'],
path_frac=config.path_frac,
max_paths=config.max_paths,
relative_frac=config.relative_frac):
max_paths=config.max_paths):
path = reconstruct_path(kmer_path, id_kmer_map, sgraph)
logging.debug("\texpr=%f length=%d" % (expr, len(path)))
paths.append((path, expr))
# build gene clusters
clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac,
relative_frac=config.relative_frac)
clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
logging.debug('\tclusters: %d filtered: %d' %
(len(clusters), len(filtered)))
gene_isoforms = []
if len(paths) > 0:
if config.relative_frac:
max_expr = paths[0][1]
else:
max_expr = source_expr
for cluster in clusters:
isoforms = []
for path, expr, gene_frac in cluster.iterpaths():
isoforms.append(Isoform(path=path, expr=expr,
path_frac=(expr / max_expr),
gene_frac=gene_frac))
for path, expr, rel_frac, abs_frac in cluster.iterpaths():
isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac,
abs_frac=abs_frac))
# apply max isoforms limit (per cluster)
if config.max_isoforms > 0:
isoforms = isoforms[:config.max_isoforms]
Expand Down Expand Up @@ -370,13 +357,13 @@ def assemble_gene(sgraph, locus_id_str, config):
tss_id=tss_id_str,
transcript_id=t_id_str,
expr=isoform.expr,
gene_frac=isoform.gene_frac,
path_frac=isoform.path_frac):
rel_frac=isoform.rel_frac,
abs_frac=isoform.abs_frac):
print >>config.assembly_gtf_fh, str(f)
# write to BED
name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr)
fields = write_bed(sgraph.chrom, name, sgraph.strand,
int(round(1000.0 * isoform.gene_frac)),
int(round(1000.0 * isoform.rel_frac)),
isoform.path)
print >>config.assembly_bed_fh, '\t'.join(fields)

Expand Down Expand Up @@ -432,7 +419,6 @@ def assemble(**kwargs):
- max_paths
- isoform_frac
- max_isoforms
- relative_frac
Input file attributes:
- transfrags_gtf_file
Expand Down
12 changes: 2 additions & 10 deletions taco/lib/path_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _find_path(nodes, exprs, succ, isource, isink):
return path, expr


def find_paths(G, source, sink, path_frac=0, max_paths=0, relative_frac=False):
def find_paths(G, source, sink, path_frac=0, max_paths=0):
# initialize path finding
nodes = nx.topological_sort(G)
indexes = dict((n, i) for i, n in enumerate(nodes))
Expand All @@ -60,22 +60,15 @@ def find_paths(G, source, sink, path_frac=0, max_paths=0, relative_frac=False):
for n in nodes:
exprs.append(G.node[n][KMER_EXPR])
succ.append([indexes[x] for x in G.successors_iter(n)])

# don't run if all nodes are zero
if exprs[isource] < MIN_SCORE:
return []

# find highest scoring path
path, expr = _find_path(nodes, exprs, succ, isource, isink)
results = [(path, expr)]

# define threshold score to stop producing suboptimal paths
if relative_frac:
lowest_expr = expr * path_frac
else:
lowest_expr = exprs[isource] * path_frac
lowest_expr = expr * path_frac
lowest_expr = max(MIN_SCORE, lowest_expr)

# iterate to find suboptimal paths
iterations = 1
while True:
Expand All @@ -89,5 +82,4 @@ def find_paths(G, source, sink, path_frac=0, max_paths=0, relative_frac=False):
results.append((path, expr))
iterations += 1
logging.debug("\tpath finding iterations=%d" % iterations)
# return (path,score) tuples sorted from high -> low score
return results
11 changes: 0 additions & 11 deletions taco/lib/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,16 +178,6 @@ def create():
default=Args.MAX_PATHS,
help='dynamic programming algorithm will stop '
'after finding N paths [default=%(default)s]')
advgrp.add_argument('--relative-frac', dest='relative_frac',
action='store_true',
default=Args.RELATIVE_FRAC,
help='Compute isoform fraction relative to '
'highest expression isoform in gene. By default, '
'TACO computes absolute fraction compared to the '
'total gene expression [default=%(default)s]')
advgrp.add_argument('--no-relative-frac', dest='relative_frac',
action='store_false',
help='Disable relative isoform fraction')
parser.add_argument('sample_file', nargs='?')
return parser

Expand Down Expand Up @@ -225,7 +215,6 @@ def log(args, func=logging.info):
args.path_graph_loss_threshold))
func(fmt.format('path frac:', args.path_frac))
func(fmt.format('max paths:', args.max_paths))
func(fmt.format('relative isoform fraction:', args.relative_frac))

@staticmethod
def parse():
Expand Down

0 comments on commit 84e162d

Please sign in to comment.