080_LING.py

#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Linguistic Complexity of a Genome
Rosalind ID: LING
Rosalind #: 080
URL: http://rosalind.info/problems/ling/
'''

from scripts import SuffixTree
from math import log

with open('data/rosalind_ling.txt') as input_data:
    dna = input_data.read().strip()
    n = len(dna)

# After removing the termination symbol $, if necessary, each edge corresponds to len(edge) substrings.
# Specifically, the substrings are generated by concatonating previous edges with the prefixes of the current edge.
edges = [edge if edge[1] != n+1 else [edge[0], n] for edge in SuffixTree(dna).edges.values()]
sub = float(sum([edge[1]-edge[0] for edge in edges]))

# The number of possible substrings of length k is min(4^k, n-k-1).
# Minimum because it depends on if there's  enough room in the string to fit all of the 4^k possible substrings.
m = float(sum([n-k+1 if k > log(n+1)/log(4) else 4**k for k in xrange(1, n+1)]))  # log's and if statement to avoid 4^k when k large.

# Print and save the answer.
print sub/m
with open('output/080_LING.txt', 'w') as output_file:
    output_file.write(str(sub/m))