phbradley · agartland · Oct 10, 2017 · Sep 19, 2017 · Sep 19, 2017 · Sep 19, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,7 @@
 *.pyc
 *~
+.cache/
+datasets/
+db/
+external/
+testing_ref/
diff --git a/api.md b/api.md
@@ -0,0 +1,46 @@
+# API branch
+---
+
+
+# Single TCR example
+Here's an example of how you can process a single nucleotide sequence:
+
+```python
+betaNT = 'CGGGGGGGGTACCNTTGNTTAGGTCCTCTACACGGTTAACCTGGTCCCCGAACCGAAGGTCAATAGGGCCTGTATACTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAGTCANN'
+
+betaQuals = '12.12.12.12.12.22.9.8.6.6.6.8.3.0.3.10.3.0.3.10.10.11.20.25.30.37.37.29.27.14.14.15.27.30.41.47.36.50.50.50.42.42.57.57.43.47.53.47.47.47.47.47.47.50.54.57.57.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.57.57.57.57.59.59.59.57.57.57.57.57.57.57.57.59.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.59.59.59.59.59.57.57.57.59.57.57.43.37.28.28.21.28.23.37.28.30.15.19.17.15.21.20.25.3.0.0'
+
+import tcrdist as td
+chain = td.processing.processNT('human', 'B', betaNT, betaQuals)
+```
+
+# Pipeline example
+Here's an example of a TCR pipeline with `tcrdist`:
+
+```python
+import tcrdist as td
+psDf = td.datasets.loadPSData('human_pairseqs_v1')
+probDf = td.processing.computeProbs(psDf)
+psDf = psDf.join(probDf)
+clonesDf = td.processing.identifyClones(psDf)
+```
+
+# Testing
+To run all tests:
+
+`py.test tcrdist`
+
+or to pass all print statements through for debugging:
+
+`py.test tcrdist -s`
+
+Note that importing and running functions in `tcrdist` generates a log file, `tcrdist.log`. You can set the logging `level` by modifying the parameter in the base `__init__.py` file using one of the following:
+
+```
+logging.ERROR
+logging.WARNING
+logging.INFO
+logging.DEBUG
+```
+
+For details see documentation on the `logging` [module](https://docs.python.org/2/library/logging.html).
diff --git a/tcrdist/__init__.py b/tcrdist/__init__.py
@@ -0,0 +1,20 @@
+from __future__ import print_function
+import logging
+logging.basicConfig(filename='tcrdist.log',
+                    level=logging.WARNING,
+                    format='[%(asctime)s] [%(levelname)s] [%(name)s: %(lineno)s]\n\t%(message)s',
+                    datefmt='%m/%d/%Y %I:%M:%S %p',
+                    filemode='w')
+logger = logging.getLogger('__init__.py')
+logger.debug('Begining package imports')
+
+from . import processing
+from . import utils
+from . import plotting
+from .objects import TCRClone, TCRChain
+from . import datasets
+
+__all__ = ['processing',
+           'utils',
+           'plotting',
+           'datasets']
diff --git a/tcrdist/all_genes.py b/tcrdist/all_genes.py
@@ -0,0 +1,252 @@
+import os.path as op
+import pandas as pd
+import logging
+
+from .amino_acids import amino_acids
+from .blosum import blosum
+from .paths import path_to_db, db_file
+from . import translation
+from .objects import TCR_Gene
+
+logger = logging.getLogger('all_genes.py')
+
+gap_character = '.'
+
+all_genes = {}
+
+db_file = op.join(path_to_db, db_file)
+
+assert op.exists(db_file)
+
+dbDf = pd.read_csv(db_file, delimiter='\t')
+
+for rowi, row in dbDf.iterrows():
+    try:
+        g = TCR_Gene( row )
+    except:
+        print(row)
+        raise
+    if g.organism not in all_genes:
+        all_genes[g.organism] = {} # map from id to TCR_Gene objects
+    all_genes[g.organism][g.id] = g
+
+
+for organism,genes in all_genes.iteritems():
+
+    for ab in 'AB':
+        org_merged_loopseqs = {}
+        for id,g in genes.iteritems():
+            if g.chain == ab and g.region == 'V':
+                loopseqs = g.cdrs[:-1] ## exclude CDR3 Nterm
+                org_merged_loopseqs[id] = ' '.join( loopseqs )
+
+        all_loopseq_nbrs = {}
+        all_loopseq_nbrs_mm1 = {}
+        for id1,seq1 in org_merged_loopseqs.iteritems():
+            g1 = genes[id1]
+            cpos = g1.cdr_columns[-1][0] - 1 #0-indexed
+            alseq1 = g1.alseq
+            minlen = cpos+1
+            assert len(alseq1) >= minlen
+            if alseq1[cpos] != 'C':
+                logger.error('funny cpos: %s %s %s',id1,alseq1,g1.cdrs[-1])
+
+            all_loopseq_nbrs[id1] = []
+            all_loopseq_nbrs_mm1[id1] = []
+            for id2,seq2 in org_merged_loopseqs.iteritems():
+                g2 = genes[id2]
+                alseq2 = g2.alseq
+                assert len(alseq2) >= minlen
+                assert len(seq1) == len(seq2)
+                if seq1 == seq2:
+                    all_loopseq_nbrs[id1].append( id2 )
+                    all_loopseq_nbrs_mm1[id1].append( id2 )
+                    continue
+
+                ## count mismatches between these two, maybe count as an "_mm1" nbr
+                loop_mismatches = 0
+                loop_mismatches_cdrx = 0
+                loop_mismatch_seqs =[]
+                spaces=0
+                for a,b in zip( seq1,seq2):
+                    if a==' ':
+                        spaces+=1
+                        continue
+                    if a!= b:
+                        if a in '*.' or b in '*.':
+                            loop_mismatches += 10
+                            break
+                        else:
+                            assert a in amino_acids and b in amino_acids
+                            if spaces<=1:
+                                loop_mismatches += 1
+                                loop_mismatch_seqs.append( ( a,b ) )
+                            else:
+                                assert spaces==2
+                                loop_mismatches_cdrx += 1
+                            if loop_mismatches>1:
+                                break
+                if loop_mismatches <=1:
+                    all_mismatches = 0
+                    for a,b in zip( alseq1[:cpos+2],alseq2[:cpos+2]):
+                        if a!= b:
+                            if a in '*.' or b in '*.':
+                                all_mismatches += 10
+                            else:
+                                assert a in amino_acids and b in amino_acids
+                                all_mismatches += 1
+                    #dist = tcr_distances.blosum_sequence_distance( seq1, seq2, gap_penalty=10 )
+                    if loop_mismatches<=1 and loop_mismatches + loop_mismatches_cdrx <= 2 and all_mismatches<=10:
+                        if loop_mismatches == 1:
+                            blscore= blosum[(loop_mismatch_seqs[0][0],loop_mismatch_seqs[0][1])]
+                        else:
+                            blscore = 100
+                        if blscore>=1:
+                            all_loopseq_nbrs_mm1[id1].append( id2 )
+                            if loop_mismatches>0:
+                                mmstring = ','.join(['%s/%s'%(x[0],x[1]) for x in loop_mismatch_seqs])
+                                gene1 = id1[:id1.index('*')]
+                                gene2 = id2[:id2.index('*')]
+                                if gene1 != gene2:
+                                    logger.error('v_mismatches: %s %s %s %s %s %s %s %s %s',organism,mmstring,blscore,id1,id2,\
+                                        loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq1)
+                                    logger.error('v_mismatches: %s %s %s %s %s %s %s %s %s',organism,mmstring,blscore,id1,id2,\
+                                        loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq2)
+
+
+        for id in all_loopseq_nbrs:
+            rep = min( all_loopseq_nbrs[id] )
+            assert org_merged_loopseqs[id] == org_merged_loopseqs[ rep ]
+            genes[id].rep = rep
+            logger.debug('vrep %s %15s %15s %s'%(organism, id, rep, org_merged_loopseqs[id]))
+
+
+        ## merge mm1 nbrs to guarantee transitivity
+        while True:
+            new_nbrs = False
+            for id1 in all_loopseq_nbrs_mm1:
+                new_id1_nbrs = False
+                for id2 in all_loopseq_nbrs_mm1[id1]:
+                    for id3 in all_loopseq_nbrs_mm1[id2]:
+                        if id3 not in all_loopseq_nbrs_mm1[id1]:
+                            all_loopseq_nbrs_mm1[id1].append( id3 )
+                            logger.debug('new_nbr: %s %s %s %s %s',id1,'<--->',id2,'<--->',id3)
+                            new_id1_nbrs = True
+                            break
+                    if new_id1_nbrs:
+                        break
+                if new_id1_nbrs:
+                    new_nbrs = True
+            logger.debug('new_nbrs: %s, %s, %s',ab,organism,new_nbrs)
+            if not new_nbrs:
+                break
+
+        for id in all_loopseq_nbrs_mm1:
+            rep = min( all_loopseq_nbrs_mm1[id] )
+            genes[id].mm1_rep = rep
+            logger.debug('mm1vrep %s %15s %15s %s'%(organism, id, rep,org_merged_loopseqs[id]))
+
+
+    ## setup Jseq reps
+    for ab in 'AB':
+        jloopseqs = {}
+        for id,g in genes.iteritems():
+            if g.chain == ab and g.region == 'J':
+                num = len( g.cdrs[0].replace( gap_character, '' ) )
+                jloopseq = g.protseq[:num+3] ## go all the way up to and including the GXG
+                jloopseqs[id] = jloopseq
+        all_jloopseq_nbrs = {}
+        for id1,seq1 in jloopseqs.iteritems():
+            all_jloopseq_nbrs[id1] = []
+            for id2,seq2 in jloopseqs.iteritems():
+                if seq1 == seq2:
+                    all_jloopseq_nbrs[id1].append( id2 )
+        for id in all_jloopseq_nbrs:
+            rep = min( all_jloopseq_nbrs[id] )
+            genes[id].rep = rep
+            genes[id].mm1_rep = rep # just so we have an mm1_rep field defined...
+            assert jloopseqs[id] == jloopseqs[ rep ]
+            logger.debug('jrep %s %15s %15s %15s'%(organism, id, rep, jloopseqs[id]))
+
+def get_cdr3_and_j_match_counts( organism, ab, qseq, j_gene, min_min_j_matchlen = 3,
+                                 extended_cdr3 = False ):
+    #fasta = all_fasta[organism]
+    jg = all_genes[organism][j_gene]
+
+    errors = []
+
+    ## qseq starts at CA...
+    assert qseq[0] == 'C'
+
+    num_genome_j_positions_in_loop = len(jg.cdrs[0].replace(gap_character,''))-2
+    #num_genome_j_positions_in_loop = all_num_genome_j_positions_in_loop[organism][ab][j_gene]
+
+    if extended_cdr3: num_genome_j_positions_in_loop += 2 ## up to but not including GXG
+
+    ## history: was only for alpha
+    aseq = qseq[:] ## starts at the C position
+
+    ja_gene = j_gene
+    assert ja_gene in fasta
+    ja_seq = jg.protseq #fasta[ ja_gene ]
+
+    min_j_matchlen = min_min_j_matchlen+3
+
+    while min_j_matchlen >= min_min_j_matchlen:
+        ntrim =0
+        while ntrim+min_j_matchlen<len(ja_seq) and ja_seq[ntrim:ntrim+min_j_matchlen] not in aseq:
+            ntrim += 1
+
+        jatag = ja_seq[ntrim:ntrim+min_j_matchlen]
+        if jatag in aseq:
+            break
+        else:
+            min_j_matchlen -= 1
+
+    if jatag not in aseq:
+        logger.error('whoah',ab,aseq,ja_seq )
+        errors.append( 'j{}tag_not_in_aseq'.format(ab) )
+        return '-',[100,0],errors
+    elif ja_seq.count( jatag ) != 1:
+        logger.error('whoah2',ab,aseq,ja_seq )
+        errors.append( 'multiple_j{}tag_in_jseq'.format(ab) )
+        return '-',[100,0],errors
+    else:
+        pos = aseq.find( jatag )
+        looplen = pos - ntrim + num_genome_j_positions_in_loop
+        if not extended_cdr3:
+            aseq = aseq[3:]
+            looplen -= 3 ## dont count CAX
+        if len(aseq)<looplen:
+            logger.error('short',ab,aseq,ja_seq )
+            errors.append( ab+'seq_too_short' )
+            return '-',[100,0],errors
+
+        cdrseq = aseq[:looplen ]
+
+    ## now count mismatches in the J gene, beyond the cdrseq
+    j_seq = jg.protseq #fasta[ j_gene ] ## not sure why we do this again (old legacy code)
+    if qseq.count( cdrseq ) > 1:
+        logger.error('multiple cdrseq occurrences %s %s'%(qseq,cdrseq))
+        errors.append('multiple_cdrseq_occ')
+        return '-',[100,0],errors
+    assert qseq.count(cdrseq) == 1
+    start_counting_qseq = qseq.find(cdrseq)+len(cdrseq)
+    start_counting_jseq = num_genome_j_positions_in_loop
+    j_match_counts = [0,0]
+    #assert extended_cdr3 ## otherwise I think this count is not right?
+    #print 'here',start_counting_qseq,start_counting_jseq,len(qseq)
+    for qpos in range( start_counting_qseq, len(qseq)):
+        jpos = start_counting_jseq + (qpos-start_counting_qseq)
+        #print 'here',qpos,jpos
+        if jpos>= len(j_seq): break
+        if qseq[qpos] == j_seq[jpos]:
+            j_match_counts[1] += 1
+        else:
+            j_match_counts[0] += 1
+
+    return cdrseq, j_match_counts,errors
+
+if __name__ == '__main__':
+    ## show J alignments
+    pass
diff --git a/tcrdist/amino_acids.py b/tcrdist/amino_acids.py
@@ -0,0 +1,61 @@
+
+amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', \
+               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+
+#aana = amino_acids + ['a','c','g','t']
+
+longer_names={'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
+              'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
+              'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
+              'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
+              'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'}
+
+short_to_long = {}
+for rsd in longer_names.keys():short_to_long[longer_names[rsd]] = rsd
+
+
+HP = {'I': 0.73, 'F': 0.61, 'V': 0.54, 'L': 0.53, 'W': 0.37,
+      'M': 0.26, 'A': 0.25, 'G': 0.16, 'C': 0.04, 'Y': 0.02,
+      'P': -0.07, 'T': -0.18, 'S': -0.26, 'H': -0.40, 'E': -0.62,
+      'N': -0.64, 'Q': -0.69, 'D': -0.72, 'K': -1.10, 'R': -1.76}
+
+HP['X'] = sum(HP.values())/20.
+
+GES = {'F': -3.7, 'M': -3.4, 'I': -3.1, 'L': -2.8, 'V': -2.6,
+       'C': -2.0, 'W': -1.9, 'A': -1.6, 'T': -1.2, 'G': -1.0,
+       'S': -0.6, 'P': 0.2,  'Y': 0.7,  'H': 3.0,  'Q': 4.1,
+       'N': 4.8,  'E': 8.2,  'K': 8.8,  'D': 9.2,  'R': 12.3}
+
+GES['X'] = sum(GES.values())/20.
+
+## KD values (Kyte-Doolittle) taken from http://web.expasy.org/protscale/pscale/Hphob.Doolittle.html
+
+KD = {'A': 1.8, 'C': 2.5, 'E': -3.5, 'D': -3.5, 'G': -0.4, 'F': 2.8, 'I': 4.5, 'H': -3.2, 'K': -3.9, 'M': 1.9, 'L': 3.8, 'N': -3.5, 'Q': -3.5, 'P': -1.6, 'S': -0.8, 'R': -4.5, 'T': -0.7, 'W': -0.9, 'V': 4.2, 'Y': -1.3}
+assert len(KD) == 20
+
+aa_charge =  {}
+for a in amino_acids: aa_charge[a] = 0.0
+aa_charge['K'] = 1.0
+aa_charge['R'] = 1.0
+aa_charge['D'] = -1.0
+aa_charge['E'] = -1.0
+aa_charge['X'] = 0.0
+
+
+if __name__ == '__main__':
+
+    ## covariation between different HP scales
+    from scipy import stats
+
+    ges = [ -1*GES[x] for x in amino_acids ]
+    kd  = [  -1*KD[x] for x in amino_acids ]
+    hp  = [  -1*HP[x] for x in amino_acids ]
+
+    slope, intercept, r_ges_kd, p_value, std_err = stats.linregress(ges,kd)
+    slope, intercept, r_ges_hp, p_value, std_err = stats.linregress(ges,hp)
+    slope, intercept, r_kd_hp, p_value, std_err = stats.linregress(kd,hp)
+
+    print('r_ges_kd:',r_ges_kd)
+    print('r_ges_hp:',r_ges_hp)
+    print('r_kd_hp:',r_kd_hp)
+