From 823c883a41bd00dafa8d22a1113a34fde2ddff92 Mon Sep 17 00:00:00 2001
From: berylramadhian <berylramadhian@gmail.com>
Date: Mon, 2 Dec 2019 23:50:56 +0900
Subject: [PATCH] fixed broken data coordinate from PDB

---
 data_multi_processor.py | 63 +++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/data_multi_processor.py b/data_multi_processor.py
index 365ec8b..d55dcc7 100644
--- a/data_multi_processor.py
+++ b/data_multi_processor.py
@@ -88,7 +88,7 @@ def data_processing(path,id_name, atom_types, cutoff):
     l =[]
     with open(path_file, 'r') as f:
         for line in f:
-            if line.startswith('ATOM') or line.startswith('TER'):
+            if line.startswith('ATOM'):
                 clean_line = (line.rstrip()).split()
                 #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
                 if len(clean_line) == 11:
@@ -96,6 +96,16 @@ def data_processing(path,id_name, atom_types, cutoff):
                     split = [clean_line[-2][:4], clean_line[-2][4:]]
                     clean_line[-2] = split[1]
                     clean_line.insert(-2, split[0])
+                #check if coordinate data collumns are collided (most likely happens between x and y coor)
+                if len(clean_line[6])>=13:
+                    split = [clean_line[6][:-8], clean_line[6][-8:]]
+                    last_elem = clean_line.pop()
+                    clean_line[-1] = last_elem
+                    clean_line.insert(6, split[0])
+                    clean_line[7] = split[1]
+                l.append(clean_line)
+            elif line.startswith('TER'):
+                clean_line = (line.rstrip()).split()
                 l.append(clean_line)
             elif line.startswith('ENDMDL'):
                 break
@@ -160,7 +170,7 @@ def data_multi_processing(path,id_name, atom_types, cutoff, pool):
     l =[]
     with open(path_file, 'r') as f:
         for line in f:
-            if line.startswith('ATOM') or line.startswith('TER'):
+            if line.startswith('ATOM'):
                 clean_line = (line.rstrip()).split()
                 #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
                 if len(clean_line) == 11:
@@ -168,6 +178,16 @@ def data_multi_processing(path,id_name, atom_types, cutoff, pool):
                     split = [clean_line[-2][:4], clean_line[-2][4:]]
                     clean_line[-2] = split[1]
                     clean_line.insert(-2, split[0])
+                #check if coordinate data collumns are collided (most likely happens between x and y coor)
+                if len(clean_line[6])>=13:
+                    split = [clean_line[6][:-8], clean_line[6][-8:]]
+                    last_elem = clean_line.pop()
+                    clean_line[-1] = last_elem
+                    clean_line.insert(6, split[0])
+                    clean_line[7] = split[1]
+                l.append(clean_line)
+            elif line.startswith('TER'):
+                clean_line = (line.rstrip()).split()
                 l.append(clean_line)
             elif line.startswith('ENDMDL'):
                 break
@@ -197,7 +217,7 @@ def data_multi_processing_mp(params):
     l =[]
     with open(path_file, 'r') as f:
         for line in f:
-            if line.startswith('ATOM') or line.startswith('TER'):
+            if line.startswith('ATOM'):
                 clean_line = (line.rstrip()).split()
                 #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
                 if len(clean_line) == 11:
@@ -205,6 +225,16 @@ def data_multi_processing_mp(params):
                     split = [clean_line[-2][:4], clean_line[-2][4:]]
                     clean_line[-2] = split[1]
                     clean_line.insert(-2, split[0])
+                #check if coordinate data collumns are collided (most likely happens between x and y coor)
+                if len(clean_line[6])>=13:
+                    split = [clean_line[6][:-8], clean_line[6][-8:]]
+                    last_elem = clean_line.pop()
+                    clean_line[-1] = last_elem
+                    clean_line.insert(6, split[0])
+                    clean_line[7] = split[1]
+                l.append(clean_line)
+            elif line.startswith('TER'):
+                clean_line = (line.rstrip()).split()
                 l.append(clean_line)
             elif line.startswith('ENDMDL'):
                 break
@@ -267,7 +297,7 @@ def unit_test_y_data():
 #    print(len(complex_files))
 #    
 ##    test_file = path+'/'+complex_files[2]
-#    test_file = path+'/2wy2.ent.pdb'
+#    test_file = path+'/1f5r.ent.pdb'
 #    print(test_file)
 #    
 #    '''
@@ -276,7 +306,7 @@ def unit_test_y_data():
 #    l =[]
 #    with open(test_file, 'r') as f:
 #        for line in f:
-#            if line.startswith('ATOM') or line.startswith('TER'):
+#            if line.startswith('ATOM'):
 #                clean_line = (line.rstrip()).split()
 #                #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
 #                if len(clean_line) == 11:
@@ -284,12 +314,25 @@ def unit_test_y_data():
 #                    split = [clean_line[-2][:4], clean_line[-2][4:]]
 #                    clean_line[-2] = split[1]
 #                    clean_line.insert(-2, split[0])
+#                #check if coordinate data collumns are collided (most likely happens between x and y coor)
+#                if len(clean_line[6])>=13:
+#                    split = [clean_line[6][:-8], clean_line[6][-8:]]
+#                    last_elem = clean_line.pop()
+#                    clean_line[-1] = last_elem
+#                    clean_line.insert(6, split[0])
+#                    clean_line[7] = split[1]
+#                l.append(clean_line)
+#            elif line.startswith('TER'):
+#                clean_line = (line.rstrip()).split()
 #                l.append(clean_line)
 #            elif line.startswith('ENDMDL'):
 #                break
 #    df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type'})
-#    
-#
+#    for i in range(len(l)):
+#        print(i, l[i])
+#        
+#    print(l[2013])
+#    print(len(l[293][6].split('-')))
 #    print(df_atoms)
 #    
 #    '''
@@ -303,7 +346,8 @@ def unit_test_y_data():
 #    
 #    print(df_atoms.index[df_atoms['record'] == 'TER'].tolist())
 #    print(l_df)
-    
+#    
+#    print(df_atoms.iloc[293])
     '''
     multiprocessing unit test
     '''
@@ -351,6 +395,7 @@ def unit_test_y_data():
     #initialize parameters
     path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP'
     complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+    #print(complex_files)
     
     atom_types = ['C','N','O','F','P','S','Cl','Br','I']
     cutoff = 12
@@ -415,5 +460,5 @@ def unit_test_y_data():
     except FileNotFoundError:
         print('File is not found')
     saved_ids = [d['id'] for d in data]
-    print('processed protein IDs = ',saved_ids)
+    print('processed protein IDs = ',saved_ids, print(len(saved_ids)))
     
\ No newline at end of file