31
31
32
32
33
33
"""
34
+ from nlu .pipe .pipe_logic import PipeUtils
34
35
class ColSubstitutionUtils ():
35
36
"""Utils for substituting col names in Pythonify to short and meaningful names.
36
37
Uses custom rename methods for either PySpark or Pandas
37
38
"""
39
+ from sparknlp .annotator import MarianTransformer
40
+ cleanable_splits = ['ner_converter' ,'spell' ,'ner_to_chunk_converter' ,'train' ,'classify' ,'ner' ,'med_ner' ,'dl' ,'match' ,'clean' ,'sentiment' ,'embed' ,'embed_sentence' ,'embed_chunk' ,'explain' ,'pos' ,'resolve_chunk' ,'resolve' ,]
41
+ all_langs = ['en' ,'et' ,'bh' ,'am' ,'da' ,'fr' ,'de' ,'it' ,'nb' ,'no' ,'nn' ,'pl' ,'pt' ,'ru' ,'es' ,'af' ,'ar' ,'hy' ,'eu' ,'bn' ,'br' ,'bg' ,'ca' ,'cs' ,'eo' ,'fi' ,'gl' ,'el' ,'ha' ,'he' ,'hi' ,'hu' ,'id' ,'ga' ,'ja' ,'la' ,'lv' ,'mr' ,'fa' ,'ro' ,'sk' ,'sl' ,'so' ,'st' ,'sw' ,'sv' ,'th' ,'tr' ,'uk' ,'yo' ,'zu' ,'zh' ,'xx' ,'ur' ,'ko' ]
38
42
@staticmethod
39
43
def substitute_col_names (df ,anno_2_ex ,pipe ,drop_debug_cols = True ):
40
44
"""
@@ -49,7 +53,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
49
53
if pipe .has_licensed_components :
50
54
from nlu .pipe .col_substitution import col_substitution_HC
51
55
from nlu .pipe .col_substitution import substitution_map_HC
52
-
56
+ deducted_component_names = ColSubstitutionUtils . deduct_component_names ( pipe )
53
57
for c in pipe .components :
54
58
is_unique = True # TODO infer this properly
55
59
cols_to_substitute = ColSubstitutionUtils .get_final_output_cols_of_component (c ,df ,anno_2_ex )
@@ -64,7 +68,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
64
68
new_cols .update (dict (zip (cols_to_substitute ,cols_to_substitute )))
65
69
continue
66
70
# dic, key=old_col, value=new_col. Some cols may be omitted and missing from the dic which are deemed irrelevant. Behaivour can be disabled by setting drop_debug_cols=False
67
- new_cols = {** new_cols , ** (substitution_fn (c ,cols_to_substitute ,is_unique ))}
71
+ new_cols = {** new_cols , ** (substitution_fn (c ,cols_to_substitute ,deducted_component_names [ c ] ))}
68
72
69
73
return df .rename (columns = new_cols )[new_cols .values ()] if drop_debug_cols else df .rename (columns = new_cols )
70
74
@@ -84,9 +88,65 @@ def get_final_output_cols_of_component(c,df,anno_2_ex):
84
88
# find all metadata fields generated by compoent
85
89
for col in df .columns :
86
90
if 'meta_' + configs .output_col_prefix in col :
87
- meta_col_name = 'meta_' + configs .output_col_prefix + col .split ('meta_' + configs .output_col_prefix )[- 1 ]
88
- if meta_col_name in df .columns :result_cols .append (meta_col_name )
91
+ base_meta_prefix = 'meta_' + configs .output_col_prefix
92
+ meta_col_name = base_meta_prefix + col .split (base_meta_prefix )[- 1 ]
93
+ if meta_col_name in df .columns :
94
+ # special case for overlapping names with _
95
+ if col .split (base_meta_prefix )[- 1 ].split ('_' )[1 ].isnumeric () and not c .info .outputs [0 ].split ('_' )[- 1 ].isnumeric (): continue
96
+ if col .split (base_meta_prefix )[- 1 ].split ('_' )[1 ].isnumeric () and c .info .outputs [0 ].split ('_' )[- 1 ].isnumeric ():
97
+ id1 = int (col .split (base_meta_prefix )[- 1 ].split ('_' )[1 ])
98
+ id2 = int (c .info .outputs [0 ].split ('_' )[- 1 ])
99
+ if id1 != id2 : continue
100
+ result_cols .append (meta_col_name )
89
101
else : logger .info (f"Could not find meta col for c={ c } , col={ col } . Ommiting col.." )
90
102
return result_cols
91
103
92
104
105
+ @staticmethod
106
+ def deduct_component_names (pipe ):
107
+ """Deduct a meaningful name for Embeddings, classifiers, resolvesr, relation extractors, etc..
108
+ Will return a dict that maps every Annotator Class to a String Name. If String_Name =='' that means, it can be omtited for naming and the unique_default name schema should be used,
109
+ since that annotator is unique in the pipe
110
+ """
111
+ import nlu .pipe .col_substitution .name_deduction .name_deductable_annotators_OS as deductable_OS
112
+ max_depth = 10
113
+ result_names = {}
114
+ for c in pipe .components :
115
+ result_names [c ]= 'UNIQUE' # assuemd uniqe, if not updated in followign steps
116
+ is_always_name_deductable_component = False
117
+ if pipe .has_licensed_components :
118
+ import nlu .pipe .col_substitution .name_deduction .name_deductable_annotators_HC as deductable_HC
119
+ if type (c .model ) not in deductable_HC .name_deductable_HC : continue
120
+ if type (c .model ) in deductable_HC .always_name_deductable_HC : is_always_name_deductable_component = True
121
+
122
+ if type (c .model ) not in deductable_OS .name_deductable_OS : continue
123
+ if type (c .model ) in deductable_OS .always_name_deductable_OS : is_always_name_deductable_component = True
124
+
125
+ same_components = []
126
+ for other_c in pipe .components :
127
+ if c is other_c : continue
128
+ if c .info .type == other_c .info .type : same_components .append (other_c )
129
+ if len (same_components ) or is_always_name_deductable_component :
130
+ # make sure each name is unique among the components of same type
131
+ cur_depth = 1
132
+ other_names = [ColSubstitutionUtils .deduct_name_from_nlu_ref_at_depth (other_c ) for other_c in same_components ]
133
+ c_name = ColSubstitutionUtils .deduct_name_from_nlu_ref_at_depth (c )
134
+ while c_name in other_names and cur_depth < max_depth :
135
+ cur_depth += 1
136
+ other_names = [ColSubstitutionUtils .deduct_name_from_nlu_ref_at_depth (other_c ) for other_c in same_components ]
137
+ c_name = ColSubstitutionUtils .deduct_name_from_nlu_ref_at_depth (c ,cur_depth )
138
+ result_names [c ]= c_name
139
+ else :
140
+ result_names [c ]= 'UNIQUE' # no name insertion required
141
+ return result_names
142
+
143
+ @staticmethod
144
+ def deduct_name_from_nlu_ref_at_depth (c , depth = 1 ):
145
+ if isinstance (c .model , MarianTransformer ): return c .info .nlu_ref .split ('xx.' )[- 1 ].replace ('marian.' ,'' )
146
+ splits = c .info .nlu_ref .split ('.' )
147
+ #remove all name irrelevant splits
148
+ while splits [0 ] in ColSubstitutionUtils .all_langs or splits [0 ] in ColSubstitutionUtils .cleanable_splits : splits .pop (0 )
149
+ if len (splits )== 0 : return c .info .nlu_ref
150
+ else : return '_' .join (splits [:depth ])
151
+
152
+
0 commit comments