Skip to content

Commit 53b0487

Browse files
committed
annotator name deduction integration into column name substitution
1 parent 5f88266 commit 53b0487

7 files changed

+295
-118
lines changed

nlu/pipe/col_substitution/col_name_substitution_utils.py

+64-4
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,14 @@
3131
3232
3333
"""
34+
from nlu.pipe.pipe_logic import PipeUtils
3435
class ColSubstitutionUtils():
3536
"""Utils for substituting col names in Pythonify to short and meaningful names.
3637
Uses custom rename methods for either PySpark or Pandas
3738
"""
39+
from sparknlp.annotator import MarianTransformer
40+
cleanable_splits = ['ner_converter','spell','ner_to_chunk_converter','train','classify','ner','med_ner','dl','match','clean','sentiment','embed','embed_sentence','embed_chunk','explain','pos','resolve_chunk','resolve',]
41+
all_langs = ['en','et','bh','am','da','fr','de','it','nb','no','nn','pl','pt','ru','es','af','ar','hy','eu','bn','br','bg','ca','cs','eo','fi','gl','el','ha','he','hi','hu','id','ga','ja','la','lv','mr','fa','ro','sk','sl','so','st','sw','sv','th','tr','uk','yo','zu','zh','xx','ur','ko']
3842
@staticmethod
3943
def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
4044
"""
@@ -49,7 +53,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
4953
if pipe.has_licensed_components :
5054
from nlu.pipe.col_substitution import col_substitution_HC
5155
from nlu.pipe.col_substitution import substitution_map_HC
52-
56+
deducted_component_names = ColSubstitutionUtils.deduct_component_names(pipe)
5357
for c in pipe.components :
5458
is_unique = True # TODO infer this properly
5559
cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c,df,anno_2_ex)
@@ -64,7 +68,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
6468
new_cols.update(dict(zip(cols_to_substitute,cols_to_substitute)))
6569
continue
6670
# dic, key=old_col, value=new_col. Some cols may be omitted and missing from the dic which are deemed irrelevant. Behaivour can be disabled by setting drop_debug_cols=False
67-
new_cols = {**new_cols, **(substitution_fn(c,cols_to_substitute,is_unique))}
71+
new_cols = {**new_cols, **(substitution_fn(c,cols_to_substitute,deducted_component_names[c]))}
6872

6973
return df.rename(columns = new_cols)[new_cols.values()] if drop_debug_cols else df.rename(columns = new_cols)
7074

@@ -84,9 +88,65 @@ def get_final_output_cols_of_component(c,df,anno_2_ex):
8488
# find all metadata fields generated by compoent
8589
for col in df.columns :
8690
if 'meta_'+ configs.output_col_prefix in col:
87-
meta_col_name = 'meta_'+ configs.output_col_prefix + col.split('meta_'+ configs.output_col_prefix)[-1]
88-
if meta_col_name in df.columns :result_cols.append(meta_col_name)
91+
base_meta_prefix = 'meta_'+ configs.output_col_prefix
92+
meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
93+
if meta_col_name in df.columns :
94+
# special case for overlapping names with _
95+
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not c.info.outputs[0].split('_')[-1].isnumeric(): continue
96+
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and c.info.outputs[0].split('_')[-1].isnumeric():
97+
id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
98+
id2 = int(c.info.outputs[0].split('_')[-1])
99+
if id1 != id2 : continue
100+
result_cols.append(meta_col_name)
89101
else : logger.info(f"Could not find meta col for c={c}, col={col}. Ommiting col..")
90102
return result_cols
91103

92104

105+
@staticmethod
106+
def deduct_component_names(pipe):
107+
"""Deduct a meaningful name for Embeddings, classifiers, resolvesr, relation extractors, etc..
108+
Will return a dict that maps every Annotator Class to a String Name. If String_Name =='' that means, it can be omtited for naming and the unique_default name schema should be used,
109+
since that annotator is unique in the pipe
110+
"""
111+
import nlu.pipe.col_substitution.name_deduction.name_deductable_annotators_OS as deductable_OS
112+
max_depth = 10
113+
result_names = {}
114+
for c in pipe.components :
115+
result_names[c]='UNIQUE' # assuemd uniqe, if not updated in followign steps
116+
is_always_name_deductable_component = False
117+
if pipe.has_licensed_components :
118+
import nlu.pipe.col_substitution.name_deduction.name_deductable_annotators_HC as deductable_HC
119+
if type(c.model) not in deductable_HC.name_deductable_HC: continue
120+
if type(c.model) in deductable_HC.always_name_deductable_HC: is_always_name_deductable_component=True
121+
122+
if type(c.model) not in deductable_OS.name_deductable_OS: continue
123+
if type(c.model) in deductable_OS.always_name_deductable_OS: is_always_name_deductable_component=True
124+
125+
same_components = []
126+
for other_c in pipe.components :
127+
if c is other_c: continue
128+
if c.info.type == other_c.info.type: same_components.append(other_c)
129+
if len(same_components) or is_always_name_deductable_component:
130+
# make sure each name is unique among the components of same type
131+
cur_depth = 1
132+
other_names = [ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(other_c) for other_c in same_components]
133+
c_name = ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(c)
134+
while c_name in other_names and cur_depth < max_depth:
135+
cur_depth += 1
136+
other_names = [ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(other_c) for other_c in same_components]
137+
c_name = ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(c,cur_depth)
138+
result_names[c]=c_name
139+
else :
140+
result_names[c]='UNIQUE' # no name insertion required
141+
return result_names
142+
143+
@staticmethod
144+
def deduct_name_from_nlu_ref_at_depth(c, depth=1):
145+
if isinstance(c.model, MarianTransformer): return c.info.nlu_ref.split('xx.')[-1].replace('marian.','')
146+
splits = c.info.nlu_ref.split('.')
147+
#remove all name irrelevant splits
148+
while splits[0] in ColSubstitutionUtils.all_langs or splits[0] in ColSubstitutionUtils.cleanable_splits: splits.pop(0)
149+
if len(splits)==0: return c.info.nlu_ref
150+
else : return '_'.join(splits[:depth])
151+
152+

nlu/pipe/col_substitution/col_substitution_HC.py

+10-16
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
logger = logging.getLogger('nlu')
44

5-
def substitute_ner_internal_converter_cols(c, cols, is_unique):
5+
def substitute_ner_internal_converter_cols(c, cols, nlu_identifier):
66
"""
77
Fetched fields are:
88
- entities@<storage_ref>_results
@@ -11,8 +11,7 @@ def substitute_ner_internal_converter_cols(c, cols, is_unique):
1111
- entities@<storage_ref>_confidence
1212
"""
1313
new_cols = {}
14-
nlu_identifier = extract_nlu_identifier(c)
15-
new_base_name = 'entities' if is_unique else f'entities_{nlu_identifier}'
14+
new_base_name = 'entities' if nlu_identifier=='UNIQUE' else f'entities_{nlu_identifier}'
1615
for col in cols :
1716
if 'results' in col : new_cols[col] = new_base_name
1817
elif '_beginnings' in col : new_cols[col] = f'{new_base_name}_begin'
@@ -30,7 +29,7 @@ def substitute_ner_internal_converter_cols(c, cols, is_unique):
3029

3130

3231

33-
def substitute_chunk_resolution_cols(c, cols, is_unique=True):
32+
def substitute_chunk_resolution_cols(c, cols, nlu_identifier=True):
3433
"""
3534
Substitute col name for Resolution. For Resolution, some name will be infered, and entity_resolution_<name> will become the base name schema
3635
all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
@@ -52,8 +51,7 @@ def substitute_chunk_resolution_cols(c, cols, is_unique=True):
5251
token -> Token index
5352
"""
5453
new_cols = {}
55-
c_name = extract_nlu_identifier(c)
56-
new_base_name = f'entity_resolution' if is_unique else f'entity_resolution_{c_name}'
54+
new_base_name = f'entity_resolution' if nlu_identifier=='UNIQUE' else f'entity_resolution_{nlu_identifier}'
5755
for col in cols :
5856
if '_results' in col and 'all_k' not in col : new_cols[col] = f'{new_base_name}_code' # resolved code
5957
elif '_beginnings' in col : new_cols[col] = f'{new_base_name}_begin'
@@ -84,7 +82,7 @@ def substitute_chunk_resolution_cols(c, cols, is_unique=True):
8482
return new_cols
8583

8684

87-
def substitute_sentence_resolution_cols(c, cols, is_unique=True):
85+
def substitute_sentence_resolution_cols(c, cols, nlu_identifier=True):
8886
"""
8987
Substitute col name for Resolution. For Resolution, some name will be infered, and sentence_resolution_<name> will become the base name schema
9088
all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
@@ -106,8 +104,7 @@ def substitute_sentence_resolution_cols(c, cols, is_unique=True):
106104
token -> Token index
107105
"""
108106
new_cols = {}
109-
c_name = extract_nlu_identifier(c)
110-
new_base_name = f'sentence_resolution' if is_unique else f'sentence_resolution_{c_name}'
107+
new_base_name = f'sentence_resolution' if nlu_identifier=='UNIQUE' else f'sentence_resolution_{nlu_identifier}'
111108
for col in cols :
112109
if '_results' in col and 'all_k' not in col : new_cols[col] = f'{new_base_name}_code' # resolved code
113110
elif '_beginnings' in col : new_cols[col] = f'{new_base_name}_begin'
@@ -139,13 +136,13 @@ def substitute_sentence_resolution_cols(c, cols, is_unique=True):
139136

140137

141138

142-
def substitute_assertion_cols(c, cols, is_unique=True):
139+
def substitute_assertion_cols(c, cols, nlu_identifier=True):
143140
"""
144141
Substitute col name for Assertion. For Assertion, some name will be infered, and assertion_<sub_field> defines the base name schema
145142
Assert should always be unique
146143
"""
147144
new_cols = {}
148-
c_name = extract_nlu_identifier(c)
145+
# c_name = extract_nlu_identifier(c)
149146
new_base_name = f'assertion'# if is_unique else f'sentence_resolution_{c_name}'
150147
for col in cols :
151148
if '_results' in col : new_cols[col] = f'{new_base_name}' # resolved code
@@ -170,7 +167,6 @@ def substitute_de_identification_cols(c, cols, is_unique=True):
170167
de_identify should always be unique
171168
"""
172169
new_cols = {}
173-
c_name = extract_nlu_identifier(c)
174170
new_base_name = f'de_identified'# if is_unique else f'sentence_resolution_{c_name}'
175171
for col in cols :
176172
if '_results' in col : new_cols[col] = f'{new_base_name}' # resolved code
@@ -184,10 +180,9 @@ def substitute_de_identification_cols(c, cols, is_unique=True):
184180

185181
return new_cols
186182

187-
def extract_nlu_identifier(c):return "<name>"
188183

189184

190-
def substitute_relation_cols(c, cols, is_unique=True):
185+
def substitute_relation_cols(c, cols, nlu_identifier=True):
191186
"""
192187
Substitute col name for de-identification. For de-identification, some name will be infered, and de_identified_<sub_field> defines the base name schema
193188
de_identify should always be unique
@@ -206,8 +201,7 @@ def substitute_relation_cols(c, cols, is_unique=True):
206201
207202
"""
208203
new_cols = {}
209-
c_name = extract_nlu_identifier(c)
210-
new_base_name = f'relation' if is_unique else f'relation_{c_name}'
204+
new_base_name = f'relation' if nlu_identifier=='UNIQUE' else f'relation_{nlu_identifier}'
211205
for col in cols :
212206
if '_results' in col : new_cols[col] = f'{new_base_name}' # resolved code
213207
elif '_beginnings' in col : new_cols[col] = f'{new_base_name}_begin'

0 commit comments

Comments
 (0)