luca-martial
diff --git a/‎nlu/pipe/col_substitution/col_name_substitution_utils.py
+64-4 b/‎nlu/pipe/col_substitution/col_name_substitution_utils.py
+64-4
diff --git a/‎nlu/pipe/col_substitution/col_substitution_HC.py
+10-16 b/‎nlu/pipe/col_substitution/col_substitution_HC.py
+10-16
@@ -31,10 +31,14 @@
 
 
 """
+from nlu.pipe.pipe_logic import PipeUtils
 class ColSubstitutionUtils():
     """Utils for substituting col names in Pythonify to short and meaningful names.
     Uses custom rename methods for either PySpark or Pandas
     """
+    from sparknlp.annotator import MarianTransformer
+    cleanable_splits = ['ner_converter','spell','ner_to_chunk_converter','train','classify','ner','med_ner','dl','match','clean','sentiment','embed','embed_sentence','embed_chunk','explain','pos','resolve_chunk','resolve',]
+    all_langs        = ['en','et','bh','am','da','fr','de','it','nb','no','nn','pl','pt','ru','es','af','ar','hy','eu','bn','br','bg','ca','cs','eo','fi','gl','el','ha','he','hi','hu','id','ga','ja','la','lv','mr','fa','ro','sk','sl','so','st','sw','sv','th','tr','uk','yo','zu','zh','xx','ur','ko']
     @staticmethod
     def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
         """
@@ -49,7 +53,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
         if pipe.has_licensed_components :
             from nlu.pipe.col_substitution import col_substitution_HC
             from nlu.pipe.col_substitution import substitution_map_HC
-
+        deducted_component_names = ColSubstitutionUtils.deduct_component_names(pipe)
         for c in pipe.components :
             is_unique = True # TODO infer this properly
             cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c,df,anno_2_ex)
@@ -64,7 +68,7 @@ def substitute_col_names(df,anno_2_ex,pipe,drop_debug_cols=True):
                 new_cols.update(dict(zip(cols_to_substitute,cols_to_substitute)))
                 continue
             # dic, key=old_col, value=new_col. Some cols may be omitted and missing from the dic which are deemed irrelevant. Behaivour can be disabled by setting drop_debug_cols=False
-            new_cols = {**new_cols, **(substitution_fn(c,cols_to_substitute,is_unique))}
+            new_cols = {**new_cols, **(substitution_fn(c,cols_to_substitute,deducted_component_names[c]))}
 
         return df.rename(columns = new_cols)[new_cols.values()] if drop_debug_cols else df.rename(columns = new_cols)
 
@@ -84,9 +88,65 @@ def get_final_output_cols_of_component(c,df,anno_2_ex):
         # find all metadata fields generated by compoent
         for col in df.columns :
             if 'meta_'+ configs.output_col_prefix in col:
-                meta_col_name = 'meta_'+ configs.output_col_prefix + col.split('meta_'+ configs.output_col_prefix)[-1]
-                if meta_col_name in df.columns :result_cols.append(meta_col_name)
+                base_meta_prefix = 'meta_'+ configs.output_col_prefix
+                meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
+                if meta_col_name in df.columns :
+                    # special case for overlapping names with _
+                    if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not c.info.outputs[0].split('_')[-1].isnumeric(): continue
+                    if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and c.info.outputs[0].split('_')[-1].isnumeric():
+                        id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
+                        id2 = int(c.info.outputs[0].split('_')[-1])
+                        if id1 != id2 : continue
+                    result_cols.append(meta_col_name)
                 else : logger.info(f"Could not find meta col for c={c}, col={col}. Ommiting col..")
         return result_cols
 
 
+    @staticmethod
+    def deduct_component_names(pipe):
+        """Deduct a meaningful name for Embeddings, classifiers, resolvesr, relation extractors, etc..
+        Will return a dict that maps every Annotator Class to a String Name. If String_Name =='' that means, it can be omtited for naming and the unique_default name schema should be used,
+        since that annotator is unique in the pipe
+        """
+        import nlu.pipe.col_substitution.name_deduction.name_deductable_annotators_OS as deductable_OS
+        max_depth = 10
+        result_names = {}
+        for c in pipe.components :
+            result_names[c]='UNIQUE' # assuemd uniqe, if not updated in followign steps
+            is_always_name_deductable_component = False
+            if pipe.has_licensed_components :
+                import nlu.pipe.col_substitution.name_deduction.name_deductable_annotators_HC as deductable_HC
+                if type(c.model) not in deductable_HC.name_deductable_HC: continue
+                if type(c.model) in deductable_HC.always_name_deductable_HC: is_always_name_deductable_component=True
+
+            if type(c.model) not in deductable_OS.name_deductable_OS: continue
+            if type(c.model) in deductable_OS.always_name_deductable_OS: is_always_name_deductable_component=True
+
+            same_components = []
+            for other_c in pipe.components :
+                if c is other_c: continue
+                if c.info.type == other_c.info.type: same_components.append(other_c)
+            if len(same_components) or is_always_name_deductable_component:
+                # make sure each name is unique among the components of same type
+                cur_depth = 1
+                other_names = [ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(other_c) for other_c in same_components]
+                c_name = ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(c)
+                while c_name in other_names and cur_depth < max_depth:
+                    cur_depth += 1
+                    other_names = [ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(other_c) for other_c in same_components]
+                    c_name = ColSubstitutionUtils.deduct_name_from_nlu_ref_at_depth(c,cur_depth)
+                result_names[c]=c_name
+            else :
+                result_names[c]='UNIQUE' # no name insertion required
+        return result_names
+
+    @staticmethod
+    def deduct_name_from_nlu_ref_at_depth(c, depth=1):
+        if isinstance(c.model, MarianTransformer): return c.info.nlu_ref.split('xx.')[-1].replace('marian.','')
+        splits = c.info.nlu_ref.split('.')
+        #remove all name irrelevant splits
+        while splits[0] in ColSubstitutionUtils.all_langs or splits[0] in ColSubstitutionUtils.cleanable_splits: splits.pop(0)
+        if len(splits)==0: return c.info.nlu_ref
+        else : return '_'.join(splits[:depth])
+
+
@@ -2,7 +2,7 @@
 import logging
 logger = logging.getLogger('nlu')
 
-def substitute_ner_internal_converter_cols(c, cols, is_unique):
+def substitute_ner_internal_converter_cols(c, cols, nlu_identifier):
     """
     Fetched fields are:
     - entities@<storage_ref>_results
@@ -11,8 +11,7 @@ def substitute_ner_internal_converter_cols(c, cols, is_unique):
         - entities@<storage_ref>_confidence
     """
     new_cols = {}
-    nlu_identifier = extract_nlu_identifier(c)
-    new_base_name = 'entities' if is_unique else f'entities_{nlu_identifier}'
+    new_base_name = 'entities' if nlu_identifier=='UNIQUE' else f'entities_{nlu_identifier}'
     for col in cols :
         if 'results'     in col     : new_cols[col] = new_base_name
         elif '_beginnings' in col     : new_cols[col] = f'{new_base_name}_begin'
@@ -30,7 +29,7 @@ def substitute_ner_internal_converter_cols(c, cols, is_unique):
 
 
 
-def substitute_chunk_resolution_cols(c, cols, is_unique=True):
+def substitute_chunk_resolution_cols(c, cols, nlu_identifier=True):
     """
     Substitute col name for Resolution. For Resolution, some name will be infered, and entity_resolution_<name> will become the base name schema
 all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
@@ -52,8 +51,7 @@ def substitute_chunk_resolution_cols(c, cols, is_unique=True):
 token -> Token index
     """
     new_cols = {}
-    c_name   = extract_nlu_identifier(c)
-    new_base_name = f'entity_resolution' if is_unique else f'entity_resolution_{c_name}'
+    new_base_name = f'entity_resolution' if nlu_identifier=='UNIQUE'  else f'entity_resolution_{nlu_identifier}'
     for col in cols :
         if '_results'      in col    and 'all_k' not in col :  new_cols[col] = f'{new_base_name}_code' # resolved code
         elif '_beginnings' in col     : new_cols[col]  = f'{new_base_name}_begin'
@@ -84,7 +82,7 @@ def substitute_chunk_resolution_cols(c, cols, is_unique=True):
     return new_cols
 
 
-def substitute_sentence_resolution_cols(c, cols, is_unique=True):
+def substitute_sentence_resolution_cols(c, cols, nlu_identifier=True):
     """
     Substitute col name for Resolution. For Resolution, some name will be infered, and sentence_resolution_<name> will become the base name schema
 all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
@@ -106,8 +104,7 @@ def substitute_sentence_resolution_cols(c, cols, is_unique=True):
 token -> Token index
     """
     new_cols = {}
-    c_name   = extract_nlu_identifier(c)
-    new_base_name = f'sentence_resolution' if is_unique else f'sentence_resolution_{c_name}'
+    new_base_name = f'sentence_resolution' if nlu_identifier=='UNIQUE' else f'sentence_resolution_{nlu_identifier}'
     for col in cols :
         if '_results'      in col    and 'all_k' not in col :  new_cols[col] = f'{new_base_name}_code' # resolved code
         elif '_beginnings' in col     : new_cols[col]  = f'{new_base_name}_begin'
@@ -139,13 +136,13 @@ def substitute_sentence_resolution_cols(c, cols, is_unique=True):
 
 
 
-def substitute_assertion_cols(c, cols, is_unique=True):
+def substitute_assertion_cols(c, cols, nlu_identifier=True):
     """
     Substitute col name for Assertion. For Assertion, some name will be infered, and assertion_<sub_field> defines the base name schema
     Assert should always be unique
     """
     new_cols = {}
-    c_name   = extract_nlu_identifier(c)
+    # c_name   = extract_nlu_identifier(c)
     new_base_name = f'assertion'# if is_unique else f'sentence_resolution_{c_name}'
     for col in cols :
         if '_results'      in col     :  new_cols[col] = f'{new_base_name}' # resolved code
@@ -170,7 +167,6 @@ def substitute_de_identification_cols(c, cols, is_unique=True):
     de_identify should always be unique
     """
     new_cols = {}
-    c_name   = extract_nlu_identifier(c)
     new_base_name = f'de_identified'# if is_unique else f'sentence_resolution_{c_name}'
     for col in cols :
         if '_results'      in col     :  new_cols[col] = f'{new_base_name}' # resolved code
@@ -184,10 +180,9 @@ def substitute_de_identification_cols(c, cols, is_unique=True):
 
     return new_cols
 
-def extract_nlu_identifier(c):return "<name>"
 
 
-def substitute_relation_cols(c, cols, is_unique=True):
+def substitute_relation_cols(c, cols, nlu_identifier=True):
     """
     Substitute col name for de-identification. For de-identification, some name will be infered, and de_identified_<sub_field> defines the base name schema
     de_identify should always be unique
@@ -206,8 +201,7 @@ def substitute_relation_cols(c, cols, is_unique=True):
 
     """
     new_cols = {}
-    c_name   = extract_nlu_identifier(c)
-    new_base_name = f'relation' if is_unique else f'relation_{c_name}'
+    new_base_name = f'relation' if nlu_identifier=='UNIQUE' else f'relation_{nlu_identifier}'
     for col in cols :
         if '_results'      in col     : new_cols[col]  = f'{new_base_name}' # resolved code
         elif '_beginnings' in col     : new_cols[col]  = f'{new_base_name}_begin'