resolve conflict

于泽 · 于泽 · commit fc4a34c25cfe · 2022-03-03T13:56:22.000+08:00
diff --git a/dataspin/cli/test_data_generator.py b/dataspin/cli/test_data_generator.py
@@ -14,14 +14,15 @@
 app_ids = ['APPIOXDKXIESP', 'APPOWLSLSDWLD', 'APPSISEKIDESS']
 
 event_names = ['login', 'logout', 'enter_splash', 'leave_splash', 'session']
-
+platform = ['android','ios']
 
 def generate_json_data(bp_timestamp):
     data_dict = {
         'app_id': app_ids[randint(0, len(app_ids)-1)],
         'event_name': event_names[randint(0, len(event_names)-1)],
         'event_id': str(uuid.uuid4()),
         'bp_timestamp': bp_timestamp,
+        'platform':platform[randint(0,len(platform)-1)],
         'device':{
             'brand':'',
             'os_type':'android',
@@ -37,7 +38,7 @@ def save_json_file(file_dir: str, file_name, data_list: List[str]):
         f.write('\n'.join(data_list).encode('utf-8'))
 
 
-def generate_test_data(file_dir='temp', file_numbers=1, data_counts=1000, duplicate_data_count=1, data_format='jsonl', time_range=2, time_unit='day'):
+def generate_test_data(file_dir='tmp/source', file_numbers=1, data_counts=1000, duplicate_data_count=1, data_format='jsonl', time_range=2, time_unit='day'):
     file_dir = file_dir.strip('/')
     if os.path.exists(file_dir):
         shutil.rmtree(file_dir)
@@ -89,7 +90,7 @@ def cli():
 
 @cli.command()
 @click.option('--execute_duration', '-ed', type=click.IntRange(min=10), default=60, help='execute duration,unit is second,default 60')
-@click.option('--file_dir', '-fd', default='temp', help='file dir')
+@click.option('--file_dir', '-fd', default='tmp/source', help='file dir')
 @click.option('--file_numbers', '-fn', type=click.IntRange(min=1), default=1, help='file numbers')
 @click.option('--data_counts', '-dc', type=click.IntRange(min=10), default=100000, help='data counts')
 @click.option('--duplicate_data_count', '-dd', type=click.IntRange(min=0), default=1000, help='duplicate data counts')
diff --git a/dataspin/core.py b/dataspin/core.py
@@ -166,14 +166,14 @@ def name(self):
 
 
 class DataFile:
-    def __init__(self, file_path, file_type="table", tags=None):
+    def __init__(self, file_path, file_type="table",tags = None):
         self.name, self.ext = os.path.splitext(os.path.basename(file_path))
         if self.ext in ['.gz']:
             self.name, ext = os.path.splitext(self.name)
             self.ext = f'{ext}{self.ext}'
         self.file_path = file_path
-        self.file_type = file_type  # table or index
-        self.file_format = "jsonl"  # can be jsonl, parquet
+        self.file_type = file_type # table or index
+        self.file_format = "jsonl" # can be jsonl, parquet
         self.tags = tags
 
     @property
@@ -223,9 +223,9 @@ def set_data_files(self, data_files):
         logger.debug('set data files,', data_files=data_files)
         self.final_files = data_files
         self.files_history.append(data_files)
-
-    def create_data_file(self, file_path, file_type="table", data_format="jsonl", tags=None):
-        datafile = DataFile(file_path=file_path, file_type=file_type, tags=tags)
+    
+    def create_data_file(self, file_path, file_type="table", data_format="jsonl",tags=None):
+        datafile =  DataFile(file_path=file_path, file_type=file_type,tags=tags)
         datafile.data_format = data_format
         return datafile
 
diff --git a/dataspin/functions/function.py b/dataspin/functions/function.py
@@ -1,3 +1,4 @@
+from collections import namedtuple
 import os
 from basepy.log import logger
 
@@ -39,30 +40,43 @@ class SplitByFunction(Function):
     function_name = 'splitby'
 
     def process(self, data_file, context):
-        def write_to_group(group_name, line):
-            if group_name not in group_file_savers:
+        def write_to_group(group_names, line):
+            if group_names not in group_file_savers:
+                group_name = '-'.join(group_names)
                 dst_path = os.path.join(context.temp_dir, f'{data_file.name}-group-{group_name}.jsonl')
                 file_saver = AtomicSaver(dst_path)
                 file_saver.setup()
-                group_file_savers[group_name] = file_saver
-            saver = group_file_savers[group_name]
+                group_file_savers[group_names] = file_saver
+            saver = group_file_savers[group_names]
             saver.part_file.write(line.encode('utf-8'))
             saver.part_file.write(b'\n')
 
         data_files = []
         group_file_savers = {}
-        split_key = self.args['key'][0]
+        split_keys = self.args['key']
+        tags = self.args['tags']
+        tags_with_group = {}
         file_reader = DataFileReader(data_file.file_path)
         for (data, line) in file_reader.readlines():
-            group_name = data.get(split_key)
-            if not group_name:
+            group_names = []
+            for split_key in split_keys:
+                group_name = data.get(split_key)
+                group_names.append(group_name)
+            group_names = tuple(group_names)
+            if not tags_with_group.get(group_names):
+                object_name = namedtuple("DataObject", data.keys())(*data.values())
+                fill_tags = {}
+                for tag_k,tag_v in tags.items():
+                    fill_tags[tag_k] = tag_v.format(data=object_name)
+                tags_with_group[group_names] = fill_tags
+            if not group_names:
                 # TODO: warning
                 continue
-            write_to_group(group_name, line)
-
-        for saver in group_file_savers.values():
-            saver.__exit__(None, None, None)
-            data_files.append(context.create_data_file(file_path=saver.dest_path))
+            write_to_group(group_names, line)
+        for group_names,saver in group_file_savers.items():
+            saver.__exit__(None, None, None)                
+            tags = tags_with_group[group_names] if tags_with_group[group_names] else None
+            data_files.append(context.create_data_file(file_path=saver.dest_path,tags=tags))
         return data_files
 
 
@@ -72,10 +86,16 @@ class SaveFunction(FunctionMultiMixin, Function):
     def process(self, data_file, context):
         logger.debug('save function process', data_file=data_file.file_path)
         location = self.args.get('location')
+        path_suffix = self.args.get('path_suffix')
+        if path_suffix:
+            path_suffix = path_suffix.format(**data_file.tags)
+            if not path_suffix.endswith('/'):
+                path_suffix = path_suffix + '/'
         storage = context.get_storage(location)
         if not storage:
             raise Exception('No storage defined.')
-        storage.save(data_file.basename, data_file.file_path)
+        key = path_suffix + data_file.basename if path_suffix else data_file.basename
+        storage.save(key, data_file.file_path)
         return data_file
 
 
@@ -102,7 +122,7 @@ def process(self, data_file, context):
                 file_saver.part_file.write(b'\n')
 
         file_saver.__exit__(None, None, None)
-        new_data_file = context.create_data_file(dst_path, file_type="index")
+        new_data_file = context.create_data_file(dst_path, file_type="index",tags= data_file.tags)
         return [data_file, new_data_file]
 
 
@@ -120,7 +140,7 @@ def process(self, data_file, context):
             for data, line in file_reader.readlines():
                 f.write(json.dumps(common.flatten_dict(data)).encode('utf-8'))
                 f.write(b'\n')
-        return context.create_data_file(file_path=dst_path)
+        return context.create_data_file(file_path = dst_path,tags = data_file.tags)
 
 
 class FormatFunction(FunctionMultiMixin, Function):
@@ -184,7 +204,7 @@ def process(self, data_file, context):
                 pk_values.add(pk_value)
                 f.write(json.dumps(data).encode('utf-8'))
                 f.write(b'\n')
-        return data_file, context.create_data_file(file_path=dst_path)
+        return data_file, context.create_data_file(file_path=dst_path, tags=data_file.tags)
 
 
 class FilterFunction(FunctionMultiMixin, Function):
@@ -200,7 +220,7 @@ def process(self, data_file, context):
             tags = rule_config.get('tags')
             rule = rule_config.get('rule', "False")
 
-            dst_path = os.path.join(context.temp_dir, f'{data_file.name}-filter-{tags if tags else "default"}.jsonl')
+            dst_path = os.path.join(context.temp_dir, f'{data_file.name}-filter-{"_".join(list(tags.values())) if tags else "default"}.jsonl')
             file_saver = AtomicSaver(dst_path)
             file_saver.setup()
 
diff --git a/examples/simple.json b/examples/simple.json
@@ -58,11 +58,11 @@
                     "args": {
                         "filter_rules": [
                             {
-                                "tags": "specific",
+                                "tags": {"filter": "specific"},
                                 "rule": "event_name == 'login' and app_id in ['APPIOXDKXIESP','APPOWLSLSDWLD']"
                             },
                             {
-                                "tags": null,
+                                "tags": {"filter": "default"},
                                 "rule": "True"
                             }
                         ]
@@ -72,9 +72,8 @@
                     "name": "split by app_id",
                     "function": "splitby",
                     "args": {
-                        "key": [
-                            "app_id"
-                        ]
+                        "key": ["app_id"],
+                        "tags": {"app_id":"{data.app_id}","service":"dataspin"}
                     }
                 },
                 {
@@ -106,6 +105,7 @@
                     "function": "save",
                     "args": {
                         "location": "simple_target",
+                        "path_suffix": "{service}/filter={filter}/app_id={app_id}",
                         "table_name": "table"
                     }
                 }

Original file line number	Diff line number	Diff line change
`@@ -58,11 +58,11 @@`
`58`	`58`	`"args": {`
`59`	`59`	`"filter_rules": [`
`60`	`60`	`{`
`61`		`- "tags": "specific",`
	`61`	`+ "tags": {"filter": "specific"},`
`62`	`62`	`"rule": "event_name == 'login' and app_id in ['APPIOXDKXIESP','APPOWLSLSDWLD']"`
`63`	`63`	`},`
`64`	`64`	`{`
`65`		`- "tags": null,`
	`65`	`+ "tags": {"filter": "default"},`
`66`	`66`	`"rule": "True"`
`67`	`67`	`}`
`68`	`68`	`]`
`@@ -72,9 +72,8 @@`
`72`	`72`	`"name": "split by app_id",`
`73`	`73`	`"function": "splitby",`
`74`	`74`	`"args": {`
`75`		`- "key": [`
`76`		`- "app_id"`
`77`		`- ]`
	`75`	`+ "key": ["app_id"],`
	`76`	`+ "tags": {"app_id":"{data.app_id}","service":"dataspin"}`
`78`	`77`	`}`
`79`	`78`	`},`
`80`	`79`	`{`
`@@ -106,6 +105,7 @@`
`106`	`105`	`"function": "save",`
`107`	`106`	`"args": {`
`108`	`107`	`"location": "simple_target",`
	`108`	`+ "path_suffix": "{service}/filter={filter}/app_id={app_id}",`
`109`	`109`	`"table_name": "table"`
`110`	`110`	`}`
`111`	`111`	`}`