Skip to content

Commit fc4a34c

Browse files
author
于泽
committed
resolve conflict
2 parents f30db2b + 016fde2 commit fc4a34c

File tree

4 files changed

+52
-31
lines changed

4 files changed

+52
-31
lines changed

dataspin/cli/test_data_generator.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@
1414
app_ids = ['APPIOXDKXIESP', 'APPOWLSLSDWLD', 'APPSISEKIDESS']
1515

1616
event_names = ['login', 'logout', 'enter_splash', 'leave_splash', 'session']
17-
17+
platform = ['android','ios']
1818

1919
def generate_json_data(bp_timestamp):
2020
data_dict = {
2121
'app_id': app_ids[randint(0, len(app_ids)-1)],
2222
'event_name': event_names[randint(0, len(event_names)-1)],
2323
'event_id': str(uuid.uuid4()),
2424
'bp_timestamp': bp_timestamp,
25+
'platform':platform[randint(0,len(platform)-1)],
2526
'device':{
2627
'brand':'',
2728
'os_type':'android',
@@ -37,7 +38,7 @@ def save_json_file(file_dir: str, file_name, data_list: List[str]):
3738
f.write('\n'.join(data_list).encode('utf-8'))
3839

3940

40-
def generate_test_data(file_dir='temp', file_numbers=1, data_counts=1000, duplicate_data_count=1, data_format='jsonl', time_range=2, time_unit='day'):
41+
def generate_test_data(file_dir='tmp/source', file_numbers=1, data_counts=1000, duplicate_data_count=1, data_format='jsonl', time_range=2, time_unit='day'):
4142
file_dir = file_dir.strip('/')
4243
if os.path.exists(file_dir):
4344
shutil.rmtree(file_dir)
@@ -89,7 +90,7 @@ def cli():
8990

9091
@cli.command()
9192
@click.option('--execute_duration', '-ed', type=click.IntRange(min=10), default=60, help='execute duration,unit is second,default 60')
92-
@click.option('--file_dir', '-fd', default='temp', help='file dir')
93+
@click.option('--file_dir', '-fd', default='tmp/source', help='file dir')
9394
@click.option('--file_numbers', '-fn', type=click.IntRange(min=1), default=1, help='file numbers')
9495
@click.option('--data_counts', '-dc', type=click.IntRange(min=10), default=100000, help='data counts')
9596
@click.option('--duplicate_data_count', '-dd', type=click.IntRange(min=0), default=1000, help='duplicate data counts')

dataspin/core.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,14 @@ def name(self):
166166

167167

168168
class DataFile:
169-
def __init__(self, file_path, file_type="table", tags=None):
169+
def __init__(self, file_path, file_type="table",tags = None):
170170
self.name, self.ext = os.path.splitext(os.path.basename(file_path))
171171
if self.ext in ['.gz']:
172172
self.name, ext = os.path.splitext(self.name)
173173
self.ext = f'{ext}{self.ext}'
174174
self.file_path = file_path
175-
self.file_type = file_type # table or index
176-
self.file_format = "jsonl" # can be jsonl, parquet
175+
self.file_type = file_type # table or index
176+
self.file_format = "jsonl" # can be jsonl, parquet
177177
self.tags = tags
178178

179179
@property
@@ -223,9 +223,9 @@ def set_data_files(self, data_files):
223223
logger.debug('set data files,', data_files=data_files)
224224
self.final_files = data_files
225225
self.files_history.append(data_files)
226-
227-
def create_data_file(self, file_path, file_type="table", data_format="jsonl", tags=None):
228-
datafile = DataFile(file_path=file_path, file_type=file_type, tags=tags)
226+
227+
def create_data_file(self, file_path, file_type="table", data_format="jsonl",tags=None):
228+
datafile = DataFile(file_path=file_path, file_type=file_type,tags=tags)
229229
datafile.data_format = data_format
230230
return datafile
231231

dataspin/functions/function.py

+37-17
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections import namedtuple
12
import os
23
from basepy.log import logger
34

@@ -39,30 +40,43 @@ class SplitByFunction(Function):
3940
function_name = 'splitby'
4041

4142
def process(self, data_file, context):
42-
def write_to_group(group_name, line):
43-
if group_name not in group_file_savers:
43+
def write_to_group(group_names, line):
44+
if group_names not in group_file_savers:
45+
group_name = '-'.join(group_names)
4446
dst_path = os.path.join(context.temp_dir, f'{data_file.name}-group-{group_name}.jsonl')
4547
file_saver = AtomicSaver(dst_path)
4648
file_saver.setup()
47-
group_file_savers[group_name] = file_saver
48-
saver = group_file_savers[group_name]
49+
group_file_savers[group_names] = file_saver
50+
saver = group_file_savers[group_names]
4951
saver.part_file.write(line.encode('utf-8'))
5052
saver.part_file.write(b'\n')
5153

5254
data_files = []
5355
group_file_savers = {}
54-
split_key = self.args['key'][0]
56+
split_keys = self.args['key']
57+
tags = self.args['tags']
58+
tags_with_group = {}
5559
file_reader = DataFileReader(data_file.file_path)
5660
for (data, line) in file_reader.readlines():
57-
group_name = data.get(split_key)
58-
if not group_name:
61+
group_names = []
62+
for split_key in split_keys:
63+
group_name = data.get(split_key)
64+
group_names.append(group_name)
65+
group_names = tuple(group_names)
66+
if not tags_with_group.get(group_names):
67+
object_name = namedtuple("DataObject", data.keys())(*data.values())
68+
fill_tags = {}
69+
for tag_k,tag_v in tags.items():
70+
fill_tags[tag_k] = tag_v.format(data=object_name)
71+
tags_with_group[group_names] = fill_tags
72+
if not group_names:
5973
# TODO: warning
6074
continue
61-
write_to_group(group_name, line)
62-
63-
for saver in group_file_savers.values():
64-
saver.__exit__(None, None, None)
65-
data_files.append(context.create_data_file(file_path=saver.dest_path))
75+
write_to_group(group_names, line)
76+
for group_names,saver in group_file_savers.items():
77+
saver.__exit__(None, None, None)
78+
tags = tags_with_group[group_names] if tags_with_group[group_names] else None
79+
data_files.append(context.create_data_file(file_path=saver.dest_path,tags=tags))
6680
return data_files
6781

6882

@@ -72,10 +86,16 @@ class SaveFunction(FunctionMultiMixin, Function):
7286
def process(self, data_file, context):
7387
logger.debug('save function process', data_file=data_file.file_path)
7488
location = self.args.get('location')
89+
path_suffix = self.args.get('path_suffix')
90+
if path_suffix:
91+
path_suffix = path_suffix.format(**data_file.tags)
92+
if not path_suffix.endswith('/'):
93+
path_suffix = path_suffix + '/'
7594
storage = context.get_storage(location)
7695
if not storage:
7796
raise Exception('No storage defined.')
78-
storage.save(data_file.basename, data_file.file_path)
97+
key = path_suffix + data_file.basename if path_suffix else data_file.basename
98+
storage.save(key, data_file.file_path)
7999
return data_file
80100

81101

@@ -102,7 +122,7 @@ def process(self, data_file, context):
102122
file_saver.part_file.write(b'\n')
103123

104124
file_saver.__exit__(None, None, None)
105-
new_data_file = context.create_data_file(dst_path, file_type="index")
125+
new_data_file = context.create_data_file(dst_path, file_type="index",tags= data_file.tags)
106126
return [data_file, new_data_file]
107127

108128

@@ -120,7 +140,7 @@ def process(self, data_file, context):
120140
for data, line in file_reader.readlines():
121141
f.write(json.dumps(common.flatten_dict(data)).encode('utf-8'))
122142
f.write(b'\n')
123-
return context.create_data_file(file_path=dst_path)
143+
return context.create_data_file(file_path = dst_path,tags = data_file.tags)
124144

125145

126146
class FormatFunction(FunctionMultiMixin, Function):
@@ -184,7 +204,7 @@ def process(self, data_file, context):
184204
pk_values.add(pk_value)
185205
f.write(json.dumps(data).encode('utf-8'))
186206
f.write(b'\n')
187-
return data_file, context.create_data_file(file_path=dst_path)
207+
return data_file, context.create_data_file(file_path=dst_path, tags=data_file.tags)
188208

189209

190210
class FilterFunction(FunctionMultiMixin, Function):
@@ -200,7 +220,7 @@ def process(self, data_file, context):
200220
tags = rule_config.get('tags')
201221
rule = rule_config.get('rule', "False")
202222

203-
dst_path = os.path.join(context.temp_dir, f'{data_file.name}-filter-{tags if tags else "default"}.jsonl')
223+
dst_path = os.path.join(context.temp_dir, f'{data_file.name}-filter-{"_".join(list(tags.values())) if tags else "default"}.jsonl')
204224
file_saver = AtomicSaver(dst_path)
205225
file_saver.setup()
206226

examples/simple.json

+5-5
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@
5858
"args": {
5959
"filter_rules": [
6060
{
61-
"tags": "specific",
61+
"tags": {"filter": "specific"},
6262
"rule": "event_name == 'login' and app_id in ['APPIOXDKXIESP','APPOWLSLSDWLD']"
6363
},
6464
{
65-
"tags": null,
65+
"tags": {"filter": "default"},
6666
"rule": "True"
6767
}
6868
]
@@ -72,9 +72,8 @@
7272
"name": "split by app_id",
7373
"function": "splitby",
7474
"args": {
75-
"key": [
76-
"app_id"
77-
]
75+
"key": ["app_id"],
76+
"tags": {"app_id":"{data.app_id}","service":"dataspin"}
7877
}
7978
},
8079
{
@@ -106,6 +105,7 @@
106105
"function": "save",
107106
"args": {
108107
"location": "simple_target",
108+
"path_suffix": "{service}/filter={filter}/app_id={app_id}",
109109
"table_name": "table"
110110
}
111111
}

0 commit comments

Comments
 (0)