-
Notifications
You must be signed in to change notification settings - Fork 2
/
dvc.yaml
70 lines (70 loc) · 1.92 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# DVC data processing pipeline
vars:
- DATA_DIR: data
- SAFE_DICT_PATH: data/safe.txt
- NOT_SAFE_DICT_PATH: data/not-safe.txt
- WIKI_DATA_PATH: data/ruwiki-latest-pages-articles.xml.bz2
- SEGM_DATA_PATH: data/ruwiki-yo-segments.txt
- VOCAB_FILENAME: vocab.txt
- MARKUPS_FILENAME: markups.jsonl.bz2
- CONFIG_PATH: params.yaml
- METRICS_PATH: reports/metrics.json
stages:
download-ruwiki:
cmd: scripts/download_russian_wiki_dump.sh
deps:
- scripts/download_russian_wiki_dump.sh
outs:
- ${WIKI_DATA_PATH}
download-eyo-dicts:
cmd: scripts/download_eyo_kernel_dicts.sh
deps:
- scripts/download_eyo_kernel_dicts.sh
outs:
- ${SAFE_DICT_PATH}
- ${NOT_SAFE_DICT_PATH}
extract-segments-from-wiki:
cmd: >-
python -m scripts.extract_segments_from_wiki
${data.extract}
-i ${WIKI_DATA_PATH}
-o ${SEGM_DATA_PATH}
deps:
- ${WIKI_DATA_PATH}
- ${SAFE_DICT_PATH}
- ${NOT_SAFE_DICT_PATH}
outs:
- ${SEGM_DATA_PATH}
generate-dataset:
cmd: >-
python -m scripts.generate_dataset
${data.generate}
--vocab-filename ${VOCAB_FILENAME}
--markups-filename ${MARKUPS_FILENAME}
-i ${SEGM_DATA_PATH}
-O ${DATA_DIR}
deps:
- ${WIKI_DATA_PATH}
- ${SEGM_DATA_PATH}
- ${SAFE_DICT_PATH}
- ${NOT_SAFE_DICT_PATH}
outs:
- ${DATA_DIR}/train-${MARKUPS_FILENAME}
- ${DATA_DIR}/test-${MARKUPS_FILENAME}
- ${DATA_DIR}/${VOCAB_FILENAME}
evaluate-model:
cmd: >-
python -m scripts.evaluate_model
--config-path ${CONFIG_PATH}
--markups-path ${DATA_DIR}/test-${MARKUPS_FILENAME}
--vocab-path ${DATA_DIR}/${VOCAB_FILENAME}
--save-path ${METRICS_PATH}
deps:
- ${DATA_DIR}/test-${MARKUPS_FILENAME}
- ${DATA_DIR}/${VOCAB_FILENAME}
- ${CONFIG_PATH}
params:
- model
metrics:
- ${METRICS_PATH}:
cache: false