-
Notifications
You must be signed in to change notification settings - Fork 2
/
dvc.lock
50 lines (50 loc) · 1.51 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
schema: '2.0'
stages:
download-ruwiki:
cmd: scripts/download_russian_wiki_dump.sh
outs:
- path: data/ruwiki-latest-pages-articles.xml.bz2
md5: 01bed521f67bedf76fd561377bd3c3ec
size: 4996155242
download-eyo-dicts:
cmd: scripts/download_eyo_kernel_dicts.sh
outs:
- path: data/not-safe.txt
md5: b840d675d69f7b37659a5950194bc629
size: 242758
- path: data/safe.txt
md5: a0c4cb74b6dd0a97fd8f1c78495028dc
size: 864955
extract-segments-from-wiki:
cmd: python scripts/extract_segments_from_wiki.py -n 1000000
deps:
- path: data/ruwiki-latest-pages-articles.xml.bz2
md5: 01bed521f67bedf76fd561377bd3c3ec
size: 4996155242
outs:
- path: data/ruwiki-yo-segments.txt
md5: 11bf0e02feab1264a424b3b4b349dcfe
size: 313125364
prepare-segments-dataset:
cmd: python scripts/prepare_segments_dataset.py --max-text-length 220
deps:
- path: data/ruwiki-yo-segments.txt
md5: 11bf0e02feab1264a424b3b4b349dcfe
size: 313125364
outs:
- path: data/ruwiki-yo-segments-preprocessed.csv
md5: 8977e2f4302d57108d15846de18daf0f
size: 354168207
split-dataset:
cmd: python scripts/split_dataset.py
deps:
- path: data/ruwiki-yo-segments-preprocessed.csv
md5: 8977e2f4302d57108d15846de18daf0f
size: 354168207
outs:
- path: data/test.csv
md5: 4b0e7ca1bc2af46998685663839ffd5d
size: 88853486
- path: data/train.csv
md5: 1a2a6b776b8cbdb75f73e50267d1e935
size: 266739875