forked from bigcode-project/octopack
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathto_meg.sh
61 lines (51 loc) · 1.85 KB
/
to_meg.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
#SBATCH --job-name=xp3mixedjsonl # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=40 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --partition=cpu_p1
#SBATCH --qos=qos_cpu-t3
set -x -e
source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed
TOKENIZER_PATH="bigscience/tokenizer"
mkdir -p $OUTPUT
# Includes prior xP3 code & new commits data
LANGS=(
code
)
DATA_PATH=/gpfswork/rech/six/commun/bigscience-training/jsonls/xp3cappedmixednewcode
OUTPUT=/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong
mkdir -p $OUTPUT
for val in {0..1}; do
LANG=${LANGS[$val]}
cd $DATA_PATH/$LANG
# Merge
cat *.jsonl > merged_dups_$LANG.jsonl
# Drop duplicates (~1G / 37G for en) + Shuffle
sort -u merged_dups_$LANG.jsonl | shuf > merged_$LANG.jsonl
cd $MEGATRON_DEEPSPEED_REPO
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT/xp3_$LANG \
--dataset-impl mmap \
--json-key inputs \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--workers 35
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT/xp3_$LANG \
--dataset-impl mmap \
--json-key targets \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--append-eod \
--workers 35
done