-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsample_chunks_and_split.sh
More file actions
86 lines (56 loc) · 4.02 KB
/
sample_chunks_and_split.sh
File metadata and controls
86 lines (56 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Make necessary directories
mkdir babylm_data/
mkdir babylm_data/babylm_100M
mkdir babylm_data/babylm_10M
mkdir babylm_data/babylm_dev
mkdir babylm_data/babylm_test
# Make train 100M/test/dev
## bnc
python sample_chunks_and_split.py --input_file preprocessed_data/bnc_spoken.txt --output_dir babylm_data --p_keep 0.833 --p_keep_dev 0.0835 --split_at 10000 --seed 3
## childes
python sample_chunks_and_split.py --input_file preprocessed_data/childes.txt --output_dir babylm_data --p_keep 0.833 --p_keep_dev 0.0835 --split_at 10000 --seed 0
## children's gutenberg
python sample_chunks_and_split.py --input_file preprocessed_data/gutenberg.txt --output_dir babylm_data --p_keep 0.528 --p_keep_dev 0.0528 --split_at 5000 --seed 13
## openSubtitles
python sample_chunks_and_split.py --input_file preprocessed_data/open_subtitles.txt --output_dir babylm_data --p_keep 0.02 --p_keep_dev 0.002 --split_at 5000 --seed 2
## simple english wiki
python sample_chunks_and_split.py --input_file preprocessed_data/simple_wiki.txt --output_dir babylm_data --p_keep 0.475 --p_keep_dev 0.0475 --split_at 5000 --seed 18
## switchboard
python sample_chunks_and_split.py --input_file preprocessed_data/switchboard.txt --output_dir babylm_data --p_keep 0.833 --p_keep_dev 0.0835 --split_at 2000 --seed 11
## Move to appropriate directories
mv babylm_data/*.train babylm_data/babylm_100M
mv babylm_data/*.dev babylm_data/babylm_dev
mv babylm_data/*.test babylm_data/babylm_test
# Make data for 10M
## bnc
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/bnc_spoken.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 10000 --seed 3
## childes
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/childes.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 10000 --seed 31
## childrens gutenberg
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/gutenberg.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 5000 --seed 5
## openSubtitles
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/open_subtitles.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 5000 --seed 3
## simple english wiki
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/simple_wiki.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 5000 --seed 6
## switchboard
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/switchboard.train --output_dir babylm_data/babylm_10M --p_keep 0.1 --p_keep_dev 0.1 --split_at 2000 --seed 6
rm babylm_data/babylm_10M/*.dev
rm babylm_data/babylm_10M/*.test
wc -w babylm_data/babylm_10M/*.train
# Make data for 50M
## bnc
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/bnc_spoken.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 10000 --seed 2
## childes
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/childes.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 10000 --seed 14
## childrens gutenberg
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/gutenberg.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 5000 --seed 2
## openSubtitles
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/open_subtitles.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 5000 --seed 0
## simple english wiki
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/simple_wiki.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 5000 --seed 2
## switchboard
python sample_chunks_and_split.py --input_file babylm_data/babylm_100M/switchboard.train --output_dir babylm_data/babylm_50M --p_keep 0.5 --p_keep_dev 0.1 --split_at 2000 --seed 1
rm babylm_data/babylm_50M/*.dev
rm babylm_data/babylm_50M/*.test
wc -w babylm_data/babylm_50M/*.train
zip -r babylm_data.zip babylm_data