-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathprepare_feverous_data.sh
More file actions
25 lines (22 loc) · 876 Bytes
/
prepare_feverous_data.sh
File metadata and controls
25 lines (22 loc) · 876 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# download corpus for feverous
cd ./datasets/FEVEROUS
mkdir -p ./raw_data
cd ./raw_data
wget https://fever.ai/download/feverous/feverous_train_challenges.jsonl
wget https://fever.ai/download/feverous/feverous_dev_challenges.jsonl
wget https://fever.ai/download/feverous/feverous-wiki-pages.zip
wget https://fever.ai/download/feverous/feverous-wiki-pages-db.zip
unzip feverous-wiki-pages-db.zip
cd ..
mkdir -p ./corpus/jsonl_corpus
# build pyserini index from sqlite database
python build_jsonline_corpus_from_db.py \
--db_path ./raw_data/feverous_wikiv1.db \
--save_path ./corpus/jsonl_corpus/feverous_corpus.jsonl
python -m pyserini.index.lucene \
--collection JsonCollection \
--input ./corpus/jsonl_corpus \
--index ./corpus/index \
--generator DefaultLuceneDocumentGenerator \
--threads 40 \
--storePositions --storeDocvectors --storeRaw