From 29483cc5f4d0a41dc646112583cb4f7cbafac16a Mon Sep 17 00:00:00 2001 From: Andrew Moore Date: Mon, 15 Nov 2021 15:32:02 +0000 Subject: [PATCH 1/4] Sentence marker does not exist in the lexicon --- Dutch/simplified-pos-tagset-dut.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Dutch/simplified-pos-tagset-dut.txt b/Dutch/simplified-pos-tagset-dut.txt index 495d9b2..e78a117 100644 --- a/Dutch/simplified-pos-tagset-dut.txt +++ b/Dutch/simplified-pos-tagset-dut.txt @@ -10,6 +10,5 @@ conj conjuction art article pron pronoun punc puctuation -sent sentence marker From f3bdd2608d6cf2447aec4bbdd7e805e9b6c53230 Mon Sep 17 00:00:00 2001 From: Andrew Moore Date: Wed, 17 Nov 2021 08:10:35 +0000 Subject: [PATCH 2/4] Initial pos tagset generation script --- create_pos_tagsets.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 create_pos_tagsets.py diff --git a/create_pos_tagsets.py b/create_pos_tagsets.py new file mode 100644 index 0000000..6be1c89 --- /dev/null +++ b/create_pos_tagsets.py @@ -0,0 +1,78 @@ +from pathlib import Path +import json +from typing import List, Dict, Iterable +import csv +from collections import Counter +import re + +from tabulate import tabulate + +from test_collection import check_file + + +def read_tsv_file(file_path: str) -> Iterable[Dict[str, str]]: + with Path(file_path).open('r', newline='') as lexicon_data: + csv_reader = csv.DictReader(lexicon_data, delimiter='\t') + for row in csv_reader: + yield row + +def contains_pos_information(file_path: str) -> bool: + for value in read_tsv_file(file_path): + if 'pos' in value: + return True + else: + return False + +def single_lexicon_pos_count(file_path: str) -> Counter: + pos_counter = Counter() + for row in read_tsv_file(file_path): + pos_counter.update([row['pos'].lower()]) + return pos_counter + +def mwe_lexicon_pos_count(file_path: str) -> Counter: + pos_counter = Counter() + pos_matcher = re.compile(r'([^_\s]*)_([^_\s]*)') + for row in read_tsv_file(file_path): + mwe_template = row['mwe_template'] + pos_matches = pos_matcher.findall(mwe_template) + for _, pos_value in pos_matches: + pos_value = pos_value.lower() + #if pos_value == '*': + # continue + pos_counter.update([pos_value]) + return pos_counter + +if __name__ == '__main__': + json_data = Path(__file__, '..', 'language_resources.json').resolve() + + with json_data.open('r') as json_fp: + data = json.load(json_fp) + for language_code, meta_data in data.items(): + language_code: str + language_description: str = meta_data['language data']['description'] + language_and_code: str = f'{language_description} ({language_code})' + + resources: List[Dict[str, str]] = meta_data['resources'] + pos_label_counts = Counter() + resource_file_path = '' + for resource in resources: + resource_type = resource['data type'] + if resource_type == 'pos': + continue + resource_file_path = resource['file path'] + + + if resource_type == 'single': + if not contains_pos_information(resource_file_path): + continue + pos_label_counts += single_lexicon_pos_count(resource_file_path) + if resource_type == 'mwe': + pos_label_counts += mwe_lexicon_pos_count(resource_file_path) + if resource_file_path and pos_label_counts: + with Path(Path(resource_file_path).parent, 'generated_pos_tagset.tsv').open('w', newline='') as pos_tagset_fp: + csv_writer = csv.DictWriter(pos_tagset_fp, fieldnames=['POS', 'Count'], delimiter='\t') + csv_writer.writeheader() + for label, count in pos_label_counts.items(): + csv_writer.writerow({'POS': label, 'Count': count}) + + \ No newline at end of file From 74565cebd0e2abd34915b0d560114f7cbdff2272 Mon Sep 17 00:00:00 2001 From: Andrew Moore Date: Wed, 17 Nov 2021 08:25:15 +0000 Subject: [PATCH 3/4] Removed unused imports --- create_pos_tagsets.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/create_pos_tagsets.py b/create_pos_tagsets.py index 6be1c89..ef155e8 100644 --- a/create_pos_tagsets.py +++ b/create_pos_tagsets.py @@ -5,10 +5,6 @@ from collections import Counter import re -from tabulate import tabulate - -from test_collection import check_file - def read_tsv_file(file_path: str) -> Iterable[Dict[str, str]]: with Path(file_path).open('r', newline='') as lexicon_data: From 71c58c76033639e4f6681115aa1652a53756861f Mon Sep 17 00:00:00 2001 From: Andrew Moore Date: Wed, 17 Nov 2021 11:27:54 +0000 Subject: [PATCH 4/4] Generated POS tagsets --- Chinese/generated_pos_tagset.tsv | 20 ++++++ Czech/generated_pos_tagset.tsv | 12 ++++ Dutch/generated_pos_tagset.tsv | 12 ++++ Finnish/generated_pos_tagset.tsv | 16 +++++ French/generated_pos_tagset.tsv | 11 ++++ Italian/generated_pos_tagset.tsv | 15 +++++ Portuguese/generated_pos_tagset.tsv | 13 ++++ README.md | 27 ++++++++ Russian/generated_pos_tagset.tsv | 19 ++++++ Spanish/generated_pos_tagset.tsv | 48 ++++++++++++++ Swedish/generated_pos_tagset.tsv | 19 ++++++ Urdu/generated_pos_tagset.tsv | 99 +++++++++++++++++++++++++++++ Welsh/generated_pos_tagset.tsv | 14 ++++ 13 files changed, 325 insertions(+) create mode 100644 Chinese/generated_pos_tagset.tsv create mode 100644 Czech/generated_pos_tagset.tsv create mode 100644 Dutch/generated_pos_tagset.tsv create mode 100644 Finnish/generated_pos_tagset.tsv create mode 100644 French/generated_pos_tagset.tsv create mode 100644 Italian/generated_pos_tagset.tsv create mode 100644 Portuguese/generated_pos_tagset.tsv create mode 100644 Russian/generated_pos_tagset.tsv create mode 100644 Spanish/generated_pos_tagset.tsv create mode 100644 Swedish/generated_pos_tagset.tsv create mode 100644 Urdu/generated_pos_tagset.tsv create mode 100644 Welsh/generated_pos_tagset.tsv diff --git a/Chinese/generated_pos_tagset.tsv b/Chinese/generated_pos_tagset.tsv new file mode 100644 index 0000000..99d151e --- /dev/null +++ b/Chinese/generated_pos_tagset.tsv @@ -0,0 +1,20 @@ +POS Count +noun 47450 +num 900 +pron 356 +pnoun 8904 +adv 5971 +verb 31412 +det 411 +loc 371 +conj 428 +msr 763 +part 3778 +adj 9494 +prep 1614 +intj 272 +mark 192 +idiom 10 +fw 15 +ono 1 +punc 144 diff --git a/Czech/generated_pos_tagset.tsv b/Czech/generated_pos_tagset.tsv new file mode 100644 index 0000000..a62e185 --- /dev/null +++ b/Czech/generated_pos_tagset.tsv @@ -0,0 +1,12 @@ +POS Count +j 71 +v 6050 +r 66 +p 54 +d 1774 +c 58 +n 13057 +a 6670 +t 67 +i 61 +x 233 diff --git a/Dutch/generated_pos_tagset.tsv b/Dutch/generated_pos_tagset.tsv new file mode 100644 index 0000000..74e728f --- /dev/null +++ b/Dutch/generated_pos_tagset.tsv @@ -0,0 +1,12 @@ +POS Count +adj 769 +noun 2061 +verb 1000 +conj 33 +interj 21 +prep 33 +adv 208 +num 39 +pron 53 +intj 1 +art 2 diff --git a/Finnish/generated_pos_tagset.tsv b/Finnish/generated_pos_tagset.tsv new file mode 100644 index 0000000..e378c49 --- /dev/null +++ b/Finnish/generated_pos_tagset.tsv @@ -0,0 +1,16 @@ +POS Count +code 15 +abbrev 409 +proper 7931 +noun 26666 +comppart 608 +adjective 3395 +verb 3365 +interjection 86 +adverb 3243 +preposition 224 +conjunction 116 +numeral 91 +pronoun 73 +noun 3 +adverb 1 diff --git a/French/generated_pos_tagset.tsv b/French/generated_pos_tagset.tsv new file mode 100644 index 0000000..9c235a4 --- /dev/null +++ b/French/generated_pos_tagset.tsv @@ -0,0 +1,11 @@ +POS Count +prep 60 +noun 1633 +adv 147 +verb 448 +adj 264 +det 86 +pron 56 +conj 20 +null 2 +intj 8 diff --git a/Italian/generated_pos_tagset.tsv b/Italian/generated_pos_tagset.tsv new file mode 100644 index 0000000..1eeba97 --- /dev/null +++ b/Italian/generated_pos_tagset.tsv @@ -0,0 +1,15 @@ +POS Count +abr 114 +noun 22272 +verb 11056 +null 12 +pnoun 417 +adj 8153 +num 59 +prep 3130 +conj 181 +adv 1405 +pron 366 +intj 73 +art 659 +punc 2 diff --git a/Portuguese/generated_pos_tagset.tsv b/Portuguese/generated_pos_tagset.tsv new file mode 100644 index 0000000..4d8a329 --- /dev/null +++ b/Portuguese/generated_pos_tagset.tsv @@ -0,0 +1,13 @@ +POS Count +num 57 +pnoun 320 +adj 3428 +noun 9291 +verb 3096 +prep 1131 +adv 681 +intj 38 +pron 132 +conj 83 +det 289 +punc 28 diff --git a/README.md b/README.md index e6d206e..6ba482d 100644 --- a/README.md +++ b/README.md @@ -340,6 +340,33 @@ Number of unique values in lexicon file 2 7637 Number of unique values in common between the two files:3169 ``` +### Create Pos Tagsets + +The script creates a POS tagset per language stated within the [./language_resources.json](./language_resources.json) meta data file, which is explained in the [USAS Lexicon Meta Data section](#usas-lexicon-meta-data), and creates a POS tagset based on the POS tags used within the language's single and MWE semantic lexicon files. The POS tagset generated is then saved within each language's folder under the file name `generated_pos_tagset.tsv`. Each generated tagset has two fields `POS`, and `Count`, the `POS` field represents the POS tags, and the `Count` represents the number of times the associated tag has been used within the language's lexicon file(s). An example of this generated POS tagset is shown below, taken from the Welsh language folder: + +``` tsv +POS Count +verb 130197 +adv 123 +art 7 +conj 87 +pron 67 +prep 293 +noun 4358 +pnoun 6572 +adj 1542 +fw 40 +num 36 +intj 6 +* 2 +``` + +To run this script: + +``` bash +python create_pos_tagsets.py +``` + ### Python Requirements This has been tested with Python >= `3.7`, to install the relevant python requirements: diff --git a/Russian/generated_pos_tagset.tsv b/Russian/generated_pos_tagset.tsv new file mode 100644 index 0000000..5b8a6c9 --- /dev/null +++ b/Russian/generated_pos_tagset.tsv @@ -0,0 +1,19 @@ +POS Count +fw 6 +intj 13 +s 21003 +a 1931 +adv 171 +conj 17 +v 1701 +part 26 +pr 39 +a-pro 26 +com 4 +parenth 14 +a-num 37 +num 41 +praedic 17 +adv-pro 30 +s-pro 17 +* 1772 diff --git a/Spanish/generated_pos_tagset.tsv b/Spanish/generated_pos_tagset.tsv new file mode 100644 index 0000000..8fd0300 --- /dev/null +++ b/Spanish/generated_pos_tagset.tsv @@ -0,0 +1,48 @@ +POS Count +noun 2217 +prep 95 +num/noun 52 +pnoun 19993 +adv 238 +adj 947 +verb 717 +conj 37 +intj 3 +prep+art 2 +adj/noun 36 +pron 72 +verb/adj 2 +num 22 +adj/num/noun 13 +adj/nun 1 +num/adj 16 +adj/num 10 +adv/conj 2 +verb/adj/noun 1 +art 67 +verbo 1 +abbr 1 +abbr/noun 1 +prefix 1 +noun/adj 15 +verb + pron 1 +det 1 +noun/adv 1 +adv/adj 1 +adj/adv 1 +adj/pron 7 +verb/noun 6 +con/adj/pron 1 +noun/verb 2 +adj/nm 1 +adv/pron 1 +fw 7 +noun 1 +noun/num 1 +num/art 1 +art/art/pron 1 +num/art/pron 2 +art/pron 2 +num/adj/pron 1 +noun/pnoun 1 +port 10 diff --git a/Swedish/generated_pos_tagset.tsv b/Swedish/generated_pos_tagset.tsv new file mode 100644 index 0000000..56e2fa0 --- /dev/null +++ b/Swedish/generated_pos_tagset.tsv @@ -0,0 +1,19 @@ +POS Count +nn 9120 +pp 77 +in 25 +vb 2959 +av 3294 +ab 424 +pm 60 + 1860 +prefix 63 +pn 87 +sn 16 +pres part 7 +kn 19 +nl 41 +ie 1 +al 3 +nna 2 +pma 24 diff --git a/Urdu/generated_pos_tagset.tsv b/Urdu/generated_pos_tagset.tsv new file mode 100644 index 0000000..636cab8 --- /dev/null +++ b/Urdu/generated_pos_tagset.tsv @@ -0,0 +1,99 @@ +POS Count +at 1 +io 1 +cc 4 +at1 3 +rp 21 +to 2 +pph1 1 +vbz 1 +vbdz 1 +ppis1 1 +rg 12 +ppy 1 +pphs1 2 +iw 1 +ii 13 +vbr 2 +xx 1 +vhz 3 +rr 141 +vhn 1 +pphs2 1 +pp$ 1 +ddq 2 +ppis2 1 +vbdr 1 +vd0 1 +vbn 1 +appge 5 +vm 8 +rl 28 +vv0 557 +csw 2 +vvn 99 +pnqs 1 +pn1 13 +ppho2 1 +rrq 6 +ppho1 1 +rt 9 +mc 9 +vdd 1 +uh 16 +ppio1 1 +nn1 423 +rrr 12 +dd2 2 +jj 147 +csn 1 +da2 3 +nnt2 6 +vh0 1 +vbg 1 +da 3 +nnt1 13 +vdz 1 +vbm 2 +ppio2 1 +nn 16 +rrqv 1 +dd1 1 +rrt 6 +nnb 2 +ra 5 +vvd 39 +ge 1 +ja 1 +nn2 93 +cs 4 +md 3 +vvz 40 +vdn 1 +vhg 1 +np1 60 +jb 30 +nnl1 17 +ppx1 5 +vdg 1 +vvg 21 +nno 4 +ppx2 1 +vvi 9 +ddqge 1 +nnu 1 +nnl2 1 +nd1 4 +npm1 12 +jjr 5 +nnu2 3 +pnqo 1 +np2 2 +nn11 1 +npd1 4 +pn 1 +jj% 3 +vvn@ 1 +vmk 1 +jjt 2 +nno2 1 diff --git a/Welsh/generated_pos_tagset.tsv b/Welsh/generated_pos_tagset.tsv new file mode 100644 index 0000000..9320289 --- /dev/null +++ b/Welsh/generated_pos_tagset.tsv @@ -0,0 +1,14 @@ +POS Count +verb 130197 +adv 123 +art 7 +conj 87 +pron 67 +prep 293 +noun 4358 +pnoun 6572 +adj 1542 +fw 40 +num 36 +intj 6 +* 2