-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmorphroot.py
More file actions
executable file
·78 lines (62 loc) · 2.12 KB
/
morphroot.py
File metadata and controls
executable file
·78 lines (62 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/python
from typing import Dict
from settings import db_dir, CORPORA
import os
import csv
from morphemes import Morphemes # type: ignore
morph_dir = f"{db_dir}/morphemes"
morph_data = f"{morph_dir}/roots.csv"
if not os.path.exists(morph_dir):
os.mkdir(morph_dir)
m = Morphemes(morph_dir)
roots: Dict[str, str] = {}
def load_roots():
if os.path.exists(morph_data):
with open(morph_data, "r") as f:
for r in csv.reader(f):
assert r[0] not in roots
roots[r[0]] = r[1]
print(f"Loaded {len(roots)} morphemens")
def save_roots():
with open(morph_data, "w") as f:
writer = csv.writer(f)
writer.writerows((k, roots[k]) for k in sorted(list(roots.keys())))
def extract_root(word, tree):
result = ""
# print(word)
for e in tree:
if e["type"] == "root":
# assert not result, f'Found at least two roots for {word}: {result} and {e["text"]}'
result += e["text"]
if "children" in e:
# value = get_root_morpheme(word, e["children"])
# assert not result, f'Found at least two roots for {word}: {result} and {value}'
result += extract_root(word, e["children"])
roots[word] = result
return result
def get_root_morpheme(word: str) -> str:
if not roots:
load_roots()
word = word.lower()
if word in roots:
return roots[word]
if not word.isalpha():
roots[word] = word
return word
tree = m.parse(word.lower()).get("tree")
if not tree:
roots[word] = word
return word
return extract_root(word, tree) # if word.isalpha() else word.lower()
if __name__ == "__main__":
from glob import glob
from util import story_tokenize
from corpora import corpora
for corpus in corpora:
for fname in glob(f"./corpora.{CORPORA}/{corpus}/*.txt"):
print(fname)
with open(fname) as f:
# talename = fname.split("/")[-1].split(".")[-2]
fulltext = "".join(f.readlines())
story_tokenize(get_root_morpheme, fulltext)
save_roots()