-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
105 lines (90 loc) · 2.73 KB
/
utils.py
File metadata and controls
105 lines (90 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import pickle
import argparse
#used for lowercasing and removing spaces from prefix
def norm_rsplit(text,n): return text.lower().rsplit(' ', n)[-n:]
#removing all non [a-z] characters
def re_split(text): return re.findall('[a-z]+', text.lower()) #[a-z]+
def clean(word):
word = word.lower()
regex = re.compile('[^a-z]')
return regex.sub('', word)
#splits sentence into (word a, word b) tuple pairs
def chunks(l, n):
for i in range(0, len(l) - n + 1):
yield l[i:i+n]
#load training corpus
def load_corpus(corpus_path):
'''
:param corpus_path: file path to where you have the downloaded corpus
'''
with open(corpus_path, 'r', encoding='utf-8') as corpus:
return str(corpus.read())
#saving parameters from trained models
def save_models(word_model, tuple_model, save_path):
'''
:param word_model: unique word count dict to save
:param tuple_model: (word a, word b) tuple count dict to save
:param save_path: location to save models
'''
pickle.dump({'words_model': word_model,
'word_tuples_model': tuple_model},
open(save_path, 'wb'),
protocol=2)
#loads the pre-trained models for inference
def load_models(model_path):
'''
:param model_path: location where pre-trained models are left
:returns: WORD_MODEL (unique word count dict) and WORD_TUPLES_MODEL (word a, word b) tuple count dict)
'''
mod = pickle.load(open(model_path, "rb"))
WORDS_MODEL = mod['words_model']
WORD_TUPLES_MODEL = mod['word_tuples_model']
return WORDS_MODEL, WORD_TUPLES_MODEL
def process_input_prefix(prefix):
return clean(prefix[0])
#remove punctuation, lowercase
#
# print(prefix, prefix[0])
# re.findall('[a-z]+', text.lower())
# print(type(prefix), type(prefix[0]))
# return [clean(p) for p in prefix]
# def parse_args():
# parser = argparse.ArgumentParser()
# parser.add_argument("echo")
# args = parser.parse_args()
# print(args.echo)
def clean_and_parse(input):
last_space = True if input[-1:] == " " else False
if last_space:
input = input[0:-1]
split = input.split(" ")
return ([clean(s) for s in split], last_space)
#Used to define potential mistakes for last letter of prefix to autocomplete
NEARBY_KEYS = {
'a': 'qwsz',
'b': 'vghn',
'c': 'xdfv',
'd': 'erfcxs',
'e': 'rdsw',
'f': 'rtgvcd',
'g': 'tyhbvf',
'h': 'yujnbg',
'j': 'uikmnh',
'k': 'iolmj',
'l': 'opk',
'm': 'njk',
'n': 'bhjm',
'o': 'iklp',
'p': 'ol',
'q': 'wa',
'r': 'edft',
's': 'wedxza',
't': 'rfgy',
'u': 'yhji',
'v': 'cfgb',
'w': 'qase',
'x': 'zsdc',
'y': 'tghu',
'z': 'asx'
}