textPrediction/utils.py at main · Tim-Yang-YTY/textPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import pickle
import argparse

#used for lowercasing and removing spaces from prefix
def norm_rsplit(text,n): return text.lower().rsplit(' ', n)[-n:]

#removing all non [a-z] characters
def re_split(text): return re.findall('[a-z]+', text.lower()) #[a-z]+

def clean(word):
    word = word.lower()
    regex = re.compile('[^a-z]')
    return regex.sub('', word)

#splits sentence into (word a, word b) tuple pairs
def chunks(l, n):
    for i in range(0, len(l) - n + 1):
        yield l[i:i+n]

#load training corpus
def load_corpus(corpus_path):
    '''
    :param corpus_path: file path to where you have the downloaded corpus
    '''
    with open(corpus_path, 'r', encoding='utf-8') as corpus:
        return str(corpus.read())

#saving parameters from trained models
def save_models(word_model, tuple_model, save_path):
    '''
    :param word_model: unique word count dict to save
    :param tuple_model: (word a, word b) tuple count dict to save
    :param save_path: location to save models
    '''
    pickle.dump({'words_model': word_model,
                 'word_tuples_model': tuple_model},
                open(save_path, 'wb'),
                protocol=2)

#loads the pre-trained models for inference
def load_models(model_path):
    '''
    :param model_path: location where pre-trained models are left
    :returns: WORD_MODEL (unique word count dict) and WORD_TUPLES_MODEL (word a, word b) tuple count dict)
    '''
    mod = pickle.load(open(model_path, "rb"))
    WORDS_MODEL = mod['words_model']
    WORD_TUPLES_MODEL = mod['word_tuples_model']
    return WORDS_MODEL, WORD_TUPLES_MODEL

def process_input_prefix(prefix):
    return clean(prefix[0])
    #remove punctuation, lowercase
    #
    # print(prefix, prefix[0])
    # re.findall('[a-z]+', text.lower())
    # print(type(prefix), type(prefix[0]))
    # return [clean(p) for p in prefix]

# def parse_args():
#     parser = argparse.ArgumentParser()
#     parser.add_argument("echo")
#     args = parser.parse_args()
#     print(args.echo)

def clean_and_parse(input):

    last_space = True if input[-1:] == " " else False
    if last_space:
        input = input[0:-1]
    split = input.split(" ")

    return ([clean(s) for s in split], last_space)


#Used to define potential mistakes for last letter of prefix to autocomplete
NEARBY_KEYS = {
    'a': 'qwsz',
    'b': 'vghn',
    'c': 'xdfv',
    'd': 'erfcxs',
    'e': 'rdsw',
    'f': 'rtgvcd',
    'g': 'tyhbvf',
    'h': 'yujnbg',
    'j': 'uikmnh',
    'k': 'iolmj',
    'l': 'opk',
    'm': 'njk',
    'n': 'bhjm',
    'o': 'iklp',
    'p': 'ol',
    'q': 'wa',
    'r': 'edft',
    's': 'wedxza',
    't': 'rfgy',
    'u': 'yhji',
    'v': 'cfgb',
    'w': 'qase',
    'x': 'zsdc',
    'y': 'tghu',
    'z': 'asx'
    }