-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
180 lines (146 loc) · 5.39 KB
/
utils.py
File metadata and controls
180 lines (146 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import re
import copy
import jieba
#if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
# https://blog.csdn.net/qinbaby/article/details/23201883
def clean(sent):
clean_s = re.sub("[。;;!!??.]",",",sent)
clean_s = re.sub(' ', ",", clean_s)
clean_s = re.sub('😜', " ", clean_s)
clean_s = re.sub('👄', " ", clean_s)
return clean_s
def remove_html(string):
pattern_list = [
'<(img|div)?.*>?',
'<(img|div|span|url)[-A-Za-z0-9+&@#/%?=~_!:,.;"sr…… ]+',
'<(div|ul|url|body|html|b|p|img|span|a)+.*/(div|ul|url|body|html|b|p|img|span|a)>',
'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
]
for pattern in pattern_list:
temp = re.search(pattern, string)
if temp != None:
result = temp.group(0)
string = string.replace(result, '')
return string
"""
Input:
origin_sent: sentence to be augmented,
word_dict: word - synonym index
thesaurus_all: synonym
Output:
aug_sentences: Augmented sentences (list)
This function takes in the original sentence,
cut by jieba and find words that can be substituted and substitution choices.
According to the ratio between length of original sentence and the amount of substitution words' count,
The augmented sentences will be generated by substituting 1,2,3 words.
"""
def aug_sent(origin_sent, word_dict, thesaurus_all, sub_threshold=10):
sub_count = 0
choices = {}
origin_sent_list = list(jieba.cut(origin_sent))
for word in origin_sent_list:
if word in word_dict.keys():
sub_count += 1
thesaurus_list = thesaurus_all[word_dict[word]].split(',')
choice = []
# print(word,':', thesaurus_all[word_dict[word]].split(','))
for w in thesaurus_list:
if word == w:
continue
else: choice.append(w)
# print(word, 'substitution choice:',choice)
choices[word] = choice
# print('original sent len:', len(origin_sent_list))
# print('substituable word counts:', sub_count)
# print(choices)
aug_sentences = []
if len(choices)>sub_threshold:
# print(origin_sent)
return aug_sentences
if len(choices) == 0:
return aug_sentences
# 如果 原句词数量/可替换的词数量 > 3 则至少替换三个词才能算新句子
elif len(origin_sent_list)/float(len(choices)) > 3:
aug_sentences = sub3(origin_sent_list, choices)
# 如果 3 > 原句词数量/可替换的词数量 > 2 则至少替换两个词才能算新句子
elif len(origin_sent_list)/float(len(choices)) > 2:
aug_sentences = sub2(origin_sent_list, choices)
else: aug_sentences = sub1(origin_sent_list, choices)
# print('\naug_amount:',len(aug_sentences))
return aug_sentences
"""
Input:
origin_sent_list: cutted original sentence word list
s: word that will be substituted
sub_list: substitution choices
Output:
new_sents: generated new sentences in list
"""
def sub(origin_sent_list, s, sub_list):
new_sents = []
for i in range(len(origin_sent_list)):
if origin_sent_list[i] == s:
for sub_w in sub_list:
new_sent_list = copy.copy(origin_sent_list)
new_sent_list[i] = sub_w
new_sents.append(new_sent_list)
# new_sents.append(''.join(new_sent_list))
return new_sents
"""
substitute only one word with all choices
"""
# sub1(origin_sent_list, choices)
def sub1(origin_sent_list, choices):
choices_list = list(choices.keys())
all_list = []
for i in range(len(choices_list)):
all_list.extend(sub(origin_sent_list, choices_list[i], choices[choices_list[i]]))
all_new = []
for s_list in all_list:
all_new.append(''.join(s_list))
return all_new
"""
substitute two words
"""
# sub2(origin_sent_list, choices)
def sub2(origin_sent_list, choices):
choices_list = list(choices.keys())
all_list = []
for i in range(len(choices_list)):
j = i
for j in range(j+1,len(choices_list)-1):
# print(choices_list[i],choices_list[j])
sub_first = sub(origin_sent_list, choices_list[i], choices[choices_list[i]])
# print('first',sub_first)
for sent in sub_first:
all_list.extend(sub(sent, choices_list[j], choices[choices_list[j]]))
# print('all',all_list)
# print(len(all_list))
all_new = []
for s_list in all_list:
all_new.append(''.join(s_list))
return all_new
"""
substitute three words
"""
# sub3(origin_sent_list, choices)
def sub3(origin_sent_list, choices):
choices_list = list(choices.keys())
sub_second = []
all_list = []
for i in range(len(choices_list)):
j = i
for j in range(j+1,len(choices_list)-1):
sub_first = sub(origin_sent_list, choices_list[i], choices[choices_list[i]])
for sent in sub_first:
sub_second.extend(sub(sent, choices_list[j], choices[choices_list[j]]))
k = j
for k in range(k+1, len(choices_list)-1):
# print(choices_list[i],choices_list[j], choices_list[k])
for sent in sub_second:
all_list.extend(sub(sent, choices_list[k], choices[choices_list[k]]))
# print(len(all_list))
all_new = []
for s_list in all_list:
all_new.append(''.join(s_list))
return all_new