-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtinynormalizer.py
More file actions
70 lines (53 loc) · 1.83 KB
/
tinynormalizer.py
File metadata and controls
70 lines (53 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
A tiny tokenizer
for multi-lingual data
author: Barbara Plank
# Note: needs the regex package, as the package "re" does not capture indic vowel markers in \w
# this is a bug is cf. http://stackoverflow.com/questions/12746458/
"""
import re
import regex
import sys
import argparse
parser = argparse.ArgumentParser(description="""simple tokenizer, inspired by Christopher Pott's twitter tokenizer; expects one sentence per line""")
parser.add_argument("infile", help="file with one sentence per line")
class TinyNormalizer(object):
"""
TinyNormalizer
replace @usernames and http (urls) with <USER> and <URL>
"""
def normalize(self, word):
if word.lower().startswith("http"):
return "<URL>"
elif word.startswith("@") or word.startswith(".@"):
return "<USER>"
else:
return word
def tokenize(self,line):
"""
return list of tokens
"""
line = regex.sub(r"\s+"," ",line) # remove extra spaces
return [self.normalize(w) for w in line.split()]
def main():
args = parser.parse_args()
tt = TinyNormalizer()
for line in open(args.infile):
line=line.strip()
out = tt.tokenize(line)
outline = " ".join(out)
try:
#assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
print(outline)
except:
print("==== CHECK FILE! ====", args.infile, file=sys.stderr)
print("+"*20, file=sys.stderr)
print("in: >>{}<<".format(line), file=sys.stderr)
print("out: >>{}<<".format(outline), file=sys.stderr)
if __name__=="__main__":
if (sys.version_info < (3, 0)):
print("needs python 3")
exit()
main()