maple/maple.py at master · euirim/maple · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
"""Maple: automatically summarizes given text using
a modified version of the TextRank algorithm."""

import sys
import codecs
import pickle
import string

import nltk
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from tests.tests_visual import test_summarizer, tests_simple, tests_diverse
from tests.tests_field import generate_test_files
from tests import tests_alpha


def field_test():
    generate_test_files("~/Documents/summ_test_files/selected")

def alpha_test():
    tests_alpha.generate_test_files("output")


def test(simple=True):
    if simple:
        print("********* MAPLE'S SIMPLE TESTS *********\n")
        tests_simple()
    else:
        print("********* MAPLE'S DIVERSE TESTS *********\n")
        tests_diverse()

    print("********* TESTS COMPLETED *********")


def train(filename, stem=True):
    """
    Given file to use as unsupervised data, train tfidfvectorizer and punkt
    sentence tokenizer and output to pickle in data directory.
    """
    text = codecs.open(filename, "rb", "utf8").read()

    abbreviations = [
            "u.s.a", "fig", "gov", "sen", "jus", "jdg", "rep", "pres",
            "mr", "mrs", "ms", "h.r", "s.", "h.b", "s.b", "u.k", "u.n",
            "u.s.s.r", "u.s",
    ]

    print("TRAINING SENTENCE TOKENIZER...")
    pst = PunktSentenceTokenizer()
    pst.train(text.replace("\n\n", " "))
    # add extra abbreviations
    pst._params.abbrev_types.update(abbreviations)
    print("TRAINED ABBREVIATIONS: \n{}".format(pst._params.abbrev_types))

    # stemming
    if stem:
        wnl = WordNetLemmatizer()
        print("WORD TOKENIZING TEXT")
        tokens = nltk.word_tokenize(text)

        # pos tagging
        print("POS TAGGING TEXT...")
        tagged_tokens = pos_tag(tokens)

        print("STEMMING TRAINING TEXT...")
        for i, tok in enumerate(tagged_tokens):
            position = None
            if tok[1] == "NN" or tok[1] == "NNS" or tok[1] == "NNPS":
                position = wordnet.NOUN
            elif "JJ" in tok[1]:
                position = wordnet.ADJ
            elif "VB" in tok[1]:
                position = wordnet.VERB
            elif "RB" in tok[1]:
                position = wordnet.ADV

            if position:
                tokens[i] = wnl.lemmatize(tok[0], position)

            if i % 1000000 == 0:
                print("TOKEN: {}".format(i))

        text = "".join([("" if tok in string.punctuation else " ")+tok
                for tok in tokens])
        text = text.strip()

    print("TRAINING VECTORIZER...")
    tfv = TfidfVectorizer()
    tfv.fit(pst.tokenize(text))

    # export trained tokenizer + vectorizer
    print("EXPORTING TRAINED TOKENIZER + VECTORIZER...")
    if stem:
        punkt_out_filename = "data/punkt_stem.pk"
        tfidf_out_filename = "data/tfidf_stem.pk"
    else:
        punkt_out_filename = "data/punkt.pk"
        tfidf_out_filename = "data/tfidf.pk"

    with open(punkt_out_filename, "wb") as pst_out:
        pickle.dump(pst, pst_out)
    with open(tfidf_out_filename, "wb") as tfv_out:
        pickle.dump(tfv, tfv_out)

    print("EXPORTING COMPLETED")
    return


def main(argv):
    if argv[0] == "-t":
        try:
            test(bool(int(argv[1])))
            return 0
        except:
            print("Enter True or False as second parameter for testing.\n")
            return 1
    elif (len(argv) > 3) or (len(argv) < 3) or (argv[0] == "-h"):
        print("./maple.py (optional -test true or false) <filename>"
                " <max_units> <units (-p or -s)>")
        return 1

    if argv[2] == "-p":
        paragraphs = True
    else:
        paragraphs = False

    test_summarizer(filename)

    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))