SEMST2012/token_inventory.groovy at master · jimwhite/SEMST2012 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
data_dir = new File('data')

scope_dir = new File(data_dir, 'SEM-2012-SharedTask-CD-SCO')
dev_file = new File(scope_dir, 'SEM-2012-SharedTask-CD-SCO-dev.txt')
training_file = new File(scope_dir, 'SEM-2012-SharedTask-CD-SCO-training.txt')

input_file = training_file
input_file = new File(data_dir, 'eval-2012-02-20/wisteria2.all')

input_basename = input_file.name - /.txt/

//plain_text_file = new File(data_dir, 'dev_sentences.txt')
//
//pet_dir = new File(data_dir, "erg.1111.${plain_text_file.name - /.txt/}.12-02-12.pet")
//
//println pet_dir.name

//println pet_dir.listFiles().sort { it.name as Integer }

headline_pattern = /^\[(\d+)\]\s+\((\d+)\s+of\s+(\d+)\)\s+\{(\d+)\}\s+`(.*)'$/

assert ((/[1] (1 of 1) {1} `1 . The Singular Experience of Mr . John Scott Eccles'/ =~ headline_pattern).matches())

dmrs_first_line_pattern = /\s*<\s*[dD][mM][rR][sS].*$/

assert ((/<dmrs cfrom='-1' cto='-1'>/ =~ dmrs_first_line_pattern).matches())

report_file = new File(input_basename + '-tokens.html')

report_file.withPrintWriter { printer ->
    new groovy.xml.MarkupBuilder(printer).html {
        body {
            h1 input_file.name
            input_file.withReader { reader ->
                def delimitedReader = new BlankLineTerminatedReader(reader)

                def INTERWORDSEP = ' '

                def sentence_count = 0
                def negated_sentence_count = 0
                def missing_export_file_count = 0

//                def token_inventory = [:].withDefault { 0 }
                def token_inventory = [:].withDefault { [] }

                def realpred_to_negation_cues = [:].withDefault { [] }
                def gpred_to_negation_cues = [:].withDefault { [] }
                def negation_cues = [:].withDefault { [count:0, realpreds:[:].withDefault { 0 }, gpreds:[:].withDefault { 0 }]}

                while (delimitedReader.next()) {
                    List<String> lines = delimitedReader.readLines()
                    List<Map> words = lines.collect {
                        def columns = it.split(/\t/)
                        def (chap_name, sent_indx, tok_indx, word, lemma, pos, syntax) = columns
                        def labels = columns.size() < 8 ? [] : columns[7..-1].collect { it.trim() }
                        [chap_name:chap_name, sent_indx:sent_indx as Integer, tok_indx:tok_indx as Integer, word:word, lemma:lemma, pos:pos, syntax:syntax, labels:labels]
                    }

                    def negated_scope_count = (words[0].labels.size() / 3) as Integer

                    def plain_text = words.collect { it.word }.join(INTERWORDSEP)

                    words.inject(0) { i, word -> word.cfrom = i ; word.cto = i + word.word.length() ; word.cto + INTERWORDSEP.length() }

                    // Zero-based sentence index used by CoNLL.
                    def sentence_index = (words[0].sent_indx as Integer)

                    // One-based sentence number used by PET.
                    def sentence_number = ++sentence_count

//                    words.each { if (!(it.word =~ /^[-\p{L}]+$/ || it.word =~ /^\p{N}+$/)) token_inventory[it.word] = token_inventory[it.word] + 1 }
                    words.each { if (!(it.word =~ /^[-\p{L}]+$/ || it.word =~ /^\p{N}+$/)) token_inventory[it.word] << "${words[0].chap_name} s${words[0].sent_indx} : $plain_text".toString() }
                }

                if (token_inventory.size()) {
                    h3 'Tokens other than /^[-\\p{L}]+$/ || /^\\p{N}+$/'
                    table(border:1) {
//                    token_inventory.entrySet().sort { a, b -> b.value.size() <=> a.value.size() ?: a.key <=> b.key }.each { e ->
                        token_inventory.entrySet().sort { a, b -> a.value.size() <=> b.value.size() ?: a.key <=> b.key }.each { e ->
                            tr {
                                td(align:'center', e.key)
                                td(align:'right', e.value.size())
                                if (e.value.size() > 40) {
                                    td()
                                } else {
                                    td {
                                        e.value.unique().each { p(it) }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}


String get_plain_text_from_export_file(File pet_export_file)
{
    def result = null

    pet_export_file.withReader {
        def delimitedReader = new BlankLineTerminatedReader(it)

        while (delimitedReader.hasNext()) {
            delimitedReader.next()
            List<String> lines = delimitedReader.readLines()

            if (lines.size()) {
//                println (lines[0])

                def m = lines[0] =~ headline_pattern

                if (m.matches()) {
                    def (_, pet_sentence_number, parse_i, parse_n, somenumber, pet_plain_text) = m[0]

                    result = pet_plain_text

                    break
                }
            }
        }
    }

    result
}

def get_dmrs_from_export_file(File pet_export_file)
{
    def result = null

    pet_export_file.withReader {
        def delimitedReader = new BlankLineTerminatedReader(it)

        while (delimitedReader.hasNext()) {
            delimitedReader.next()
            List<String> lines = delimitedReader.readLines()

            if (lines.size()) {
//                println (lines[0])

                def m = lines[0] =~ dmrs_first_line_pattern

                if (m.matches()) {
                    def dmrs = new XmlSlurper().parseText(lines.join('\n'))

                    result = dmrs
                    break
                }
            }
        }
    }

    result
}

String column2string(List words, col, joiner = ' ')
{
    words.collect { it[col] }.join(joiner)
}

String columns2sexpstring(List<String[]> words)
{
    def b = new StringBuilder(words.size() * 20)

    words.each { columns ->
        def (chap, line, i, word, lemma, pos, tree) = columns
        b.append(tree.replace('*', " ($pos ${(word == lemma) ? word : "($word $lemma)"}) "))
    }

    b
}