forked from neubig/util-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcombine-predicate.pl
More file actions
executable file
·123 lines (114 loc) · 3.26 KB
/
combine-predicate.pl
File metadata and controls
executable file
·123 lines (114 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/perl
#
# What's this?
# ------------
#
# Combine segmented predicates in Japanese sentences into manageable
# units for ease of statistical machine translation tasks such as word
# alignment, rule extraction and other subsequent processes.
#
# Usage and Data Format
# ---------------------
#
# Currently, we are assuming the input format is sequences of triplets of (word, the
# part-of-speech, and the reading of the word). Each triplet should be delimitted by
# slash "/".
#
# Here is the sample input and output of this script.
#
# $ echo "これ/代名詞/これ は/助詞/は ペン/名詞/ぺん で/助動詞/で あ/動詞/あ る/語尾/る" | ./combine-predicate.pl
# word=これ pos=代名詞 pron=これ
# word=は pos=助詞 pron=は
# word=ペン pos=名詞 pron=ぺん
# word=で pos=助動詞 pron=で
# word=あ pos=動詞 pron=あ
# word=る pos=語尾 pron=る
# これ は ペン である
#
#
# This kinds of the input data are used in the output format of KyTea
# (http://www.phontron.com/kytea/) For example,
#
# $ echo "これはペンである" | kytea | ./combine-predicate.pl
# これ は ペン である
#
use strict;
use warnings;
use utf8;
use Getopt::Long;
use List::Util qw(sum min max shuffle);
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
my $VERB = 0;
GetOptions(
"verb" => \$VERB,
);
my $GLUE = "";
if (@ARGV eq 1) {
$GLUE = $ARGV[0];
} elsif (@ARGV > 1) {
print STDERR "Usage: $0 [GLUE]\n";
exit 1;
}
# Split each of the triplet into three things.
#
# E.g., "これ/代名詞/これ" => ("これ", "代名詞", "これ"),
# "ペン/名詞/ぺん" => ("ペン", "名詞", "ぺん")
#
sub wpp {
my $s = shift;
# print STDERR "$s\n";
$s =~ /^(.+)\/([^\/]+)\/([^\/]+)$/ or die $s;
return ($1, $2, $3);
}
# TODO: Write the documentation about used heuristic rules.
sub iscombine {
my $s = shift;
my ($word, $pos, $pron) = wpp($s);
if($VERB and ($pos =~ /^動詞$/)) {
return 1;
} elsif($pos =~ /^(語尾|助動詞)$/) {
return 1;
} elsif(($word =~ /^(て|ば)$/) and ($pos =~ /^(助詞)$/)) {
return 1;
} elsif(($word =~ /^(な)$/) and ($pos =~ /^(形容詞)$/)) {
return 1;
} elsif(($word =~ /^(さ|し|す|あ|い)$/) and ($pos =~ /^(動詞)$/)) {
return 1;
}
return 0;
}
# Combine predicates, and return the sequence of words.
#
# If there are consecutive words that fire the bits,
# they will be combined into a single word.
#
# For example,
# $harr = ["これ", "は", "ペン", "で", "あ", "る"]
# $carr = [0, 0, 0, 1, 1, 1]
# => ["これ", "は", "ペン", "である"]
#
# Note that the words ["で", "あ", "る"] were merged into
# "である".
sub combine {
my ($harr, $carr) = @_;
my @newarr = ($$harr[0]);
foreach my $i (1 .. $#$harr) {
if (($$carr[$i] == 1) and ($$carr[$i-1] == 1)) {
$newarr[-1] .= $GLUE . $$harr[$i];
} else {
push @newarr, $$harr[$i];
}
}
return \@newarr;
}
while(<STDIN>) {
chomp;
# print "$_\n";
s/\\ / /g;
my @warr = split(/ +/);
my @harr = map { my ($w, $pr, $ps) = wpp($_); $w } @warr;
my @carr = map { iscombine($_) } @warr;
print "@{combine(\@harr, \@carr)}\n";
}