forked from danlou/LMMS
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmwtok.py
More file actions
27 lines (22 loc) · 1001 Bytes
/
mwtok.py
File metadata and controls
27 lines (22 loc) · 1001 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def calc_idx_map_abs(tokens_mw):
"""Calculate a mapping from multiword token ids to whitespace token ids
This allows handling of multi-word expressions as the mapping allows
matching whitespace tokens with mw features
For example:
calc_idx_map_abs(["single", "multi word", "token"])
will return:
[[0, [0]], [1, [1, 2]], [2, [3]]]
:param tokens_mw: a list of multiword tokens
:returns: a list of list each sublist contains the id of the multiword and a
list of ids for the corresponding whitespace tokens
:rtype: list
"""
idx_map_abs = []
idx_map_rel = [(i, list(range(len(t.split()))))
for i, t in enumerate(tokens_mw)]
token_counter = 0
for idx_group, idx_tokens in idx_map_rel: # converting relative token positions to absolute
idx_tokens = [i+token_counter for i in idx_tokens]
token_counter += len(idx_tokens)
idx_map_abs.append([idx_group, idx_tokens])
return idx_map_abs