-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathload_data.py
More file actions
665 lines (525 loc) · 29.4 KB
/
load_data.py
File metadata and controls
665 lines (525 loc) · 29.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
from fastNLP.io import CSVLoader
from fastNLP import Vocabulary
from fastNLP import Const
import numpy as np
import fitlog
import pickle
import os
from fastNLP import cache_results
from torch import embedding
# from fastNLP.embeddings import StaticEmbedding
from fastNLP_module import StaticEmbedding
from utils import writeList2File
from fastNLP.io.loader import ConllLoader
from utils import get_bigrams
from functools import partial
@cache_results(_cache_fp='cache/ontonotes4ner',_refresh=False)
def load_ontonotes4ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,train_clip=False,
char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0):
train_path = os.path.join(path,'train.char.bmes{}'.format('_clip' if train_clip else ''))
dev_path = os.path.join(path,'dev.char.bmes')
test_path = os.path.join(path,'test.char.bmes')
loader = ConllLoader(['chars','target'])
train_bundle = loader.load(train_path)
dev_bundle = loader.load(dev_path)
test_bundle = loader.load(test_path)
datasets = dict()
datasets['train'] = train_bundle.datasets['train']
datasets['dev'] = dev_bundle.datasets['train']
datasets['test'] = test_bundle.datasets['train']
datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['train'].add_seq_len('chars')
datasets['dev'].add_seq_len('chars')
datasets['test'].add_seq_len('chars')
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary()
print(datasets.keys())
print(len(datasets['dev']))
print(len(datasets['test']))
print(len(datasets['train']))
char_vocab.from_dataset(datasets['train'],field_name='chars',
no_create_entry_dataset=[datasets['dev'],datasets['test']])
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
no_create_entry_dataset=[datasets['dev'],datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
if index_token:
char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='chars',new_field_name='chars')
bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='target',new_field_name='target')
vocabs = {}
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
vocabs['label'] = label_vocab
embeddings = {}
if char_embedding_path is not None:
char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,
min_freq=char_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['char'] = char_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,
min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['bigram'] = bigram_embedding
return datasets,vocabs,embeddings
@cache_results(_cache_fp='cache/resume_ner',_refresh=False)
def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,
char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0):
train_path = os.path.join(path,'train.char.bmes')
dev_path = os.path.join(path,'dev.char.bmes')
test_path = os.path.join(path,'test.char.bmes')
loader = ConllLoader(['chars','target'])
train_bundle = loader.load(train_path)
dev_bundle = loader.load(dev_path)
test_bundle = loader.load(test_path)
datasets = dict()
datasets['test'] = test_bundle.datasets['train']
datasets['train'] = train_bundle.datasets['train']
datasets['dev'] = dev_bundle.datasets['train']
print(datasets)
# 作用?
# apply_field()
datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
datasets['train'].add_seq_len('chars')
datasets['dev'].add_seq_len('chars')
datasets['test'].add_seq_len('chars')
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary()
print(datasets.keys())
print(len(datasets['dev']))
print(len(datasets['test']))
print(len(datasets['train']))
char_vocab.from_dataset(datasets['train'],field_name='chars',
no_create_entry_dataset=[datasets['dev'],datasets['test']] )
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
no_create_entry_dataset=[datasets['dev'],datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
if index_token:
char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='chars',new_field_name='chars')
bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='target',new_field_name='target')
vocabs = {}
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
embeddings = {}
if char_embedding_path is not None:
char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,
min_freq=char_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['char'] = char_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,
min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['bigram'] = bigram_embedding
return datasets,vocabs,embeddings
@cache_results(_cache_fp='need_to_defined_fp',_refresh=False)
def equip_chinese_ner_with_skip(datasets,vocabs,embeddings,w_list,word_embedding_path=None,
word_min_freq=1,only_train_min_freq=0):
from utils_ import Trie,get_skip_path
w_trie = Trie()
for w in w_list:
w_trie.insert(w)
# for k,v in datasets.items():
# v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips')
def skips2skips_l2r(chars,w_trie):
'''
:param lexicons: list[[int,int,str]]
:return: skips_l2r
'''
# print(lexicons)
# print('******')
lexicons = get_skip_path(chars,w_trie=w_trie)
# max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0
result = [[] for _ in range(len(chars))]
for lex in lexicons:
s = lex[0]
e = lex[1]
w = lex[2]
result[e].append([s,w])
return result
def skips2skips_r2l(chars,w_trie):
'''
:param lexicons: list[[int,int,str]]
:return: skips_l2r
'''
# print(lexicons)
# print('******')
lexicons = get_skip_path(chars,w_trie=w_trie)
# max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0
result = [[] for _ in range(len(chars))]
for lex in lexicons:
s = lex[0]
e = lex[1]
w = lex[2]
result[s].append([e,w])
return result
for k,v in datasets.items():
v.apply_field(partial(skips2skips_l2r,w_trie=w_trie),'chars','skips_l2r')
for k,v in datasets.items():
v.apply_field(partial(skips2skips_r2l,w_trie=w_trie),'chars','skips_r2l')
# print(v['skips_l2r'][0])
word_vocab = Vocabulary()
word_vocab.add_word_lst(w_list)
vocabs['word'] = word_vocab
for k,v in datasets.items():
v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
for k,v in datasets.items():
v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')
for k,v in datasets.items():
v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
v.apply_field(lambda x:
list(map(lambda y:
list(map(lambda z:word_vocab.to_index(z),y)),x)),
'skips_l2r_word',new_field_name='skips_l2r_word')
v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
v.apply_field(lambda x:
list(map(lambda y:
list(map(lambda z:word_vocab.to_index(z),y)),x)),
'skips_r2l_word',new_field_name='skips_r2l_word')
if word_embedding_path is not None:
word_embedding = StaticEmbedding(word_vocab,word_embedding_path,word_dropout=0)
embeddings['word'] = word_embedding
vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
field_name='chars', new_field_name='chars')
vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
field_name='bigrams', new_field_name='bigrams')
vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
field_name='target', new_field_name='target')
return datasets,vocabs,embeddings
'''
description: yangjie_word_char_mix.txt 这个文件中是对每个词语的embedding 存储,但是问题有:
01.这个embedding 是怎么来的?
02.如果换成其他词了,这个embedding 改怎么生成?
03.针对不同领域的文本,应该使用不同的词典。这就需要我们自己的工作了
param {type}
return {type}
'''
@cache_results(_cache_fp='cache/load_yangjie_rich_pretrain_word_list',_refresh=False)
def load_yangjie_rich_pretrain_word_list(embedding_path,drop_characters=True):
f = open(embedding_path,'r')
lines = f.readlines()
w_list = []
for line in lines:
splited = line.strip().split(' ')
w = splited[0]
w_list.append(w)
if drop_characters:
w_list = list(filter(lambda x:len(x) != 1, w_list))
# writeList2File(w_list,"wordsList.txt")
return w_list
@cache_results(_cache_fp='cache/ontonotes4ner',_refresh=False)
def load_toy_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,train_clip=False):
train_path = os.path.join(path,'toy_train.bmes')
dev_path = os.path.join(path,'toy_dev.bmes')
test_path = os.path.join(path,'toy_test.bmes')
loader = ConllLoader(['chars','target'])
train_bundle = loader.load(train_path)
dev_bundle = loader.load(dev_path)
test_bundle = loader.load(test_path)
datasets = dict()
datasets['train'] = train_bundle.datasets['train']
datasets['dev'] = dev_bundle.datasets['train']
datasets['test'] = test_bundle.datasets['train']
datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['train'].add_seq_len('chars')
datasets['dev'].add_seq_len('chars')
datasets['test'].add_seq_len('chars')
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary(padding=None,unknown=None)
print(datasets.keys())
print(len(datasets['dev']))
print(len(datasets['test']))
print(len(datasets['train']))
char_vocab.from_dataset(datasets['train'],field_name='chars',
no_create_entry_dataset=[datasets['dev'],datasets['test']] )
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
no_create_entry_dataset=[datasets['dev'],datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
if index_token:
char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='chars',new_field_name='chars')
bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
field_name='target',new_field_name='target')
vocabs = {}
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
vocabs['label'] = label_vocab
embeddings = {}
if char_embedding_path is not None:
char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,)
embeddings['char'] = char_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01)
embeddings['bigram'] = bigram_embedding
return datasets,vocabs,embeddings
@cache_results(_cache_fp='cache/msraner1',_refresh=False)
def load_msra_ner_1(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,train_clip=False,
char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0):
if train_clip:
train_path = os.path.join(path, 'train_dev.char.bmes_clip1')
test_path = os.path.join(path, 'test.char.bmes_clip1')
else:
train_path = os.path.join(path,'train_dev.char.bmes')
test_path = os.path.join(path,'test.char.bmes')
loader = ConllLoader(['chars','target'])
train_bundle = loader.load(train_path)
test_bundle = loader.load(test_path)
datasets = dict()
datasets['train'] = train_bundle.datasets['train']
datasets['test'] = test_bundle.datasets['train']
datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
datasets['train'].add_seq_len('chars')
datasets['test'].add_seq_len('chars')
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary()
print(datasets.keys())
# print(len(datasets['dev']))
print(len(datasets['test']))
print(len(datasets['train']))
char_vocab.from_dataset(datasets['train'],field_name='chars',
no_create_entry_dataset=[datasets['test']] )
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
no_create_entry_dataset=[datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
if index_token:
char_vocab.index_dataset(datasets['train'],datasets['test'],
field_name='chars',new_field_name='chars')
bigram_vocab.index_dataset(datasets['train'],datasets['test'],
field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(datasets['train'],datasets['test'],
field_name='target',new_field_name='target')
vocabs = {}
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
vocabs['label'] = label_vocab
embeddings = {}
if char_embedding_path is not None:
char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,
min_freq=char_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['char'] = char_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,
min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['bigram'] = bigram_embedding
return datasets,vocabs,embeddings
'''
description:
1.这个方法就三个作用:
01.构建dataset
02.构建vocab
03.构建embedding
在构建的时候使用到了fastNLP 中很多函数和类,所以需要先行学习一下这个库的使用。
2.如果加了@cache_results 这个参数,那么在debug 的时候,可能是不会进到这个函数中,而是直接利用cache中的数据了
'''
@cache_results(_cache_fp='cache/weiboNER_uni+bi', _refresh=False)
def load_weibo_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True,
char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0,char_word_dropout=0.01):
# step0.=============================准备数据,诸如数据地址等
loader = ConllLoader(['chars','target'])
train_path = os.path.join(path,'weiboNER_2nd_conll.train_deseg')
dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg')
test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg')
paths = {}
paths['train'] = train_path
paths['dev'] = dev_path
paths['test'] = test_path
# step1.=============================构建datasets
datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个(fastNLP)中 DataSet 类的实例
for k,v in paths.items():
bundle = loader.load(v)
# 这里有点儿疑问,为什么是固定的 'train' 作为参数?
# 固定的 train 为参数,是因为bundle 这个实例的设置,它是把数据都放到 train 这个里面了
datasets[k] = bundle.datasets['train']
trainData = datasets['train']
print(type(trainData)) # <class 'fastNLP.core.dataset.DataSet'>
print(len(trainData)) # 1350
print(trainData)
"""
datasets['train'] 中的数据长成下面这样,
+-----------------------------------------------------------+-----------------------------------------------------------+
| chars | target |
+-----------------------------------------------------------+-----------------------------------------------------------+
| ['科', '技', '全', '方', '位', '资', '讯', '智', '能',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['对', ',', '输', '给', '一', '个', '女', '人', ',',... | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM... |
| ['今', '天', '下', '午', '起', '来', '看', '到', '外',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['今', '年', '拜', '年', '不', '短', '信', ',', '就',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['浑', '身', '酸', '疼', ',', '两', '腿', '无', '力',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['明', '显', '紧', '张', '状', '态', '没', '出', '来',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['三', '十', '年', '前', ',', '老', '爹', '带', '我',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
| ['好', '活', '动', '呀', ',', '给', '力', '的', '商',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ['人', '生', '如', '戏', ',', '导', '演', '是', '自',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
| ['听', '说', '小', '米', '开', '卖', '了', ',', '刚',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
| ... | ... |
+-----------------------------------------------------------+-----------------------------------------------------------+
这个是 复旦大学开源工具fastNLP 中DataSet 的类型,其详细文档可参考:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html
"""
for k,v in datasets.items():
print('{}:{}'.format(k,len(v)))
# print(*list(datasets.keys()))
#step2.=============================根据得到的dataset构建字典信息
vocabs = {}
# 需要学习一下 Vocabulary 的使用方法
# urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary()
# datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
for item in datasets.items():
print(item)
for k,v in datasets.items(): # 处理键值对
# ignore the word segmentation tag
# apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
# 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
# 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
v.apply_field(lambda x: [w[0] for w in x],'chars','chars')
v.apply_field(get_bigrams,'chars','bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams
# datasets['train']是一个DataSet 的实例
char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word))
for k,v in datasets.items():
# v.set_pad_val('target',-100)
v.add_seq_len('chars',new_field_name='seq_len')
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']])
if index_token:
char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
bigram_vocab.index_dataset(*list(datasets.values()),field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')
# vocabs 的构造和 datasets 的构造原理都是相同的
# 二者都是字典,不同的键值对应着不同的数据信息
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
# step3.=============================构建embedding信息
embeddings = {}
if unigram_embedding_path is not None:
unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
word_dropout=char_word_dropout,
min_freq=char_min_freq,only_train_min_freq=only_train_min_freq,)
embeddings['char'] = unigram_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
word_dropout=0.01,
min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
embeddings['bigram'] = bigram_embedding
return datasets, vocabs, embeddings
'''
description:
param {type} :
01.index_token ?? 是否让dataset 中的field 转为index
return {type}
'''
@cache_results(_cache_fp='cache/tianChiNER_uni+bi', _refresh=False)
def load_tianchi_ner(path,
unigram_embedding_path=None, # yangjie_rich_pretrain_unigram_path
bigram_embedding_path=None,# yangjie_rich_pretrain_bigram_path
index_token=True,
char_min_freq=1,
bigram_min_freq=1,
only_train_min_freq=0,
char_word_dropout=0.01
):
# step0.=============================准备数据,诸如数据地址等
loader = ConllLoader(['chars','target'])
train_path = os.path.join(path,'tianchi.train')
dev_path = os.path.join(path, 'tianchi.dev')
test_path = os.path.join(path, 'tianchi.test')
paths = {}
paths['dev'] = dev_path
paths['train'] = train_path
paths['test'] = test_path
# step1.=============================构建datasets
datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个 DataSet 类的实例
for k,v in paths.items():
bundle = loader.load(v)
datasets[k] = bundle.datasets['train']
for k,v in datasets.items():
print('{}:{}'.format(k,len(v)))
#step2.=============================根据得到的dataset构建字典信息
vocabs = {}
# 需要学习一下 Vocabulary 的使用方法
# urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
char_vocab = Vocabulary()
bigram_vocab = Vocabulary()
label_vocab = Vocabulary()
# datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
for item in datasets.items():
print(item)
for k,v in datasets.items(): # 处理键值对
# ignore the word segmentation tag
# apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
# 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
# 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
v.apply_field(lambda x: [w[0] for w in x],'chars','chars')
v.apply_field(get_bigrams,'chars','bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams
# datasets['train']是一个DataSet 的实例
# 形参no_create_entry_dataset的作用:在建立词表的时候将test与dev就考虑到模型中,这会使得最终的结果更好
# 根据训练数据构建字典信息
char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']])
bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']])
label_vocab.from_dataset(datasets['train'],field_name='target')
#char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=datasets['dev'])
#bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=datasets['dev'])
print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word))
for k,v in datasets.items():
# 将使用len()直接对field_name中每个元素作用,将其结果作为sequence length, 并放入new_field_name=seq_len这个field
v.add_seq_len('chars',new_field_name='seq_len')
# 是否将dataset中的每列转为字典中的index
# 我对 *list(datasets.values()) 这个不是很熟悉
if index_token:
char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
bigram_vocab.index_dataset(*list(datasets.values()),field_name='bigrams',new_field_name='bigrams')
label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')
# vocabs 的构造和 datasets 的构造原理都是相同的
# 二者都是字典,不同的键值对应着不同的数据信息
vocabs['char'] = char_vocab
vocabs['label'] = label_vocab
vocabs['bigram'] = bigram_vocab
# step3.=============================构建embedding信息
'''有如下几个问题:
01.不是说预训练的embedding 会失去上下文的语义信息吗?为什么这里又用embedding了?
02.这个embedding 和后面的bertEmbedding 有什么区别?
03.需要学习一下 StaticEmbedding()的作用
'''
embeddings = {}
if unigram_embedding_path is not None:
unigram_embedding = StaticEmbedding(char_vocab,
model_dir_or_name=unigram_embedding_path,
word_dropout=char_word_dropout,
min_freq=char_min_freq,
only_train_min_freq=only_train_min_freq)
# 这里的 unigram_embedding 就是一个实例
embeddings['char'] = unigram_embedding
if bigram_embedding_path is not None:
bigram_embedding = StaticEmbedding(bigram_vocab,
model_dir_or_name=bigram_embedding_path,
word_dropout=0.01,
min_freq=bigram_min_freq,
only_train_min_freq=only_train_min_freq)
embeddings['bigram'] = bigram_embedding
return datasets, vocabs, embeddings
if __name__ == '__main__':
path = 'data/pretrain/yangjie_word_char_mix.txt'
#load_yangjie_rich_pretrain_word_list(path)