NeuralProgramInterfaces/construct_data.py at master · n8rob/NeuralProgramInterfaces · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#           Data that we can use to train           #
#               our classifeir and NPI              #
#                                                   #
#          Fulda, Brown, Wingate, Robinson          #
#                       DRAGN                       #
#                    NPI Project                    #
#                       2020                        #

#             See file                              #

import numpy as np
import re
import pickle as pkl
import copy
import pdb
import gc
import random

from transformers import *
import run_generation as rg

import torch
import torch.nn.functional as F

from tqdm import trange
from tqdm import tqdm

import argparse
import os
import pdb
import copy as cp

# for NLP
import spacy
nlp = spacy.load("en_core_web_sm")

"""
The output of the GPT-2 model is a tuple of length 3
Last element of tuple is all_hidden_states, a list of length 25
Each element of all_hidden_states is a hidden-state tensor of size (1,seq_length,num_possible_labels)
For many many sentences:
    Take 20-word sentence
    For i in range(20):
        Pass sentence through GPT2
        Take first and last hidden states and concatenate them to BIG_ARRAY
        Get next predicted token from GPT2 and append to sentence
        Make sentence length 20 again by removing first token
Then BIG_ARRAY is size (1,800,num_possible_labels) (but we may want to reshape it to be (800, num_possible_labels, 1))

See terminal for all the code I typed to do this minus the loops
"""

if __name__ == "__main__":
    """
    Cycles through list of sentences provided by pkl files, making a big_array of hidden states
    for each sentence.
    For each sentence we perform:
        Take sentence
        For i in range(num_iters):
            Pass sentence through GPT2
            Take first and last hidden states and concatenate them to big_array
            Get next predicted token from GPT2 and append to sentence
            Make sentence length sent_len again by removing first token
    Then big_array is size (1,2*sent_len*num_iters,1024)
            (but we may want to reshape it to be (2*sent_len*num_iters, 1024, 1))

    Params:
        num_sentences (int): max number of sentences to get data for
        sent_len (int): sentence length
        num_iters (int): number of words to add onto the sentence cyclically
        pkl_name (str): name for pkl to which we save big_arrays
    Returns:
        (dict): dictionary mapping sentences to big_array's
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--mixed_sentence_file",
                        default="./smaller_wiki_books_reddit_shuffled.txt",
                        help="corpus from which to pull sentences; may be mixed with sentences that display target behavior to increase likelihood of production from GPT-2 and possibly expedite data set production"

                        )
    parser.add_argument("--save-pkl",
                        default="data/sentence_arrays.pkl",
                        help="base name of pickle files to write data set to"
                        )
    parser.add_argument("--target-words",
                        default="cat",
                        help="words to target, separated by commas"
                        )
    parser.add_argument("--pretrained-model",
                        default="gpt2",
                        help="pretrained model to use. For small GPT-2 use 'gpt2' and for medium GPT-2 use 'gpt2-medium'"
                        )
    parser.add_argument("--model-layers",
                        default="0,1,2,3,4,5,6,7,8,9,10,11,12",
                        help="Which layers to extract from language model? layer indices separated by commas\nRecommended: if spacial restrictions allow, use all available layers for data set generation and extract the needed layers at training using the extract_needed_layers function"
                        )
    parser.add_argument("--seq-len",
                        type=int,
                        default=10,
                        help="window size for inputs to lang model"
                        )
    parser.add_argument("--num-iters",
                        type=int,
                        default=10,
                        help="number of times to run lang model forward pass (extracting layers each time)"
                        )

    args = parser.parse_args()

    # Shortcuts in arguments
    mixed_sentence_file = args.mixed_sentence_file
    pkl_name = args.save_pkl
    pretrained_models = [args.pretrained_model]
    term_list = args.target_words.split() # list of target words

    # Some very important constants
    TARG = ['target words']
    WORDS = {'target words':term_list}
    PRED_INDICES = args.model_layers.split(',')
    PRED_INDICES = [int(pi) for pi in PRED_INDICES]

    # define variables for determining text processing

    # num_checks will determine the maximum number of .pkl data files to be generated
    #   of course you can always kill the process once you feel you have enough data
    num_chunks=25 * len(pretrained_models) * len(PRED_INDICES)
    num_sentences_per_chunk=4000//len(PRED_INDICES) # a pkl file should only be so big for loading speed
    num_sentences = num_chunks * num_sentences_per_chunk
    sent_len=args.seq_len
    num_iters=args.num_iters
    max_iters=5 * num_iters # default 50
    assert max_iters >= num_iters
    top_k=1
    top_p=.9
    temperature=1
    # define how sentence label vectors shall be indexed
    FAKE_DATA_INDEX = 0
    UNK_LABEL_INDEX = -2#1 + (num_word_categories * num_in_word_category)
    GPT2_MODEL_INDEX = -1
    # optional_s
    OPTIONAL_S = True # n8 HACK

    # define how each data point in the data set will be indexed
    ORIG_ACTIV_INDEX = 0 # activation arrays concatenated
    ORIG_LABEL_INDEX = 1 # label of output text
    TARG_LABEL_INDEX = 2 # this entry no longer used in our implementation
    LANG_MODEL_INDEX = 3 # pretrained model name
    META_DATA_INDEX = 4 # relevant meta data including text tokens
    ORIG_TEXT_INDEX = 5 # input text that yields lang model output text
    PRED_TEXT_INDEX = 6 # this entry no longer used in our implementation
    TARG_TEXT_INDEX = 7 # target term/behavior
    GPT2_TEXT_INDEX = 8 # the text of what the lang model actually produced

    # params to inject the word randomly into inputs to encourage its output
    INJECT_WORDNESS = True
    INJECT_WORD_RAND_CHANGES = True # this one should likely be True if the first one is

    # Fix pkl_name:
    if ".pkl" not in pkl_name:
        pkl_name = pkl_name + ".pkl"
    pkl_name_base = pkl_name

    # Create tokenizers
    model_name = pretrained_models[0]
    gpt2_tokenizer = None
    if pretrained_models == [model_name]:
        gpt2_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    else:
        raise NotImplementedError("Only the following tokenizers are supported: {}".format(model_name))

    num_keywords = len(TARG)
    num_possible_labels = int(1 + num_keywords)

    model = None
    tokenizer = None
    if 'gpt2' in model_name:
        model = GPT2LMHeadModel.from_pretrained(model_name)
        tokenizer = gpt2_tokenizer
    else:
        raise NotImplementedError("model_name == {} not supported".format(model_name))

    model.transformer.output_hidden_states = True # necessary to pull activation tensors
    device = torch.device("cpu")
    if torch.cuda.is_available():
        model = model.cuda()
        device = torch.device("cuda")

    try:
        # BEGINNING ##############################################################################################################################

        print("and so it begins",flush=True)
        dataset = []

        word_to_toks = {}
        word_to_toks[TARG[0]] = []
        for word in WORDS[TARG[0]]:
            word = word.lower()
            gpt2_word_toks = []
            gpt2_word_toks.append(gpt2_tokenizer.encode(word)) # list of veriable len but likely len 1 for small words
            gpt2_word_toks.append(gpt2_tokenizer.encode("this is "+word)[2:]) # token differs in context
            if OPTIONAL_S: # for some verbs or nouns we may put an optional 's' on the end
                gpt2_word_toks.append(gpt2_tokenizer.encode(word+"s"))
                gpt2_word_toks.append(gpt2_tokenizer.encode("this is "+word+"s")[2:])
            word_to_toks[word] = gpt2_word_toks

            word_to_toks[TARG[0]] = word_to_toks[TARG[0]] + word_to_toks[word]

        # We want to count how many words we got :)
        word_counts = {}
        for word in TARG:
            word_counts[word] = 0
        word_counts['UNK'] = 0

        # And a few other things we need defined outside the loop
        pkl_counter = 0
        iterator = -1

        """Now we begin the loop of a lifetime...---...---...---...---...---...---...---...---...---...---"""

        with open(mixed_sentence_file,'r') as BIG_FILE:

            for line in BIG_FILE:

                # clean line to some extent
                #   (due to possible differences in corpora that could tip off the classifer)
                line = line.lower().strip().strip('.').strip()
                if len(line.split()) > 100 or len(line.split()) < 4:
                    continue

                # Will we inject a word?
                if INJECT_WORD_RAND_CHANGES:
                    INJECT_WORDNESS = random.choice([True,False])

                iterator += 1

                append_to_dataset = True # naively assume we're gonna append ha ha so naive

                big_array = [] # nxmx1

                tokens = gpt2_tokenizer.encode(line)
                tokens = tokens[-sent_len:]
                num_tokens_needed = sent_len - len(tokens)
                tokens = torch.tensor(tokens, dtype=torch.long, device=device)
                tokens = tokens.unsqueeze(0).repeat(1, 1)
                tokens = tokens.cuda()
                all_text_tokens = cp.deepcopy(tokens)
                ogog_tokens = cp.deepcopy(tokens)

                # some constants to set first
                found_words_dict = {}
                for word in TARG:
                    found_words_dict[word] = False
                len_for_big_array = len(PRED_INDICES) * num_iters
                stop_itern = num_tokens_needed+max_iters

                word_found_already = False # This will tell us if we've found a word yet
                index_of_last_injection = -1 # This is for keeping track of word injection

                # We loop through multiple times now
                purely_generated_tokens = [] # haven't generated anything yet
                i = -1
                while True:
                    i += 1

                    # Now run the model
                    hidden_states, presents, all_hiddens = model(input_ids=tokens[:,-sent_len:]) # all_hiddens is a list of len
                                                                    # 25 or 13 with tensors of shape (gpt2 medium of small)
                                                                    # (1,sent_len,1024) or (1,sent_len,768)
                    # Add to big_array
                    if tokens.shape[1] >= sent_len:
                        for pi in PRED_INDICES:
                            big_array.append(all_hiddens[pi].data)

                    # Now we extract the new token and add it to the list of tokens
                    next_token_logits = hidden_states[0,-1,:]
                    filtered_logits = rg.top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
                    next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
                    next_token_list = next_token.tolist()
                    next_word = tokenizer.decode(next_token_list)
                    purely_generated_tokens = purely_generated_tokens + next_token_list
                    #generated_sent = generated_sent + next_word + " "

                    # check if the next_token_list is the token we are looking for!!!
                    next_token_item = next_token_list[0]
                    generated_string = tokenizer.decode(purely_generated_tokens)
                    for word in TARG:
                        for subword in WORDS[word]:
                            if subword in generated_string:
                                found_words_dict[word] = True
                                word_found_already = True
                                time_since_last_injection = i - index_of_last_injection # Let's make the target word be in the middle of the generated text
                                number_of_indices_more_we_need = num_iters - time_since_last_injection
                                stop_itern = i + number_of_indices_more_we_need # this will ensure there are no injections in what we call the generated tokens

                    # ...update list of tokens
                    if tokens.shape[1] < sent_len:
                        tokens = torch.cat((tokens,next_token.unsqueeze(0)),dim=1).cuda()
                    else:
                        tokens = torch.cat((tokens[:,1:],next_token.unsqueeze(0)),dim=1).cuda()

                    all_text_tokens = torch.cat((all_text_tokens,next_token.unsqueeze(0)),dim=1).cuda()

                    if len(big_array) >= len_for_big_array and i >= stop_itern and len(all_text_tokens.squeeze()) >= num_iters+sent_len:
                        break # if we have a target word and enough arrays

                    if i >= num_tokens_needed+max_iters and not word_found_already:
                        break

                    if INJECT_WORDNESS and i > num_tokens_needed and (i-num_tokens_needed)%num_iters == 0 and not word_found_already:
                        word = random.choice(TARG)
                        tokens_to_inject = random.choice(word_to_toks['target words'])
                        num_tokens_to_inject = len(tokens_to_inject)
                        tokens_to_inject = torch.tensor(tokens_to_inject).long().unsqueeze(0).repeat(1,1).cuda()
                        # Now we want to change the sentence a bit
                        #    iterate through the tokens and if any of them are nouns, then replace
                        #    NOTE: it is also valid to replace something other than nouns if that seems more reasonable :)
                        idx_to_replace = None
                        for tok_idx in range(tokens.shape[1]):
                            token_in_question = tokens.squeeze().tolist()[tok_idx]
                            word_in_question = tokenizer.decode([token_in_question]).strip()
                            assert type(word_in_question) is str
                            # Now we want to see if it's a noun
                            doc = nlp(word_in_question)
                            pos = [token.pos_ for token in doc]
                            if pos == ['NOUN'] and len(word_in_question)>2: # we have a viable noun
                                # we need this conditions because spacy thinks single letters are nouns
                                idx_to_replace = tok_idx

                        if idx_to_replace is not None: # we actually found a NOUN to replace
                            # We knock off the first few tokens if num_tokens_to_inject > 1
                            tokens = torch.cat((tokens[:,num_tokens_to_inject-1:idx_to_replace], tokens_to_inject, tokens[:,idx_to_replace+1:]),dim=1)
                            tokens = tokens[:,-sent_len:] # make sure it is the right length

                            idx_to_replace_in_all_toks = idx_to_replace - tokens.shape[1] + all_text_tokens.shape[1]
                            all_text_tokens = torch.cat((all_text_tokens[:,:idx_to_replace_in_all_toks], tokens_to_inject, all_text_tokens[:,idx_to_replace_in_all_toks+1:]),dim=1)#all_text_tokens[-1,idx_to_replace_in_all_toks] = token_to_inject
                            # keep track of when this happened
                            index_of_last_injection = i


                num_gpt2_iters_run = i+1
                big_array = big_array[-len_for_big_array:]

                # figure out true classification
                orig_classification = np.zeros(len(TARG)+1)

                # count words and see if we should append to dataset
                for i_word, word in enumerate(TARG):
                    if found_words_dict[word]: # means we found this word: so this is a term-postive labeled data point!
                        # Label: [1, 0]
                        orig_classification[i_word] = 1.
                        word_counts[word] = word_counts[word] + 1
                        # then check if we should append or not
                        if word_counts[word] > num_sentences:
                            append_to_dataset = False
                if True not in list(found_words_dict.values()): # means this is a term-negative labeled data point!
                    # Label: [0, 1]
                    orig_classification[i_word+1] = 1.
                    word_counts['UNK'] = word_counts['UNK'] + 1
                    if word_counts['UNK'] > 1.2*max([word_counts[word] for word in TARG])+1: # keep the UNKs down!!! We want balance!!!
                        append_to_dataset = False
                        word_counts['UNK'] = word_counts['UNK'] - 1 # so we actually won't count this one since we're not appending

                # What will we call "original text" and "generated text"

                assert all_text_tokens.squeeze().tolist()[-sent_len:] == tokens.squeeze().tolist()

                orig_text_tokens = all_text_tokens[:,-sent_len-num_iters:-num_iters] # sent_len tokens that produced generated_text_tokens
                generated_text_tokens = tokens

                orig_tokens = orig_text_tokens.squeeze().tolist()
                gpt2_generated_tokens = generated_text_tokens.squeeze().tolist()

                orig_text = gpt2_tokenizer.decode(orig_tokens)
                gpt2_generated_text = gpt2_tokenizer.decode(gpt2_generated_tokens)

                # Now the big_array is a list of length (num_iters*len(PRED_INDICES)) of tensors with shape (1,sent_len,emb_dim)
                big_array = torch.cat(big_array, dim=1)
                big_array = big_array.permute(1,2,0) # shape is (2*sent_len*num_iters, emb_dim, 1) now, emb_dim will be 1024 or 768
                big_array = big_array.data.cpu().numpy()

                # We want to save this big_array in the data
                #ORIG_ACTIV_INDEX = 0
                #ORIG_LABEL_INDEX = 1
                #TARG_LABEL_INDEX = 2
                #LANG_MODEL_INDEX = 3
                #META_DATA_INDEX = 4
                #ORIG_TEXT_INDEX = 5
                #PRED_TEXT_INDEX = 6
                #TARG_TEXT_INDEX = 7
                #GPT2_TEXT_INDEX = 8
                if append_to_dataset:
                    datum = [
			        big_array, # ORIG ACTIV
			        orig_classification, # ORIG LABEL
			        None, # this no longer used
			        model_name, # LANG MODEL: model_name an abstraction for 'gpt2'
			        {'num_gpt2_iters':num_gpt2_iters_run,\
                                        'orig_tokens':orig_tokens,\
                                        'gpt2_generated_tokens':gpt2_generated_tokens}, # META DATA
			        orig_text, # ORIG TEXT (or what we're deeming 'originial text')
			        None, # PRED TEXT this literally won't exist until we have an NPI
			        TARG[0], # TARG TEXT
			        gpt2_generated_text # GPT2 TEXT: just generated right here by the GPT2 :D :D
				    ]
                    dataset.append(datum)

                # Check data len
                if len(dataset) >= num_sentences_per_chunk:
                    # Then we want to save it
                    # pkl stuff
                    pkl_name = pkl_name_base + "_" + str(pkl_counter)
                    with open(pkl_name,'wb') as f:
                        pkl.dump(dataset,f)

                    # More business to conduct every num_sentences_per_chunk data
                    del dataset
                    dataset = []
                    pkl_counter += 1

                # Now if we have all the pickles we need... we break! (bc we done yay)
                if pkl_counter == num_chunks:
                    break # :)

                # Now for helpful print statements
                if iterator % (num_sentences_per_chunk//(num_sentences_per_chunk//100)) == 0:
                    print("iterations: {}; target words data: {}/{}, generic data: {}/{}, pkls written: {}".format(iterator+1, word_counts['target words'], num_sentences//2, word_counts['UNK'], num_sentences//2, pkl_counter), \
                                flush=True)

    except:
        raise

    finally:
        torch.cuda.empty_cache()


    # shuffle now
    # random.shuffle(word_sent_list)
    torch.cuda.empty_cache()
    print(" ",flush=True)
    print("done")

"""
"""