Skip to content

Latest commit

 

History

History
25 lines (20 loc) · 615 Bytes

File metadata and controls

25 lines (20 loc) · 615 Bytes
def create_training_data(corpus_raw, WINDOW_SIZE=2):
    sentences_list = corpus_raw.split('.')

    sentences = []
    words_list = []
    for sent in sentences_list:
        sent_words = sent.split()
        sentences.append(sent_words)

        for word in sent_words:
            words_list.append(word)

    print("total words: %d" % len(words_list))

    words_list = set(words_list)
    vocab_size = len(words_list)
    print("vocabulary size: %d" % vocab_size)

    word2idx = dict()
    idx2word = dict()
    for i, w in enumerate(words_list):
        word2idx[w] = i
        idx2word[i] = w