From 104838849455bdda5bc6de72e87ebd049a0e74fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Li=C3=A9vin?= Date: Fri, 23 Sep 2022 22:08:15 +0200 Subject: [PATCH 01/27] Transformers notebook --- .../5.3-Recurrent-Neural-Networks-Numpy.ipynb | 0 ..._EXE_deep_learning_with_transformers.ipynb | 15920 ++++++++++++++++ ...1_Recurrent_Neural_Networks_Nanograd.ipynb | 0 3 files changed, 15920 insertions(+) rename 5_Recurrent/OLD-5.1-Numpy-Recurrent-Neural-Networks.ipynb => 5_Transformers/5.3-Recurrent-Neural-Networks-Numpy.ipynb (100%) create mode 100644 5_Transformers/5_1_EXE_deep_learning_with_transformers.ipynb rename 5_Recurrent/5_1_EXE_Recurrent_Neural_Networks_Nanograd.ipynb => 5_Transformers/5_1_Recurrent_Neural_Networks_Nanograd.ipynb (100%) diff --git a/5_Recurrent/OLD-5.1-Numpy-Recurrent-Neural-Networks.ipynb b/5_Transformers/5.3-Recurrent-Neural-Networks-Numpy.ipynb similarity index 100% rename from 5_Recurrent/OLD-5.1-Numpy-Recurrent-Neural-Networks.ipynb rename to 5_Transformers/5.3-Recurrent-Neural-Networks-Numpy.ipynb diff --git a/5_Transformers/5_1_EXE_deep_learning_with_transformers.ipynb b/5_Transformers/5_1_EXE_deep_learning_with_transformers.ipynb new file mode 100644 index 0000000..4ab01a4 --- /dev/null +++ b/5_Transformers/5_1_EXE_deep_learning_with_transformers.ipynb @@ -0,0 +1,15920 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yeoSXsVuzsTf" + }, + "source": [ + "# Week 5 - Deep learning with Transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vxq_lY1Wz2AK" + }, + "source": [ + "Some preliminary set-up." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 401, + "referenced_widgets": [ + "72691438bd3443e496380c56af496eaa", + "44408e7021a34ed7b5aeeed803f48f30", + "e54ba4ac5c6949d7bcb97eb9729d6ada", + "335a51f7dd5548c3947a6288495c1737", + "2bd6ec97d073442eb08d141c07fd199d", + "7ffad741bcf84ea59f3a41239cd92045", + "38e443a3d6124361b5fc1acaaeaeb93e", + "3b037f3257684895ba25778397aeee19", + "2fa2d24e07cf4c4cba4cd296ba9281b3", + "c278bbf65b6d43c0b82af9a3cab5ef9e", + "3f549e3d727043358247df4eddcc7c07" + ] + }, + "id": "rK63fnkquGHY", + "outputId": "1921863d-7063-46ab-ca22-dc0c11f592ea" + }, + "outputs": [], + "source": [ + "! pip install ipywidgets rich seaborn torch datasets transformers tokenizers sentencepiece sacremoses --quiet\n", + "\n", + "%matplotlib inline\n", + "\n", + "import os\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "\n", + "import torch\n", + "from torch import nn\n", + "import math\n", + "from functools import partial\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "import rich\n", + "from typing import List, Tuple, Optional, Dict, Any\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import transformers\n", + "import tokenizers\n", + "import datasets\n", + "import zipfile\n", + "from huggingface_hub import hf_hub_download\n", + "\n", + "sns.set()\n", + "\n", + "# define the device to use\n", + "DEVICE = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "rich.print(f\"Device: [red]{DEVICE}\")\n", + "\n", + "# control verbosity\n", + "transformers.logging.set_verbosity_error()\n", + "datasets.logging.set_verbosity_error()\n", + "\n", + "# define support functions\n", + "def load_glove_vectors(filename = \"glove.6B.300d.txt\") -> Tuple[List[str], torch.Tensor]:\n", + " \"\"\"Load the GloVe vectors. See: `https://github.com/stanfordnlp/GloVe`\"\"\"\n", + " path = Path(hf_hub_download(repo_id=\"stanfordnlp/glove\", filename=\"glove.6B.zip\"))\n", + " target_file = path.parent / filename\n", + " if not target_file.exists():\n", + " with zipfile.ZipFile(path, 'r') as zip_ref:\n", + " zip_ref.extractall(path.parent)\n", + "\n", + " if not target_file.exists():\n", + " print(f\"Available files:\")\n", + " for p in path.parent.iterdir():\n", + " print(p)\n", + " raise ValueError(f\"Target file `{target_file.name}` can't be found. Check if `{filename}` was properly downloaded.\")\n", + "\n", + " # parse the vocabulary and the vectors\n", + " vocabulary = []\n", + " vectors = []\n", + " with open(target_file, \"r\") as f:\n", + " for l in tqdm(f.readlines(), desc=f\"Parsing {target_file.name}...\" ):\n", + " word, *vector = l.split()\n", + " vocabulary.append(word)\n", + " vectors.append(torch.tensor([float(v) for v in vector]))\n", + " vectors = torch.stack(vectors)\n", + " return vocabulary, vectors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105, + "referenced_widgets": [ + "864803498d9d485b854f6dc22bf8e7f7", + "e276dfc72b4f4b42a5a93241c7dcd0b4", + "30caa70b8e2b4ca28af34fb14cd0cf86", + "233c55aac32242f6be5085631989f7c8", + "c1d0a6be0e0b402696f8f759ecf692e2", + "09b9bc4a5757400fb352de17b2eb1cab", + "9ef088f4f6d244738be3a42da4396635", + "eb07e619043940ecb3dd33ec3f38b03c", + "d7522bc3874c4285b33ca65febd3f2d5", + "586aa53e10754cb8948e3636da7aef9c", + "e0ea063acfe245c783f720279a8004cd" + ] + }, + "id": "gtB1kzl_uGHm", + "outputId": "107a3b51-d3f5-4e5f-8894-626e95df4bc8" + }, + "outputs": [], + "source": [ + "# prepare data for the later cells\n", + "glove_vocabulary, glove_vectors = load_glove_vectors()\n", + "rich.print(f\"glove_vocabulary: type={type(glove_vocabulary)}, length={len(glove_vocabulary)}\")\n", + "rich.print(f\"glove_vectors: type={type(glove_vectors)}, shape={glove_vectors.shape}, dtype={glove_vectors.dtype}\")\n", + "\n", + "# add special tokens\n", + "special_tokens = ['<|start|>', '<|unknown|>', '<|pad|>']\n", + "glove_vocabulary = special_tokens + glove_vocabulary\n", + "glove_vectors = torch.cat([torch.randn_like(glove_vectors[:len(special_tokens)]), glove_vectors])\n", + "\n", + "# tokenizer for GloVe\n", + "glove_tokenizer = tokenizers.Tokenizer(tokenizers.models.WordLevel(vocab={v:i for i,v in enumerate(glove_vocabulary)}, unk_token=\"<|unknown|>\"))\n", + "glove_tokenizer.normalizer = tokenizers.normalizers.BertNormalizer(strip_accents=False)\n", + "glove_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eKoXNCEfuGHr" + }, + "source": [ + "# Language Modelling and Transformers\n", + "\n", + "___\n", + "## Content\n", + "\n", + "* I. Text to vectors\n", + "* II. Language models\n", + "* III. Attention mechanism\n", + "* IV. Transformers\n", + "* V. Applications of Transformer-based language models\n", + "\n", + "\n", + "___\n", + "## Introduction\n", + "\n", + "Since its introduction ([\"Attention is All You Need\", Wasrani et al. (2016)](https://arxiv.org/abs/1706.03762)), Transformers have overtaken the field of Machine Learning. Initially applied to translation tasks, Transformers pre-trained on vast amounts of unlabelled data such as BERT and GPT have been acquired as central components in most of the modern natural language processing (NLP) systems. Transformers power question answering (QA) models, machine translation services, search engines and chat bots. Independently of the language applications, the Transformer is also a versatile neural architecture and, therefore, has found success outside the field of NLP. Transformers are rapidly being adopted in image processing ([\"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale\", Dosovitskiy et al. (2021)](https://arxiv.org/abs/2010.11929)), in reinforcement learning ([\"A Generalist Agent\", Reed et al. (2022)](https://arxiv.org/abs/2205.06175)), video generation ([\"VideoGPT: Video Generation using VQ-VAE and Transformers\", Yan et al. (2021)](https://arxiv.org/abs/2104.10157)), and more. In the following sections, we will first introduce the basics of NLP (tokenization, token embeddings, language modelling), introduce the attention mechanism. In the second part, we will study the Transformer architecture and apply it to NLP tasks.\n", + "\n", + "___\n", + "## I. Text to vectors\n", + "\n", + "In the previous labs, we have applied deep learning to processing images encoded as RGB pixels. We found that processing arrays of RGB pixels using convolutional neural network was effective. In NLP, other neural interfaces are required to enable plugging text into neural networks. Raw text cannot trivially be plugged-in neural networks. In this section we show how to convert text units or *tokens* into vectors and introduce the notion of text vector spaces.\n", + "\n", + "### I.a. Tokenization\n", + "\n", + "In [alphabetic languages](https://en.wikipedia.org/wiki/List_of_writing_systems), text can be decomposed into various types of units or *tokens*: characters, syllables, words or even sentences. Each tokenization system comes with vocabulary $\\mathcal{V}$ that references all known symbols. \n", + "\n", + "The choice of tokenizer is a tradeoff between the size of the vocabulary and the number of tokens required to encode a sentence. For instance, character-level tokenizers result in a smaller vocabulary size (only 128 character when using ASCII encoding) than other tokenizers. Word-based tokenizers encode text using fewer tokens than the other tokenizers but require a much larger vocabulary, which still might miss words seen at test time. Sub-words tokenizers such as [WordPiece](https://arxiv.org/abs/2012.15524) and [byte-pair encoding (BPE)](https://arxiv.org/abs/1508.07909) are a tradeoff between character-level and word-level encoding. They have progressively taken over the field as they provide two main advantages: (i) good tradeoff between vocabulary size and encoding length, (ii) open-ended vocabulary. \n", + "\n", + "Below we tokenize one sentence using word-level, character-level and sub-word-level tokenizers. In each case, the output corresponds to a sequence of indexes corresponding to the position of the given token in the vocabulary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 837, + "referenced_widgets": [ + "f01a9c20f21547b6926e9eb44b8f43af", + "91759f59f8604393a0d72c623b1b97af", + "06a4618bda4949f2ace26450b1d8052e", + "55783e273f894123b18c1ab84fe996a1", + "8f96e5e63c61463181ad8daed8f104e0", + "8befeadf18114bdba529e65426de91e2", + "8ee9eb7d0c8c4f2998dc41835994e442", + "89d19eb4b1a243bf8318b037f3eb1eb2", + "4a41a1e705b549f8b094e5a1432ea389", + "f1e394f5978f44de9d7f539e78699582", + "d1f5bcde8d754af18fdd052c98a6d81b", + "9b39591d155d4d5b9bf7b9c37c928332", + "52a04a50c3bb491dabd849005adfe60b", + "1d9d2cf9cd59425497adf32e4dd59261", + "882871406b0043229ea41e2bc7e20cd9", + "2af41c4c23e24e3db73a9003a32d0097", + "ddd780b848994a3eaea4257b5b3f6dee", + "85fa69e6cb8c4b84b8bcb7c36bf2d8dd", + "9b601f47611b4281b2b2d1fcab39c38b", + "670ce1a5a4a3477e970db3210192ec4a", + "28ac8463e3f14e899b7e16553d991d41", + "71a9ceb690724a8ab688acbf67c68113", + "d667b4d49dfe49bebbf39dba10ada96d", + "ed1c5ae4218e48bc924fc615e416721f", + "0869911afc8549d9bd4ce0e7f97bdc0c", + "242c86c1e834490897262daa0ef5dfcd", + "389af45e82694958b597e0c25529f6c0", + "a303620e1ed142658667b32cee4b7e70", + "0330f9b958f34c46a34e62b47568fa77", + "4bde8be94bb945b7869e4fa261d686dd", + "9b6c07e98b4c4e95bdddc7dac23d69cf", + "971ee1ec3e25467fa861cf30207f25a7", + "15b3668ae54c4fb88b9d67d5756122c4", + "5fd3ce3c4f1e4751b6e1567b1d0cc447", + "c5bcb4f5e0b740f6adaaecbc9b38a8e0", + "36a31d0fa707478285f094582fb2b090", + "8b0b3d0a39df422ab85e98147252c37d", + "20d9a77f209d473bb1d2e751d706913e", + "8a2dcb7fe90040328cacd70146c48202", + "b4ef3d467ecb4d63a3a8e90b9db13fb3", + "b2966a02f9b74b7895c525b6fb21597d", + "67eb4a8763f648c1a4934e076c64bb2b", + "81bd92c195424af2b552038bb2240473", + "fee18278d97f4191a086db8dc185409b", + "b29bd3a6f57d414ea8c2625317daf851", + "2da10880d1da4b7eba255333d171d653", + "74d518f82ecb4021ade1a713de64104b", + "8b54d237a7cc43beaeb6a1edee13714e", + "148fd6dd13ca40199c14bc396a741d83", + "5cb53426cc29476dadb4c2fb88c5b5ad", + "3ce1683b100b4e249a4272220b0b6bed", + "583320fb40344b9bbb1887848173ca45", + "9b421d1eb87a4d9fbde251053fcf6c2c", + "bcd5a9c91e52470c9c285fda8a659a94", + "fa07548194e244b8b655381e073abb9d", + "9cc17f6d1a40454b8db2052160168a12", + "ec288e953b244616befab8429457c77b", + "c6b3661a3447492ab9e5b8d1b3aae05e", + "3586240e502f4255ab0d3d66e79b449e", + "64f9c055090a4861b33f8ed15a858daf", + "2d966f4595464c49be7f5b323cf54b1c", + "17d2e815f3ca46fa82167ccd1eb85b2a", + "f9ac22e242984baeaae8ddae3748003c", + "2de4643e661d448caaa7a1a7e660c6b0", + "588dfe8a747a4cd2a9ca35bba8898808", + "6d3fa316dfa24d20bda288ddfb71b4c8", + "31949d7fd38f4c20b8bb734dbf9aaa3b", + "636cfa48a2c647efa9ada29b42c37a35", + "bc150147b2c845f3aead1c4b1fa09c61", + "a0f74a508afc44ed9c618e1d181807a5", + "689de7097be44956ac7d9a5736b8e5bb", + "57c21a5483ec40a998a7ef66301a41ae", + "4dce807720b442788b75eecf25174e86", + "978de6521f3d4b8b9ac305b9db7a0ff1", + "bfa2bb61bc534200bdb61bc28bc09b53", + "2cb7d45a1a804c109e67e9a6c50976ee", + "2ba33d1e19ac48149fc2c78e1baba2a9", + "31e124e44206472ab49233418434b9ce", + "8d664b21ce974899bb6a7aaaea21a69e", + "e2036efdcaab46719ab2b57baf01e2a2", + "1a511d9f81cd434ea9a4a0baa1ad43f2", + "acfe14fcd0fc484b957fe97cfcb34c4c", + "a8650804aeaf4a37b9732b890791d06d", + "d3f574de1ff448178358f3a4ddf803c9", + "94d4edee3f7a406281f694241eb143ab", + "84ccee50a10d47f392d094a53c5223d9", + "5b85631ac5a446e5884020fb7ba4b96f", + "fc44ba4fc74f472a95cbeab1ea179b89", + "847410262bdd45919fdb4dbb18a5df2c", + "3828d19ebc43415f8ff87568418ec9b2", + "f79a56e0fe024b9682ec79d134201412", + "a5e970c0cf8a48ae8d2cb45058887d6b", + "aab826eacf3b429a8b67bb4c42698154", + "3369c01acc964d9b8b5738e59b05b6b4", + "15280581483b47f18aa57296f25d8334", + "bc51d76c3a854586b7b4832771a112bf", + "4429b3f77f2941468d72ba1b17821d9a", + "d8afe82dd0454aeeb34099fd21ccf726", + "04a785e5695545f8905306feee92b618", + "59add760ed794047bbb548f5c09e64ac", + "ff32978b71364a37a6724a5ee508bed3", + "43f85ac6c8ae4044a0e4443d7da79eb3", + "1e9772f564fa4d4390518664a1595197", + "aedd4dc023644d5cb76f33462ae965c7", + "6835c929cd844a7f882765f68bd8a7e2", + "1de84e67a3b34cddb1c73beae7888833", + "f3e74d9727564ed3ac609ebb3ec74737", + "16ebe063385841faba7f79ae57dad474", + "f34c4b62a07949b68758d6f6ffd1b31f", + "7ce42569074943d3af0ca12b2ac351d7", + "c11372c27bb34a03a26971a77791141a", + "65fd5c5e9b2c4cbdb4fe0c3314ae29bb", + "cefc16b8f9c94fc0aee7ba72a1c4aeb0", + "d35f83b7433e4f1bb857789c9738bf21", + "999d7a37dd8843b1aa9920c0da638c81", + "5c1987d5e4824dda8818f2d2187a0666", + "340c96ed160c42b191291545dfe6848c", + "8f1bf0c58d1e493a8a00e96ce09590da", + "a45abec670d242799e43d5badb75fa99", + "91e3ec446c5d4030853dca426e304f9c", + "718e86634d1343559659a545f36abb42" + ] + }, + "id": "MPEQDuq4uGHz", + "outputId": "4683777f-3247-482d-c852-f7903f307b1b" + }, + "outputs": [], + "source": [ + "# Example sentence with rare English words and non-english words\n", + "sentence = \"It is jubilating to see how élégant my horse has became\"\n", + "rich.print(f\"Input sentence: [bold blue]`{sentence}`\")\n", + "\n", + "# Define multiple tokenizers\n", + "tokenizer_ids = {\n", + " \"Word-level\": glove_tokenizer,\n", + " \"WordPiece\": \"bert-base-cased\",\n", + " \"BPE\": \"distilgpt2\",\n", + " \"Character-level\": \"google/byt5-small\",\n", + " }\n", + "\n", + "# iterate through the tokenizers and decode the input sentences\n", + "for tokenizer_name, tokenizer in tokenizer_ids.items():\n", + " # intialize the tokenizer (either)\n", + " if isinstance(tokenizer, str):\n", + " # init a `transformers.PreTrainedTokenizerFast`\n", + " tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer)\n", + " vocab_size = tokenizer.vocab_size\n", + " else:\n", + " # use the provided `tokenizers.Tokenizer``\n", + " vocab_size = tokenizer.get_vocab_size()\n", + "\n", + " # Tokenize\n", + " token_ids = tokenizer.encode(sentence, add_special_tokens=False)\n", + " if isinstance(token_ids, tokenizers.Encoding):\n", + " token_ids = token_ids.ids\n", + "\n", + " # Report\n", + " rich.print(f\"[red]{tokenizer_name}[/red]: sentence converted into {len(token_ids)} tokens (vocabulary: {vocab_size} tokens)\")\n", + " rich.print(f\"Tokens:\\n{[tokenizer.decode([t]) for t in token_ids]}\")\n", + " rich.print(f\"Token ids:\\n{[t for t in token_ids]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DNH3dQ3VuGH1" + }, + "source": [ + "### I.b Embeddings\n", + "\n", + "A tokenizer transforms fragments of text into list of integers that maps a vocabulary. We assign one vector of dimension $d$ to each item in the vocabulary of size $N_\\mathcal{V}$, this results in a matrix $E$ of dimension ${N_\\mathcal{V} \\times d}$. Converting a fragment of text into a sequence of vector representations can be done by tokenizing the text, and then looking up the embedding vector for each token, which is equivalent to *one-hot encoding* the tokens and performing a matrix multiplication using $E$. Given $\\mathbf{t}_1, \\ldots, \\mathbf{t}_L$ the sequence of one-hot encoded tokens, this is equivalent to\n", + "$$\n", + "\\mathbf{w}_i = E \\mathbf{t}_i ,\n", + "$$\n", + "In the code below, we encode the sentence `Hellow world!` using a BPE tokenizer and a set of embedding of dimension `hdim`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 271, + "referenced_widgets": [ + "fae2e3a4035d403dad3baef4802a9258", + "bd052d320db8492c9b3aa1cf29e4a9cf", + "55ed66dd456e4f19ae98ce739da552b8", + "a4befd44d0b5402e8bcb61f64162ff43", + "f08a4af32b364cbaa774b179c87f737a", + "74acdd9bc09c438ab1f48c8c762c4a07", + "f8eaa261c4df44cea66453f6233555f9", + "c59172344f204280a756b106f69ed793", + "e35b435f72d4403e9ac00852fad7eef7", + "380c55c2680a4f85a81cfb266eace834", + "94ac791468ea4909a84c353d5bc7611d", + "713c08d937ca4e31a8b8f6942a487b7a", + "a4424cb16ec849ee96df431f52a57e29", + "5f1eef839eb24e25bbf667d0c23d2378", + "bad437a15b834e3e8ca6ca67b4860b7d", + "836bd677b7b040e2812fa3d3cf39b491", + "3e842b6e81c041419f630f7bbf69e0d2", + "b81956f3a32f4d8cb6dbe07a62867627", + "7ff7ecf2e5f44961becb04faa0a3048b", + "cad503bc0c3346e2abf5b25d355305d7", + "4772a52c663241ba9b5314d887fd47de", + "1c28b4c4a1bd44ddbed1028e3a755bb5", + "cdb2e3a16db04678a4a6d5d23931dacc", + "6c2c388a465f447a976d3427b59bad7a", + "b173b9397ed748b98aa18e9b4b739fca", + "4ffa8b39a34a4433bcce70f713bd7f2a", + "62f64323983d426c8762984fd25bb02e", + "09e77e9a34f24fedbdb90b2112f27347", + "5cef548a19104d2a910071036dccc588", + "b611fcfb99e84693954b41743fbf321a", + "8d2ad2b28f1644f09ff21d95df9bf0c0", + "3a7a4945d0f04ce5ac54621bf1d02698", + "af69668241644ef4b5cb06223a565d9f", + "fc631832218144c4b29f776ebba56193", + "0f938c22289d45d9bdd7929ed1a97626", + "b9474845c2d24c6f91420bb23ca0e149", + "2066c6d9fe2a4d6f876e81fbbf64e653", + "868d87d61432416fb23969b8405cc628", + "67e91a4c9376417dbb9916e83f5ccdd4", + "9ca45bbcbdbe4eab98cb67ed1249d8fb", + "88b39f446a4d46f5a5748d8bdf4311cf", + "7fdf1d094b4e4693acc53477fd2379c2", + "00046c6611c940058bac1aa75da16f95", + "08995f7f72f14dd8a22565ee4ae41215" + ] + }, + "id": "MKD5CuNLuGH2", + "outputId": "f0a4835d-f378-4048-a7c0-409198a91649" + }, + "outputs": [], + "source": [ + "hdim = 5 # embedding dimension\n", + "tokenizer = transformers.AutoTokenizer.from_pretrained(\"bert-base-uncased\") # tokenizer\n", + "sentence = \"Hello World!\" # input text\n", + "embeddings = torch.randn((tokenizer.vocab_size, hdim)) # embedding matrix\n", + "rich.print(f\"Embeddings (shape): {embeddings.shape}\")\n", + "token_ids = tokenizer.encode(sentence, add_special_tokens=False, return_tensors=\"pt\")[0]\n", + "rich.print(f\"Tokens ids (shape): {token_ids.shape}\")\n", + "vectors = torch.nn.functional.one_hot(token_ids, tokenizer.vocab_size).float() @ embeddings # equivalent to a `nn.Linear` layer\n", + "rich.print(f\"Vectors (shape): {vectors.shape}\")\n", + "rich.print(f\"List of tokens and their corresponding vectors:\")\n", + "for t,v in zip(token_ids, vectors):\n", + " token_info = f\"[blue]{tokenizer.decode(t):5}[/blue] (token id: {t:4})\"\n", + " rich.print(f\" * {token_info} -> {v}\")\n", + "\n", + "# NB: in practice, we use the simpler interface `torch.nn.Embedding``\n", + "# embeddings = torch.nn.Embedding(tokenizer.vocab_size, hdim)\n", + "# vectors = embeddings(token_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BZ9GcEgOuGH4" + }, + "source": [ + "### I.c Word vectors\n", + "\n", + "\"Word2vec:\n", + "\n", + "\n", + "Word2vec ([\"Efficient Estimation of Word Representations in Vector Space\", Mikolov et al. (2013)](https://arxiv.org/abs/1301.3781)) converts words into vector representations, which are learned using the Skip-Gram algorithm. Intuitively, The algorithm is based on the idea that words that appear together are related to each other.\n", + "\n", + "The word vector space allows to use the inner product to compare words, and arithmetic operations to manipulate word representations. For instance, in a well-defined word vector space, the concept \"king\" can be translated into \"queen\" by applying a linear transformation and the vector `vec(\"captial\") - vec(\"country\")` was found to correspond to the relative concept `\"capital city of a country\"` (see above illustration (*Image credits: https://www.tensorflow.org/tutorials/word2vec*)).\n", + "\n", + "\n", + "**Experiment** In the first cells, we have downloaded the [GloVe word vectors](ttps://github.com/stanfordnlp/GloVe) from [\"GloVe: Global Vectors for Word Representation\", Jeffrey Pennington et al. (2014)](https://arxiv.org/abs/1902.11004). GloVe are trained using a Skip-Gram objective on a large collection of documents. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 431 + }, + "id": "NQHCpsoXuGH7", + "outputId": "6a26788a-0ab5-431f-bfb7-454b626d67d4" + }, + "outputs": [], + "source": [ + "def word2vec(\n", + " word: str,\n", + " vocabulary:List[str],\n", + " vectors: torch.Tensor\n", + " ) -> Optional[torch.Tensor]:\n", + " \"\"\"Convert a word into a vector\"\"\"\n", + " word = word.lower()\n", + " if word in vocabulary:\n", + " word_idx = vocabulary.index(word)\n", + " return vectors[word_idx]\n", + " else:\n", + " return None\n", + "\n", + "def vec2words(\n", + " vec: torch.Tensor,\n", + " k=5,\n", + " *,\n", + " vocabulary:List[str],\n", + " vectors: torch.Tensor,\n", + " exclude_vecs: List[torch.Tensor] = None,\n", + " ) -> Tuple[List[str], torch.Tensor]:\n", + " \"\"\"Retrieve the nearest word neighbours for an input vector\"\"\"\n", + "\n", + " # compute the similarity between `vec`and all the vectors in `glove_vectors`\n", + " similarity = vectors @ vec\n", + "\n", + " # potentially filter out some vocabulary entries\n", + " if exclude_vecs is not None and len(exclude_vecs):\n", + " mask = None\n", + " for e in exclude_vecs:\n", + " mask_ = (vectors == e[None, :]).all(dim=1)\n", + " if mask is None:\n", + " mask = mask_\n", + " else:\n", + " mask |= mask_\n", + " similarity.masked_fill_(mask=mask, value=-math.inf)\n", + "\n", + " # return the ids of the nearesrt neighbours given the similarity\n", + " nearest_neighbour_ids = torch.argsort(-similarity)[:k]\n", + "\n", + " # retrieve the corresponding words in the `vocabulary``\n", + " return [vocabulary[idx] for idx in nearest_neighbour_ids], similarity[nearest_neighbour_ids]\n", + "\n", + "# register the vocab and vectors args\n", + "glove_args = {'vocabulary':glove_vocabulary, 'vectors':glove_vectors}\n", + "\n", + "# Nearest neighbours\n", + "rich.print(\"[red]Nearest neighbour search:\")\n", + "for word in [\"king\", \"queen\", \"dog\", \"France\"]:\n", + " rich.print(f'Nearest neighbours of the word \"{word}\":')\n", + " word_vec = word2vec(word, **glove_args)\n", + " words, similarities = vec2words(word_vec, k=5, **glove_args, exclude_vecs=[word_vec])\n", + " rich.print(f\"Words: {words}\")\n", + " rich.print(f\"Similarities: {similarities}\")\n", + "\n", + "# Word analogies\n", + "rich.print(\"\\n[red]Vector arithmetic:\")\n", + "cases = [\n", + " [(\"+\", \"king\"), (\"-\", \"man\"), (\"+\", \"woman\")],\n", + " [(\"+\", \"denmark\"), (\"-\", \"france\"), (\"+\", \"paris\")],\n", + " [(\"+\", \"pakistan\"), (\"-\", \"belgium\"), (\"+\", \"brussels\")],\n", + "]\n", + "for operations in cases:\n", + " # current location in the vector space\n", + " location = 0\n", + " rich.print(f\"Vector Translation: [blue]0 {' '.join(f'{d} {v}' for d,v in operations)} = \")\n", + " for sign, word in operations:\n", + " # retrieve the `vec(word)``\n", + " vec = word2vec(word, **glove_args)\n", + " if vec is None:\n", + " raise ValueError(f\"Unknown word `{word}`\")\n", + "\n", + " # parse the direction (+/-)\n", + " direction = {\"+\": 1, \"-\": -1}[sign]\n", + "\n", + " # apply the vector transform to the current location\n", + " location += direction * vec\n", + "\n", + " # return the nearest neighbours of the end location\n", + " exclude_list = [word2vec(w, **glove_args) for _, w in operations]\n", + " words, similarities = vec2words(location, k=5, exclude_vecs=exclude_list, **glove_args)\n", + " rich.print(f\"Words: {words}\")\n", + " rich.print(f\"Similarities: {similarities}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0ZnUwuv0uGIA" + }, + "source": [ + "**Beyond word2vec** The Skip-Gram model allows to learn meaningful word prepresentations and arithmetic in the resulting vector space allow manipulating concepts. Ultimately, we are interested in learning representations that represent larger text fragments such as sentences, paragraphs or documents. Doing so require combining multiple vectors, which can be done by exploiting arithmetic in the vector space, or by combining word-vectors using deep neural networks, such as Transformers!\n", + "\n", + "___\n", + "## II. Language models\n", + "\n", + "We have seen how to encode text into sequences of tokens, seen how to convert tokens into vectors using a set of embeddings and experimented with a GloVe word vector space. In this section we will see how to model text at the sentence, pragraph or even document level using language models.\n", + "\n", + "### II.a Language Modelling\n", + "\n", + "*Figure: Left-to-right language models*\n", + "![Autoregressive left-to-right language model](https://github.com/vlievin/transformers-lab/blob/main/images/ar-lm.png?raw=1)\n", + "\n", + "**Autoregressive factorization** Language models aim at grasping the underlying linguistic structure of a fragment of text: whereas word vectors model words independently of each others, a language model tracks the grammatical and semantic relationships between word tokens. Given a piece of text encoded into tokens $\\mathbf{w}_{1:T} = [\\mathbf{w_1}, \\ldots, \\mathbf{w}_T]$ a *left-to-right* language model describes $\\mathbf{w}_{1:T}$ with the following factorization:\n", + "$$\n", + " p_\\theta(\\mathbf{w}_{1:T}) = \\prod_{t=1}^T p_\\theta(\\mathbf{w}_t \\mid \\mathbf{w}_{t}$. This defines a bidirectional language model, which factorizes as\n", + "$$\n", + "L_\\theta(\\mathbf{w}_{1:T}) = \\prod_{t=1}^T p_\\theta(\\mathbf{w}_t \\mid \\mathbf{w}_{-t}) \\ ,\n", + "$$\n", + "where $\\mathbf{w}_{-t}$ represent the set of tokens $\\mathbf{w}_{1:T} \\backslash \\{ \\mathbf{w}_t \\}$. We call it pseudo because this likelihood is not forming a valid distribution (because the graph formed by $\\mathbf{w}_{1:T}$ is not a directed acyclic graph (a DAG)). Bidirectional language models such as [ELMo (\"Deep contextualized word representations\", Peters et al. (2018))](https://arxiv.org/abs/1802.05365), learn token representation contextualized on the whole context.\n", + "\n", + "In the case, of bidirectional language models, the context $\\mathbf{w}_{-t}$ corresponds to the whole sequence of tokens with the predicted element masked out. It is possible to generalize the bidirectional factorization to masking out one or more tokens. In that case, we consider a model $p_\\theta(\\mathbf{w}_m \\mid \\mathbf{w}_{-m})$ where $m$ is a set of indices of the tokens being predicted and $-m$ is the set of the other tokens. This is notably the approach adopted in [\"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\", Delvin et al. (2018)](https://arxiv.org/abs/1810.04805)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NQT5jAR5uGIF" + }, + "source": [ + "### II.b Recurrent Neural Networks\n", + "\n", + "*Figure: Left-to-right recurrent neural network. We highlight the information flowing from the context \"My horse is\" to the predicted word \"very\".*\n", + "![Recurrent Neural Network](https://github.com/vlievin/transformers-lab/blob/main/images/recurrent-lm-activated.png?raw=1)\n", + "\n", + "**Recurrent neural networks (RNNs)** implement a recursive function $f_\\theta$ using neural networks, which makes them a particularly good fit for sequential data. In the general setting, RNNs model the acquired knowledge at time $t$ using an additional variable $\\mathbf{h}_t$ of dimension $d_h$ (*hidden state*). The hidden state at step $t-1$ is updated with the information extracted from the observation $\\mathbf{w}_t$ using a function\n", + "$$\n", + "h_\\theta: (\\mathbf{w}_{t}, \\mathbf{h}_{t-1}) \\rightarrow \\mathbf{h}_{t} \\ ,\n", + "$$\n", + "which can be imlemented using an arbitrary neural network that takes the tuple $(\\mathbf{w}_{t}, \\mathbf{h}_t)$ as input and returns a new hidden state $\\mathbf{h}_{t+1}$. RRNs can be applied to parametrize language models by projecting the hidden state $\\mathbf{t}$ into the vocabulary space using a projection matrix $\\mathbf{F} \\in \\mathcal{R}^{V \\times d_h}$. This results in parameterizing the transition distribution as\n", + "$$\n", + "p_\\theta(\\cdot \\mid \\mathbf{w}_{