diff --git a/.gitignore b/.gitignore index 667f0ec..1b03ad5 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,3 @@ Session_9/.ipynb_checkpoints/exercises_9_solutions-checkpoint.ipynb Session_2/.ipynb_checkpoints/exercises_2-checkpoint.ipynb Session_3/.ipynb_checkpoints/exercises_3-checkpoint.ipynb Session_3/.ipynb_checkpoints/Readme-checkpoint.md -.* -~* \ No newline at end of file diff --git a/README.md b/README.md index c1cf2d4..4177656 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ # Class Schedule This repository contains the folders with the content of each class. Please check them for the references of each meeting. -The schedule goes as follows (**always 10-12AM in room R42.3.103**). +The schedule goes as follows (**always 10-12AM in room R42.3.103**) | Session | Date | Discussant | Topic | References | | ------------- | -------------|-------------|-------------| ----------------------------------------------------------------------------------------------------------------------------------- | | [Session 1](https://github.com/Python-do-ECARES/Classes/tree/master/Session_1) | 11/02/2020 | Glenn | Reproducible research, automation and introduction to the course | Slides / [Gentzkow and Shapiro](https://web.stanford.edu/~gentzkow/research/CodeAndData.pdf)| | [Session 2](https://github.com/Python-do-ECARES/Classes/tree/master/Session_2) | 18/02/2020 | Fabrizio | Jupyter (Anaconda), Git, GitHub, GitHub Desktop | Slides / [QuantEcon](https://python.quantecon.org/getting_started.html) | | [Session 3](https://github.com/Python-do-ECARES/Classes/tree/master/Session_3) | 25/02/2020 | Federico | Variables, numbers, strings, lists, functions, tuples, sets, dictionaries, lists comprehension, for and while loops, if statements, other logical operators | [Udemy - Section 1:"Intro to Python" and Section 2:"Python fundamentals"](https://www.udemy.com/course/the-complete-python-course/learn/lecture/9412506#overview), [QuantEcon - An Introductory Example](https://python.quantecon.org/python_by_example.html) and [Python Essentials](https://python.quantecon.org/python_essentials.html) | -| [Session 4](https://github.com/Python-do-ECARES/Classes/tree/master/Session_4) | 03/03/2020 | Alberto, Marco & Giampaolo | Loops and Functions (Part 2) | | -| [Session 5](https://github.com/Python-do-ECARES/Classes/tree/master/Session_5) | 10/03/2020 | Angela, Moritz & Thomas |APIs| [Udemy - Section 16:"Interacting with APIs with Python"](https://www.udemy.com/course/the-complete-python-course/learn/lecture/9412506#overview) | +| [Session 4](https://github.com/Python-do-ECARES/Classes/tree/master/Session_4) | 03/03/2020 | | Classes, handling errors (including try and except), files | [Udemy - Section 4:"Object-Oriented Programming with Python", Section 5:"Errors in Python", Section 6:"Files in Python"](https://www.udemy.com/course/the-complete-python-course/learn/lecture/9412506#overview), [QuantEcon - OOP I](https://python.quantecon.org/oop_intro.html), [Building Classes](https://python.quantecon.org/python_oop.html) and [Debugging](https://python.quantecon.org/debugging.html) | +| [Session 5](https://github.com/Python-do-ECARES/Classes/tree/master/Session_5) | 10/03/2020 | Moritz |APIs| [Udemy - Section 16:"Interacting with APIs with Python"](https://www.udemy.com/course/the-complete-python-course/learn/lecture/9412506#overview) | | [Session 6](https://github.com/Python-do-ECARES/Classes/tree/master/Session_6) | 17/03/2020 | Charles | Web Scraping, regular expressions, Selenium| [Udemy - Section 10:"Advanced Python Development", Section 11:"Web Scraping with Python", Section 12:"Browser Automation with Selenium"](https://www.udemy.com/course/the-complete-python-course/learn/lecture/9412506#overview) | | [Session 7](https://github.com/Python-do-ECARES/Classes/tree/master/Session_7) | 24/03/2020 | Cristina & Remi | Pandas | [QuantEcon - Pandas](https://python.quantecon.org/pandas.html) and [QuantEcon- DataScience](https://datascience.quantecon.org/pandas/) | | Homework | *Easter Break*| | | | -| [Session 8](https://github.com/Python-do-ECARES/Classes/tree/master/Session_8) | 21/04/2020 | Charles and Fabrizio | Discussion Easter projects | | -| [Session 9](https://github.com/Python-do-ECARES/Classes/tree/master/Session_9) | 28/04/2020 | Domenico | Numpy and Numba | [QuantEcon - Numpy](https://python.quantecon.org/numpy.html) and [QuantEcon - Numba](https://python.quantecon.org/numba.html) | | +| [Session 8](https://github.com/Python-do-ECARES/Classes/tree/master/Session_8) | 21/04/2020 | | Discussion Easter homework | | +| [Session 9](https://github.com/Python-do-ECARES/Classes/tree/master/Session_9) | 28/04/2020 | | Numpy and Numba | [QuantEcon - Numpy](https://python.quantecon.org/numpy.html) and [QuantEcon - Numba](https://python.quantecon.org/numba.html) | | | [Session 10](https://github.com/Python-do-ECARES/Classes/tree/master/Session_10) | 12/05/2020 | | Scipy | [QuantEcon - Scipy](https://python.quantecon.org/scipy.html) and [Scipy.optimize](https://docs.scipy.org/doc/scipy/reference/optimize.html) | | -| [Session 11](https://github.com/Python-do-ECARES/Classes/tree/master/Session_11) | 19/05/2020 | Yasmine | Visualisation (matplotlib) | [QuantEcon- Matplotlib](https://python.quantecon.org/matplotlib.html) | -| [Session 12](https://github.com/Python-do-ECARES/Classes/tree/master/Session_12) | 26/05/2020 | Glenn & Fabrizio | Intro to Machine Learning | [QuantEcon DataScience](https://datascience.quantecon.org/applications/ml_in_economics.html) and TBD | +| [Session 11](https://github.com/Python-do-ECARES/Classes/tree/master/Session_11) | 19/05/2020 | | Visualisation (matplotlib) | [QuantEcon- Matplotlib](https://python.quantecon.org/matplotlib.html) | +| [Session 12](https://github.com/Python-do-ECARES/Classes/tree/master/Session_12) | 26/05/2020 | | Intro to Machine Learning | [QuantEcon DataScience](https://datascience.quantecon.org/applications/ml_in_economics.html) and TBD | diff --git a/Session_11/Readme.md b/Session_11/Readme.md.txt similarity index 100% rename from Session_11/Readme.md rename to Session_11/Readme.md.txt diff --git a/Session_12/Readme.md b/Session_12/Readme.md.txt similarity index 100% rename from Session_12/Readme.md rename to Session_12/Readme.md.txt diff --git a/Session_3/Slides_3_solutions.ipynb b/Session_3/Slides_3_solutions.ipynb deleted file mode 100644 index 01b6ccd..0000000 --- a/Session_3/Slides_3_solutions.ipynb +++ /dev/null @@ -1,1609 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Session 3: introduction to Python\n", - "We will cover (more details about the topics below are available in the [Python Bible](https://docs.python.org/3.7/library/stdtypes.html)):\n", - "* Numeric types, *int* and *float*, *booleans* operators and *strings* \n", - "* Containers: *dictionaries*, *lists*, *sets*, *tuples*\n", - " * We will not cover other containers, you can find them [here](https://docs.python.org/3.7/library/collections.html)\n", - "* Introduction to functions, which we will cover more in-depth next week\n", - "* Loops\n", - "\n", - "First, let's see the solutions to the exercises" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "# RISE: slideshow using jupyter notebook\n", - "* Step 1: Follow the installation procedure in the [RISE webpage: Installation](https://rise.readthedocs.io/en/stable/installation.html)\n", - " * If you don't use conda, consider using virtual environments ([pipenv](https://pipenv-fork.readthedocs.io/en/latest/)) instead of pip\n", - "* Step 2: Write your code on jupyter lab or jupyter notebook (you can also use other editors, but you will have to organise the code after)\n", - "* Step 3: Open jupyter notebook\n", - "* Step 2: From the \"View\" tab select \"Cell Toolbar -> Slideshow\"\n", - "* Step 5: Organise the slide type for each block of content (details for this on the [RISE webpage: Usage](https://rise.readthedocs.io/en/stable/usage.html))\n", - "* Step 6: Use shortcuts or the \"Enter/Exit RISE slideshow\" button in your jupyter notebook to start the presentation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Exercise 1: Variables, Numbers and Strings\n", - "\n", - "You have a variable of type *int* and another one of type *float*.\n", - "* They are both numeric types\n", - "* *float* is used for floating-point numbers\n", - " * We insert decimal floating-point numbers, and the computer approximates them to base 2 fractions\n", - "* *int* is for integer numbers\n", - "* You can also write complex numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "a = 40\n", - "b = 4.0\n", - "print(type(a))\n", - "print(type(b))\n", - "# Otherwise, you can write print(type(40)) and print(type(4.0)) instead of assigning values to variables" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Convert the integer *a* to a *float*, and viceversa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "a = float(a)\n", - "print(type(a))\n", - "b = int(b)\n", - "print(type(b))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "The function *int()* truncates *float* numbers, and truncating a number is different from rounding it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "print(b)\n", - "# Remember that b as float is 4.0\n", - "temp = 4.6\n", - "print(temp)\n", - "temp = int(temp)\n", - "print(temp)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Sum, subtract, multiply and divide the two variables above by one another. Print the output of each operation. Notice the type of the output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "c = a + b\n", - "print(c)\n", - "d = a - b\n", - "print(d)\n", - "e = a * b\n", - "print(e)\n", - "f = a / b\n", - "print(f)\n", - "# You can take an exponent using two asterisks\n", - "print(a ** b)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "Now, *a* is a *float* and *b* is an *integer*. Divide them by one another to get an *integer*. (Hint: a / b won't work. and \"/\" is different from \"//\" )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "a = int(a)\n", - "g = a // b\n", - "print(g)\n", - "print(type(g))\n", - "# The operation \"//\" is a floored quotient, meaning that it rounds the result to the greates integer below the floating result.\n", - "# For example, you can try to change the numerator to (a + 11) and check the output " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "Let's now move onto **strings**. Create two strings, one with the text *Hello* and the other with *World*. Print them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "string_1 = \"Hello\"\n", - "string_2 = \"World\"\n", - "print(string_1)\n", - "print(string_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "# There are a lot of things that you can do using strings, just to mention a few of them:\n", - "# Concatenate strings\n", - "print(string_1 + string_2)\n", - "# The output does not include a space, you can write print(string_1 + \" \" + string_2) to include it, or\n", - "print(\" \".join([string_1, string_2]))\n", - "# Add a string multiple times to itself\n", - "print(string_1 * 4)\n", - "# Slice the string\n", - "print(string_1[0:2])\n", - "# Get the number of elements in the string\n", - "print(len(string_1))\n", - "temp_2=\"Hello!\"\n", - "print(len(temp_2))\n", - "# Check if a string is part of another string!\n", - "print(\"He\" in string_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Below a brief introduction to formatted string literals:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "temp_name = \"Fede\"\n", - "temp_age = 24\n", - "print(f\"Name: {temp_name}; age: {temp_age}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "This might look useless for now, but wait for it!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Exercise 2: Lists\n", - "\n", - "Create a list including *a*, *b*, and *\"Hello, World!\"*. Print the entire list and its third element separately. (Hint = the index of the first element in Python is 0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "list_1 = [a, b, string_1 + \", \" + string_2 +\"!\"]\n", - "print(list_1)\n", - "print(type(list_1))\n", - "print(list_1[2])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "Remove the element that is not a number, and replace it with an *int*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "list_1.remove(list_1[2])\n", - "list_1.append(8)\n", - "print(list_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "Print the length of the list defined above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "print(len(list_1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# Add a list to itself multiple times\n", - "print(f\"{list_1 * 3} \\nThe length of this list is: {len(list_1 * 3)}\")\n", - "# Check if an element is in a list or not\n", - "print(8.0 in list_1)\n", - "print(8 not in list_1)\n", - "# List slicing\n", - "print(list_1[0])\n", - "print(list_1[0:2])\n", - "# Append an item. Remember that if you try to append an item using temp_list[3].append(40) you get an error\n", - "# You can use temp_list = list_1[:] instead of the copy method\n", - "temp_list = list_1.copy()\n", - "temp_list.append(4)\n", - "# Check the occurrences of an element in a list\n", - "print(temp_list.count(4))\n", - "# Check the min in the list (can use max(temp_list) for the max)\n", - "print(min(temp_list))\n", - "# The remove method removes the first item in the list with the specified value\n", - "print(temp_list)\n", - "temp_list.remove(4)\n", - "print(temp_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "temp_list = list_1.copy()\n", - "temp_list.append(4)\n", - "# You can also use \".pop\" and del to remove items in a different way\n", - "print(temp_list.pop(1))\n", - "print(temp_list)\n", - "del temp_list[1]\n", - "print(temp_list)\n", - "# For delete you can also use list slicing with steps\n", - "temp_list2 = list(range(10))\n", - "print(temp_list2)\n", - "del temp_list2[0:10:2]\n", - "print(temp_list2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "# You can also create a list of lists (a matrix)\n", - "temp_list3 = [list_1] * 3\n", - "print(temp_list3)\n", - "# The first element of this list is a list. You can slice the first element of the first list\n", - "print(temp_list3[0])\n", - "print(temp_list3[0][0])\n", - "temp_list3.remove([40, 4, 8])\n", - "print(temp_list3)\n", - "temp_list3.append([20, 2, 4])\n", - "print(temp_list3)\n", - "print(temp_list3.pop(2))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "#### List comprehension" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "temp_list4 = [x for x in list_1]\n", - "print(temp_list4)\n", - "# This gives an outcome similar to the method .copy mentioned above\n", - "temp_list5 = [x for x in range(10)]\n", - "print(temp_list5)\n", - "temp_list6 = [x for x in temp_list5 if temp_list5[x] % 2 is 0]\n", - "print(temp_list6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "# We can use the list index in a f string\n", - "temp_list_name = [\"Fede\", \"Ale\"]\n", - "temp_list_age = [24, 27]\n", - "print(f\"Name: {temp_list_name[0]}; Age: {temp_list_age[0]} \\nName: {temp_list_name[1]}; Age: {temp_list_age[1]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Exercise 3: Tuples, sets and dictionaries\n", - "\n", - "Create a tuple and a dictionary including a, b, and \"Hello, World!\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tuple_1 = (a, b, string_1 + \", \" + string_2 +\"!\")\n", - "print(tuple_1)\n", - "dict_1 = {'int_1': a, 'int_2': b, 'string_1': string_1 + \", \" + string_2 +\"!\"}\n", - "print(dict_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dictionary includes *key: value* pairs, and the keys within a dictionary must be unique. You can use strings or numbers as keys, but also tuples that contain strings, numbers, or other tuples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "The tuples are immutable objects, meaning that: \"Such an object cannot be altered. A new object has to be created if a different value has to be stored.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(tuple_1[0])\n", - "tuple_1[0] = 20" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "\n", - "Add another *int* to the tuple." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tuple_1 += (8,)\n", - "print(tuple_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "You can include lists inside a tuple" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "temp_tuple1 = ([\"Fede\", \"Ale\"], [24, 27])\n", - "print(type(temp_tuple1))\n", - "temp_tuple1[1][0] = 42\n", - "print(temp_tuple1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "temp_tuple1[1] = [54, 57]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Print the value associated to the first key of the dictionary defined above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(dict_1['int_1'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "temp_dict1 = dict_1.copy()\n", - "print(temp_dict1)\n", - "temp_dict1[\"goodbye_1\"] = \"Goodbye, World!\" \n", - "print(temp_dict1)\n", - "# To get the keys in different orders\n", - "print(f\"Not sorted:{list(temp_dict1)} \\nSorted:{sorted(temp_dict1)}\")\n", - "# You can check if a key is in the dictionary\n", - "print(\"goodbye_1\" in temp_dict1)\n", - "print(\"Goodbye, World!\" in temp_dict1[\"goodbye_1\"])\n", - "# You can delete keys and their associated values using del\n", - "del temp_dict1[\"goodbye_1\"]\n", - "print(\"goodbye_1\" in temp_dict1)\n", - "print(\"goodbye_1\" not in temp_dict1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "You can create a dictionary in different ways:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# From a list of tuples\n", - "temp_dict2 = dict([(\"int_1\", 40), (\"int_2\", 4), (\"string_1\", \"Hello, World!\")])\n", - "print(temp_dict2)\n", - "# From a list of lists\n", - "temp_dict3 = dict([[\"int_1\", 40], [\"int_2\", 4], [\"string_1\", \"Hello, World!\"]])\n", - "print(temp_dict3)\n", - "temp_dict3[\"int_1\"] = 41\n", - "print(temp_dict3)\n", - "# Using a dict comprehension\n", - "temp_dict4 = {x: [y, y**2] for x, y in [[\"int_1\", 1], [\"int_2\", 2], [\"int_3\", 3]]}\n", - "print(temp_dict4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# With dict you need 2 arguments per element, but the second one containing values can include multiple elements\n", - "temp_dict5 = dict([(\"int_1\", (40, 41, 42)), (\"int_2\", (4, 5, 6)), (\"string_1\", \"Hello, World!\")])\n", - "print(temp_dict5)\n", - "# If the values for a key are in a tuple, you will not be able to change them, but you can change the tuple assigned to that key\n", - "temp_dict5[\"int_1\"] = (41, 42, 43)\n", - "print(temp_dict5)\n", - "# While you can do that using lists\n", - "temp_dict6 = dict([[\"int_1\", [40, 41, 42]], [\"int_2\", [4, 5, 6]], [\"string_1\", \"Hello, World!\"]])\n", - "print(temp_dict6)\n", - "temp_dict6[\"int_1\"][0] = 41\n", - "print(temp_dict6)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Sets\n", - "A set is an unordered collection with no duplicate elements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "set_1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3}\n", - "set_2 = {0, 2, 4, 6, 10, 11, 12}\n", - "print(set_1)\n", - "print(type(set_1))\n", - "# You can print the element in set_1 that are not in set_2, you can alse use \"set_1 - set_2\" to do that)\n", - "print(set_1.difference(set_2))\n", - "# You can print the union of two sets (use \"set_1 | set_2\")\n", - "print(set_1.union(set_2))\n", - "# The intesection (use \"set_1 & set_2\" or set_1.intersection(set_2))\n", - "print(set_1 & set_2)\n", - "# The symmetric difference for elements in set_1 or set_2 but not in both (you can use \"set_1 ^ set_2\")\n", - "print(set_1.symmetric_difference(set_2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "# Check if an element is in a set\n", - "print(f\"Is 9 an element of set 1? {9 in set_1} \\nIs 9 an element of set 2? {9 in set_2}\")\n", - "# You can use set comprehension to create a set\n", - "set_3 = {x for x in range(10)}\n", - "print(set_1.symmetric_difference(set_3))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Exercise 4: Functions and logical statements\n", - "\n", - "Create a function that takes an *int*, a *float* and a *string* as inputs and returns a list containing the three of them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "# The typing package is not necessary, but it helps in keeping track of the inputs' types and the type of the function output\n", - "\n", - "def func_1(int_1: int, float_1: float, str_1: str) -> list:\n", - " return [str(int_1), str(float_1), str_1]\n", - "\n", - "# Below the function using positional arguments, you can also use keyword arguments: func_1(int_1 = 4, float_1 = 3.0, str_1 = \"hello\")\n", - "# You can use keyword arguments after positional arguments (as long as they do not assign values to the same argument), but not the opposite\n", - "func_1(4, 3.0, 'hello')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "The functions start with a \"def\", then you have the name of the function and the inputs inside round brackets.\n", - "\n", - "After the semicolons you can write the function, remember that you need one indentation level for each function (or loop). You can use 4 spaces or a tab to do the indent.\n", - "\n", - "The return at the end of the function gives the output from that function. If return is not assigned, Python will give by default an output \"None\" (same happens if return is included but it does not have an expression argument defined)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### More fun with FUNctions\n", - "Remember to define the functions before they are called in the code. You can set default values for the parameters, and you have to order the parameters starting from the one that do not have default values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def temp_func1(float_1: float, str_1: str, int_1: int = 4) -> list:\n", - " temp = temp_func2()\n", - " return [[str(int_1), str(float_1), str_1], temp]\n", - "\n", - "print(temp_func1(3.0, \"hello\"))\n", - "\n", - "def temp_func2(int_2: int = 8, float_2: float = 6.0, str_2: str = \"Hello\") -> list:\n", - " return [str(int_2), str(float_2), str_2]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "You can use unpacking argument for lists and tuples (using \"*\"), while double asterisks for dictionaries, to get the values used in a function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def temp_func3(float_1: float, str_1: str, int_1: int = 4) -> list:\n", - " return [str(int_1), str(float_1), str_1]\n", - "\n", - "temp_tuple2 = (6.0, \"Hello\", 8)\n", - "print(temp_func3(*temp_tuple2))\n", - "\n", - "temp_dict7 = {\"int_1\": 8, \"float_1\": 6.0, \"str_1\": \"Hello\"}\n", - "print(temp_func3(**temp_dict7))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "# You can (and should) include a documentation inside your function\n", - "def temp_doc():\n", - " \"\"\" Usually the summary in the first line.\n", - " \n", - " Leave one black space between the summary and the rest of the text\n", - " \"\"\"\n", - " pass\n", - "\n", - "print(temp_doc.__doc__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Use a lambda function to achieve the same result. It is a short function (in one line) that take parameters and an expression; the latter after the semicolons." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "lambda_1 = lambda a, b, c : [str(a), str(b), c]\n", - "print(lambda_1(4, 3.0, 'hello'))\n", - "print(type(lambda_1(4, 3.0, 'hello')))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Let's move into something slightly more involving. This function checks the type of the input. \n", - "* The part of code in the \"if\" statement environment is evaluated if the initial condition is *True*.\n", - "* If the \"if\" condition evaluates to *False*, we jump to the elseif (or else) statement that follows the initial \"if\" statement. Elseif is short for else if.\n", - "* You can include else or elseif statements, they are not necessary in the code. In that case, when the if statement is true, the code will continue as if there is an else statement containing *pass*, meaning that it does nothing when the if statement evaluates to *False* " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def func_2(x):\n", - " if type(x) == int:\n", - " print(f\"{x} is an integer\")\n", - " elif type(x) == float:\n", - " print(f\"{x} is a float\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Notice that the function only checks if the input is an *int* or a *float*. Add an extra condition to check if the input is a *string*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def func_2(x):\n", - " if type(x) == int:\n", - " print(f\"{x} is an integer\")\n", - " elif type(x) == float:\n", - " print(f\"{x} is a float\")\n", - " elif type(x) == str:\n", - " print(f\"{x} is a string\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "func_2(\"hello\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Comparisons\n", - "The comparisons yield boolean values (*True* or *False*). We will discuss:\n", - "* Value comparisons, which compare the values of two objects\n", - " * In this group you can find equality comparison \"==\" and \"!=\"\n", - " * Order comparison, \"<\", \">\", \"<=\", \">=\"\n", - "* Membership test, we have seen them previously as \"x in y\" and \"z not in f\"\n", - "* Identity comparisons \"x is y\"and \"z is not f\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Value comparisons" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "# Comparison of numbers\n", - "print(5 == 5)\n", - "print(5 != 5)\n", - "print(7 < 10)\n", - "print(7 < 10 < 20)\n", - "# This is pretty much the same as \"7 < 20 and 20 < 10\".\n", - "# The first comparison evaluates \"True\", but the second is \"False\", so the presence of \"and\" gives a \"False\"\n", - "print(7 < 20 < 10)\n", - "# For strings the comparison is realised looking at the unicode of each character\n", - "print(\"hello\" < \"Hello\")\n", - "print(\"h unicode is\", ord('h'), \", H unicode is\", ord('H'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# You can compare collections if they have the same type (a list with a list, set with another set, etc.)\n", - "# Lexicographic comparison\n", - "print([1, 2, 3] < [1, 2, 4])\n", - "print([1, 2] < [1, 2, 4])\n", - "print([1, 3] < [1, 2, 4])\n", - "# For dictionaries you can only use equality comparison, it evaluates the equality between (key, value) pairs\n", - "temp_dict8 = {\"temp_1\": 1, \"temp_2\": 2}\n", - "temp_dict9 = {\"temp_1\": 1, \"temp_2\": 2}\n", - "print(temp_dict8 == temp_dict9)\n", - "# For sets we have order comparison operators to realise subset and superset tests\n", - "temp_set1 = {1, 2}\n", - "temp_set2 = {1, 2, 3}\n", - "print(temp_set1 == temp_set2)\n", - "print(temp_set1 < temp_set2)\n", - "print(temp_set1 > temp_set2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Identity comparison\n", - "It is not the same as equality comparisons as you can see below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "temp_var = [1, 2]\n", - "temp_var2 = [1, 2]\n", - "print(id(temp_var2))\n", - "print(id(temp_var))\n", - "print(temp_var is temp_var2)\n", - "print(temp_var == temp_var2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Boolean operations\n", - "The boolean values are *True* and *False*. But what exactly evaluate to these two values?\n", - "* Values for *False*: False, None, numeric zero of all types, empy strings and containers.\n", - "* Values for *True*: All the values that do not evaluate to *False*.\n", - "\n", - "We can use the following operator and expressions:\n", - "* The opeartor \"not\" yields *True* if its argument is false, *False* otherwise. In other words, it reverses the result.\n", - "* The expression *x and y* first evaluates x; if x is false, its value is returned; otherwise, y is evaluated and the resulting value is returned.\n", - "* The expression x or y first evaluates x; if x is true, its value is returned; otherwise, y is evaluated and the resulting value is returned." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "x = 42\n", - "y = 20\n", - "z = 22\n", - "print(not(x == 42 and y + z == x))\n", - "print(not(x == 42 and y + z != x))\n", - "print(x != 42 or y == 20)\n", - "print(bool([]))\n", - "print(bool([1]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Exercise 5: For and While Loops\n", - "\n", - "Python allows you to loop over several type of objects. A simple loop looks like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "for x in range(10):\n", - " print(x, end=\" \")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "The for statement iterates over the items of a sequence. In this case, the sequence of numbers generated by the *range(10)* function.\n", - "\n", - "You can notice that the for statement follows the order in which the items appear in the sequence." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Modify the script above to loop over a list containing both numbers (*int* and/or *float*) and *string*. Print the type of the input at each iteration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "for x in [1, 3.0, \"hello\"]:\n", - " print(\"The input supplied is a \" + str(type(x)))\n", - " \n", - "# If you want a better look for your print output use the \"__name__\" dunder method, more regarding dunder below\n", - "for x in [1, 3.0, \"hello\"]:\n", - " print(\"The input supplied is a \" + str(type(x).__name__))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "The next chunk of code fills in an empty list with a sequence of integers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "list_1 = []\n", - "\n", - "for i in range(10):\n", - " list_1.append(i)\n", - " \n", - "print(list_1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "Produce the same result using a while loop instead (Hint: create a new list, call it *list_2*. Make sure to update the counter at each iteration of the loop)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "i = 0\n", - "list_2 = []\n", - "while i < 10:\n", - " list_2.append(i)\n", - " i += 1\n", - "\n", - "print(list_2)\n", - "print(list_1 == list_2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "The *while* loop executes the code inside its environment as long as the initial condition (in this case \"i < 10\") evaluates to *True*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "Sum the strings above to obtain *Hello, World!*. Print the result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "string_3 = string_1 + \", \" + string_2 +\"!\"\n", - "print(string_3)\n", - "# Otherwise, use:\n", - "# print(string_1 + \", \" + string_2 +\"!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# You can add more arguments in the for loop to unpack items\n", - "temp_list7 = [[\"Fede\", 24, \"ULB\"], [\"Ale\", 27, \"Sapienza\"]]\n", - "for x, y, z in temp_list7:\n", - " print(f\"Item in the list {x}, {y}, {z}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Break statement\n", - "The *break* statement exits the loop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "for i in range(10):\n", - " if i <= 7:\n", - " print(i, end=\" \")\n", - " else:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "i = 0\n", - "while i < 10:\n", - " print(i, end=\" \")\n", - " i += 1\n", - " if i == 9:\n", - " break " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Continue statement\n", - "The *continue* statement restart the loop moving to the next iteration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "temp_list8 = list(range(10))\n", - "temp_list8[5] = \"five\"\n", - "print(temp_list8)\n", - "for i in temp_list8:\n", - " try:\n", - " print(i + 1, end=\" \")\n", - " except TypeError:\n", - " continue" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "temp_list8 = list(range(10))\n", - "temp_list8[5] = \"five\"\n", - "i = 0\n", - "while i < 10:\n", - " if type(temp_list8[i]) == int:\n", - " i += 1\n", - " continue\n", - " temp = temp_list8[i]\n", - " i += 1\n", - " print(temp, type(temp))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "### Else clause\n", - "The else clause is executed only if the for loop has been completed without breaks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def example_1(container: list):\n", - " for i in container:\n", - " if type(i) == float:\n", - " print(\"This list contains at least one float\")\n", - " break\n", - " else:\n", - " print(\"This list does not contain a float\")\n", - " \n", - "temp_list9 = [1, 2, 3, 4]\n", - "temp_list10 = [1, 2, 3.0, 4]\n", - "example_1(temp_list9)\n", - "example_1(temp_list10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "def example_2(i: int, n: int, c: int):\n", - " initial_value = i\n", - " while i <= n:\n", - " i += 1\n", - " if i == (2/3)*c:\n", - " print(f\"The value {i} is 2/3 of {c}. Stop the loop.\")\n", - " break\n", - " else:\n", - " print(f\"2/3 of {c} is {(2/3)*c}; it is not part of the interval [{initial_value}, {n}]\")\n", - "\n", - "example_2(0, 10, 12)\n", - "example_2(0, 10, 30)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Classes in Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "class Savings:\n", - " \n", - " def __init__(self, w):\n", - " self.wealth = w\n", - " self.new_wealth = w\n", - " \n", - " def salary(self, x):\n", - " self.new_wealth += x\n", - " \n", - " def expenditures(self, **kwargs):\n", - " total_expenditures = 0\n", - " for key, value in kwargs.items():\n", - " total_expenditures += value\n", - " print(f\"You plan to spend {value} euro on {key}\")\n", - " new_wealth = self.new_wealth - total_expenditures\n", - " if new_wealth < 0:\n", - " print(\"Insufficient savings, try decreasing your expenditures\")\n", - " else:\n", - " self.new_wealth = new_wealth\n", - " print(f\"Your new wealth after the expenditures will be: {self.new_wealth} euro\")\n", - " if self.new_wealth < self.wealth:\n", - " print(f\"You will be below your initial wealth by {self.wealth - self.new_wealth} euro\")\n", - " elif self.new_wealth == self.wealth:\n", - " print(f\"Your new wealth will be the same as the initial one: {self.new_wealth} euro\")\n", - " else:\n", - " print(f\"You will be above your initial wealth by {self.new_wealth - self.wealth} euro\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "person = Savings(1000)\n", - "print(person.wealth)\n", - "person.salary(200)\n", - "print(person.new_wealth)\n", - "person.expenditures(chocolate = 50, beer = 75)\n", - "print(person.new_wealth)\n", - "person.expenditures(chocolate = 400, beer = 600)\n", - "print(person.new_wealth)\n", - "person. expenditures(chocolate = 100)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Debugger using jupyter notebook\n", - "In jupyter notebook you can use the *%debug* magic to launch the debugger. In jupyter lab you can have a better graphical interface for the debugger using the debugger extension." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "def example3(s1: set, s2: set):\n", - " s3 = s1 & s2\n", - " %debug\n", - " print(s3[1])\n", - " return s3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "temp_set1 = set(range(0 , 20, 2))\n", - "temp_set2 = set(range(0, 20, 4))\n", - "print(temp_set1)\n", - "print(temp_set2)\n", - "example3(temp_set1, temp_set2)" - ] - } - ], - "metadata": { - "celltoolbar": "Slideshow", - "file_extension": ".py", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - }, - "mimetype": "text/x-python", - "name": "python", - "npconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": 3 - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/Session_4/README.md b/Session_4/README.md deleted file mode 100644 index 0ba1317..0000000 --- a/Session_4/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Session 4: More on Python's basic - -In this session, we will work more specifically on loops and functions. We have two files, both to be open with Jupyter. - -The first one, the regular one (named `exercise_4.ipynb`), shows how to play around with lists of values to simulate a small investment strategy and perform non-parametric Monte-Carlo simulations. The second one, that is optional (and called `ad_fundum_the_programming_case.ipynb`), digs further in the topics of exception handling, recursive functions and Belgian student drinking culture. - -The solutions to both of them will be posted. diff --git a/Session_8/Readme.md b/Session_4/Readme.md.txt similarity index 100% rename from Session_8/Readme.md rename to Session_4/Readme.md.txt diff --git a/Session_4/ad_fundum_the_programming_case.ipynb b/Session_4/ad_fundum_the_programming_case.ipynb deleted file mode 100644 index 8d710b5..0000000 --- a/Session_4/ad_fundum_the_programming_case.ipynb +++ /dev/null @@ -1,276 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "# Ad fundum!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "This notebook has three purposes: \n", - "1. Learning to use exceptions as part of a normal program workflow\n", - "2. Getting acquainted with recursive function\n", - "3. Introducing those of you who are not from Belgium with the tradition of Ad Fundum (I'm sure it exists in every country but the Belgian way is probably slightly different)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## The ULB way" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "In this notebook, we are going to build a small system that enables you to hone your ad fundum skills from the comfort of your own house. For that, we will use a set of functions that call one another and time the whole process.\n", - "\n", - "The ad fundum is a student... let's call it ritual that goes more or less like this:\n", - "\n", - "* One person challenges another at ad fundum, the other usually agrees. They go and pick up a beer or a beverage each. A referee is appointed (in our case, the referee is the program we write and the only adversary in our case is ourself).\n", - "* The referee asks the two contestants whether they are ready. Once it is the case, she pronounces the following formula: \"À main, à bouche, à cul, nom de dieu!\" (loosely translated, it means \"To the hand, to the mouth, to the bottom, for god's sake\"). Of course, I assume that in catholic universities, they drop the last part so our program should let the player decides if she wants to see this. Also, the word \"cul\" is somewhat vulgar in French and our program should be family friendly (with the difference that, if children want to play, they should drink a non-alcoholic beverage but the program cannot control that, now, can it?). The player should be able to decide if she wants to see the full word or the cleaner \"c\\*\\*\".\n", - "* Once the referee has pronouced the formula, both contestants have to chug their beer (or their beverage) as fast as possible. For some reason, bystanders usually shout \"et glou\" (which in English would sounds something like \"hay glue\" and which is supposed to mimick the sound of somebody drinking) regularly, so we'll emulate that as well.\n", - "* Once the first of the player is done with her glass (or whatever container she's drinking from), she's declared the winner and go on to enjoy the rest of the evening (or directly to do another ad fundum).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## The program" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "So basically, our program is a glorified timer: it writes the ritual formula, starts a timer and then repeatedly writes \"et glou\" on the screen until the player kill it by hand (meaning by clicking on the little \"stop\" sign on the top of the notebook. This is not something normal for the program: it is an exception and it can be captured in the logic of the code to treat it accordingly (by default, it kills the whole program but in our case, we just want to use it to stop the never-ending stream of \"et glou\").\n", - "\n", - "Before getting down to business, there are two functions that we could see:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on built-in function sleep in module time:\n", - "\n", - "sleep(...)\n", - " sleep(seconds)\n", - " \n", - " Delay execution for a given number of seconds. The argument may be\n", - " a floating point number for subsecond precision.\n", - "\n" - ] - } - ], - "source": [ - "import time # Yes, in Python, you can import time itself! God-feeling, much?\n", - "\n", - "help(time.sleep)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on built-in function time in module time:\n", - "\n", - "time(...)\n", - " time() -> floating point number\n", - " \n", - " Return the current time in seconds since the Epoch.\n", - " Fractions of a second may be present if the system clock provides them.\n", - "\n" - ] - } - ], - "source": [ - "help(time.time)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This took 2.002575159072876 seconds\n" - ] - } - ], - "source": [ - "start = time.time()\n", - "time.sleep(2)\n", - "finish = time.time()\n", - "print(\"This took \", finish - start, \" seconds\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## The formula" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "The first step of this exercise is to create a function that put the initial formula. It should be a function that takes 2 arguments and returns nothing. The two arguments should be two Boolean: the first one, called *parental_version* will be True if we must print \"c\\*\\*\" instead of \"cul\" and the other parameter, called *gods_sake* will also be a Boolean and will be True if we have to take out the mention to God's sake.\n", - "\n", - "Write this function here below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Et glou" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now going to write a function that takes 1 argument (an integer), lets call it *x*, but do three things sequentially:\n", - "1. Print the infamous \"Et glou\" to the screen.\n", - "2. Sleep for *x* number of seconds\n", - "3. Call the function itself" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "![boum](https://i.ytimg.com/vi/LLpIMRowndg/maxresdefault.jpg)\n", - "\n", - "Whaaaat? A function that calls itself? Well, yes. It is called a [recursive function](http://algosaur.us/recursion/). Recursive functions are a fascinating topic that allow you to do many stuff. For example, certain languages do not have loop as using recursive functions can substitute any type of loop. A classical example of use of recursive function is to compute Fibonacci numbers.\n", - "\n", - "> \"To iterate is human, to recurse is divine\"\n", - "> *L Peter Deutsch*\n", - "\n", - "OK, enough about recursion (although if you're into it, there is plenty to do and be said on the topic!). We actually need to modify slightly our function: if we let it as is, it will recurse until the end of time (or when the memory of your computer runs out). We need to implement a way to kill this list manually. As told before, we will implement the end of this loop as an Exception Handling problem: if the user tries to kill the program (by clicking on the square above), the exception will be capture and the function will return None, which will break the cycle. To do so, do wrap the whole body of your function in a *try* block and then in the *except* part, only capture the *KeyboardInterrupt* exception. If such an exception happen, just return *None*.\n", - "\n", - "Write the modified function below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Now it's time to tie everything together. Create a final function, that takes three arguments (the *parental_control*, the *gods_sake* and the *time_sleeping*) and that does the following:\n", - "\n", - "1. Call the function that will print the ritual formula\n", - "2. Create a *start* variable that contains the time at which the challenge starts\n", - "3. Call the function that writes \"et glou\" recursively\n", - "4. Create an *end* variable containing the time after your recursiv function ends\n", - "5. Print the value of *end* - *start*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "That's it, you can now train your ad fundum skills from home. Drink responsibly. Prosit!" - ] - } - ], - "metadata": { - "kernelspec": { - "argv": [ - "python", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "Python 3", - "env": null, - "interrupt_mode": "signal", - "language": "python", - "metadata": null, - "name": "python3" - }, - "name": "exercises_4_bis_using_exceptions.ipynb" - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_4/exercises_4.ipynb b/Session_4/exercises_4.ipynb deleted file mode 100644 index b8d7d50..0000000 --- a/Session_4/exercises_4.ipynb +++ /dev/null @@ -1,419 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "# It gets loopy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "In this notebook, we will work with stock prices. I'll get you started by writing some code that go and fetch the last quotes of a few American stocks. We will then write loops and functions to do two things:\n", - "\n", - "1. Evaluate the performance of an investing strategy on past data\n", - "2. Infer the possible performance of the same strategy on short-run future data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We start by importing the quotes. It is not crucial to understand the code here below as we will see what a call to an API is during a future session. It works broadly like this: we call a server that sends us back a kind of nested dictionaries and lists that contain the quotes and date of those quotes for the 5 largest tech firms (that are sometimes referred to as GAFAM)." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from requests import get\n", - "stock_quotes = {}\n", - "tickers = {\"GOOG\" : \"Google\", \"AAPL\" : \"Apple\", \"FB\": \"Facebook\", \"AMZN\" : \"Amazon\", \"MSFT\": \"Microsoft\"}\n", - "for ticker in tickers:\n", - " prices_and_dates = {}\n", - " query = \"https://financialmodelingprep.com/api/v3/historical-price-full/\"+ticker+\"?serietype=line\"\n", - " histoire_des_prix = get(query).json()[\"historical\"]\n", - " prices_and_dates[\"quotes\"] = [cours[\"close\"] for cours in histoire_des_prix[-31:]]\n", - " prices_and_dates[\"dates\"] = [cours[\"date\"] for cours in histoire_des_prix[-31:]]\n", - " stock_quotes[ticker] = prices_and_dates" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Run the cell above: it will run for some time and then ends without showing anything. In fact, it loads a peculiar data structure in the memory of your computer. The structure consists in prices and dates contained inside a dictionary which is itself contained in a dictionary. It is kind of a lousy structure so far but we will see in next sessions (the pandas one) how to make it easier. For now, we have to rely on this awkward structure. Here are some example of how to use it." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1404.32, 1419.83, 1429.73, 1439.23, 1430.88, 1439.2, 1451.7, 1480.39, 1484.4, 1485.95, 1486.65, 1466.71, 1433.9, 1452.56, 1458.63, 1455.84, 1434.23, 1485.94, 1447.07, 1448.23, 1476.23, 1479.23, 1508.68, 1508.79, 1518.27, 1514.66, 1520.74, 1519.67, 1526.69, 1518.15, 1485.11]\n" - ] - } - ], - "source": [ - "# Show all prices for Google\n", - "print(stock_quotes[\"GOOG\"][\"quotes\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "313.05\n" - ] - } - ], - "source": [ - "# Show the price of the Apple closing quote yesterday\n", - "print(stock_quotes[\"AAPL\"][\"quotes\"][-1])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-01-10\n" - ] - } - ], - "source": [ - "# What is the date of the third quote for the Microsoft data?\n", - "print(stock_quotes[\"MSFT\"][\"dates\"][2])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Let's work" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "OK, enough playing around! Now is your turn: Let's start by stretching our programming muscles. I'll write a line that will create a new variable called *quotes_amazon* and that is a list that contains the quotes of Amazon (in dollars) for all the data we have.\n", - "\n", - "Your job is to create a new list, called *quotes_amazon_euro* in which you will store the quotes but in Euros (for this exercise, it is enough to consider that converting to Euros just means multiplying by *0.9* - please applied trade economists, don't crucify me for this approximation)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "quotes_amazon = stock_quotes[\"AMZN\"][\"quotes\"]\n", - "\n", - "# Your code comes below this line\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Done? Good! Now for a harder problem: We will create a function that will compute the mean of any list that is passed (provided it contains integers or floats, of course). The function will work broadly like this: You initialize a counter at 0, you loop over the list and, at each iteration, you add the value at that position to the counter. Once the loop is over, you divide this by the length of the list to obtain the average.\n", - "\n", - "Oddly enough, Python doesn't ship with a built-in mean function (it was not initially a scientific computing language). Implementing it is therefore needed to perform the next steps (in future session, we will see some libraries, the most important being Numpy, that provide implementation of means functions.\n", - "\n", - "Implement your function, call it my_mean and then see if it works on the list containing the prices of the Facebook stock (that will be stored in the variable *facebook_quotes*.\n", - "\n", - "*Don't forget that you must produce a **function**, it will be important because we will use it in the next part!*" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your function should produce the following result (plus ou minus numerical error):\t 215.19645161290322\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "facebook_quotes = stock_quotes[\"FB\"][\"quotes\"]\n", - "print(\"Your function should produce the following result (plus ou minus numerical error):\\t\"\n", - " , np.mean(facebook_quotes))\n", - "\n", - "# Your code comes here below\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Let's now build on top of what we did previously to build a function that computes the mobile average. Let's start by using a fix number of lags to be considered (you can always extend it later): We want to have a function that takes a list and produces a new list containing the average value of the three past value (you can either align the two list by setting the three first value of your result to *None* or not, this will impact the way you will have to solve the next exercises, but both ways will be equally practical). Test it on the Google quotes that are in the variable *google_quotes*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "google_quotes = stock_quotes[\"GOOG\"][\"quotes\"]\n", - "\n", - "# Your code comes here below\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Now that we have done it for a fix number of lags, do it again but now allow for a second parameter to your function: the number of lags you want to consider for your moving average." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now going to compute the mean squared error. Write a function that sums over the squared differences between the actual value and your moving-average-based forecast. This will help us determining the optimal number of lags for our secret investment strategy at the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Our masterplan to get rich is getting together nicely...\n", - "\n", - "![coming_together](https://i1.wp.com/media1.tenor.com/images/a71c94c3aa7ad66a5051f81f48d14dd2/tenor.gif?w=688&ssl=1)\n", - "\n", - "The last step before creating the actual investement strategy: we are going to evaluate which lag produces the smallest mean square error. For that, create a function that loops over the range between 2 and 6 and return the optimal lag (the one with the smallest MSE)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now going to devise an incredibly dumb investment strategy: if your moving-average prediction (with the optimal lag - see, we are already [double-dipping](https://en.wikipedia.org/wiki/Circular_analysis) in the data...) is higher than the current price on a day, we invest all our accrued fortune (if we currently had no stock) or hold (if we had already some value invested), if the prediction is below, we sell our whole position (or do nothing if we had no money invested at the time).\n", - "\n", - "How much would we have today if we had started using this strategy on February 3rd and started out with 100 Euros?\n", - "\n", - "This is a significantly more complex problem than the ones above and you might want to split it into subproblems (for example, a good idea could be to devise a function that takes two values and returns the variation between those). There are many ways to solve it, find the one that makes more sense to you. The result should be the same for everyone, though." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Get rich (fast) or code tryin'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "The last step for today is to assess how much we will make in the near future. For this, we will use a small, non-parametric Monte-Carlo simulation.\n", - "\n", - "![monte_carlo](http://www.azquotes.com/picture-quotes/quote-in-mathematics-as-in-physics-so-much-depends-on-chance-on-a-propitious-moment-stanislaw-ulam-111-21-35.jpg)\n", - "\n", - "Here's how it will go: we are going to reuse the function that simulates the performance of our stupid investment strategy but on simulated data that will be produced in the following way: we are going to simulate 20 periods of data by resampling at random the past prices. This can be done using the function [random.choices(population, weights=None, \\*, cum_weights=None, k=1)](https://docs.python.org/3/library/random.html#random.choices): We are going to produce a list containing all the price variations of the quote. We then are going to produce a list of 20 values between 0 and the length of the list we just create and use those as variations over the next 20 days. We test our investment strategy again this \"future reality\" as if we started today with 100 Euros and the result is the final value we would have after 20 days. We have one potential result that we store in a list, called *results*. We repeat the process 5000 times (you can do it less time if your computer is rather slow). In the end, we will end up with 5000 potential results of our investment strategy over the 20 next days. I'll write the function that plots the distribution so that we can see visually what we can expect.\n", - "\n", - "Again, this whole procedure will probably require several functions. Try to break down the problem in a sequence of intermediate steps." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import random\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "The next cell will represent the distribution of your profits, only execute it once you have your 5000 results in the variable *results*." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import seaborn as sns\n", - "\n", - "sns.distplot(results)" - ] - } - ], - "metadata": { - "kernelspec": { - "argv": [ - "python", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "Python 3", - "env": null, - "interrupt_mode": "signal", - "language": "python", - "metadata": null, - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "name": "exercises_4.ipynb" - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_4/exercises_4_solutions.ipynb b/Session_4/exercises_4_solutions.ipynb deleted file mode 100644 index 5e6de28..0000000 --- a/Session_4/exercises_4_solutions.ipynb +++ /dev/null @@ -1,429 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# It gets loopy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we will work with stock prices. I'll get you started by writing some code that go and fetch the last quotes of a few American stocks. We will then write loops and functions to do two things:\n", - "\n", - "1. Evaluate the performance of an investing strategy on past data\n", - "2. Infer the possible performance of the same strategy on short-run future data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We start by importing the quotes. It is not crucial to understand the code here below as we will see what a call to an API is during a future session. It works broadly like this: we call a server that sends us back a kind of nested dictionaries and lists that contain the quotes and date of those quotes for the 5 largest tech firms (that are sometimes referred to as GAFAM)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from requests import get\n", - "stock_quotes = {}\n", - "tickers = {\"GOOG\" : \"Google\", \"AAPL\" : \"Apple\", \"FB\": \"Facebook\", \"AMZN\" : \"Amazon\", \"MSFT\": \"Microsoft\"}\n", - "for ticker in tickers:\n", - " prices_and_dates = {}\n", - " query = \"https://financialmodelingprep.com/api/v3/historical-price-full/\"+ticker+\"?serietype=line\"\n", - " histoire_des_prix = get(query).json()[\"historical\"]\n", - " prices_and_dates[\"quotes\"] = [cours[\"close\"] for cours in histoire_des_prix[-31:]]\n", - " prices_and_dates[\"dates\"] = [cours[\"date\"] for cours in histoire_des_prix[-31:]]\n", - " stock_quotes[ticker] = prices_and_dates" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the cell above: it will run for some time and then ends without showing anything. In fact, it loads a peculiar data structure in the memory of your computer. The structure consists in prices and dates contained inside a dictionary which is itself contained in a dictionary. It is kind of a lousy structure so far but we will see in next sessions (the pandas one) how to make it easier. For now, we have to rely on this awkward structure. Here are some example of how to use it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Show all prices for Google\n", - "print(stock_quotes[\"GOOG\"][\"quotes\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Show the price of the Apple closing quote yesterday\n", - "print(stock_quotes[\"AAPL\"][\"quotes\"][-1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# What is the date of the third quote for the Microsoft data?\n", - "print(stock_quotes[\"MSFT\"][\"dates\"][2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Let's work" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "OK, enough playing around! Now is your turn: Let's start by stretching our programming muscles. I'll write a line that will create a new variable called *quotes_amazon* and that is a list that contains the quotes of Amazon (in dollars) for all the data we have.\n", - "\n", - "Your job is to create a new list, called *quotes_amazon_euro* in which you will store the quotes but in Euros (for this exercise, it is enough to consider that converting to Euros just means multiplying by *0.9* - please applied trade economists, don't crucify me for this approximation)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "quotes_amazon = stock_quotes[\"AMZN\"][\"quotes\"]\n", - "\n", - "# Your code comes below this line\n", - "print([quote * 0.9 for quote in quotes_amazon])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Done? Good! Now for a harder problem: We will create a function that will compute the mean of any list that is passed (provided it contains integers or floats, of course). The function will work broadly like this: You initialize a counter at 0, you loop over the list and, at each iteration, you add the value at that position to the counter. Once the loop is over, you divide this by the length of the list to obtain the average.\n", - "\n", - "Oddly enough, Python doesn't ship with a built-in mean function (it was not initially a scientific computing language). Implementing it is therefore needed to perform the next steps (in future session, we will see some libraries, the most important being Numpy, that provide implementation of means functions.\n", - "\n", - "Implement your function, call it my_mean and then see if it works on the list containing the prices of the Facebook stock (that will be stored in the variable *facebook_quotes*.\n", - "\n", - "*Don't forget that you must produce a **function**, it will be important because we will use it in the next part!*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "facebook_quotes = stock_quotes[\"FB\"][\"quotes\"]\n", - "print(\"Your function should produce the following result (plus ou minus numerical error):\\t\"\n", - " , np.mean(facebook_quotes))\n", - "\n", - "# Your code comes here below\n", - "def mean(numbers_to_average):\n", - " return sum(numbers_to_average)/len(numbers_to_average)\n", - "\n", - "print(mean(facebook_quotes))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now build on top of what we did previously to build a function that computes the mobile average. Let's start by using a fix number of lags to be considered (you can always extend it later): We want to have a function that takes a list and produces a new list containing the average value of the three past value (you can either align the two list by setting the three first value of your result to *None* or not, this will impact the way you will have to solve the next exercises, but both ways will be equally practical). Test it on the Google quotes that are in the variable *google_quotes*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "google_quotes = stock_quotes[\"GOOG\"][\"quotes\"]\n", - "\n", - "# Your code comes here below\n", - "def moving_average_3(list_to_treat):\n", - " result = [None, None, None]\n", - " for i in range(3,len(list_to_treat)):\n", - " result.append(round(mean(list_to_treat[i-3:i]),2))\n", - " return result\n", - " \n", - "print(moving_average_3(google_quotes))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have done it for a fix number of lags, do it again but now allow for a second parameter to your function: the number of lags you want to consider for your moving average." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def moving_average(list_to_treat, order):\n", - " results = [None for i in range(order)]\n", - " for i in range(order, len(list_to_treat)):\n", - " results.append(round(mean(list_to_treat[i-order:i]),2))\n", - " return results\n", - "\n", - "print(moving_average(google_quotes, 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are now going to compute the mean squared error. Write a function that sums over the squared differences between the actual value and your moving-average-based forecast. This will help us determining the optimal number of lags for our secret investment strategy at the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def mse(list_1, list_2):\n", - " if len(list_1) != len(list_2):\n", - " print(\"Both list must be of the same size\")\n", - " return None\n", - " accumulator = 0\n", - " count = 0\n", - " for i in range(len(list_1)):\n", - " if list_1[i] != None and list_2[i] != None:\n", - " accumulator += (list_1[i] - list_2[i])**2\n", - " count += 1\n", - " return accumulator/count\n", - "\n", - "print(mse([1, 2, 2, 5], [None, None, 4, 3]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our masterplan to get rich is getting together nicely...\n", - "\n", - "![coming_together](https://i1.wp.com/media1.tenor.com/images/a71c94c3aa7ad66a5051f81f48d14dd2/tenor.gif?w=688&ssl=1)\n", - "\n", - "The last step before creating the actual investement strategy: we are going to evaluate which lag produces the smallest mean square error. For that, create a function that loops over the range between 2 and 6 and return the optimal lag (the one with the smallest RMSE)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def optimal_order(a_list):\n", - " optimum = 0\n", - " minimum_mse = 1_000_000_000_000 # that's a comically large value\n", - " for i in range(2, 7):\n", - " MA = moving_average(a_list, i)\n", - " mse_for_this_lag = mse(a_list, MA)\n", - " if mse_for_this_lag < minimum_mse:\n", - " minimum_mse = mse_for_this_lag\n", - " optimum = i\n", - " return optimum\n", - "\n", - "print(optimal_order(google_quotes))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are now going to devise an incredibly dumb investment strategy: if your moving-average prediction (with the optimal lag - see, we are already [double-dipping](https://en.wikipedia.org/wiki/Circular_analysis) in the data...) is higher than the current price on a day, we invest all our accrued fortune (if we currently had no stock) or hold (if we had already some value invested), if the prediction is below, we sell our whole position (or do nothing if we had no money invested at the time).\n", - "\n", - "How much would we have today if we had started using this strategy on February 3rd and started out with 100 Euros?\n", - "\n", - "This is a significantly more complex problem than the ones above and you might want to split it into subproblems (for example, a good idea could be to devise a function that takes two values and returns the variation between those). There are many ways to solve it, find the one that makes more sense to you. The result should be the same for everyone, though." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "compute_variation = lambda day_before, current_day : round((current_day - day_before)/day_before, 4)\n", - "\n", - "update_value = lambda current_value, variation : current_value * (1 + variation)\n", - "\n", - "def choose_investment_decision(value_today, forecast_tomorrow, amount_invested_today, amount_in_cash_today):\n", - " new_amount_in_cash, new_amount_in_stocks = amount_in_cash_today, amount_invested_today # the baseline is that we keep the situation as is\n", - " if value_today >= forecast_tomorrow: # the stock is predicted to loose money\n", - " if amount_in_cash_today == 0: # We had money on the stock\n", - " new_amount_in_cash = amount_invested_today\n", - " new_amount_in_stocks = amount_in_cash_today\n", - " elif value_today < forecast_tomorrow: # The stock is predicted to make money\n", - " if amount_in_cash_today != 0: # Our money was in cash\n", - " new_amount_in_cash = amount_invested_today\n", - " new_amount_in_stocks = amount_in_cash_today\n", - " # Note: we can convert this large if by removing the elif and using boolean algebra (because the body is the same)\n", - " return new_amount_in_cash, new_amount_in_stocks\n", - "\n", - "def index_date(date_list, date):\n", - " for i in range(len(date_list)): \n", - " if date_list[i] == date:\n", - " return i\n", - " return -1\n", - "\n", - "def simulate_investment(stock, start_simulation, start_cash, end_simulation = -1):\n", - " if end_simulation == -1: end_simulation = len(stock)-1\n", - " cash_stocks = (start_cash, 0)\n", - " optimal_lag = optimal_order(stock)\n", - " forecast = moving_average(stock, optimal_lag)\n", - " for ii in range(start_simulation, end_simulation):\n", - " variation = compute_variation(stock[ii], stock[ii-1])\n", - " cash_stocks = (cash_stocks[0], update_value(cash_stocks[1], variation)) # The value of the stocks are updated\n", - " cash_stocks = choose_investment_decision(stock[ii], forecast[ii+1], cash_stocks[1], cash_stocks[0])\n", - " return sum(cash_stocks)\n", - "\n", - "stock_to_analyse = stock_quotes[\"AAPL\"]\n", - "index_start = index_date(stock_to_analyse[\"dates\"], '2020-02-03')\n", - "simulate_investment(stock_to_analyse[\"quotes\"], index_start, 100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get rich (fast) or code tryin'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The last step for today is to assess how much we will make in the near future. For this, we will use a small, non-parametric Monte-Carlo simulation.\n", - "\n", - "![monte_carlo](http://www.azquotes.com/picture-quotes/quote-in-mathematics-as-in-physics-so-much-depends-on-chance-on-a-propitious-moment-stanislaw-ulam-111-21-35.jpg)\n", - "\n", - "Here's how it will go: we are going to reuse the function that simulates the performance of our stupid investment strategy but on simulated data that will be produced in the following way: we are going to simulate 20 periods of data by resampling at random the past prices. This can be done using the function [random.choices(population, weights=None, \\*, cum_weights=None, k=1)](https://docs.python.org/3/library/random.html#random.choices): We are going to produce a list containing all the price variations of the quote. We then are going to produce a list of 20 values between 0 and the length of the list we just create and use those as variations over the next 20 days. We test our investment strategy again this \"future reality\" as if we started today with 100 Euros and the result is the final value we would have after 20 days. We have one potential result that we store in a list, called *results*. We repeat the process 5000 times (you can do it less time if your computer is rather slow). In the end, we will end up with 5000 potential results of our investment strategy over the 20 next days. I'll write the function that plots the distribution so that we can see visually what we can expect.\n", - "\n", - "Again, this whole procedure will probably require several functions. Try to break down the problem in a sequence of intermediate steps." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import random\n", - "\n", - "def produce_variations_lists(stock_prices):\n", - " variations = []\n", - " for ii in range(1,len(stock_prices)):\n", - " variations.append(compute_variation(stock_prices[ii - 1], stock_prices[ii]))\n", - " return variations\n", - "\n", - "def produce_simulation(list_prices, n_future_periods):\n", - " return random.choices(produce_variations_lists(list_prices), k=n_future_periods)\n", - "\n", - "def simulate_profit(stock_prices, n_future_steps, n_monte_carlo):\n", - " results = []\n", - " for i in range(n_monte_carlo):\n", - " variations = produce_simulation(stock_prices, n_future_steps)\n", - " length_original_list = len(stock_prices)\n", - " simulated_list = stock_prices[:]\n", - " for variation_that_day in variations:\n", - " simulated_list.append(simulated_list[-1]*(1 + variation_that_day))\n", - " results.append(simulate_investment(simulated_list, length_original_list, 100))\n", - " return results\n", - "\n", - "stock_to_analyse = stock_quotes[\"AAPL\"][\"quotes\"][:]\n", - "results = simulate_profit(stock_to_analyse, 20, 5000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next cell will represent the distribution of your profits, only execute it once you have your 5000 results in the variable *results*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "\n", - "sns.distplot(results)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "name": "exercises_4.ipynb" - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_5/Ex 1 - UN Comtrade.ipynb b/Session_5/Ex 1 - UN Comtrade.ipynb deleted file mode 100644 index 01fb2e8..0000000 --- a/Session_5/Ex 1 - UN Comtrade.ipynb +++ /dev/null @@ -1,1036 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dowloading Trade Data with the Comtrade API\n", - "\n", - "Many institutions including the World Bank, the Fed, or the ECB now provide access to their data bases through APIs (Automated Programming Interfaces). In this exercise we will use the United Nations Comtrade API to download trade flows. \n", - "\n", - "The trade data accessible through the API can also be downloaded manually through drop-down menus on the [comtrade website](https://comtrade.un.org/data/). But if one is interested in making multiple downloads the API will come in pretty handy. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working directory\n", - "First, let us set up the working directory since we will download files from the UN comtrade. If you are running this notebook from the session 5 folder of your fork, you should have it as current directory." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/moritz/Documents/GitHub/Classes/Session_5\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "# The os.getcwd() returns a string, you can assign it to a variable if you need using var = os.getcwd(). \n", - "# Then, var will be assigned to that string.\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, create a folder to store the data, call it `/Data` inside your working directory. " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "os.makedirs(\"Data\", exist_ok = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## API Documentation\n", - "\n", - "Every API comes with a documentation. To understand how to use the Comtrade API you **need** to look at the [UN Comtrade documentation here](https://comtrade.un.org/data/doc/api/) to get an idea of the parameters required to make a request. \n", - "\n", - "Let's start with a simple request using the `requests` package, we want: \n", - "- Commodities\n", - "- Annual frequency\n", - "- Year 2013\n", - "- HS Sector Classification\n", - "- UK to World\n", - "- Imports and exports \n", - "\n", - "Check out the url including these parameters!\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your request was successful\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "url = \"http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=HS&ps=2013&r=826&p=0&rg=all&cc=ALL&fmt=csv&head=M\"\n", - "\n", - "data_1 = requests.get(url)\n", - "\n", - "if data_1.status_code == 200:\n", - " print(\"Your request was successful\")\n", - "else:\n", - " print(f\"Error {data_1.status_code} on your request \")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some initial remarks: Above you might receive the \"ChunkedEncodingError\" which stops your code. If you receive this before the function \"bilateral_requests\" is defined, just run again the block of code returning the error. Instead, if you get it in the bilateral_requests call, or in the blocks of code after that, just ignore it and read the rest of the code without running it. We have not used exception handling to solve this problem on purpose, since we want to show some of the problems you might have using the UN Comtrade API. At the end of this notebook we mention this (and other problems) and possible ways to solve them, but we leave it is an exercise to include those solutions in your code.\n", - "\n", - "You don't need to always print the status code when you download data. The HTTP code 200 means that the request was succesful and the object required has been returned. You can learn more about HTTP codes [here](https://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html). The use of HTTP codes helps when you want to use exception handling to deal with possible problems in retrieving data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have obtained an object as result from your query (in this case the csv file), you might want to store the file somewhere. In this case, we will use the *Data* folder of the previous step." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/\n" - ] - } - ], - "source": [ - "# Use one of the variables below depending on your OS.\n", - "\n", - "# For Windows\n", - "#data_path = os.getcwd() + \"\\\\Data\\\\UK_world\"\n", - "\n", - "# For MacOS or Linux\n", - "data_path = os.getcwd() + \"/Data/\"\n", - "\n", - "# Function to write csv for reporter and partner data\n", - "def write(req, path, reporter = \"\", partner = \"\"):\n", - " \n", - " print(f\"Writing .csv file in {path}\")\n", - "\n", - " # The function open below just opens the file defined as path in write mode. \n", - " # Then, while this file is open, the following line will write the text content of the request \n", - " # to this file (after some manipulation using join and replace)\n", - " with open(path + reporter + \"_\" + partner + \".csv\", 'w', newline = \"\") as f:\n", - " # This will access the content of our request, and we already know that it is a csv file. \n", - " # It will write that file in the directory that we specify as path.\n", - " f.write(\"\".join(req.text.replace(\";\",\"\")))\n", - " print(f\"File .csv saved in {path}\")\n", - "\n", - " \n", - "# Execute the Function \n", - "write(data_1, data_path, \"UK\", \"World\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Good job! Now you should have the csv file inside the data folder. To start all over again, let's remove it:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "os.remove(data_path + \"_.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Country IDs\n", - "\n", - "The API uses numeric ISO codes for specific countries. In order to retrieve a list of country codes, the API allows the following call (see Documentation!): " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[{'id': '4', 'text': 'Afghanistan'}, {'id': '8', 'text': 'Albania'}, {'id': '12', 'text': 'Algeria'}, {'id': '20', 'text': 'Andorra'}, {'id': '24', 'text': 'Angola'}, {'id': '660', 'text': 'Anguilla'}, {'id': '28', 'text': 'Antigua and Barbuda'}, {'id': '32', 'text': 'Argentina'}, {'id': '51', 'text': 'Armenia'}]\n" - ] - } - ], - "source": [ - "# This is the url to the json file with the id-country pairs\n", - "url_country_values= \"https://comtrade.un.org/Data/cache/reporterAreas.json\"\n", - "\n", - "country_values = requests.get(url_country_values).json()[\"results\"]\n", - "\n", - "# Let's print the first 10 codes\n", - "print(country_values[1:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's get the format into a more convienent shape:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - } - ], - "source": [ - "# Object is of type\n", - "print(type(country_values))\n", - "# First item inside object is of type\n", - "print(type(country_values[1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "[('4', 'Afghanistan'), ('8', 'Albania'), ('12', 'Algeria'), ('20', 'Andorra'), ('24', 'Angola'), ('660', 'Anguilla'), ('28', 'Antigua and Barbuda'), ('32', 'Argentina'), ('51', 'Armenia'), ('533', 'Aruba'), ('36', 'Australia'), ('40', 'Austria'), ('31', 'Azerbaijan'), ('44', 'Bahamas'), ('48', 'Bahrain'), ('50', 'Bangladesh'), ('52', 'Barbados'), ('112', 'Belarus'), ('56', 'Belgium'), ('58', 'Belgium-Luxembourg'), ('84', 'Belize'), ('204', 'Benin'), ('60', 'Bermuda'), ('64', 'Bhutan')]\n" - ] - } - ], - "source": [ - "# Below you can find two different ways to achieve the same result\n", - "\"\"\"\n", - "unpacked_id = []\n", - "unpacked_countries = []\n", - "for x in range(len(country_values)):\n", - " unpacked_id.append(country_values[x][\"id\"])\n", - " unpacked_countries.append(country_values[x][\"text\"])\n", - " \n", - "unpacked_values = list(zip(unpacked_id, unpacked_countries))\n", - "print(unpacked_values)\n", - "\"\"\"\n", - "unpacked_values = [(x, y) for entry in range(len(country_values)) for x, y in [(country_values[entry].get(\"id\"), country_values[entry].get(\"text\"))]]\n", - "\n", - "print(type(unpacked_values))\n", - "print(type(unpacked_values[1]))\n", - "print(unpacked_values[1:25])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we know the name of a country but not the id, which is what we need for the API request, we can construct a function that takes the name of the country as argument and return the associated id." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The country Belgium is in the list with id 56\n", - "56\n" - ] - } - ], - "source": [ - "def obtain_id(country_name):\n", - " for x in range(len(unpacked_values)):\n", - " if country_name in unpacked_values[x]:\n", - " print(f\"The country {country_name} is in the list with id {unpacked_values[x][0]}\")\n", - " i = unpacked_values[x][0]\n", - " return i\n", - " else:\n", - " print(f\"The country {country_name} is not on the list, check the exact name used by the UN comtrade for that country\")\n", - " \n", - "# Let's try it:\n", - "print(obtain_id(\"Belgium\"))\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## API Call\n", - "\n", - "Let's build another helper function. It creates new folders to store our data with the arguments:\n", - "- Frequency\n", - "- Sector classificiation\n", - "- Year\n", - "- Reporter\n", - "\n", - "Since we work on different OS, we will also add an argument that takes the string \"Windows\" or \"MacOS\"" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "def folder(frequency, classification, year, reporter, OS, month = \"\"):\n", - " if OS == \"Windows\":\n", - " path = os.getcwd() + \"\\\\Data\\\\\" + frequency + \"\\\\\" + classification + \"\\\\\" + year + month + \"\\\\\" + reporter\n", - " os.makedirs(path, exist_ok = True)\n", - " print(f\"The folder at {path} has been created.\")\n", - " return path + \"\\\\\"\n", - " elif OS == \"MacOS\":\n", - " path = os.getcwd() + \"/Data/\" + frequency + \"/\" + classification + \"/\" + year + month + \"/\" + reporter + \"/\"\n", - " os.makedirs(path, exist_ok = True)\n", - " print(f\"The folder at {path} has been created.\")\n", - " return path\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's creates the function with the actual call to the API. It should take as arguments the parameters required by the API. Use the name of a country instead of the id, which we can recover from the previous function. As arguments use:\n", - "\n", - "* \"frequency\" to which we will assign the value \"A\" or \"M\" to get the data frequency\n", - "* \"classification\" that takes the values \"HS\", \"H4\", etc. depending on the calssification that we want to use\n", - "* \"year\" for the data reference year\n", - "* \"reporter\" the reporter country, we will recover the id using the previous function\n", - "* \"partner\" same as reporter but for the trading partner\n", - "\n", - "For the other parameters in the URL fix the following values:\n", - "* Commodities (type=C)\n", - "* Obtaind data on imports and exports (rg=1,2)\n", - "* For all the classification codes within a classification (cc=all)\n", - "* The format returned is a csv file (fmt=csv)\n", - "\n", - "This function should return the object of the query (like we did with \"data_1 = requests.get(url)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are including a frequency argument in the function but we will always use the annual frequency data for this exercise. If you want to get monthly data you should adjust the function in the following way:\n", - "* If you did read the documentation of the API, you should have noticed that the format of the parameter at annual frequency is 2017, 2016, etc.\n", - "* Instead, for the monthly frequency you have 201701, 201702, etc. the second part is the month\n", - "* To obtain this parameter, you should add another argument (called \"month\") to the function. You will use the values 01, 02, 03, etc. for this parameter\n", - "* In the url_year part you should concatenate the year and month arguments to get the required values, i.e. 201701.\n", - "* The only classification available for the monthly data is \"HS\"" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "def query_constructor_C(frequency, classification, year, reporter, partner, month = \"\"):\n", - " url_frequency = \"&freq=\" + frequency\n", - " url_classification = \"&px=\" + classification\n", - " url_year = \"&ps=\" + year + month\n", - " url_reporter = \"&r=\" + obtain_id(reporter)\n", - " url_partner = \"&p=\" + obtain_id(partner)\n", - " url_final = \"&rg=1,2&cc=ALL&fmt=csv&head=M\"\n", - " url = \"http://comtrade.un.org/api/get?max=100000&type=C\" + url_frequency + url_classification + url_year + url_reporter + url_partner + url_final\n", - " print(f\"The url for {classification} and trade flows between {reporter} and {partner} in {year + month} has been created. Processing request...\")\n", - " req = requests.get(url) \n", - " print(f\"The request for {classification}, {reporter}, {partner}, {year + month} has been completed. The HTTP code is: {req.status_code}\")\n", - " print(url)\n", - " return req" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let us try if the function is working properly. For now we will include it in a temporary function together with the folder function just to store the file. Then, we will check how to improve things." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "def bilateral_requests(frequency, classification, year, reporter, partner, OS, month = \"\"):\n", - " path = folder(frequency, classification, year, reporter, OS, month = month)\n", - " print(path)\n", - " req = query_constructor_C(frequency, classification, year, reporter, partner, month = month)\n", - " write(req, path, reporter, partner)\n", - " return req" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country Germany is in the list with id 276\n", - "The url for H4 and trade flows between France and Germany in 2017 has been created. Processing request...\n", - "The request for H4, France, Germany, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=276&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Check your folder now, this should have created the csv file\n", - "bilateral_requests(\"A\", \"H4\", \"2017\", \"France\", \"Germany\", \"MacOS\")\n", - "# Let us try with monthly data\n", - "#bilateral_requests(\"M\", \"HS\", \"2017\", \"France\", \"Germany\", \"MacOs\", month = \"01\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Everything should be fine for now. Instead of bilateral data, try to get trade values between France and all its trading partners." - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country All is in the list with id all\n", - "The url for H4 and trade flows between France and All in 2017 has been created. Processing request...\n", - "The request for H4, France, All, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=all&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bilateral_requests(\"A\", \"H4\", \"2017\", \"France\", \"All\", \"MacOS\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you check the last file, you will notice that the request was successful but we did not get the data since the number of observations is above the limit. Usually, it is better to receive an error when you make the request that exceed the limit instead of a successful request code. However, this is how the UN Comtrade deisgned its API. Anyway, we can take a step back to avoid this problem. If you did check the API documentation, the UN Comtrade has a separate file to check the data availability (it returns a json with multiple informations). Let us define a couple of functions to use this data availability request." - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "def query_availability_C(frequency, classification, year, reporter, partner = \"\", month = \"\"):\n", - " url_frequency = \"http://comtrade.un.org/api/refs/da/view?type=C&freq=\" + frequency\n", - " url_classification = \"&px=\" + classification\n", - " url_year = \"&ps=\" + year + month\n", - " url_reporter = \"&r=\" + obtain_id(reporter)\n", - " if partner == \"\":\n", - " url_partner = \"&p=\"\n", - " elif partner != \"\":\n", - " url_partner = \"&p=\" + obtain_id(partner)\n", - " url_final = \"&rg=1,2&cc=ALL\"\n", - " url = url_frequency + url_classification + url_year + url_reporter + url_partner + url_final\n", - " print(f\"The url for {classification} and trade flows between {reporter} and {partner} in {year + month} has been created. Processing data availability file...\")\n", - " req = requests.get(url).json()\n", - " print(f\"The json for {classification}, {reporter}, {partner}, {year + month} is now available. Now it is time to unpack it.\")\n", - " unpacked = [(x) for entry in range(len(req)) for x in req[entry].items()]\n", - " print(unpacked)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, start from our previous query on annual data between France and Germany." - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The country France is in the list with id 251\n", - "The country Germany is in the list with id 276\n", - "The url for H4 and trade flows between France and Germany in 2017 has been created. Processing data availability file...\n", - "The json for H4, France, Germany, 2017 is now available. Now it is time to unpack it.\n", - "[('type', 'COMMODITIES'), ('freq', 'ANNUAL'), ('px', 'H4'), ('r', '251'), ('rDesc', 'France'), ('ps', '2017'), ('TotalRecords', 684593), ('isOriginal', 0), ('publicationDate', '2018-08-24T00:00:00'), ('isPartnerDetail', 1)]\n" - ] - } - ], - "source": [ - "query_availability_C(\"A\", \"H4\", \"2017\", \"France\", partner = \"Germany\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, the data availabilty json is good if you have problems for observations between a reporter country and all its trade partners, while it is useless to solve the observation problem should it arise for bilateral flows (it should not since UN Comtrade increase the max size of the request). We did include a partner paramater but the request ignored it since it is not a paramenter of the data availability query. The request sent us the number of observations for all trade flows (including re-exports and re-imports) between the reporter country and all trade partners." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us check the availability for the monthly data just for fun" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The country France is in the list with id 251\n", - "The country All is in the list with id all\n", - "The url for HS and trade flows between France and All in 201701 has been created. Processing data availability file...\n", - "The json for HS, France, All, 201701 is now available. Now it is time to unpack it.\n", - "[('type', 'COMMODITIES'), ('freq', 'MONTHLY'), ('px', 'HS'), ('r', '251'), ('rDesc', 'France'), ('ps', '201701'), ('TotalRecords', 373901), ('isOriginal', 1), ('publicationDate', '2018-08-22T00:00:00'), ('isPartnerDetail', 1)]\n" - ] - } - ], - "source": [ - "query_availability_C(\"M\", \"HS\", \"2017\", \"France\", partner = \"All\", month = \"01\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Same problem as before, we only get aggregate observations for France." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Clean the Data folder by deleting the csv files. We could also write down a function to do that. You can do that as an exercise, you will need to look at some of the functions in the os library." - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "# For Windows\n", - "#os.remove(os.getcwd() + \"\\\\Data\\\\A\\\\H4\\\\2017\\\\France\\\\France_Germany.csv\")\n", - "#os.remove(os.getcwd() + \"\\\\Data\\\\A\\\\H4\\\\2017\\\\France\\\\France_All.csv\")\n", - "#os.remove(os.getcwd() + \"\\\\Data\\\\M\\\\HS\\\\201701\\\\France\\\\France_Germany.csv\")\n", - "\n", - "# For MacOS\n", - "os.remove(os.getcwd() + \"/Data/A/H4/2017/France/France_Germany.csv\")\n", - "os.remove(os.getcwd() + \"/Data/A/H4/2017/France/France_All.csv\")\n", - "#os.remove(os.getcwd() + \"/Data/M/HS/201701/France/France_Germany.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will look at some of the problems that you might have using the UN Comtrade API without a license below. First, we will create a for loop to make requests for bilateral data." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [], - "source": [ - "# We might include an elif to avoid the request for the \"All\" partner, but we do not know if, at least for some reporter country (maybe small countries) it works because there are less observations\n", - "def reporter_requests(frequency, classification, year, reporter, OS, month = \"\"):\n", - " index = 0\n", - " for x in range(len(unpacked_values)):\n", - " if reporter == unpacked_values[x][1]:\n", - " continue\n", - " else:\n", - " req = bilateral_requests(frequency, classification, year, reporter, unpacked_values[x][1], OS, month = month)\n", - " index += 1\n", - " if req.status_code != 200:\n", - " print(f\"The request was not successful for {frequency}, {classification}, {year + month}, {reporter}, {unpacked_values[x][1]}\")\n", - " break\n", - " elif index == 4:\n", - " print(f\"Since this is only an example, we stop at the index {index} since you might not want to download all the bilateral data for {reporter} in {year + month}\")\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country All is in the list with id all\n", - "The url for H4 and trade flows between France and All in 2017 has been created. Processing request...\n", - "The request for H4, France, All, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=all&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country Afghanistan is in the list with id 4\n", - "The url for H4 and trade flows between France and Afghanistan in 2017 has been created. Processing request...\n", - "The request for H4, France, Afghanistan, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=4&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country Albania is in the list with id 8\n", - "The url for H4 and trade flows between France and Albania in 2017 has been created. Processing request...\n", - "The request for H4, France, Albania, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=8&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "The country France is in the list with id 251\n", - "The country Algeria is in the list with id 12\n", - "The url for H4 and trade flows between France and Algeria in 2017 has been created. Processing request...\n", - "The request for H4, France, Algeria, 2017 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2017&r=251&p=12&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2017/France/\n", - "Since this is only an example, we stop at the index 4 since you might not want to download all the bilateral data for France in 2017\n" - ] - } - ], - "source": [ - "reporter_requests(\"A\", \"H4\", \"2017\", \"France\", \"MacOS\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let us mention a couple of problems that you might find using the UN Comtrade API:\n", - "1. If you got an error at some point the loop will break. There are a couple of possible explanation for this:\n", - " * We know that the request for the parameters above is well defined (meaning that the data are available, believe me on this). However, as we have seen before, they might exceed the number of observations available to free users. We have seen that this still return an HTTP code equal to 200, so it does not break the loop (more on this below). More on how to deal with large number of observations below.\n", - " * Most likely, the problem is that free users can send 1 request per second. Since the files for the bilateral data are quite small, the loop might cycle requests really fast. In that case, you should get an HTTP code different from 200, breaking the loop. To avoid this, we can just import the time library and include a time.sleep(1) inside the loop.\n", - " * You exceed the number of requests (100) that free users can make to the API in a 60 minutes window. To keep track of this, we can add an index = 0 at the beginning of the function, and increase it by 1 with each iteration of the for loop. Then, just add an if statement that, when the index is close to 100, uses time.sleep() for a sufficient number of minutes to reset the counter. Inside the function, after the time.sleep(), you can reset the index and go back to the iterations of the loop. Otherwise, instead of the counter, you can use exception handling to tell the code to sleep once the API returns the error (using an HTTP code) associated to the user request limit. When you hit the limit you receive a 409 HTTP code, which would stop the code, and the first row of the last file donwloaded will tell you why you received the error and, if the error was cause by the requests limit per hour, the time in which you can get back to sending requests. You could import in Python the first row from that file to extract, using regular expressions, the time to resume the requests, and use it to restart the loop creating the requests. We will not provide the code to do that here since we already cover a lot of topics in this session. \n", - "2. The loop stops after you have already completed a certain amount of iterations. This might happen for multiple reasons such as a loss of internet connection which leads to an error when you try to make the request or you simply interrupt the code by hand to close the notebook. In that case, you do not want to download again files that you have already stored in your folder, since it will burn your number of available requests per hour. Below we address this problem using a function that tells you whether a file is already stored in your folder and, in that case, does not submit a request since you already have the file. Below we do not provide the code to deal handle the exception from the loss of internet connection during a request. Try to write it without our help!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below the function to check the existing files in your folder." - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [], - "source": [ - "def check_existence(frequency, classification, year, reporter, partner, OS, month = \"\"):\n", - " if OS == \"Windows\":\n", - " try:\n", - " if reporter + \"_\" + partner + \".csv\" in os.listdir(os.getcwd() + \"\\\\Data\\\\\" + frequency + \"\\\\\" + classification + \"\\\\\" + year + month + \"\\\\\" + reporter):\n", - " print (f\"File {reporter}_{partner}.csv already exists, skip to next iteration.\")\n", - " return True\n", - " except FileNotFoundError:\n", - " print (f\"The folder does not exist, implying that the file {reporter}_{partner}.csv does not exist, continue with this iteration.\")\n", - " elif OS == \"MacOS\":\n", - " try:\n", - " if reporter + \"_\" + partner + \".csv\" in os.listdir(os.getcwd() + \"/Data/\" + frequency + \"/\" + classification + \"/\" + year + month + \"/\" + reporter):\n", - " print (f\"File {reporter}_{partner}.csv already exists, skip to next iteration.\")\n", - " return True\n", - " except FileNotFoundError:\n", - " print (f\"The folder does not exist, implying that the file {reporter}_{partner}.csv does not exist, continue with this iteration.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Slightly modify the reporter_requests function defined above to include this" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [], - "source": [ - "def reporter_requests_v2(frequency, classification, year, reporter, OS, month = \"\"):\n", - " index = 0\n", - " for x in range(len(unpacked_values)):\n", - " if reporter == unpacked_values[x][1]:\n", - " continue\n", - " else:\n", - " existence = check_existence(frequency, classification, year, reporter, unpacked_values[x][1], OS, month = month)\n", - " if existence:\n", - " continue\n", - " else:\n", - " req = bilateral_requests(frequency, classification, year, reporter, unpacked_values[x][1], OS, month = month)\n", - " index += 1\n", - " if req.status_code != 200:\n", - " print(f\"The request was not successful for {frequency}, {classification}, {year + month}, {reporter}, {unpacked_values[x][1]}\")\n", - " break\n", - " elif index == 4:\n", - " print(f\"Since this is only an example, we stop at the index {index} since you might not want to download all the bilateral data for {reporter} in {year + month}\")\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The folder does not exist, implying that the file France_All.csv does not exist, continue with this iteration.\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The country France is in the list with id 251\n", - "The country All is in the list with id all\n", - "The url for H4 and trade flows between France and All in 2016 has been created. Processing request...\n", - "The request for H4, France, All, 2016 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2016&r=251&p=all&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The country France is in the list with id 251\n", - "The country Afghanistan is in the list with id 4\n", - "The url for H4 and trade flows between France and Afghanistan in 2016 has been created. Processing request...\n", - "The request for H4, France, Afghanistan, 2016 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2016&r=251&p=4&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The country France is in the list with id 251\n", - "The country Albania is in the list with id 8\n", - "The url for H4 and trade flows between France and Albania in 2016 has been created. Processing request...\n", - "The request for H4, France, Albania, 2016 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2016&r=251&p=8&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "The country France is in the list with id 251\n", - "The country Algeria is in the list with id 12\n", - "The url for H4 and trade flows between France and Algeria in 2016 has been created. Processing request...\n", - "The request for H4, France, Algeria, 2016 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=2016&r=251&p=12&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/2016/France/\n", - "Since this is only an example, we stop at the index 4 since you might not want to download all the bilateral data for France in 2016\n" - ] - } - ], - "source": [ - "reporter_requests_v2(\"A\", \"H4\", \"2016\", \"France\", \"MacOS\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see another problem with the API, let us make a request for data that are not in the database (the H4 classification is from 2012, so there are no data using this classification in 1992)." - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The folder does not exist, implying that the file France_All.csv does not exist, continue with this iteration.\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The country France is in the list with id 251\n", - "The country All is in the list with id all\n", - "The url for H4 and trade flows between France and All in 1992 has been created. Processing request...\n", - "The request for H4, France, All, 1992 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=1992&r=251&p=all&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The country France is in the list with id 251\n", - "The country Afghanistan is in the list with id 4\n", - "The url for H4 and trade flows between France and Afghanistan in 1992 has been created. Processing request...\n", - "The request for H4, France, Afghanistan, 1992 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=1992&r=251&p=4&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The country France is in the list with id 251\n", - "The country Albania is in the list with id 8\n", - "The url for H4 and trade flows between France and Albania in 1992 has been created. Processing request...\n", - "The request for H4, France, Albania, 1992 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=1992&r=251&p=8&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The folder at /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/ has been created.\n", - "/home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "The country France is in the list with id 251\n", - "The country Algeria is in the list with id 12\n", - "The url for H4 and trade flows between France and Algeria in 1992 has been created. Processing request...\n", - "The request for H4, France, Algeria, 1992 has been completed. The HTTP code is: 200\n", - "http://comtrade.un.org/api/get?max=100000&type=C&freq=A&px=H4&ps=1992&r=251&p=12&rg=1,2&cc=ALL&fmt=csv&head=M\n", - "Writing .csv file in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "File .csv saved in /home/moritz/Documents/GitHub/Classes/Session_5/Data/A/H4/1992/France/\n", - "Since this is only an example, we stop at the index 4 since you might not want to download all the bilateral data for France in 1992\n" - ] - } - ], - "source": [ - "reporter_requests_v2(\"A\", \"H4\", \"1992\", \"France\", \"MacOS\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you try to open one of the .csv file downloaded, you will notice that the request was successful but the first entry of the downloaded file tells you that the data are not available for that query." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check what we get from the data availability request if we use this classification and year" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The country France is in the list with id 251\n", - "The url for H4 and trade flows between France and in 1992 has been created. Processing data availability file...\n", - "The json for H4, France, , 1992 is now available. Now it is time to unpack it.\n", - "[]\n" - ] - } - ], - "source": [ - "query_availability_C(\"A\", \"H4\", \"1992\", \"France\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is empty, meaning that the data are not available for this combination of parameter. Let us discuss the problems of the current code and possible solutions. You can improve the code to include those solutions as an exercise." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exceeding number of observations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have seen that the query for the data availability does not really help to adrress this problem. We have also noticed that the request is successful but we get a file that in the first entry says \"Result too large: you do not have permissions to access such a large resultset.\".\n", - "1. Remember that our query asks for imports and exports at the same time. You can rewrite the functions to include trade_flows as argument (adjust also the functions to construct folders, etc. obvisouly). In this way, you reduce the number of observations per file, but increase the number of requests you have to make.\n", - "2. Include in our main function another function that opens each file that we download to read the first entry. If we get the string \"Results too large etc.\" we can then sae the parameters of this query to a list so that we know which query had the observations problem. Then, we will have to break down those query to find a request with fewer observations. For example, instead of downloading all the commodity codes at once, we split them up (look at the \"cc=\" parameters on the API documentation). Otherwise, split the trade flows as suggested in 1." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# More than 1 request per second" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The rate limit should return a 409 error (according to the documentation). This is the same code returned once you exceed 100 requests per hour (again, according to the documentation).\n", - "1. To avoid the more than 1 request per second problem you just need to include a time.sleep(1) between iterations of the requests. \n", - "2. Even if you have this problem, it is possible that the UN Comtrade will download a .csv file containing the error as first entry. As for the observations problem, you can just open the file and check the first entry to see if there is a problem. Then, just tell the loop to submit the query again if that was the problem listed in the file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Usage limit, more than 100 requests per hour" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned above, this should get a 409 status code, so you can deal with it using exception handling and time.sleep(). As for the previous problems, you might get a file that in the first entry report the error, including the time at which you can start submitting requests again. You can extract that and tell Python to restart the requests at that time. Otherwise, just include an index to keep track of how many requests you have made, to stop before the limit, and to tell the code to sleep." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data not available" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For this problem you can again use the first entry of the file downloaded, which will tell you that there is a problem of availability with the data. Otherwise, include at the start of the function the query for the avaialbility of the data. If the json from that query is an empty list you know that the data are not available, so you can avoid making that request for the data to the API. The latter approach is better since it should not burn your number of requests per hour (the request for the data availability probably is not included in the usage limit)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Folder management" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are things that you can do to improve folder management of the data downloaded while keeping track of the requests made. You should add a function that deletes the file reporting errors (without data) after you open them to get information about the error contained and the parameter used for that request. In this way you can keep track of the paramaters for which you do not have data due to errors. You should also store these information somewhere since they might be useful at some point.\n", - "\n", - "If you want to update your data files over time you can create a function that looks at the creation time of your .csv files and delete them if they are older than your desired threshold. Then, you can download them again. Whether this is useful or not depends on how often UN Comtrade revises old data. For example, they might never change the content of files for trade flows between countries in 1973. In that case, deleting the old file and downloading it again is useless since there are no changes to the data. Instead, they might update trade flows data for the last 2-3 years, so you might want to update the files associated to this time window." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/Session_5/Ex 2 - GOP Twitter.ipynb b/Session_5/Ex 2 - GOP Twitter.ipynb deleted file mode 100644 index 15bbaa6..0000000 --- a/Session_5/Ex 2 - GOP Twitter.ipynb +++ /dev/null @@ -1,696 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we are finally at the final project. As you know, we are going to extract\n", - "information on the course of the cryptomonnages and see if the feeling of tweets expressed in relation to these cryptomonnages correlates with\n", - "the evolution of their price.\n", - "\n", - "Of course, we're not going to code everything from the beginning. We can reuse parts of the code that other people have made public to\n", - "be able to accelerate our development. These code parts are\n", - "usually called \"bookstores\" and identify which ones are useful\n", - "and how they work are often an important part of a project (especially for smaller projects).\n", - "\n", - "In this notebook, we'll see how the libraries you'll have\n", - "needed for the project is working. Of course, we will only show here\n", - "that certain functions you will probably have to use.\n", - "This list is not exhaustive and you will still need to look for\n", - "information on bookstores and functions on the internet. By\n", - "elsewhere, if you prefer to use other libraries for the project,\n", - "you're obviously free to do so.\n", - "\n", - "### Requests and APIs\n", - "\n", - "A good part of the project consists in finding information through\n", - "of API. In this section, we see what APIs are and how they work.\n", - "use them with the *requests* library.\n", - "\n", - "### What's an API?\n", - "\n", - "An API is the acronym for Application Programming Interface. It is actually\n", - "a set of instructions that the designers of an application (whether local or online) made available to other programmers who would like to use their application through code-based interactions (i.e. without going through the GUI).\n", - "\n", - "There are many APIs for a whole bunch of sites or applications. For the project, you will need to use the coinpaprika and twitter API. For\n", - "using the first API, we can use a very powerful library of\n", - "who just makes a call to the site and gets an answer. For the Twitter API, the fact that you have to identify yourself makes the process more\n", - "complex and, for this reason, we will use a library that manages\n", - "most of the authentication details for us.\n", - "\n", - "Here's a video that explains what a web API is (there are also\n", - "Local APIs, but we won't see them in this course). *The company that produced this video sells integration solutions, so the last part of the video is largely an ad but the video is\n", - "instructive if you've never used an API before.\n", - "\n", - "http://www.youtube.com/watch?feature=player_embedded&v=s7wmiS2mSXY\n", - "\n", - "#### The Game of Thrones API\n", - "\n", - "To use an API, you usually need to know what you want to\n", - "recover. During this session, and because it's in season, we're going to\n", - "use an API that allows us to access information about [Game of Thrones characters and their h](https://anapioficeandfire.com/)ouses. In this saga, characters from several families (called\n", - "houses) compete to become the Queen or King of the 7 Kingdoms, and\n", - "govern Westeros. Dragons are also part of the picture. *If you have a worse way of describing it, let me know *.\n", - "\n", - "This kind of API may seem a bit pointless as it is. However, it is\n", - "possible to couple this API with others and to program a little to get [this kind of result](https://got.show/) - a predictive model giving the likelihood that each character will die in the last season (a project from the Munich Technical University).\n", - "\n", - "The first step, after identifying the API you want to use is to read the documentation. Fortunately, in the case of the API for\n", - "GoT (we will use this abbreviation for Games of Thrones in the\n", - "rest of this notebook), it's pretty simple. The documentation can be\n", - "can be found [on this page](https://anapioficeandfire.com/Documentation).\n", - "\n", - "On this page you can find some important information:\n", - "\n", - "1. We will not be able to receive information from more than 50 characters or houses per API call.\n", - "2. There are two interesting \"features\" for us\n", - " - The one that lists the characters ([https://www.anapioficeandfire.com/api/characters](https://www.anapioficeandfire.com/api/characters) + options)\n", - " - The one that lists the houses ([https://www.anapioficeandfire.com/api/houses](https://www.anapioficeandfire.com/api/houses) + options)\n", - "\n", - "The Requests bookstore\n", - "\n", - "To use simple APIs like this one (and the one from Coinpaprika), you can use the *requests* library. This library manages the request forwading to the server, its reception and additionnally deals with formatting. Let's try to load the first outcome page consisting in all of the GoT characters. \n", - "\n", - "Let's start by importing the requests library.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will now write the string containing the address that the documentation gives as the one to get the character list. We store it in a variable and then pass it through the\n", - "string to the get() function from the requests library." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "request_string = \"https://www.anapioficeandfire.com/api/characters\"\n", - "response = requests.get(request_string)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, nothing is returned, but you can display the contents of the response." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'url': 'https://www.anapioficeandfire.com/api/characters/1',\n", - " 'name': '',\n", - " 'gender': 'Female',\n", - " 'culture': 'Braavosi',\n", - " 'born': '',\n", - " 'died': '',\n", - " 'titles': [''],\n", - " 'aliases': ['The Daughter of the Dusk'],\n", - " 'father': '',\n", - " 'mother': '',\n", - " 'spouse': '',\n", - " 'allegiances': [],\n", - " 'books': ['https://www.anapioficeandfire.com/api/books/5'],\n", - " 'povBooks': [],\n", - " 'tvSeries': [''],\n", - " 'playedBy': ['']},\n", - " {'url': 'https://www.anapioficeandfire.com/api/characters/2',\n", - " 'name': 'Walder',\n", - " 'gender': 'Male',\n", - " 'culture': '',\n", - " 'born': '',\n", - " 'died': '',\n", - " 'titles': [''],\n", - " 'aliases': ['Hodor'],\n", - " 'father': '',\n", - " 'mother': '',\n", - " 'spouse': '',\n", - " 'allegiances': ['https://www.anapioficeandfire.com/api/houses/362'],\n", - " 'books': ['https://www.anapioficeandfire.com/api/books/1',\n", - " 'https://www.anapioficeandfire.com/api/books/2',\n", - " 'https://www.anapioficeandfire.com/api/books/3',\n", - " 'https://www.anapioficeandfire.com/api/books/5',\n", - " 'https://www.anapioficeandfire.com/api/books/8'],\n", - " 'povBooks': [],\n", - " 'tvSeries': ['Season 1', 'Season 2', 'Season 3', 'Season 4', 'Season 6'],\n", - " 'playedBy': ['Kristian Nairn']},\n", - " {'url': 'https://www.anapioficeandfire.com/api/characters/3',\n", - " 'name': '',\n", - " 'gender': 'Male',\n", - " 'culture': '',\n", - " 'born': '',\n", - " 'died': '',\n", - " 'titles': [''],\n", - " 'aliases': ['Lamprey'],\n", - " 'father': '',\n", - " 'mother': '',\n", - " 'spouse': '',\n", - " 'allegiances': ['https://www.anapioficeandfire.com/api/houses/15'],\n", - " 'books': ['https://www.anapioficeandfire.com/api/books/3'],\n", - " 'povBooks': [],\n", - " 'tvSeries': [''],\n", - " 'playedBy': ['']}]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response.json()[0:3]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we must use the .json() method because response type objects are complex and contain several elements.\n", - "\n", - "*[{'url': 'https://www.anapioficeandfire.com/api/characters/1', 'name': '', 'gender': 'Female', 'culture': 'Braavosi', 'born': '', 'died': '', 'titles': [''], 'aliases': ['The Daughter of the Dusk'], 'father': '', 'mother': '', 'spouse': '', 'allegiances': [], 'books': ['https://www.anapioficeandfire.com/api/books/5'], 'povBooks': [], 'tvSeries': [''], 'playedBy': ['']}, {'url': 'https://www.anapioficeandfire.com/api/characters/2', 'name': 'Walder', 'gender': 'Male', 'culture': '', 'born': '', 'died': '', 'titles': [''], 'aliases': ['Hodor'], 'father': '', 'mother': '', 'spouse': '', 'allegiances': ['https://www.anapioficeandfire.com/api/houses/362'], 'books': ['https://www.anapioficeandfire.com/api/books/1', 'https://www.anapioficeandfire.com/api/books/2', 'https://www.anapioficeandfire.com/api/books/3', 'https://www.anapioficeandfire.com/api/books/5', 'https://www.anapioficeandfire.com/api/books/8'], 'povBooks': [], 'tvSeries': ['Season 1', 'Season 2', 'Season 3', 'Season 4', 'Season 6'], 'playedBy': ['Kristian Nairn']}, {'url': 'https://www.anapioficeandfire.com/api/characters/3', 'name': '', 'gender': 'Male', 'culture': '', 'born': '', 'died': '', 'titles': [''], 'aliases': ['Lamprey'], 'father': '', 'mother': '', 'spouse': '', 'allegiances': ['https://www.anapioficeandfire.com/api/houses/15'], 'books': ['https://www.anapioficeandfire.com/api/books/3'], 'povBooks': [], 'tvSeries': [''], 'playedBy': ['']}]*\n", - "\n", - "As we can see, the function returned a list of 10 characters (which do not have names, this is due to the fact that, the characters being\n", - "ranked by alphabetical order, the characters whose names are\n", - "empty appear first. We'll exclude these characters a little\n", - "later).\n", - "\n", - "One problem with this result is that it only returns 10 characters. We want them all. So we'll have to create a loop that\n", - "continues to request additional pages until the answer is the\n", - "last. Since the logic is the same for the characters and the\n", - "houses, we're going to put that part of the code into a function." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'first_response' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m# If you don't understand the next line, it's not too bad. They are specific to the GoT API and are not really relevant to the course.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# If you want to understand why this is necessary, feel free to ask the assistant or send us a mail\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mlast_page\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfirst_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Link\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mii\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlast_page\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'first_response' is not defined" - ] - } - ], - "source": [ - "def get_GoT_info(root, start, end): # The argument is the root of the address which indicates whether we deal with the characters or the houses. \n", - " final_list = [] # Create an empty list \n", - " page_size = 50 # We take the maximum number of objects per call to avoid too many API calls. \n", - " options = {\"page\": \"1\", \"pageSize\": str(page_size)} # The options for the first page, we need them to know how many pages we should extract. \n", - " first_response = requests.get(root, params= options) # This line retrieves the first response. In this response, one of the fields in the \"headers\" contains the number of the last info page. \n", - "\n", - "# If you don't understand the next line, it's not too bad. They are specific to the GoT API and are not really relevant to the course. \n", - "# If you want to understand why this is necessary, feel free to ask the assistant or send us a mail \n", - "last_page = int(first_response.headers[\"Link\"] [-start:-end]) \n", - "\n", - "for ii in range(1,last_page+1): \n", - "\toptions = {\"page\": str(ii), \"pageSize\": str(page_size)} \n", - "\tresponse = requests.get(root, params=options) \n", - "\tfinal_list = final_list + response.json() \n", - "\treturn final_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The function is a bit weird but it first checks the number of content pages available on the site (you never know they add, they might add characters or houses during the last episodes). Next, for each page, we ask the site to send us the list of the 50\n", - "characters or houses on this page. The characters or houses are\n", - "stored in a list (named *response*) and finally this list is added to the resulting list of all previous iterations.\n", - "\n", - "We can now create two lists. One containing all the dictionaries representing the characters and the other one containing the dictionaries\n", - "containing the houses." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "characters = get_GoT_info(\"https://www.anapioficeandfire.com/api/characters\", 27, 25) \n", - "# Numbers 27, 26 and 25 were determined by inspecting the chains manually\n", - "houses = get_GoT_info(\"https://www.anapioficeandfire.com/api/houses\", 26, 25) \n", - "# A good exercise (but one that requires a lot of additional study) would be to automatically determine the number to be extracted. This requires the use of regular expressions (which we will not see in this course)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So much for the Game of Thrones API. Now let's see a short example of how to use the Twitter API.\n", - "\n", - "The Twitter API\n", - "\n", - "Some APIs are freely available, such as the GoT API. You can easily query them with *requests*. For Twitter, it's a bit more complicated. You need credentials.\n", - "\n", - "You can create usernames quite simply: start by creating an\n", - "Twitter account (if no member of your group has one). Make an appointment\n", - "then on the [Twitter for developers] site ([https://developer.twitter.com/](https://developer.twitter.com/)). Then create a new application. This should give you access to 4 codes identifying your account and application. These codes are\n", - "Meaningless sequences of letters and numbers.\n", - "\n", - "To use the Twitter API, we will use the [Python-Twitter] library ([https://python-twitter.readthedocs.io/en/latest/](https://python-twitter.readthedocs.io/en/latest/)). Using this library, you just have to give the credentials you received from Twitter and make requests." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "api_key = \"5oxVSi0q8FWGKSZ68D2iwnFLZ\"\n", - "api_secret_key = \"51esJc1fCtAWpgBPd6TVr4Lun5QbtjI6afOJ00uPBD1x74L8eL\"\n", - "access_token = \"4143588544-7mu5NDQ7x1xa1hoIsZn2sT93GI01zbBDVue634c\"\n", - "access_secret_token = \"qQqrfGwuEduRZ5uAqekq72IoqkdDbdfsZVjB4NiV17wSJ\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now create an *API* object from the library. This object (we haven't seen the objects during this class, this would probably be the next topic you could study if you want to go further in the concepts) allows us to make queries to Twitter which can send us a copy of the tweets according to certain criteria.\n", - "\n", - "A detailed understanding of the library requires reading in detail\n", - "documentation but, for simple queries, you can use\n", - "the following workflow: create an API object $\\rightarrow$ define a string representing the search $\\rightarrow$ send it to the *getSearch()* method of the API object $\\rightarrow$ process the resulting list." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "#import sys\n", - "#!{sys.executable} -m pip install twitter\n", - "\n", - "from twitter import *\n", - "\n", - "api = Twitter(\n", - " auth=OAuth(token=access_token, token_secret=access_secret_token, \n", - " consumer_key=api_key, consumer_secret=api_secret_key))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code creates the API object. We can now create the request. In that case, we'll just search all the tweets regarding the houses of Stark, Lannister and Targaryen." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "main_houses = [\"Stark\", \"Lannister\", \"Targaryen\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we need to format a search string. To do this, you can use the search page in the following way: use the advanced search tool,\n", - "specifying the parameters you want. Then copy the parameter part of the URL." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['HouseStark', 'HouseLannister', 'HouseTargaryen']\n" - ] - } - ], - "source": [ - "query_chain = [\"House\"+name for name in main_houses]\n", - "print(query_chain)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'statuses': [{'created_at': 'Mon Mar 09 06:29:26 +0000 2020', 'id': 1236901891394670594, 'id_str': '1236901891394670594', 'text': \"@bIuerosez Seconded. People lack the ability to see the nuanced characters. Some people are not all good. And they'… https://t.co/vWNVRRa1nU\", 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'bIuerosez', 'name': 'chi', 'id': 1159864331066052608, 'id_str': '1159864331066052608', 'indices': [0, 10]}], 'urls': [{'url': 'https://t.co/vWNVRRa1nU', 'expanded_url': 'https://twitter.com/i/web/status/1236901891394670594', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': 1236818877109149704, 'in_reply_to_status_id_str': '1236818877109149704', 'in_reply_to_user_id': 1159864331066052608, 'in_reply_to_user_id_str': '1159864331066052608', 'in_reply_to_screen_name': 'bIuerosez', 'user': {'id': 1051029304103096320, 'id_str': '1051029304103096320', 'name': 'Rhᥲᥱᥒყs Tᥲrgᥲrყᥱᥒ', 'screen_name': 'Rhaenys_IX', 'location': 'Ikeja, Nigeria', 'description': '🇧\\u200b🇺\\u200b🇷\\u200b🇳\\u200b🇪\\u200b🇩\\u200b 🇦\\u200b🇳\\u200b🇩\\u200b 🇺\\u200b🇳\\u200b🇧\\u200b🇴\\u200b🇼\\u200b🇪\\u200b🇩\\u200b', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 396, 'friends_count': 343, 'listed_count': 0, 'created_at': 'Sat Oct 13 08:38:08 +0000 2018', 'favourites_count': 4633, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 3256, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': None, 'profile_background_image_url_https': None, 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1229689925001531393/MeUUKGMU_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1229689925001531393/MeUUKGMU_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1051029304103096320/1582054223', 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': True, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 20:54:56 +0000 2020', 'id': 1236757315497459718, 'id_str': '1236757315497459718', 'text': 'günde sadece 4 saat uyku\\ndört bir yanı hain dolu \\nyoruldun biliyoruz ama \\nbu dava sensiz olmaz daenerys… https://t.co/SzmYA4A4YD', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/SzmYA4A4YD', 'expanded_url': 'https://twitter.com/i/web/status/1236757315497459718', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [105, 128]}]}, 'metadata': {'result_type': 'recent', 'iso_language_code': 'tr'}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 314407493, 'id_str': '314407493', 'name': 'zeynep ünsal', 'screen_name': 'znsal', 'location': 'Ankara, Türkiye', 'description': 'passionate artist & potential architect in the cyber world', 'url': 'https://t.co/jv9NEU3fWY', 'entities': {'url': {'urls': [{'url': 'https://t.co/jv9NEU3fWY', 'expanded_url': 'http://instagram.com/unsalz/', 'display_url': 'instagram.com/unsalz/', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 475, 'friends_count': 134, 'listed_count': 3, 'created_at': 'Fri Jun 10 06:20:39 +0000 2011', 'favourites_count': 1887, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 1756, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '131516', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme17/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme17/bg.gif', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236755439922094081/NDnh7zIC_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236755439922094081/NDnh7zIC_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/314407493/1581453720', 'profile_link_color': '981CEB', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': 'C8CF02', 'profile_text_color': 'E6781E', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'regular'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 12, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'tr'}, {'created_at': 'Sun Mar 08 19:24:02 +0000 2020', 'id': 1236734438140252160, 'id_str': '1236734438140252160', 'text': 'RT @AilikHeda_: Siccome mi sentivo abbastanza in vena ho scritto un nuovo capitolo che ha 3 pov!\\nhttps://t.co/Z79finLEmY\\n#GameOfThrones #Go…', 'truncated': False, 'entities': {'hashtags': [{'text': 'GameOfThrones', 'indices': [121, 135]}], 'symbols': [], 'user_mentions': [{'screen_name': 'AilikHeda_', 'name': 'ثریا', 'id': 1003252529403219968, 'id_str': '1003252529403219968', 'indices': [3, 14]}], 'urls': [{'url': 'https://t.co/Z79finLEmY', 'expanded_url': 'https://efpfanfic.net/viewstory.php?sid=3890200', 'display_url': 'efpfanfic.net/viewstory.php?…', 'indices': [97, 120]}]}, 'metadata': {'iso_language_code': 'it', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1003252529403219968, 'id_str': '1003252529403219968', 'name': 'ثریا', 'screen_name': 'AilikHeda_', 'location': 'lost in Westeros ~', 'description': \"|So che è orgogliosa. Perché no? Che altro le rimaneva, se non l'orgoglio? So che è forte. Come poteva essere altrimenti?| \\n|Jus drein jus daun| \\n~FAN ACCOUNT~\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 450, 'friends_count': 538, 'listed_count': 6, 'created_at': 'Sun Jun 03 12:30:17 +0000 2018', 'favourites_count': 10964, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 11959, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1003252529403219968/1583240442', 'profile_link_color': '1B95E0', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Sat Mar 07 20:05:28 +0000 2020', 'id': 1236382480418197504, 'id_str': '1236382480418197504', 'text': 'Siccome mi sentivo abbastanza in vena ho scritto un nuovo capitolo che ha 3 pov!\\nhttps://t.co/Z79finLEmY… https://t.co/D1efcXoFp3', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/Z79finLEmY', 'expanded_url': 'https://efpfanfic.net/viewstory.php?sid=3890200', 'display_url': 'efpfanfic.net/viewstory.php?…', 'indices': [81, 104]}, {'url': 'https://t.co/D1efcXoFp3', 'expanded_url': 'https://twitter.com/i/web/status/1236382480418197504', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [106, 129]}]}, 'metadata': {'iso_language_code': 'it', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1003252529403219968, 'id_str': '1003252529403219968', 'name': 'ثریا', 'screen_name': 'AilikHeda_', 'location': 'lost in Westeros ~', 'description': \"|So che è orgogliosa. Perché no? Che altro le rimaneva, se non l'orgoglio? So che è forte. Come poteva essere altrimenti?| \\n|Jus drein jus daun| \\n~FAN ACCOUNT~\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 450, 'friends_count': 538, 'listed_count': 6, 'created_at': 'Sun Jun 03 12:30:17 +0000 2018', 'favourites_count': 10964, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 11959, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1003252529403219968/1583240442', 'profile_link_color': '1B95E0', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'it'}, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'it'}, {'created_at': 'Sun Mar 08 13:52:08 +0000 2020', 'id': 1236650913659379713, 'id_str': '1236650913659379713', 'text': 'LOVE YOUUUUU ALL TARGARYEN!!! \\nu made ur momma proud, as always\\n- your cute house head -\\n[ day 2 ]\\n@PlmAces… https://t.co/ruWCh6OR8X', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'PlmAces', 'name': 'PLM-ACES', 'id': 1167436768255172608, 'id_str': '1167436768255172608', 'indices': [99, 107]}], 'urls': [{'url': 'https://t.co/ruWCh6OR8X', 'expanded_url': 'https://twitter.com/i/web/status/1236650913659379713', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [109, 132]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1008623532, 'id_str': '1008623532', 'name': 'biancake 🌻', 'screen_name': 'iambiancafaye', 'location': 'Philippines', 'description': 'take the moment and make it perfect.', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 653, 'friends_count': 431, 'listed_count': 2, 'created_at': 'Thu Dec 13 11:48:23 +0000 2012', 'favourites_count': 13886, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 15606, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'ACDED6', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme18/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme18/bg.gif', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1191938071585009664/OQN3d_t__normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1191938071585009664/OQN3d_t__normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1008623532/1576898320', 'profile_link_color': '91D2FA', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'F6F6F6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 5, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 01:09:27 +0000 2020', 'id': 1236458980362326022, 'id_str': '1236458980362326022', 'text': 'ArtOfWarXWarOfArt \\n\\n#representltd #rephard #newera #neweracap #got #housetargaryen #dragons https://t.co/xFbUqfSdot', 'truncated': False, 'entities': {'hashtags': [{'text': 'representltd', 'indices': [20, 33]}, {'text': 'rephard', 'indices': [34, 42]}, {'text': 'newera', 'indices': [43, 50]}, {'text': 'neweracap', 'indices': [51, 61]}, {'text': 'got', 'indices': [62, 66]}, {'text': 'housetargaryen', 'indices': [67, 82]}, {'text': 'dragons', 'indices': [83, 91]}], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 1236458967905316865, 'id_str': '1236458967905316865', 'indices': [92, 115], 'media_url': 'http://pbs.twimg.com/media/ESjI4VlXsAEgEq7.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESjI4VlXsAEgEq7.jpg', 'url': 'https://t.co/xFbUqfSdot', 'display_url': 'pic.twitter.com/xFbUqfSdot', 'expanded_url': 'https://twitter.com/unable08/status/1236458980362326022/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'large': {'w': 2048, 'h': 2048, 'resize': 'fit'}, 'medium': {'w': 1200, 'h': 1200, 'resize': 'fit'}, 'small': {'w': 680, 'h': 680, 'resize': 'fit'}}}]}, 'extended_entities': {'media': [{'id': 1236458967905316865, 'id_str': '1236458967905316865', 'indices': [92, 115], 'media_url': 'http://pbs.twimg.com/media/ESjI4VlXsAEgEq7.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESjI4VlXsAEgEq7.jpg', 'url': 'https://t.co/xFbUqfSdot', 'display_url': 'pic.twitter.com/xFbUqfSdot', 'expanded_url': 'https://twitter.com/unable08/status/1236458980362326022/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'large': {'w': 2048, 'h': 2048, 'resize': 'fit'}, 'medium': {'w': 1200, 'h': 1200, 'resize': 'fit'}, 'small': {'w': 680, 'h': 680, 'resize': 'fit'}}}]}, 'metadata': {'iso_language_code': 'und', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 46298804, 'id_str': '46298804', 'name': 'abe', 'screen_name': 'unable08', 'location': '', 'description': 'my crappy audio covers', 'url': 'https://t.co/rQLeFB0up9', 'entities': {'url': {'urls': [{'url': 'https://t.co/rQLeFB0up9', 'expanded_url': 'http://www.youtube.com/pacadoy08', 'display_url': 'youtube.com/pacadoy08', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 75, 'friends_count': 544, 'listed_count': 0, 'created_at': 'Thu Jun 11 03:02:45 +0000 2009', 'favourites_count': 17347, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 794, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '1A1B1F', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme9/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme9/bg.gif', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/929414153403371521/n49-iwxj_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/929414153403371521/n49-iwxj_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/46298804/1390739435', 'profile_link_color': '2FC2EF', 'profile_sidebar_border_color': '181A1E', 'profile_sidebar_fill_color': '252429', 'profile_text_color': '666666', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'und'}, {'created_at': 'Sat Mar 07 20:05:28 +0000 2020', 'id': 1236382480418197504, 'id_str': '1236382480418197504', 'text': 'Siccome mi sentivo abbastanza in vena ho scritto un nuovo capitolo che ha 3 pov!\\nhttps://t.co/Z79finLEmY… https://t.co/D1efcXoFp3', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/Z79finLEmY', 'expanded_url': 'https://efpfanfic.net/viewstory.php?sid=3890200', 'display_url': 'efpfanfic.net/viewstory.php?…', 'indices': [81, 104]}, {'url': 'https://t.co/D1efcXoFp3', 'expanded_url': 'https://twitter.com/i/web/status/1236382480418197504', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [106, 129]}]}, 'metadata': {'iso_language_code': 'it', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1003252529403219968, 'id_str': '1003252529403219968', 'name': 'ثریا', 'screen_name': 'AilikHeda_', 'location': 'lost in Westeros ~', 'description': \"|So che è orgogliosa. Perché no? Che altro le rimaneva, se non l'orgoglio? So che è forte. Come poteva essere altrimenti?| \\n|Jus drein jus daun| \\n~FAN ACCOUNT~\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 450, 'friends_count': 538, 'listed_count': 6, 'created_at': 'Sun Jun 03 12:30:17 +0000 2018', 'favourites_count': 10964, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 11959, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236719448217866241/oyvFPnyo_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1003252529403219968/1583240442', 'profile_link_color': '1B95E0', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'it'}, {'created_at': 'Sat Mar 07 03:37:48 +0000 2020', 'id': 1236133926441078784, 'id_str': '1236133926441078784', 'text': 'What to name blackened fossil of flying beasts? Easy. \\n#HouseTargaryen represent! \\n\\n#ASOIAF #GameofThrones \\nhttps://t.co/EUh2ZIEtyQ', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseTargaryen', 'indices': [55, 70]}, {'text': 'ASOIAF', 'indices': [84, 91]}, {'text': 'GameofThrones', 'indices': [92, 106]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/EUh2ZIEtyQ', 'expanded_url': 'http://bit.ly/2TvICGA', 'display_url': 'bit.ly/2TvICGA', 'indices': [108, 131]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1103695946, 'id_str': '1103695946', 'name': 'Westeros Philippines', 'screen_name': 'WesterosPH', 'location': 'Manila City, National Capital', 'description': \"This is a fanbase for all Filipinos devoted to GRRM's A Song Of Ice and Fire as well as HBO's Game Of Thrones.\\nFollowed by @GameOfThrones\", 'url': 'https://t.co/5azIXSfS1F', 'entities': {'url': {'urls': [{'url': 'https://t.co/5azIXSfS1F', 'expanded_url': 'https://www.facebook.com/groups/WesterosPhilippines/', 'display_url': 'facebook.com/groups/Westero…', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 1490, 'friends_count': 366, 'listed_count': 37, 'created_at': 'Sat Jan 19 13:27:36 +0000 2013', 'favourites_count': 13821, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 8430, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'E1E6CF', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme9/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme9/bg.gif', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1113243610823020544/71nfiHa7_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1113243610823020544/71nfiHa7_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1103695946/1505445413', 'profile_link_color': '2FC2EF', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '252429', 'profile_text_color': '666666', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 21:44:21 +0000 2020', 'id': 1236044976468955136, 'id_str': '1236044976468955136', 'text': 'At least we have the prequel to look forward to. #remember #got #housetargaryen #backtowork https://t.co/JE4Zvrr6yN https://t.co/HSmB8vmtax', 'truncated': False, 'entities': {'hashtags': [{'text': 'remember', 'indices': [50, 59]}, {'text': 'got', 'indices': [60, 64]}, {'text': 'housetargaryen', 'indices': [65, 80]}, {'text': 'backtowork', 'indices': [81, 92]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/JE4Zvrr6yN', 'expanded_url': 'https://ift.tt/2xa7k6m', 'display_url': 'ift.tt/2xa7k6m', 'indices': [93, 116]}], 'media': [{'id': 1236044975051362306, 'id_str': '1236044975051362306', 'indices': [117, 140], 'media_url': 'http://pbs.twimg.com/media/ESdQWyBX0AIZraT.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESdQWyBX0AIZraT.jpg', 'url': 'https://t.co/HSmB8vmtax', 'display_url': 'pic.twitter.com/HSmB8vmtax', 'expanded_url': 'https://twitter.com/BruniWeb/status/1236044976468955136/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'small': {'w': 640, 'h': 640, 'resize': 'fit'}, 'medium': {'w': 640, 'h': 640, 'resize': 'fit'}, 'large': {'w': 640, 'h': 640, 'resize': 'fit'}}}]}, 'extended_entities': {'media': [{'id': 1236044975051362306, 'id_str': '1236044975051362306', 'indices': [117, 140], 'media_url': 'http://pbs.twimg.com/media/ESdQWyBX0AIZraT.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESdQWyBX0AIZraT.jpg', 'url': 'https://t.co/HSmB8vmtax', 'display_url': 'pic.twitter.com/HSmB8vmtax', 'expanded_url': 'https://twitter.com/BruniWeb/status/1236044976468955136/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'small': {'w': 640, 'h': 640, 'resize': 'fit'}, 'medium': {'w': 640, 'h': 640, 'resize': 'fit'}, 'large': {'w': 640, 'h': 640, 'resize': 'fit'}}}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'IFTTT', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 755811438, 'id_str': '755811438', 'name': 'Fred M. Bruni II', 'screen_name': 'BruniWeb', 'location': 'USA & INTERNATIONAL ', 'description': 'I deeply enjoy uncomfortable biblical truths... and the rest of the Bible of course.', 'url': 'https://t.co/P6kdWNtNUO', 'entities': {'url': {'urls': [{'url': 'https://t.co/P6kdWNtNUO', 'expanded_url': 'http://digitalcreative.studio', 'display_url': 'digitalcreative.studio', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 594, 'friends_count': 1312, 'listed_count': 38, 'created_at': 'Mon Aug 13 20:52:23 +0000 2012', 'favourites_count': 113, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 1011, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1206775918959550465/xBhnyoZx_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1206775918959550465/xBhnyoZx_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/755811438/1576552775', 'profile_link_color': '16AAE0', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 18:06:43 +0000 2020', 'id': 1235990206488158208, 'id_str': '1235990206488158208', 'text': 'Thereby the two rival branches of House Targaryen were united and two years of treachery and carnage were ended.… https://t.co/zBEctqv0fm', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/zBEctqv0fm', 'expanded_url': 'https://twitter.com/i/web/status/1235990206488158208', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [114, 137]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1235989932822405121, 'in_reply_to_status_id_str': '1235989932822405121', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 5, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 18:01:34 +0000 2020', 'id': 1235988908850151430, 'id_str': '1235988908850151430', 'text': 'They raised in its place the red dragon of the first Aegon, the banner that all the Targaryen kings had flow until… https://t.co/zxwuepXudV', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/zxwuepXudV', 'expanded_url': 'https://twitter.com/i/web/status/1235988908850151430', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [116, 139]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1235988680113827845, 'in_reply_to_status_id_str': '1235988680113827845', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 2, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 16:19:55 +0000 2020', 'id': 1235963327924391936, 'id_str': '1235963327924391936', 'text': 'Uma coisa importante que aprendemos muito com a Dany é: sempre vai ter um macho querendo de comer e te subestimando… https://t.co/2pO7JEQdDR', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/2pO7JEQdDR', 'expanded_url': 'https://twitter.com/i/web/status/1235963327924391936', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'pt', 'result_type': 'recent'}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 214275534, 'id_str': '214275534', 'name': 'Jo Pais', 'screen_name': 'jopais_', 'location': 'Recife, Pernambuco', 'description': 'Senhora das Tempestades. Escritora de ranhuras. Fotógrafa de relatos estranhos. Um saco de longas histórias. House of DC.', 'url': 'https://t.co/HjAQUr5L9z', 'entities': {'url': {'urls': [{'url': 'https://t.co/HjAQUr5L9z', 'expanded_url': 'http://www.instagram.com/jopais_', 'display_url': 'instagram.com/jopais_', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 339, 'friends_count': 465, 'listed_count': 10, 'created_at': 'Thu Nov 11 00:41:29 +0000 2010', 'favourites_count': 4404, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 53564, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'FCFCFC', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme19/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme19/bg.gif', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1225456949686349828/q4AtpftN_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1225456949686349828/q4AtpftN_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/214275534/1578790927', 'profile_link_color': '000000', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': 'FAFAFA', 'profile_text_color': '333333', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': True, 'lang': 'pt'}, {'created_at': 'Fri Mar 06 16:00:03 +0000 2020', 'id': 1235958329467420672, 'id_str': '1235958329467420672', 'text': '#DaenerysTargaryen #HouseTargaryen #GameOfThrones \\n\\nDaenerys Targaryen the queen of the seven kingdoms! THE QUEEN O… https://t.co/wYeimtE5jX', 'truncated': True, 'entities': {'hashtags': [{'text': 'DaenerysTargaryen', 'indices': [0, 18]}, {'text': 'HouseTargaryen', 'indices': [19, 34]}, {'text': 'GameOfThrones', 'indices': [35, 49]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/wYeimtE5jX', 'expanded_url': 'https://twitter.com/i/web/status/1235958329467420672', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1004037379437551617, 'id_str': '1004037379437551617', 'name': '🔥🐉 𝓦𝓸𝓵𝓯 𝓒𝓵𝓪𝓻𝓴𝓮 𝓣𝓪𝓻𝓰𝓪𝓻𝔂𝓮𝓷 🐉🔥', 'screen_name': 'WolfTargaryen_', 'location': '𝓓𝓻𝓪𝓰𝓸𝓷𝓼𝓽𝓸𝓷𝓮', 'description': '🔥𝓕𝓲𝓻𝓮 𝓪𝓷𝓭 𝓑𝓵𝓸𝓸𝓭!🔥\\n\\n🐉 𝓗𝓸𝓾𝓼𝓮 𝓣𝓪𝓻𝓰𝓪𝓻𝔂𝓮𝓷 🐉\\n\\n🏎 𝓕𝓸𝓻𝓶𝓾𝓵𝓪 1 𝓕𝓪𝓷 🏎\\n \\n(𝓕𝓪𝓷 𝓐𝓬𝓬𝓸𝓾𝓷𝓽)', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 448, 'friends_count': 397, 'listed_count': 4, 'created_at': 'Tue Jun 05 16:29:00 +0000 2018', 'favourites_count': 11171, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 12693, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236493272035590145/K93mQHs7_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236493272035590145/K93mQHs7_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1004037379437551617/1582270501', 'profile_link_color': '7FFF00', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 04:45:58 +0000 2020', 'id': 1235788693014745089, 'id_str': '1235788693014745089', 'text': 'I LOVE GAME OF JONES AND I MISS GAME OF THRONES!! Haha #HouseSharks #HouseTargaryen 😜❤️👊🏻👊🏻👊🏻🔥 https://t.co/uFuiId0WBI', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseSharks', 'indices': [55, 67]}, {'text': 'HouseTargaryen', 'indices': [68, 83]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/uFuiId0WBI', 'expanded_url': 'https://twitter.com/sanjosesharks/status/1235787798562521091', 'display_url': 'twitter.com/sanjosesharks/…', 'indices': [95, 118]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1148248129, 'id_str': '1148248129', 'name': 'Brennan', 'screen_name': 'Bonespinpin', 'location': '', 'description': \"22 years old! 😜🤘🏻A big fan of hockey! San Jose Sharks! 🏒☺️ A big fan of BONES!! ❤️❤️ And I'm in love with IRON MAN! I love you 3000!! 😍❤️ RDJ\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 236, 'friends_count': 1273, 'listed_count': 5, 'created_at': 'Mon Feb 04 15:04:00 +0000 2013', 'favourites_count': 30967, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 15140, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'FFF04D', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1097887591481921537/SxchYtt5_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1097887591481921537/SxchYtt5_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1148248129/1501029361', 'profile_link_color': '0099CC', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': True, 'quoted_status_id': 1235787798562521091, 'quoted_status_id_str': '1235787798562521091', 'quoted_status': {'created_at': 'Fri Mar 06 04:42:25 +0000 2020', 'id': 1235787798562521091, 'id_str': '1235787798562521091', 'text': 'Martin Jones save appreciation tweet. \\n\\n#SJSharks https://t.co/oUzQxnMehe', 'truncated': False, 'entities': {'hashtags': [{'text': 'SJSharks', 'indices': [40, 49]}], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 1235787378004045824, 'id_str': '1235787378004045824', 'indices': [50, 73], 'media_url': 'http://pbs.twimg.com/media/ESZmNZLU4AAyei-.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESZmNZLU4AAyei-.jpg', 'url': 'https://t.co/oUzQxnMehe', 'display_url': 'pic.twitter.com/oUzQxnMehe', 'expanded_url': 'https://twitter.com/SanJoseSharks/status/1235787798562521091/video/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'medium': {'w': 1200, 'h': 675, 'resize': 'fit'}, 'small': {'w': 680, 'h': 383, 'resize': 'fit'}, 'large': {'w': 1280, 'h': 720, 'resize': 'fit'}}}]}, 'extended_entities': {'media': [{'id': 1235787378004045824, 'id_str': '1235787378004045824', 'indices': [50, 73], 'media_url': 'http://pbs.twimg.com/media/ESZmNZLU4AAyei-.jpg', 'media_url_https': 'https://pbs.twimg.com/media/ESZmNZLU4AAyei-.jpg', 'url': 'https://t.co/oUzQxnMehe', 'display_url': 'pic.twitter.com/oUzQxnMehe', 'expanded_url': 'https://twitter.com/SanJoseSharks/status/1235787798562521091/video/1', 'type': 'video', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'medium': {'w': 1200, 'h': 675, 'resize': 'fit'}, 'small': {'w': 680, 'h': 383, 'resize': 'fit'}, 'large': {'w': 1280, 'h': 720, 'resize': 'fit'}}, 'video_info': {'aspect_ratio': [16, 9], 'duration_millis': 12646, 'variants': [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/amplify_video/1235787378004045824/pl/8_ItPNlvO_GvPR65.m3u8?tag=13'}, {'bitrate': 832000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/amplify_video/1235787378004045824/vid/640x360/Qrn_34ylf0dHhA1H.mp4?tag=13'}, {'bitrate': 288000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/amplify_video/1235787378004045824/vid/480x270/P3jTWzH-wJztuJ7F.mp4?tag=13'}, {'bitrate': 2176000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/amplify_video/1235787378004045824/vid/1280x720/bnCYx8ZcidTs8l7_.mp4?tag=13'}]}, 'additional_media_info': {'title': '', 'description': '', 'embeddable': True, 'monetizable': False}}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Media Studio', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 27961547, 'id_str': '27961547', 'name': 'San Jose Sharks', 'screen_name': 'SanJoseSharks', 'location': '#SharksTerritory', 'description': '\"\\'I just want to keep smiling and keep a positive mind\\' - Tomas Hertl\" - San Jose Sharks', 'url': 'https://t.co/wS3IYB9Wsq', 'entities': {'url': {'urls': [{'url': 'https://t.co/wS3IYB9Wsq', 'expanded_url': 'http://sjsharks.com', 'display_url': 'sjsharks.com', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 823133, 'friends_count': 482, 'listed_count': 5387, 'created_at': 'Tue Mar 31 20:57:57 +0000 2009', 'favourites_count': 6431, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': True, 'statuses_count': 57609, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'FFFFFF', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1236799341244739584/mS5KTBgb_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1236799341244739584/mS5KTBgb_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/27961547/1583551737', 'profile_link_color': '00788A', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '00788B', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 36, 'favorite_count': 285, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 03:28:37 +0000 2020', 'id': 1235769225299529728, 'id_str': '1235769225299529728', 'text': 'RT @WatchersOTWall: Doing Justice to Helaena Targaryen and the Women of Westeros – a House of the Dragon video essay by @PLHalbur https://t…', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'WatchersOTWall', 'name': 'Watchers on the Wall', 'id': 2679863268, 'id_str': '2679863268', 'indices': [3, 18]}, {'screen_name': 'PLHalbur', 'name': 'Petra Halbur', 'id': 324909635, 'id_str': '324909635', 'indices': [120, 129]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 612018761, 'id_str': '612018761', 'name': 'Michelle Carter', 'screen_name': 'CarterMlc2139', 'location': '', 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 57, 'friends_count': 390, 'listed_count': 0, 'created_at': 'Mon Jun 18 21:36:48 +0000 2012', 'favourites_count': 18218, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 8385, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/2320597819/image_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/2320597819/image_normal.jpg', 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': True, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Tue Mar 03 15:27:52 +0000 2020', 'id': 1234863066656989184, 'id_str': '1234863066656989184', 'text': 'Doing Justice to Helaena Targaryen and the Women of Westeros – a House of the Dragon video essay by @PLHalbur… https://t.co/1kvVZAXQBo', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'PLHalbur', 'name': 'Petra Halbur', 'id': 324909635, 'id_str': '324909635', 'indices': [100, 109]}], 'urls': [{'url': 'https://t.co/1kvVZAXQBo', 'expanded_url': 'https://twitter.com/i/web/status/1234863066656989184', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [111, 134]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 2679863268, 'id_str': '2679863268', 'name': 'Watchers on the Wall', 'screen_name': 'WatchersOTWall', 'location': 'USA', 'description': 'A Game of Thrones Community for Breaking News, Casting, and Commentary.', 'url': 'https://t.co/8ViHtiyPql', 'entities': {'url': {'urls': [{'url': 'https://t.co/8ViHtiyPql', 'expanded_url': 'http://watchersonthewall.com/', 'display_url': 'watchersonthewall.com', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 67801, 'friends_count': 794, 'listed_count': 606, 'created_at': 'Fri Jul 25 15:45:36 +0000 2014', 'favourites_count': 10954, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 15472, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme14/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme14/bg.gif', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1213853941777219584/dEMFRDFY_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1213853941777219584/dEMFRDFY_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2679863268/1583716584', 'profile_link_color': '000000', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': 'EFEFEF', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 5, 'favorite_count': 36, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 5, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Fri Mar 06 02:22:42 +0000 2020', 'id': 1235752637217284096, 'id_str': '1235752637217284096', 'text': 'ACABEI DE DESCOBRIR: GIZELLY É UMA TARGARYEN.\\nFOGO, AANGUE E LABAREDA-SE É O NOVO DRACARYS #BBB20 #GoT… https://t.co/R8ngVNVywL', 'truncated': True, 'entities': {'hashtags': [{'text': 'BBB20', 'indices': [91, 97]}, {'text': 'GoT', 'indices': [98, 102]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/R8ngVNVywL', 'expanded_url': 'https://twitter.com/i/web/status/1235752637217284096', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [104, 127]}]}, 'metadata': {'iso_language_code': 'pt', 'result_type': 'recent'}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 214275534, 'id_str': '214275534', 'name': 'Jo Pais', 'screen_name': 'jopais_', 'location': 'Recife, Pernambuco', 'description': 'Senhora das Tempestades. Escritora de ranhuras. Fotógrafa de relatos estranhos. Um saco de longas histórias. House of DC.', 'url': 'https://t.co/HjAQUr5L9z', 'entities': {'url': {'urls': [{'url': 'https://t.co/HjAQUr5L9z', 'expanded_url': 'http://www.instagram.com/jopais_', 'display_url': 'instagram.com/jopais_', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 339, 'friends_count': 465, 'listed_count': 10, 'created_at': 'Thu Nov 11 00:41:29 +0000 2010', 'favourites_count': 4404, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 53564, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'FCFCFC', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme19/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme19/bg.gif', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1225456949686349828/q4AtpftN_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1225456949686349828/q4AtpftN_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/214275534/1578790927', 'profile_link_color': '000000', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': 'FAFAFA', 'profile_text_color': '333333', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'possibly_sensitive': True, 'lang': 'pt'}], 'search_metadata': {'completed_in': 0.046, 'max_id': 1236901891394670594, 'max_id_str': '1236901891394670594', 'next_results': '?max_id=1235752637217284095&q=HouseTargaryen&include_entities=1', 'query': 'HouseTargaryen', 'refresh_url': '?since_id=1236901891394670594&q=HouseTargaryen&include_entities=1', 'count': 15, 'since_id': 0, 'since_id_str': '0'}}\n" - ] - } - ], - "source": [ - "number_tweets = 99 # This is the maximum number of tweets that can be requested with the free version of API\n", - "tweets_on_houses = {} # We create an empty dictionary \n", - "\n", - "for ii in query_chain: \n", - " tweets = api.search.tweets(q=ii)\n", - " #tweets = api.GetSearch(raw_query=\"l=en&q=%23\"+ii+\"&count=\"+str(number_tweets)) \n", - " tweets_on_houses[ii] = tweets\n", - " \n", - "print(tweets) " - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'statuses': [{'created_at': 'Sun Mar 08 18:29:33 +0000 2020', 'id': 1236720730563399681, 'id_str': '1236720730563399681', 'text': 'RT @ASOIAFReadThru: Rule in the west had therefore passed to his widow, Lady Johanna, and her father, Roland Westerling, Lord of the Crag.…', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'ASOIAFReadThru', 'name': 'ASOIAF Tweet-Through', 'id': 2396358626, 'id_str': '2396358626', 'indices': [3, 18]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 875434881668526081, 'id_str': '875434881668526081', 'name': 'Miguel C. Martínez', 'screen_name': 'Micaerys', 'location': 'Marín, España', 'description': 'Escritor, youtuber, Padre de Dragones, Guardián de Wikis :v', 'url': 'https://t.co/2TOanfsHqn', 'entities': {'url': {'urls': [{'url': 'https://t.co/2TOanfsHqn', 'expanded_url': 'https://m.youtube.com/channel/UCoC3UzXi_Sv5wXEAIgOVqHw', 'display_url': 'm.youtube.com/channel/UCoC3U…', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 260, 'friends_count': 635, 'listed_count': 3, 'created_at': 'Thu Jun 15 19:28:35 +0000 2017', 'favourites_count': 23429, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 8027, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1120069602589933568/I71c45Cw_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1120069602589933568/I71c45Cw_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/875434881668526081/1533726511', 'profile_link_color': '77FFDD', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Sun Mar 08 18:24:44 +0000 2020', 'id': 1236719515733557248, 'id_str': '1236719515733557248', 'text': 'Rule in the west had therefore passed to his widow, Lady Johanna, and her father, Roland Westerling, Lord of the Cr… https://t.co/cy0IOBGMwH', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/cy0IOBGMwH', 'expanded_url': 'https://twitter.com/i/web/status/1236719515733557248', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236719265069387781, 'in_reply_to_status_id_str': '1236719265069387781', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:29:09 +0000 2020', 'id': 1236720626733285378, 'id_str': '1236720626733285378', 'text': 'She agreed as well to restore that portion of the royal treasury that Tyland Lannister had sent west for safekeepin… https://t.co/bHqPNK1DfD', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/bHqPNK1DfD', 'expanded_url': 'https://twitter.com/i/web/status/1236720626733285378', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236720301087559680, 'in_reply_to_status_id_str': '1236720301087559680', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:27:51 +0000 2020', 'id': 1236720301087559680, 'id_str': '1236720301087559680', 'text': 'She would deliver two daughters to the Red Keep, to serve as companions to the new queen (and as hostages to ensure… https://t.co/RWAR0pt8Fi', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/RWAR0pt8Fi', 'expanded_url': 'https://twitter.com/i/web/status/1236720301087559680', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236720151602528257, 'in_reply_to_status_id_str': '1236720151602528257', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:26:03 +0000 2020', 'id': 1236719847876288513, 'id_str': '1236719847876288513', 'text': 'With the Red Kraken’s longships still menacing their coasts, the Lannisters were more concerned with defending Kayc… https://t.co/1bAZ7YjWE9', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/1bAZ7YjWE9', 'expanded_url': 'https://twitter.com/i/web/status/1236719847876288513', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236719515733557248, 'in_reply_to_status_id_str': '1236719515733557248', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:24:44 +0000 2020', 'id': 1236719515733557248, 'id_str': '1236719515733557248', 'text': 'Rule in the west had therefore passed to his widow, Lady Johanna, and her father, Roland Westerling, Lord of the Cr… https://t.co/cy0IOBGMwH', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/cy0IOBGMwH', 'expanded_url': 'https://twitter.com/i/web/status/1236719515733557248', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236719265069387781, 'in_reply_to_status_id_str': '1236719265069387781', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:23:44 +0000 2020', 'id': 1236719265069387781, 'id_str': '1236719265069387781', 'text': 'Lord Jason Lannister had left six children when he died in battle: five daughters and one son, Loreon, a boy of four.\\n#HouseLannister', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseLannister', 'indices': [118, 133]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236719073670705155, 'in_reply_to_status_id_str': '1236719073670705155', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 18:22:58 +0000 2020', 'id': 1236719073670705155, 'id_str': '1236719073670705155', 'text': 'Casterly Rock was the first to respond.\\n#HouseLannister', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseLannister', 'indices': [40, 55]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sun Mar 08 17:58:21 +0000 2020', 'id': 1236712875458207750, 'id_str': '1236712875458207750', 'text': 'Others rumors claimed the Lannisters and the Hightowers were on the march, without truth. \\n#HouseLannister #HouseHightower', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseLannister', 'indices': [91, 106]}, {'text': 'HouseHightower', 'indices': [108, 123]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236712633019097090, 'in_reply_to_status_id_str': '1236712633019097090', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 22:46:16 +0000 2020', 'id': 1236422944471035905, 'id_str': '1236422944471035905', 'text': 'RT @ASOIAFReadThru: \"You speak of taking Storm’s End, Oldtown, and Casterly Rock, my lord, but the men who held those seats were slain in b…', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'ASOIAFReadThru', 'name': 'ASOIAF Tweet-Through', 'id': 2396358626, 'id_str': '2396358626', 'indices': [3, 18]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter for Android', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 910528259049365505, 'id_str': '910528259049365505', 'name': 'Carlos_Martins_2017', 'screen_name': 'Joao_Pelixo', 'location': '', 'description': \"Politics, history and philosophy.\\nI love to buy new books even when I have some that I haven't read yet.\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 199, 'friends_count': 224, 'listed_count': 3, 'created_at': 'Wed Sep 20 15:37:08 +0000 2017', 'favourites_count': 27600, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 41127, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': None, 'profile_background_image_url_https': None, 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1075098095568982016/PSxLm2xV_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1075098095568982016/PSxLm2xV_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/910528259049365505/1537982366', 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': True, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Sat Mar 07 18:44:38 +0000 2020', 'id': 1236362138555494402, 'id_str': '1236362138555494402', 'text': '\"You speak of taking Storm’s End, Oldtown, and Casterly Rock, my lord, but the men who held those seats were slain… https://t.co/qnEwnTIH2S', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/qnEwnTIH2S', 'expanded_url': 'https://twitter.com/i/web/status/1236362138555494402', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [116, 139]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 12, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 19:32:21 +0000 2020', 'id': 1236374144347971586, 'id_str': '1236374144347971586', 'text': 'Paisas with beards #beardbros #PaisaPals #BestFriendsGang #HouseLannister #lannisterlions 🦁 @ Oakland, California https://t.co/hSUqrE0ea9', 'truncated': False, 'entities': {'hashtags': [{'text': 'beardbros', 'indices': [19, 29]}, {'text': 'PaisaPals', 'indices': [30, 40]}, {'text': 'BestFriendsGang', 'indices': [41, 57]}, {'text': 'HouseLannister', 'indices': [58, 73]}, {'text': 'lannisterlions', 'indices': [74, 89]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/hSUqrE0ea9', 'expanded_url': 'https://www.instagram.com/p/B9cbudHB7K8/?igshid=13ozaspleskma', 'display_url': 'instagram.com/p/B9cbudHB7K8/…', 'indices': [114, 137]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Instagram', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 30808871, 'id_str': '30808871', 'name': 'A-Lo', 'screen_name': 'Thunderlips187', 'location': 'Murda Meda, CA', 'description': 'Bulletproof Tiger. Cats. Tacos. Yoga. Tattoos. Trees. Bikes. Serenity. Nonsense. I never update my website', 'url': 'https://t.co/reoGmSJENh', 'entities': {'url': {'urls': [{'url': 'https://t.co/reoGmSJENh', 'expanded_url': 'http://angelobenedetto.com', 'display_url': 'angelobenedetto.com', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 379, 'friends_count': 561, 'listed_count': 12, 'created_at': 'Mon Apr 13 04:42:26 +0000 2009', 'favourites_count': 1167, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 15229, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '709397', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/661438836484902912/WwXkKoZy_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/661438836484902912/WwXkKoZy_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/30808871/1543411811', 'profile_link_color': '1B1E1A', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'A0C5C7', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': {'type': 'Point', 'coordinates': [37.8029, -122.2721]}, 'coordinates': {'type': 'Point', 'coordinates': [-122.2721, 37.8029]}, 'place': {'id': 'ab2f2fac83aa388d', 'url': 'https://api.twitter.com/1.1/geo/id/ab2f2fac83aa388d.json', 'place_type': 'city', 'name': 'Oakland', 'full_name': 'Oakland, CA', 'country_code': 'US', 'country': 'United States', 'contained_within': [], 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-122.34266, 37.699279], [-122.114711, 37.699279], [-122.114711, 37.8847092], [-122.34266, 37.8847092]]]}, 'attributes': {}}, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 19:03:01 +0000 2020', 'id': 1236366761823477760, 'id_str': '1236366761823477760', 'text': '\"Has Oldtown yielded? Has Casterly Rock returned the Crown’s gold?\"\\n#HouseHightower #HouseLannister', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseHightower', 'indices': [68, 83]}, {'text': 'HouseLannister', 'indices': [85, 100]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236366588447727618, 'in_reply_to_status_id_str': '1236366588447727618', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 1, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 18:44:59 +0000 2020', 'id': 1236362225633419271, 'id_str': '1236362225633419271', 'text': '\"Grant them honorable terms, and they will bend the knee.\"\\n#HouseBaratheon #HouseHightower \\n#HouseLannister', 'truncated': False, 'entities': {'hashtags': [{'text': 'HouseBaratheon', 'indices': [59, 74]}, {'text': 'HouseHightower', 'indices': [76, 91]}, {'text': 'HouseLannister', 'indices': [93, 108]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236362138555494402, 'in_reply_to_status_id_str': '1236362138555494402', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 7, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 18:44:38 +0000 2020', 'id': 1236362138555494402, 'id_str': '1236362138555494402', 'text': '\"You speak of taking Storm’s End, Oldtown, and Casterly Rock, my lord, but the men who held those seats were slain… https://t.co/qnEwnTIH2S', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/qnEwnTIH2S', 'expanded_url': 'https://twitter.com/i/web/status/1236362138555494402', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [116, 139]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 12, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 18:33:04 +0000 2020', 'id': 1236359224592130051, 'id_str': '1236359224592130051', 'text': 'Kermit Tully pointed out that Storm’s End, Oldtown, and Casterly Rock were as strong as Stark’s own Winterfell (f n… https://t.co/WyYDtZanCj', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/WyYDtZanCj', 'expanded_url': 'https://twitter.com/i/web/status/1236359224592130051', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236358915308302336, 'in_reply_to_status_id_str': '1236358915308302336', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 4, 'favorited': False, 'retweeted': False, 'lang': 'en'}, {'created_at': 'Sat Mar 07 18:30:45 +0000 2020', 'id': 1236358644398202881, 'id_str': '1236358644398202881', 'text': 'He would reduce Storm’s End first, then cross the Reach to take Oldtown. Once the Hightower had fallen, he would ta… https://t.co/hPxRnnXbEa', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/hPxRnnXbEa', 'expanded_url': 'https://twitter.com/i/web/status/1236358644398202881', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': 'Twitter Web App', 'in_reply_to_status_id': 1236358411463393280, 'in_reply_to_status_id_str': '1236358411463393280', 'in_reply_to_user_id': 2396358626, 'in_reply_to_user_id_str': '2396358626', 'in_reply_to_screen_name': 'ASOIAFReadThru', 'user': {'id': 2396358626, 'id_str': '2396358626', 'name': 'ASOIAF Tweet-Through', 'screen_name': 'ASOIAFReadThru', 'location': 'Fire and Blood', 'description': \"Live-tweeting quotes from George R. R. Martin's A Song of Ice and Fire series as I re-read it. It in no way belongs to me. SPOILERS\", 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 6098, 'friends_count': 1247, 'listed_count': 75, 'created_at': 'Tue Mar 18 15:05:40 +0000 2014', 'favourites_count': 80122, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 141308, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '3B94D9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/560626739559153664/SbqurU-g_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2396358626/1422498865', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 0, 'favorite_count': 4, 'favorited': False, 'retweeted': False, 'lang': 'en'}], 'search_metadata': {'completed_in': 0.049, 'max_id': 1236720730563399681, 'max_id_str': '1236720730563399681', 'next_results': '?max_id=1236358644398202880&q=HouseLannister&include_entities=1', 'query': 'HouseLannister', 'refresh_url': '?since_id=1236720730563399681&q=HouseLannister&include_entities=1', 'count': 15, 'since_id': 0, 'since_id_str': '0'}}\n" - ] - } - ], - "source": [ - "print(tweets_on_houses[\"HouseLannister\"]) # Frankly, Lannister's deserve the Iron throne. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*[Status(ID=1121038719195275265, ScreenName=YSoSeriousBoo, Created=Wed Apr 24 13:10:33 +0000 2019, Text='@Oreo @oreo #ForTheThrone #GOT \\n #GameOfCookies \\n\\n#WhiteWalker 7 Oreos \\n\\n#HouseLannister 6 Oreos\\n\\n#HouseDargaryen… https://t.co/zgva8sb28N'), Status(ID=1121012886875385859, ScreenName=DohertyCiaran, Created=Wed Apr 24 11:27:54 +0000 2019, Text='Second of Six #GlassOfThrones stained glass installations unveiled at #Belfast Waterfront Hall. Depicting… https://t.co/eVttTzPSQD'), Status(ID=1120944289301843968, ScreenName=TourismIreland, Created=Wed Apr 24 06:55:19 +0000 2019, Text='RT @VirginMediaNews: PICTURED: Another #GameOfThrones-themed stained glass window has been unveiled in #Belfast as part of @TourismIreland’…'), Status(ID=1120944181906690049, ScreenName=SineadGrace1, Created=Wed Apr 24 06:54:53 +0000 2019, Text='RT @VirginMediaNews: PICTURED: Another #GameOfThrones-themed stained glass window has been unveiled in #Belfast as part of @TourismIreland’…'), Status(ID=1120908255193071617, ScreenName=opeadigs, Created=Wed Apr 24 04:32:08 +0000 2019, Text='RT @PiggyBankNG: What house do you belong to, based on the feature you use the most?\\n\\n#HouseTargaryen #HouseGreyJoy #HouseStark #HouseLanni…')]*\n", - "\n", - "Nice!\n", - "\n", - "## Interact with the database\n", - "\n", - "In the first part of the project, you created a database.\n", - "This will be used to store all the information you will need for your analyses.\n", - "\n", - "In the case of our GoT analysis, we need to create the database\n", - "specifically to receive the information we extracted. However, the creation procedure of a database and inserting values into it is more or less the same from a Python point of view.\n", - "\n", - "Everything happens with *sqlite3* library. This library allows you to interact with a sqlite database by forwarding it strings containing the SQL statements we want to execute.\n", - "\n", - "Let's start by importing the library..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sqlite3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the tables\n", - "\n", - "The simplest way is to create a database (which can start as an empty file with a name ending in *.db* or *.sqlite*) in the folder where your Python code is located (whether it's a code in pure\" Python - a \".py\" - or a Jupyter notebook - a \".ipynb\").\n", - "\n", - "Once the file is created, you can use the sqlite3 library to handle it . The library documentation is available [here](https://docs.python.org/3/library/sqlite3.html).\n", - "\n", - "The first step is to link to the data. This is created by passing the database name as a string to the *connect()* method of the library. In our case, we created a database called \"GoT.db\" in the folder containing this notebook. So we create a connection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn = sqlite3.connect(\"GoT.db\") # If the file doesn't exist, it is automatically created" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have a connection, we can use the *execute()*, passing as argument a string of characters containing the instruction you wish to execute. In our case, we want to create two tables: one table containing the houses and a table containing the characters.\n", - "\n", - "Some characters can have several allegiances, it is necessary therefore to create a join table (a house can have several members and a character may have pledged allegiance to more than one member. houses)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_houses = '''CREATE TABLE IF NOT EXISTS houses( url TEXT PRIMARY KEY UNIQUE NOT NULL, name TEXT NON NULL, symbol TEXT)'''\n", - "create_characters = '''CREATE TABLE IF NOT EXISTS characters( url TEXT PRIMARY KEY UNIQUE NOT NULL, name TEXT NON NULL, dead INT NON NULL DEFAULT 0)'''\n", - "create_allegeances = '''CREATE TABLE IF NOT EXISTS allegeances( url_character TEXT NOT NULL, url_house TEXT NOT NULL, PRIMARY KEY (url_character, url_house), FOREIGN KEY (url_character) REFERENCES characters(url), FOREIGN KEY (url_house) REFERENCES houses(url))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First we create the table containing the houses\n", - "\n", - "conn.execute(create_homes)\n", - "\n", - "Then the characters\n", - "\n", - "conn.execute(create_characters)\n", - "\n", - "And finally, the joining table\n", - "\n", - "conn.execute(create_allegeances)\n", - "\n", - "\n", - "\n", - "Fill in the tables\n", - "\n", - "Now, it's time to build tuples representing the data that we wish to insert into our tables. We do this through loops and comprehensions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inputs_houses = [(house[\"url\"], house[\"name\"], house[\"coatOfArms\"]) for house in houses]\n", - "inputs_personnages = [(perso[\"url\"], perso[\"name\"], perso[\"died\"] == \"\") for perso in personnages if perso[\"name\"] != \"\"] # If the character has a date of death, the third element of the tuple equals Falseinputs_allegeances = [] \n", - "# Exercise: try to transform this loop into comprehensions for perso in characters: if perso[\"name\"] != \"\": for house in perso[\"allegiances\"]: inputs_allegeances = inputs_allegeances +[(perso[\"url\"], maison)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After these few manipulations, we are now ready to fill our database! By reading the documentation for the sqlite3 module, we can realize that, if we have a tuple list containing\n", - "the values that we want to insert in the database, we can use the executemany() statement on the connection to insert them as one statement in the table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.executemany('''INSERT INTO maisons VALUES (?, ?, ?)''', inputs_houses)\n", - "conn.executemany('''INSERT INTO personnages VALUES (?, ?, ?)''', inputs_characters)\n", - "conn.executemany('''INSERT INTO allegeances VALUES (?, ?)''', inputs_allegeances)\n", - "conn.commit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There you go. We now have the information in the database. Notice that the boolean value indicating whether the person is still has been automatically converted to 0 (for False) and 1 (for true).\n", - "\n", - "Querying the database\n", - "\n", - "The peculiarity of the *sqlite3* library is that; the response of an *execute()* statement is always a pointer (i.e. kind of a link to the responses returned). This means that to obtain a table with\n", - "responses to a SELECT instruction, the function method *.fetchall()* must be applied on the pointer to convert all the rows of the results in a table of tuples. Let's illustrate this\n", - "with an example: we will use the SELECT instruction to find the url (which is the primary key) of the Stark (of Winterfell) houses, Lannister (of Casterly Rock) and Targaryen (of King's Landing). This will be useful to record the tweets we have extracted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "main_houses_query = '''SELECT name, url FROM houses WHERE name LIKE \"%Stark of Winterfell\" OR name LIKE \"%Lannister of Casterly Rock\" OR name LIKE \"%Targaryen of King's Landing\"'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "res = conn.execute(main_houses_query)urls_main_houses = res.fetchall()main_houses = {main_house[0]: main_house[1] for main_house in urls_main_houses}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next (and last step for today) is to pass the Tweets we retrieved from the Sentiment Analysis Library and to insert them in a new database table.\n", - "\n", - "## Sentiment analysis\n", - "\n", - "We can now pass on the tweets we've collected in a\n", - "Sentiment analysis bookstore. Sentiment analysis is a task\n", - "extremely complex and still evolving. Indeed, many\n", - "of clues to discover the meaning of a sentence (this includes by\n", - "example, the use of punctuation, certain key words, certain\n", - "emoji and sometimes complex probabilistic models - think by\n", - "example to tweets containing sarcastic remarks, the most\n", - "complex to interpret).\n", - "\n", - "Fortunately, there are, once again, libraries that allow to interpret the strings passed to them and to extract from them\n", - "sentiment indicators.\n", - "\n", - "One such bookstore is [Vader (Valencia Aware Dictionary and sEntiment Reasoner)](https://github.com/cjhutto/vaderSentiment). This library can be installed by the usual means (using PiPy for example, or conda).\n", - "\n", - "The library is provided with some datasets so that you can\n", - "can convince you of the strength of the concepts behind it. Once convinced, it's time to apply the library to the Tweets that\n", - "we extracted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzeranalyser = SentimentIntensityAnalyzer() \n", - "# We create an object that will analyze tweetstweets_texts = [(house, tweet.text) for house, tweets in tweets_on_houses.items() for tweet in tweets]tweets_texts = list(set(tweets_texts)) # This is a little trick to remove duplicate lines (there may be retweets that we are not interested in here)abrevs = {\"HouseLannister\": \"House Lannister of Casterly Rock\", \"HouseStark\": \"House Stark of Winterfell\", \"HouseTargaryen\": \"House Targaryen of King's Landing\"}tweets_with_primary_key = [(houses_importantes[abrevs[tweet[0]]], tweet[1]) for tweet in tweets_texts]load_for_database = []for tweet in tweets_with_primary_key: # Here, we use a loop because the algorithm that determines the sentiment can take a long time to run: we may prefer to run it only once per tweet sentiment = analyze.polarity_scores(tweet[1]) # We pass the text to the library, it returns a dictionary containing 3 keys: \"pos\", \"neg\", \"neu\" and \"compound\" res = (tweet[0], tweet[1], sentiment[\"compound\"], sentiment[\"pos\"], sentiment[\"neu\"], sentiment[\"neg\"]) load_for_database = load_for_database + [res]load_for_database[0:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*[('https://www.anapioficeandfire.com/api/houses/229', 'Exclusive #GameofThrones Silver #IronThrone Tankard\\n\\nOrder Now {https://t.co/xWIe02Nfnd}\\n\\n#affiliate… https://t.co/R0c2OvGKf4', 0.128, 0.143, 0.857, 0.0), ('https://www.anapioficeandfire.com/api/houses/229', '#HouseLannister 🦁 https://t.co/2NIYvQ5PUx', 0.0, 0.0, 1.0, 0.0), ('https://www.anapioficeandfire.com/api/houses/378', 'Esqueci a poha da #Gameofthrones #GOT #JonSnow #DaenerysTargaryen #ForTheThrone #HouseTargaryen', 0.0, 0.0, 1.0, 0.0), ('https://www.anapioficeandfire.com/api/houses/362', '@Maisie_Williams Best scene ever. Of all the series. I’ve been waiting since Arya met Gendry. It was all I ever hop… https://t.co/ISQ37QN5sb', 0.6369, 0.174, 0.826, 0.0), ('https://www.anapioficeandfire.com/api/houses/378', 'Will the real John Snow ❄️ please stand up @GameOfThrones #GamefThrones #HouseStark #HouseTargaryen https://t.co/EZZbxokFvF', 0.3182, 0.15, 0.85, 0.0)]*\n", - "\n", - "Sentiment analysis returns 3 values: \"proportions of beliefs\" that the message is positive, negative or neutral. The sum of these three\n", - "In addition to these three values, there is a fourth, the composite index. This \"summarizes\" the other three into a value that is unconstrained (i.e. it can be\n", - "positive or negative). A value of the composite indicator close to 0\n", - "indicates a rather neutral tweet, a positive value indicates a tweet\n", - "generally positive and a negative value a tweet generally\n", - "negative. It is this value that will be mainly useful in the case of\n", - "analysis of the results (taking, for example, the average of all tweets on a house before and after an episode).\n", - "\n", - "OK, as you can probably see (it depends on when you run this code), the algorithm is not flawless (especially since, in the\n", - "of the show, talking about death, destruction, etc. is not\n", - "necessarily negative, whereas in general, a tweet talking about these\n", - "topics is of some concern). However, for this example, as we\n", - "are not too concerned about the analysis, it will not pose too much of a problem.\n", - "problem (the graphs for the previous course will be just enough for the\n", - "sad).\n", - "\n", - "Note that, if we wanted to track the interest of the twittosphere for each house per day, we could run some cells of this notebook but there are some others that should be avoided. A large part of the work consists in taking and adapting the contents of this notebook for\n", - "extract the interesting parts for your project.\n", - "\n", - "Well, now, it's a matter of creating a table in our database where we'll store the tweets and their analysis. We'll just go in enter the tuples we created earlier. We'll add a column to represent the day (in fact, the exact time) we added the tweets. This will allow us to analyze the average feeling about each of the houses this week and after.\n", - "next week's episode to see if there's any developments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "table_tweets_creation = '''CREATE TABLE IF NOT EXISTS tweets( house TEXT NOT NULL, content TEXT NOT NULL, composite_index NUMERIC, positive_index NUMERIC, neutral_index NUMERIC, negative_index NUMERIC, insert_when TEXT DEFAULT (date('now', 'localtime')), PRIMARY KEY (house, content), FOREIGN KEY (house) REFERENCES houses(url))'''conn.execute(creation_table_tweets)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note the line *\"insert_when TEXT DEFAULT (date('now', 'localtime')), \"*.\n", - "It indicates that, if we don't specify a date for the insertion, we will just add the string corresponding to the date of the system on which the script is being run (this implies that, if we run this script at 2:00 a.m. Belgian time. simultaneously on two computers, one in Brussels and one in Buenos Aires, we will not have the same line in the databases).\n", - "\n", - "That's it, we can finally insert our tweets and their information in the database. This will conclude the presentation of libraries needed to complete the second part of the work." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "insert_into_tweets = '''INSERT INTO tweets(house, content, composite_index, positive_index, neutral_index, negative_index) VALUES (?,?,?,?,?,?)'''conn.executemany(insert_into_tweets, load_for_database)conn.commit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, since we do not provide a tuple of the table size (the table has 7 columns but we only provide 6 values because we want to make the database add today's date), it is\n", - "It is necessary to name the columns in which the 6 values should be inserted.\n", - "\n", - "The *conn.commit()* writes the changes to the database. It's worth to ensure that any changes contained in the notebook have been well written. Do not hesitate to look in the database of\n", - "given to see what the code in this book has produced." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_5/Readme.md b/Session_5/Readme.md deleted file mode 100644 index 6326abb..0000000 --- a/Session_5/Readme.md +++ /dev/null @@ -1,12 +0,0 @@ -**APIs with Python** - - -In Slides -1. Introduction to APIs -2. JSON -3. Hardcoding access with `requests` -4. Popular pre-written modules for geocoding and mapping - -External scripts -1. Trade data with Comtrade API (kudos Federico) -2. The Twitter API (kudos Charles) diff --git a/Session_5/Readme.md.txt b/Session_5/Readme.md.txt new file mode 100644 index 0000000..304360c --- /dev/null +++ b/Session_5/Readme.md.txt @@ -0,0 +1 @@ +Readme diff --git a/Session_5/Session 5 - APIs Intro.ipynb b/Session_5/Session 5 - APIs Intro.ipynb deleted file mode 100644 index ded1522..0000000 --- a/Session_5/Session 5 - APIs Intro.ipynb +++ /dev/null @@ -1,644 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Python and APIs\n", - "Angela, Moritz & Thomas" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Outline of Session 5\n", - "\n", - "In these slides\n", - "* Introduction \n", - "* JSON\n", - "* Hardcoding access with `requests`\n", - "* Popular pre-written modules for geocoding and mapping\n", - "\n", - "External scripts\n", - "* Trade data with Comtrade API (kudos Federico)\n", - "* The Twitter API (kudos Charles)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# What is an API?\n", - "\n", - "## API: Application programming interface\n", - "\n", - "_\"a set of clearly defined methods of communication between various software components.\"_\n", - "\n", - "\n", - "- Pre-internet days: Extension of software beyond its usual capabilities\n", - "- Nowadays: Interface by web service providers for you to connect/retrieve with your own application (i.e. without going through GUI)\n", - "- You send data <-> You get data back\n", - "- Most APIs have a similar structure: the REST architecture (REpresentational State Transfer)\n", - "- Inititatives like the Linux Foundation's [OpenAPI](https://www.openapis.org/) develop these types of standards\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Illustration: General Set-up\n", - "\n", - "\n", - "\n", - "Source: [MuleSoft Videos](https://www.youtube.com/embed/s7wmiS2mSXY)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Illustration: Making a request\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Illustration: Returning Content\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Elements of an API\n", - "\n", - "* A protocol (eg: https)\n", - "* A server (eg: httpbin.org)\n", - "* A method name / location (eg: /get)\n", - "* A set of arguments (eg: hello=world and foo=bar)\n", - "\n", - "➥ https://httpbin.org/get?hello=world&foo=bar\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# The Request\n", - "\n", - "* You need to read the documentation (e.g. [Eurostat API documentation](https://ec.europa.eu/eurostat/web/json-and-unicode-web-services/getting-started/rest-request))\n", - "* You need to specify the URL (if it is a remote API)\n", - "* You need to import the `requests` module in Python" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# The returning object\n", - "\n", - "* Usually it is encoded in JSON \n", - "* The other popular formats for data structure are XML (mainly old stuff) and CSV (spreadsheet compatible)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# What is a JSON?\n", - "* JSON = [(JavaScript Object Notation)](https://www.json.org/)\n", - "* It is one of the most popular format for data in the world\n", - "* It looks like a Python dictionary, except for the fact that:\n", - " - JSON is a string (it is inside a text file)\n", - " - JSON must use double quotation mark\n", - " - in JavaScript it is defined as Object\n", - "* In Python there is a built-in module called `json` module \n", - "* It follows this strcture {\"key\" : \"value\"}; see example below:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'friends': [{'name': 'Jose', 'degree': 'Applied Computing'},\n", - " {'name': 'Rolf', 'degree': 'Computer Science'},\n", - " {'name': 'Anna', 'degree': 'Physics'}]}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "{\n", - " \"friends\": [\n", - " {\n", - " \"name\": \"Jose\",\n", - " \"degree\": \"Applied Computing\"\n", - " },\n", - " {\n", - " \"name\": \"Rolf\",\n", - " \"degree\": \"Computer Science\"\n", - " },\n", - " {\n", - " \"name\": \"Anna\",\n", - " \"degree\": \"Physics\"\n", - " }\n", - " ]\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# What can I do with APIs? \n", - "\n", - "- Retrieve [World Bank Dev Indicators](https://datahelpdesk.worldbank.org/knowledgebase/articles/898581-api-basic-call-structures)\n", - "- Track stock prices with [OpenFIGI](https://www.openfigi.com/api)\n", - "- Geocode an address with [Here Maps](https://developer.here.com/)\n", - "- Convert fiat currency with [opencurrency](https://openexchangerates.org/) or crypto with [alternative.me](https://alternative.me/crypto/api/)\n", - "- Send tweets with [Twitter](https://developer.twitter.com/)\n", - "- Download images from Mars or the Moon from [NASA](https://api.nasa.gov/)\n", - "- ..." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# The Starwars Character API\n", - "\n", - "- Let's get data on Starwars characters! \n", - "- We find [the documentation](https://swapi.co/documentation) \n", - "- We import the `requests` module \n", - "- We set the base URL of the API: `https://swapi.co/api/`\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "url = \"https://swapi.co/api/\"\n", - "\n", - "# Launch request\n", - "base_req = requests.get(url)\n", - "print(base_req)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true, - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'people': 'https://swapi.co/api/people/', 'planets': 'https://swapi.co/api/planets/', 'films': 'https://swapi.co/api/films/', 'species': 'https://swapi.co/api/species/', 'vehicles': 'https://swapi.co/api/vehicles/', 'starships': 'https://swapi.co/api/starships/'}\n" - ] - } - ], - "source": [ - "# Explore json content of object\n", - "print(base_req.json())\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Making sense of the answer\n", - "\n", - "- The call to the basic URL returns various subfields\n", - "`\n", - "{\n", - " 'people': 'https://swapi.co/api/people/', \n", - " 'planets': 'https://swapi.co/api/planets/', \n", - " 'films': 'https://swapi.co/api/films/', \n", - " 'species': 'https://swapi.co/api/species/', \n", - " 'vehicles': 'https://swapi.co/api/vehicles/', \n", - " 'starships': 'https://swapi.co/api/starships/'\n", - " }\n", - "`\n", - "- The documentation adds: `/people/:id/ -- get a specific people resource`\n", - "- Let's request info on the first id of the subfield `people`\n", - "- We can request further info on a sub-subfield (e.g. `starships`)\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'name': 'Luke Skywalker', 'height': '172', 'mass': '77', 'hair_color': 'blond', 'skin_color': 'fair', 'eye_color': 'blue', 'birth_year': '19BBY', 'gender': 'male', 'homeworld': 'https://swapi.co/api/planets/1/', 'films': ['https://swapi.co/api/films/2/', 'https://swapi.co/api/films/6/', 'https://swapi.co/api/films/3/', 'https://swapi.co/api/films/1/', 'https://swapi.co/api/films/7/'], 'species': ['https://swapi.co/api/species/1/'], 'vehicles': ['https://swapi.co/api/vehicles/14/', 'https://swapi.co/api/vehicles/30/'], 'starships': ['https://swapi.co/api/starships/12/', 'https://swapi.co/api/starships/22/'], 'created': '2014-12-09T13:50:51.644000Z', 'edited': '2014-12-20T21:17:56.891000Z', 'url': 'https://swapi.co/api/people/1/'}\n" - ] - } - ], - "source": [ - "# Let's try\n", - "url = \"https://swapi.co/api/people/1\"\n", - "\n", - "# Launch request\n", - "req_1 = requests.get(url)\n", - "\n", - "# Extract json\n", - "req_1_js = req_1.json()\n", - "print(req_1_js)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['https://swapi.co/api/starships/12/', 'https://swapi.co/api/starships/22/']\n", - "{'name': 'X-wing', 'model': 'T-65 X-wing', 'manufacturer': 'Incom Corporation', 'cost_in_credits': '149999', 'length': '12.5', 'max_atmosphering_speed': '1050', 'crew': '1', 'passengers': '0', 'cargo_capacity': '110', 'consumables': '1 week', 'hyperdrive_rating': '1.0', 'MGLT': '100', 'starship_class': 'Starfighter', 'pilots': ['https://swapi.co/api/people/1/', 'https://swapi.co/api/people/9/', 'https://swapi.co/api/people/18/', 'https://swapi.co/api/people/19/'], 'films': ['https://swapi.co/api/films/2/', 'https://swapi.co/api/films/3/', 'https://swapi.co/api/films/1/'], 'created': '2014-12-12T11:19:05.340000Z', 'edited': '2014-12-22T17:35:44.491233Z', 'url': 'https://swapi.co/api/starships/12/'}\n" - ] - } - ], - "source": [ - "# Let's extract Luke's starships\n", - "ships = req_1_js.get(\"starships\")\n", - "print(ships)\n", - "\n", - "# And get entry of first starship\n", - "ship = requests.get(ships[0]).json()\n", - "print(ship)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Geocoding\n", - "\n", - "- Most map services like Google Maps, Open Street Maps, Bing, etc. have APIs\n", - "- Let's check out Open Street Maps' (OSM) API called [Nominatim](https://developer.mapquest.com/)\n", - "- We will use the API through the Python module `geopy`\n", - "- To install new modules check out this [tutorial](https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/)\n", - "- But we still need to sign up with our email to obtain a key for Nominatim\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Retriving a location\n", - "\n", - "* We install `geopy`\n", - "* We sign up and obtain the key (in this example we use Moritz's key)\n", - "* We search the location of two popular places among ULB students: the ULB and the bar Tavernier\n", - "* We store and print out their coordinates" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Université Libre de Bruxelles (Campus du Solbosch), Square Albert Devèze - Albert Devèzesquare, Ixelles - Elsene, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, 1050, België - Belgique - Belgien\n", - "Tavernier, 445, Chaussée de Boondael - Boondaalse Steenweg, Ixelles - Elsene, Région de Bruxelles-Capitale - Brussels Hoofdstedelijk Gewest, 1050, België - Belgique - Belgien\n", - "[50.8134537, 4.381308819663028]\n", - "[50.816496, 4.388727177209292]\n" - ] - } - ], - "source": [ - "#import sys \n", - "#!{sys.executable} -m pip install geopy\n", - "\n", - "from geopy.geocoders import Nominatim\n", - "\n", - "# Replace the user agent key with your own key\n", - "geolocator = Nominatim(user_agent=\"jjG4qnPniTAGpG7O0q8XcMhARm0Pxcln\")\n", - "location_ULB = geolocator.geocode(\"Universite libre de Bruxelles\")\n", - "\n", - "location_tav = geolocator.geocode(\"Bar Tavernier Ixelles\")\n", - "\n", - "\n", - "# Did it work?\n", - "print(location_ULB)\n", - "print(location_tav)\n", - "# Like a charm...\n", - "\n", - "# Store coordinates in tuple\n", - "ulb_coords = [location_ULB.latitude, location_ULB.longitude]\n", - "print(ulb_coords)\n", - "\n", - "tav_coords = [location_tav.latitude, location_tav.longitude]\n", - "print(tav_coords)\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Mapping\n", - "\n", - "- Sadly, not everybody can associate a place to its coordinates in their mind :-P \n", - "- With the free [Leaflet API](https://leafletjs.com/reference-1.6.0.html) we can pin point them on a map\n", - "- Unfortunately written in Java Script\n", - "- But no need to learn, use \"wrapper\" Python Module `Folium`\n", - "- Example: Map of environmental violations of shale gas drilling in Pennsyilvania ([website](http://stateimpact.npr.org/pennsylvania/drilling/violations/))\n", - "\n", - " \"Map\"\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Producing a map\n", - "\n", - "* We install `Folium`\n", - "* We create a map for the ULB, specifying the location and a zoom level\n", - "* We display the map" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [], - "source": [ - "# Install Python module Folium, a wrapper of the Leaflet API\n", - "#import sys\n", - "#!{sys.executable} -m pip install folium\n", - "\n", - "import folium\n", - "from folium.plugins import MarkerCluster\n", - "import pandas as pd\n", - "\n", - "#Create the map\n", - "my_map = folium.Map(location = ulb_coords, zoom_start = 14)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Display map\n", - "my_map\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Adding markers to the map\n", - "\n", - "* It is still not clear where the ULB and Tavernier are located: we need some markers!\n", - "* We pin point them on the map by adding some representative icons ;-) " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Let's add markers\n", - "\n", - "\n", - "folium.Marker(ulb_coords, popup = 'ULB', icon = folium.Icon(icon='book')).add_to(my_map)\n", - "folium.Marker(tav_coords, popup = 'STATA User Meeting', icon = folium.Icon(icon='glass', color='red')).add_to(my_map)\n", - "\n", - "my_map\n" - ] - } - ], - "metadata": { - "celltoolbar": "Slideshow", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_5/Slide1.png b/Session_5/Slide1.png deleted file mode 100644 index 44150db..0000000 Binary files a/Session_5/Slide1.png and /dev/null differ diff --git a/Session_5/Slide2.png b/Session_5/Slide2.png deleted file mode 100644 index e130366..0000000 Binary files a/Session_5/Slide2.png and /dev/null differ diff --git a/Session_5/Slide3.png b/Session_5/Slide3.png deleted file mode 100644 index 8b56235..0000000 Binary files a/Session_5/Slide3.png and /dev/null differ diff --git a/Session_5/map.png b/Session_5/map.png deleted file mode 100644 index fe73330..0000000 Binary files a/Session_5/map.png and /dev/null differ diff --git a/Session_6/README.md b/Session_6/README.md deleted file mode 100644 index 87a4f10..0000000 --- a/Session_6/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Session 6: Webscraping -This week exercises are about retrieving data from HTML pages. It is a discipline that requires perseverance and ingenuity as you will often resort to dirty tricks and need a bit of luck to do what you want. - -The session will be divided into two parts: - -1. The regular exercises during which we'll scrape Wikipedia and Wikiquote pages to retrieve info on the laureates of the Nobel Memorial Prize in Economic Science. For this, you can use the library `requests`, together with `BeautifulSoup` or any library you are comfortable with. -2. A case that I'll present in class (or at least the start of it, as I'm still working on it) solving a real research problem. I'll showcase `selenium` in this context. - -Regarding the solution, I will make no attempt at all at presenting the best way to do it. I'll just show one way to retrieve the data we want, given the environment we operate in. You will see that the final results are not complete nor entirely correct. If you find a solution that is *good enough*, this will already be plenty. - -See you in class! diff --git a/Session_6/Readme.md.txt b/Session_6/Readme.md.txt new file mode 100644 index 0000000..304360c --- /dev/null +++ b/Session_6/Readme.md.txt @@ -0,0 +1 @@ +Readme diff --git a/Session_6/exercises_6.ipynb b/Session_6/exercises_6.ipynb deleted file mode 100644 index b49ca17..0000000 --- a/Session_6/exercises_6.ipynb +++ /dev/null @@ -1,239 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "# Webscraping" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "In this session, we will do some exercises about webscraping. Webscraping is the evil twin of API calls, seen last week. At the core of webscraping, there is a game: Usually, the website operators wants to prevent you from scraping her site as she would lose the ad revenues. It is therefore usually a fairly complex work where you have to outsmart somebody who makes part of her livelihood out of the data provided on the site.\n", - "\n", - "Moreover, scrapers are tipically much faster than humans and put therefore much more strain on the servers than the normal human user and require therefore more computing power on the part of the data provider. This long and boring introduction to say: please scrape responsibly." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Nobel purpose" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We know that the Nobel Prize in economics does not exist. The closest thing we have is the Nobel Memorial Prize in Economic Science. The first one was awarded in 1969, so it's fairly recent. This means that many of the work by the laureates are easily available online. In this week's exercises we will do two things:\n", - "\n", - "1. Scrape Wikipedia to retrieve informations about the location of the Alma Mater(s) of each Nobel Memorial Prize in Economic Sciences laureate. We want to know specifically how many Nobel Memorial Prize winner have never worked in an university in the United States.\n", - "2. Retrieve quotes from those people in an automated fashion so that we can pick one at random in a \"quote of the day\" spirit.\n", - "\n", - "Wikipedia is a great source for this endeavour as the website curators are notably favorable towards scrapers and make little effort to try to deter them. It is therefore a great use case for a first foray into the topic." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "### Retrieving the name and wikipedia page of all the laureate" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "There is a list of all the laureates available [here](https://en.wikipedia.org/wiki/List_of_Nobel_Memorial_Prize_laureates_in_Economics). The URL is \"https://en.wikipedia.org/wiki/List_of_Nobel_Memorial_Prize_laureates_in_Economics\". Let's just use requests and Beautiful Soup to retrieve all the names and wikipedia page of the laureates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Now that we have a list with the names and wikipedia pages of all laureates, we need to get inside each of those in order to get the name of their alma mater. The issue is that several of the laureates have several alma mater. Spend a little time looking at several of the pages of winners to see how this is reflected in the HTML.\n", - "\n", - "In this case, extracting the information might be wrapped inside a function that will be applied on the value of the dictionary created at the step before. Please use regular expression to identify the words \"Alma Mater\" (with and without capital letters for each word)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "And now, repeat the same to identify in which country this university is located. No need to use regular expressions here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now ready to put everything together. Put everything together to modify the structure in which you store the name of the laureates to contain also the countries of their alma mater." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now ready to perform our analysis. Note however that what we have done could have been done in an more structured fashion had we used *pandas*, the library that enables working with dataframes.\n", - "\n", - "Moreover, you will see that some of the results are not correct and/or available (for example, Esther Duflo has no \"Alma Mater\" listed on her page (because in her case, it is called \"Education\") and some universities, such as the University of Leningrad, have no location). This is fine for now but a more comprehensive solution should determine something to work with that." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Voices from the past" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now going to collect all the quotes avaiable on Wikiquotes from the Nobel Memorial Prize for Economic Science laureates. In order to do so, we will pass through the list of laureates pages. If there is a link to a quotes page, we will scrape all the quotes there. If not, we will simply ignore it.\n", - "\n", - "When doing so, it usually makes sense to start by writing the code (in a function) that will scrape the quotes and then write the loop that pass on each laureate and identify if it is relevant to scrape it. Write the function that collects all the quotes from a wikiquote page. \n", - "\n", - "You will see that, for some authors, there are also quotes about the person (rather than quotes from the person). In this simple example, it is OK to collect them as well although in a real setting, we probably will want to avoid keeping those.\n", - "\n", - "If you need a page to serve as example, here is [Ragnar Frisch's wikiquotes page](https://en.wikiquote.org/wiki/Ragnar_Frisch)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "We are now going to write the part that goes through the pages of all laureates, checks whether they have a Wikiquote page (the link is in the left sidebar) and then retrieve the quotes. We can, as we did previously, put the quotes in a dictionary of lists. Try avoiding having lists of size 0." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Now that all the quotes are collected, we just use a standard random number generator to select one at random. You can either select a quote from a big \"bag\" containing all quotes but I would rather first choose a laureate and then a quote in order to avoid favouring the laureates who were more prolix (or who have a larger fan-base)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "argv": [ - "python", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "Python 3", - "env": null, - "interrupt_mode": "signal", - "language": "python", - "metadata": null, - "name": "python3" - }, - "name": "Exercises.ipynb" - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Session_7/1_intro.ipynb b/Session_7/1_intro.ipynb new file mode 100644 index 0000000..43fe219 --- /dev/null +++ b/Session_7/1_intro.ipynb @@ -0,0 +1,2385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Python Fundamentals](https://datascience.quantecon.org/../python_fundamentals/index.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Understand the core pandas objects (`Series` and `DataFrame`) \n", + "- Index into particular elements of a Series and DataFrame \n", + "- Understand what `.dtype`/`.dtypes` do \n", + "- Make basic visualizations \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- US regional unemployment data from Bureau of Labor Statistics " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Introduction](#Introduction) \n", + " - [pandas](#pandas) \n", + " - [Series](#Series) \n", + " - [DataFrame](#DataFrame) \n", + " - [Data Types](#Data-Types) \n", + " - [Changing DataFrames](#Changing-DataFrames) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (1.12.0)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds \n", + "#qeds = quantecon data science" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas\n", + "\n", + "This lecture begins the material on `pandas`.\n", + "\n", + "To start, we will import the pandas package and give it the alias\n", + "`pd`, which is conventional practice." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Don't worry about this line for now!\n", + "%matplotlib inline\n", + "# activate plot theme\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, knowing which pandas version we are\n", + "using is helpful.\n", + "\n", + "We can check this by running the code below." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.25.1'" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Series\n", + "\n", + "The first main pandas type we will introduce is called Series.\n", + "\n", + "A Series is a single column of data, with row labels for each\n", + "observation.\n", + "\n", + "pandas refers to the row labels as the *index* of the Series.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/intro_files/PandasSeries.png](https://datascience.quantecon.org/assets/_static/intro_files/PandasSeries.png) \n", + "Below, we create a Series which contains the US unemployment rate every\n", + "other year starting in 1995." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "values = [5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8., 5.7]\n", + "years = list(range(1995, 2017, 2))\n", + "\n", + "unemp = pd.Series(data=values, index=years, name=\"Unemployment\")" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.6\n", + "1997 5.3\n", + "1999 4.3\n", + "2001 4.2\n", + "2003 5.8\n", + "2005 5.3\n", + "2007 4.6\n", + "2009 7.8\n", + "2011 9.1\n", + "2013 8.0\n", + "2015 5.7\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can look at the index and values in our Series." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015], dtype='int64')" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.index" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8. , 5.7])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What Can We Do with a Series object?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `.head` and `.tail`\n", + "\n", + "Often, our data will have many rows, and we won’t want to display it all\n", + "at once.\n", + "\n", + "The methods `.head` and `.tail` show rows at the beginning and end\n", + "of our Series, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.6\n", + "1997 5.3\n", + "1999 4.3\n", + "2001 4.2\n", + "2003 5.8\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.head() #default 5, but we can put in brackets other number" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2007 4.6\n", + "2009 7.8\n", + "2011 9.1\n", + "2013 8.0\n", + "2015 5.7\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.tail() #default 5, but we can put in brackets other number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Basic Plotting\n", + "\n", + "We can also plot data using the `.plot` method." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "unemp.plot() #check if there's a way to do the graph for a period of time, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ">**Note**\n", + ">\n", + ">This is why we needed the `%matplotlib inline` — it tells the notebook\n", + "to display figures inside the notebook itself. Also, pandas has much greater visualization functionality than this, but we will study that later on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Unique Values\n", + "\n", + "Though it doesn’t make sense in this data set, we may want to find the\n", + "unique values in a Series – which can be done with the `.unique` method." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([5.6, 5.3, 4.3, 4.2, 5.8, 4.6, 7.8, 9.1, 8. , 5.7])" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Indexing\n", + "\n", + "Sometimes, we will want to select particular elements from a Series.\n", + "\n", + "We can do this using `.loc[index_items]`; where `index_items` is\n", + "an item from the index, or a list of items in the index.\n", + "\n", + "We will see this more in-depth in a coming lecture, but for now, we\n", + "demonstrate how to select one or multiple elements of the Series." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5.6" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.loc[1995]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.6\n", + "2005 5.3\n", + "2015 5.7\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.loc[[1995, 2005, 2015]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame\n", + "\n", + "A DataFrame is how pandas stores one or more columns of data.\n", + "\n", + "We can think a DataFrames a multiple Series stacked side by side as\n", + "columns.\n", + "\n", + "This is similar to a sheet in an Excel workbook or a table in a SQL\n", + "database.\n", + "\n", + "In addition to row labels (an index), DataFrames also have column labels.\n", + "\n", + "We refer to these column labels as the columns or column names.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/intro_files/PandasDataFrame.png](https://datascience.quantecon.org/assets/_static/intro_files/PandasDataFrame.png) \n", + "Below, we create a DataFrame that contains the unemployment rate every\n", + "other year by region of the US starting in 1995." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
19955.94.55.36.65.6
19975.64.35.26.05.3
19994.43.64.25.24.3
20013.84.04.04.64.2
20035.85.75.76.55.8
20054.95.75.25.55.3
20074.34.94.34.54.6
20097.18.17.68.67.8
20118.38.79.110.79.1
20137.97.47.48.58.0
20155.75.15.56.15.7
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "1995 5.9 4.5 5.3 6.6 5.6\n", + "1997 5.6 4.3 5.2 6.0 5.3\n", + "1999 4.4 3.6 4.2 5.2 4.3\n", + "2001 3.8 4.0 4.0 4.6 4.2\n", + "2003 5.8 5.7 5.7 6.5 5.8\n", + "2005 4.9 5.7 5.2 5.5 5.3\n", + "2007 4.3 4.9 4.3 4.5 4.6\n", + "2009 7.1 8.1 7.6 8.6 7.8\n", + "2011 8.3 8.7 9.1 10.7 9.1\n", + "2013 7.9 7.4 7.4 8.5 8.0\n", + "2015 5.7 5.1 5.5 6.1 5.7" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {\n", + " \"NorthEast\": [5.9, 5.6, 4.4, 3.8, 5.8, 4.9, 4.3, 7.1, 8.3, 7.9, 5.7],\n", + " \"MidWest\": [4.5, 4.3, 3.6, 4. , 5.7, 5.7, 4.9, 8.1, 8.7, 7.4, 5.1],\n", + " \"South\": [5.3, 5.2, 4.2, 4. , 5.7, 5.2, 4.3, 7.6, 9.1, 7.4, 5.5],\n", + " \"West\": [6.6, 6., 5.2, 4.6, 6.5, 5.5, 4.5, 8.6, 10.7, 8.5, 6.1],\n", + " \"National\": [5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8., 5.7]\n", + "}\n", + "\n", + "unemp_region = pd.DataFrame(data, index=years)\n", + "unemp_region" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can retrieve the index and the DataFrame values as we\n", + "did with a Series." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015], dtype='int64')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.index" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 5.9, 4.5, 5.3, 6.6, 5.6],\n", + " [ 5.6, 4.3, 5.2, 6. , 5.3],\n", + " [ 4.4, 3.6, 4.2, 5.2, 4.3],\n", + " [ 3.8, 4. , 4. , 4.6, 4.2],\n", + " [ 5.8, 5.7, 5.7, 6.5, 5.8],\n", + " [ 4.9, 5.7, 5.2, 5.5, 5.3],\n", + " [ 4.3, 4.9, 4.3, 4.5, 4.6],\n", + " [ 7.1, 8.1, 7.6, 8.6, 7.8],\n", + " [ 8.3, 8.7, 9.1, 10.7, 9.1],\n", + " [ 7.9, 7.4, 7.4, 8.5, 8. ],\n", + " [ 5.7, 5.1, 5.5, 6.1, 5.7]])" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What Can We Do with a DataFrame?\n", + "\n", + "Pretty much everything we can do with a Series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `.head` and `.tail`\n", + "\n", + "As with Series, we can use `.head` and `.tail` to show only the\n", + "first or last `n` rows." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
19955.94.55.36.65.6
19975.64.35.26.05.3
19994.43.64.25.24.3
20013.84.04.04.64.2
20035.85.75.76.55.8
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "1995 5.9 4.5 5.3 6.6 5.6\n", + "1997 5.6 4.3 5.2 6.0 5.3\n", + "1999 4.4 3.6 4.2 5.2 4.3\n", + "2001 3.8 4.0 4.0 4.6 4.2\n", + "2003 5.8 5.7 5.7 6.5 5.8" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
20118.38.79.110.79.1
20137.97.47.48.58.0
20155.75.15.56.15.7
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "2011 8.3 8.7 9.1 10.7 9.1\n", + "2013 7.9 7.4 7.4 8.5 8.0\n", + "2015 5.7 5.1 5.5 6.1 5.7" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.tail(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Plotting\n", + "\n", + "We can generate plots with the `.plot` method.\n", + "\n", + "Notice we now have a separate line for each column of data." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "unemp_region.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Indexing\n", + "\n", + "We can also do indexing using `.loc`.\n", + "\n", + "This is slightly more advanced than before because we can choose\n", + "subsets of both row and columns." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5.9" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.loc[1995, \"NorthEast\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.3\n", + "2005 5.2\n", + "Name: South, dtype: float64" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.loc[[1995, 2005], \"South\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "NorthEast 5.9\n", + "National 5.6\n", + "Name: 1995, dtype: float64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.loc[1995, [\"NorthEast\", \"National\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.9\n", + "1997 5.6\n", + "1999 4.4\n", + "2001 3.8\n", + "2003 5.8\n", + "2005 4.9\n", + "2007 4.3\n", + "2009 7.1\n", + "2011 8.3\n", + "2013 7.9\n", + "2015 5.7\n", + "Name: NorthEast, dtype: float64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.loc[:, \"NorthEast\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 4.5\n", + "1997 4.3\n", + "1999 3.6\n", + "2001 4.0\n", + "2003 5.7\n", + "2005 5.7\n", + "2007 4.9\n", + "2009 8.1\n", + "2011 8.7\n", + "2013 7.4\n", + "2015 5.1\n", + "Name: MidWest, dtype: float64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# `[string]` with no `.loc` extracts a whole column\n", + "unemp_region[\"MidWest\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computations with Columns\n", + "\n", + "pandas can do various computations and mathematical operations on\n", + "columns.\n", + "\n", + "Let’s take a look at a few of them." + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 0.066\n", + "1997 0.060\n", + "1999 0.052\n", + "2001 0.046\n", + "2003 0.065\n", + "2005 0.055\n", + "2007 0.045\n", + "2009 0.086\n", + "2011 0.107\n", + "2013 0.085\n", + "2015 0.061\n", + "Name: West, dtype: float64" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Divide by 100 to move from percent units to a rate\n", + "unemp_region[\"West\"] / 100" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10.7" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find maximum\n", + "unemp_region[\"West\"].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 2.1\n", + "1997 1.7\n", + "1999 1.6\n", + "2001 0.6\n", + "2003 0.8\n", + "2005 -0.2\n", + "2007 -0.4\n", + "2009 0.5\n", + "2011 2.0\n", + "2013 1.1\n", + "2015 1.0\n", + "dtype: float64" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the difference between two columns\n", + "# Notice that pandas applies `-` to _all rows_ at once\n", + "# We'll see more of this throughout these materials\n", + "unemp_region[\"West\"] - unemp_region[\"MidWest\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9006381255384481" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find correlation between two columns\n", + "unemp_region.West.corr(unemp_region[\"MidWest\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
NorthEast1.0000000.8756540.9644150.9678750.976016
MidWest0.8756541.0000000.9513790.9006380.952389
South0.9644150.9513791.0000000.9872590.995030
West0.9678750.9006380.9872591.0000000.981308
National0.9760160.9523890.9950300.9813081.000000
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "NorthEast 1.000000 0.875654 0.964415 0.967875 0.976016\n", + "MidWest 0.875654 1.000000 0.951379 0.900638 0.952389\n", + "South 0.964415 0.951379 1.000000 0.987259 0.995030\n", + "West 0.967875 0.900638 0.987259 1.000000 0.981308\n", + "National 0.976016 0.952389 0.995030 0.981308 1.000000" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find correlation between all column pairs\n", + "unemp_region.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Types\n", + "\n", + "We asked you to run the commands `unemp.dtype` and\n", + "`unemp_region.dtypes` and think about the outputs.\n", + "\n", + "You might have guessed that they return the type of the values inside\n", + "each column.\n", + "\n", + "Occasionally, you might need to investigate what types you have in your\n", + "DataFrame when an operation isn’t behaving as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "NorthEast float64\n", + "MidWest float64\n", + "South float64\n", + "West float64\n", + "National float64\n", + "dtype: object" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataFrames will only distinguish between a few types.\n", + "\n", + "- Booleans (`bool`) \n", + "- Floating point numbers (`float64`) \n", + "- Integers (`int64`) \n", + "- Dates (`datetime`) — we will learn this soon \n", + "- Categorical data (`categorical`) \n", + "- Everything else, including strings (`object`) \n", + "\n", + "\n", + "In the future, we will often refer to the type of data stored in a\n", + "column as its `dtype`.\n", + "\n", + "Let’s look at an example for when having an incorrect `dtype` can\n", + "cause problems.\n", + "\n", + "Suppose that when we imported the data the `South` column was\n", + "interpreted as a string." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "NorthEast float64\n", + "MidWest float64\n", + "South object\n", + "West float64\n", + "National float64\n", + "dtype: object" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_unemp = unemp_region.copy()\n", + "str_unemp[\"South\"] = str_unemp[\"South\"].astype(str)\n", + "str_unemp.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Everything *looks* ok…" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
19955.94.55.36.65.6
19975.64.35.26.05.3
19994.43.64.25.24.3
20013.84.04.04.64.2
20035.85.75.76.55.8
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "1995 5.9 4.5 5.3 6.6 5.6\n", + "1997 5.6 4.3 5.2 6.0 5.3\n", + "1999 4.4 3.6 4.2 5.2 4.3\n", + "2001 3.8 4.0 4.0 4.6 4.2\n", + "2003 5.8 5.7 5.7 6.5 5.8" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_unemp.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But if we try to do something like compute the sum of all the columns,\n", + "we get unexpected results…" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "NorthEast 63.7\n", + "MidWest 62\n", + "South 5.35.24.24.05.75.24.37.69.17.45.5\n", + "West 72.8\n", + "National 65.7\n", + "dtype: object" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_unemp.sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This happened because `.sum` effectively calls `+` on all rows in\n", + "each column.\n", + "\n", + "Recall that when we apply `+` to two strings, the result is the two\n", + "strings concatenated.\n", + "\n", + "So, in this case, we saw that the entries in all rows of the South\n", + "column were stitched together into one long string." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Changing DataFrames\n", + "\n", + "We can change the data inside of a DataFrame in various ways:\n", + "\n", + "- Adding new columns \n", + "- Changing index labels or column names \n", + "- Altering existing data (e.g. doing some arithmetic or making a column\n", + " of strings lowercase) \n", + "\n", + "\n", + "Some of these “mutations” will be topics of future lectures, so we will\n", + "only briefly discuss a few of the things we can do below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating New Columns\n", + "\n", + "We can create new data by assigning values to a column similar to how\n", + "we assign values to a variable.\n", + "\n", + "In pandas, we create a new column of a DataFrame by writing:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "df[\"New Column Name\"] = new_values\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we create an unweighted mean of the unemployment rate across the\n", + "four regions of the US — notice that this differs from the national\n", + "unemployment rate." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "unemp_region[\"UnweightedMean\"] = (unemp_region[\"NorthEast\"] +\n", + " unemp_region[\"MidWest\"] +\n", + " unemp_region[\"South\"] +\n", + " unemp_region[\"West\"])/4" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNationalUnweightedMean
19955.94.55.36.65.65.575
19975.64.35.26.05.35.275
19994.43.64.25.24.34.350
20013.84.04.04.64.24.100
20035.85.75.76.55.85.925
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National UnweightedMean\n", + "1995 5.9 4.5 5.3 6.6 5.6 5.575\n", + "1997 5.6 4.3 5.2 6.0 5.3 5.275\n", + "1999 4.4 3.6 4.2 5.2 4.3 4.350\n", + "2001 3.8 4.0 4.0 4.6 4.2 4.100\n", + "2003 5.8 5.7 5.7 6.5 5.8 5.925" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Changing Values\n", + "\n", + "Changing the values inside of a DataFrame should be done sparingly.\n", + "\n", + "However, it can be done by assigning a value to a location in the\n", + "DataFrame.\n", + "\n", + "`df.loc[index, column] = value`" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "unemp_region.loc[1995, \"UnweightedMean\"] = 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNationalUnweightedMean
19955.94.55.36.65.60.000
19975.64.35.26.05.35.275
19994.43.64.25.24.34.350
20013.84.04.04.64.24.100
20035.85.75.76.55.85.925
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National UnweightedMean\n", + "1995 5.9 4.5 5.3 6.6 5.6 0.000\n", + "1997 5.6 4.3 5.2 6.0 5.3 5.275\n", + "1999 4.4 3.6 4.2 5.2 4.3 4.350\n", + "2001 3.8 4.0 4.0 4.6 4.2 4.100\n", + "2003 5.8 5.7 5.7 6.5 5.8 5.925" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Renaming Columns\n", + "\n", + "We can also rename the columns of a DataFrame, which is helpful because the names that sometimes come with datasets are\n", + "unbearable…\n", + "\n", + "For example, the original name for the North East unemployment rate\n", + "given by the Bureau of Labor Statistics was `LASRD910000000000003`…\n", + "\n", + "They have their reasons for using these names, but it can make our job\n", + "difficult since we often need to type it repeatedly.\n", + "\n", + "We can rename columns by passing a dictionary to the `rename` method.\n", + "\n", + "This dictionary contains the old names as the keys and new names as the\n", + "values.\n", + "\n", + "See the example below." + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NEMWSWNationalUnweightedMean
19955.94.55.36.65.60.000
19975.64.35.26.05.35.275
19994.43.64.25.24.34.350
20013.84.04.04.64.24.100
20035.85.75.76.55.85.925
20054.95.75.25.55.35.325
20074.34.94.34.54.64.500
20097.18.17.68.67.87.850
20118.38.79.110.79.19.200
20137.97.47.48.58.07.800
20155.75.15.56.15.75.600
\n", + "
" + ], + "text/plain": [ + " NE MW S W National UnweightedMean\n", + "1995 5.9 4.5 5.3 6.6 5.6 0.000\n", + "1997 5.6 4.3 5.2 6.0 5.3 5.275\n", + "1999 4.4 3.6 4.2 5.2 4.3 4.350\n", + "2001 3.8 4.0 4.0 4.6 4.2 4.100\n", + "2003 5.8 5.7 5.7 6.5 5.8 5.925\n", + "2005 4.9 5.7 5.2 5.5 5.3 5.325\n", + "2007 4.3 4.9 4.3 4.5 4.6 4.500\n", + "2009 7.1 8.1 7.6 8.6 7.8 7.850\n", + "2011 8.3 8.7 9.1 10.7 9.1 9.200\n", + "2013 7.9 7.4 7.4 8.5 8.0 7.800\n", + "2015 5.7 5.1 5.5 6.1 5.7 5.600" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names = {\"NorthEast\": \"NE\",\n", + " \"MidWest\": \"MW\",\n", + " \"South\": \"S\",\n", + " \"West\": \"W\"}\n", + "unemp_region.rename(columns=names)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNationalUnweightedMean
19955.94.55.36.65.60.000
19975.64.35.26.05.35.275
19994.43.64.25.24.34.350
20013.84.04.04.64.24.100
20035.85.75.76.55.85.925
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National UnweightedMean\n", + "1995 5.9 4.5 5.3 6.6 5.6 0.000\n", + "1997 5.6 4.3 5.2 6.0 5.3 5.275\n", + "1999 4.4 3.6 4.2 5.2 4.3 4.350\n", + "2001 3.8 4.0 4.0 4.6 4.2 4.100\n", + "2003 5.8 5.7 5.7 6.5 5.8 5.925" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We renamed our columns… Why does the DataFrame still show the old\n", + "column names?\n", + "\n", + "Many pandas operations create a copy of your data by\n", + "default to protect your data and prevent you from overwriting\n", + "information you meant to keep.\n", + "\n", + "We can make these operations permanent by either:\n", + "\n", + "1. Assigning the output back to the variable name\n", + " `df = df.rename(columns=rename_dict)` \n", + "1. Looking into whether the method has an `inplace` option. For\n", + " example, `df.rename(columns=rename_dict, inplace=True)` \n", + "\n", + "\n", + "Setting `inplace=True` will sometimes make your code faster\n", + "(e.g. if you have a very large DataFrame and you don’t want to copy all\n", + "the data), but that doesn’t always happen.\n", + "\n", + "We recommend using the first option until you get comfortable with\n", + "pandas because operations that don’t alter your data are (usually)\n", + "safer." + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NEMWSWNationalUnweightedMean
19955.94.55.36.65.60.000
19975.64.35.26.05.35.275
19994.43.64.25.24.34.350
20013.84.04.04.64.24.100
20035.85.75.76.55.85.925
\n", + "
" + ], + "text/plain": [ + " NE MW S W National UnweightedMean\n", + "1995 5.9 4.5 5.3 6.6 5.6 0.000\n", + "1997 5.6 4.3 5.2 6.0 5.3 5.275\n", + "1999 4.4 3.6 4.2 5.2 4.3 4.350\n", + "2001 3.8 4.0 4.0 4.6 4.2 4.100\n", + "2003 5.8 5.7 5.7 6.5 5.8 5.925" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names = {\"NorthEast\": \"NE\",\n", + " \"MidWest\": \"MW\",\n", + " \"South\": \"S\",\n", + " \"West\": \"W\"}\n", + "\n", + "unemp_shortname = unemp_region.rename(columns=names)\n", + "unemp_shortname.head()" + ] + } + ], + "metadata": { + "date": 1584040761.4786024, + "filename": "intro.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Introduction" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/1_intro_exercises.ipynb b/Session_7/1_intro_exercises.ipynb new file mode 100644 index 0000000..d4137da --- /dev/null +++ b/Session_7/1_intro_exercises.ipynb @@ -0,0 +1,682 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow->qeds) (1.12.0)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.8.0)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "\n", + "\n", + "**For the purpose of these exercises, we create:**\n", + "\n", + "- **a Series which contains the US unemployment rate every other year starting in 1995**\n", + "\n", + "- **a DataFrame that contains the unemployment rate every other year by region of the US starting in 1995**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.6\n", + "1997 5.3\n", + "1999 4.3\n", + "2001 4.2\n", + "2003 5.8\n", + "2005 5.3\n", + "2007 4.6\n", + "2009 7.8\n", + "2011 9.1\n", + "2013 8.0\n", + "2015 5.7\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Series which contains the US unemployment rate every other year starting in 1995\n", + "values = [5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8., 5.7]\n", + "years = list(range(1995, 2017, 2))\n", + "\n", + "unemp = pd.Series(data=values, index=years, name=\"Unemployment\")\n", + "unemp" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastMidWestSouthWestNational
19955.94.55.36.65.6
19975.64.35.26.05.3
19994.43.64.25.24.3
20013.84.04.04.64.2
20035.85.75.76.55.8
20054.95.75.25.55.3
20074.34.94.34.54.6
20097.18.17.68.67.8
20118.38.79.110.79.1
20137.97.47.48.58.0
20155.75.15.56.15.7
\n", + "
" + ], + "text/plain": [ + " NorthEast MidWest South West National\n", + "1995 5.9 4.5 5.3 6.6 5.6\n", + "1997 5.6 4.3 5.2 6.0 5.3\n", + "1999 4.4 3.6 4.2 5.2 4.3\n", + "2001 3.8 4.0 4.0 4.6 4.2\n", + "2003 5.8 5.7 5.7 6.5 5.8\n", + "2005 4.9 5.7 5.2 5.5 5.3\n", + "2007 4.3 4.9 4.3 4.5 4.6\n", + "2009 7.1 8.1 7.6 8.6 7.8\n", + "2011 8.3 8.7 9.1 10.7 9.1\n", + "2013 7.9 7.4 7.4 8.5 8.0\n", + "2015 5.7 5.1 5.5 6.1 5.7" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#DataFrame that contains the unemployment rate every other year by region of the US starting in 1995\n", + "data = {\n", + " \"NorthEast\": [5.9, 5.6, 4.4, 3.8, 5.8, 4.9, 4.3, 7.1, 8.3, 7.9, 5.7],\n", + " \"MidWest\": [4.5, 4.3, 3.6, 4. , 5.7, 5.7, 4.9, 8.1, 8.7, 7.4, 5.1],\n", + " \"South\": [5.3, 5.2, 4.2, 4. , 5.7, 5.2, 4.3, 7.6, 9.1, 7.4, 5.5],\n", + " \"West\": [6.6, 6., 5.2, 4.6, 6.5, 5.5, 4.5, 8.6, 10.7, 8.5, 6.1],\n", + " \"National\": [5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8., 5.7]\n", + "}\n", + "\n", + "unemp_region = pd.DataFrame(data, index=years)\n", + "unemp_region" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "1. **Display only the first 2 elements of the Series using the `.head` method.**" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1995 5.6\n", + "1997 5.3\n", + "Name: Unemployment, dtype: float64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# To display only the two first lines, we use the .head function with \"2\" as an argument.\n", + "unemp.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Using the `plot` method, make a bar plot.** " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# The following makes a bar for each state and for each year.\n", + "unemp.plot.bar()\n", + "# Note that, by default, the index is taken as the x variable here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Use `.loc` to select the lowest/highest unemployment rate shown in the Series.** " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4.2" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#lowest temperature\n", + "unemp.loc[unemp.idxmin()]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9.1" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#highest temperature\n", + "unemp.loc[unemp.idxmax()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Run the code `unemp.dtype` below. What does it give you? Where do you think it comes from?**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.dtype\n", + "#we get that the dtype is float64\n", + "#this tells us the type of data stored in the series " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Now we create we create a DataFrame that contains the unemployment rate every other year by region of the US starting in 1995.**\n", + "\n", + "1. **Use introspection (or google-fu) to find a way to obtain a list with all of the column names in `unemp_region`.**" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['NorthEast', 'MidWest', 'South', 'West', 'National']" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(unemp_region)\n", + "\n", + "#to get the index, we can do:\n", + "#list(unemp_region.index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Using the `plot` method, make a bar plot. What does it look like now?** " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Note that, by default, the index is taken as the x variable here.\n", + "unemp_region.plot.bar()\n", + "\n", + "#We create a column entitled UnweightedMean to play around with other plotting characteristics\n", + "unemp_region[\"UnweightedMean\"] = (unemp_region[\"NorthEast\"] +\n", + " unemp_region[\"MidWest\"] +\n", + " unemp_region[\"South\"] +\n", + " unemp_region[\"West\"])/4\n", + "\n", + "# The following makes a bar for each year, and this bar is the unweighted mean\n", + "unemp_region.plot.bar(y='UnweightedMean')\n", + "# The following makes a bar for each year, but plots the value \"Nation\"\n", + "unemp_region.plot.bar(y='National')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Use `.loc` to select the the unemployment data for the `NorthEast` and `West` for the years 1995, 2005, 2011, and 2015.** " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NorthEastWest
19955.96.6
20054.95.5
20118.310.7
20155.76.1
\n", + "
" + ], + "text/plain": [ + " NorthEast West\n", + "1995 5.9 6.6\n", + "2005 4.9 5.5\n", + "2011 8.3 10.7\n", + "2015 5.7 6.1" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .loc \"accesses a group of rows and columns by label(s)...\" Arguments are labels\n", + "unemp_region.loc[[1995, 2005, 2011, 2015], [\"NorthEast\", \"West\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Run the code `unemp_region.dtypes` below. What does it give you? How does this compare with `unemp.dtype`?**" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NorthEast float64\n", + "MidWest float64\n", + "South float64\n", + "West float64\n", + "National float64\n", + "UnweightedMean float64\n", + "dtype: object" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_region.dtypes" + ] + } + ], + "metadata": { + "date": 1584040761.4786024, + "filename": "intro.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Introduction" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Session_7/2_basics.ipynb b/Session_7/2_basics.ipynb new file mode 100644 index 0000000..3c305c7 --- /dev/null +++ b/Session_7/2_basics.ipynb @@ -0,0 +1,3308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Functionality\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [pandas Intro](https://datascience.quantecon.org/intro.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Be familiar with `datetime` \n", + "- Use built-in aggregation functions and be able to create your own and\n", + " apply them using `agg` \n", + "- Use built-in Series transformation functions and be able to create your\n", + " own and apply them using `apply` \n", + "- Use built-in scalar transformation functions and be able to create your\n", + " own and apply them using `applymap` \n", + "- Be able to select subsets of the DataFrame using boolean selection \n", + "- Know what the “want operator” is and how to apply it \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- US state unemployment data from Bureau of Labor Statistics " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Basic Functionality](#Basic-Functionality) \n", + " - [State Unemployment Data](#State-Unemployment-Data) \n", + " - [Dates in pandas](#Dates-in-pandas) \n", + " - [DataFrame Aggregations](#DataFrame-Aggregations) \n", + " - [Transforms](#Transforms) \n", + " - [Boolean Selection](#Boolean-Selection) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (1.12.0)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## State Unemployment Data\n", + "\n", + "In this lecture, we will use unemployment data by state at a monthly\n", + "frequency." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.25.1'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "%matplotlib inline\n", + "# activate plot theme\n", + "import qeds\n", + "qeds.themes.mpl_style();\n", + "\n", + "pd.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we will download the data directly from a url and read it into a pandas DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "## Load up the data -- this will take a couple seconds\n", + "url = \"https://datascience.quantecon.org/assets/data/state_unemployment.csv\"\n", + "unemp_raw = pd.read_csv(url, parse_dates=[\"Date\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The pandas `read_csv` will determine most datatypes of the underlying columns. The\n", + "exception here is that we need to give pandas a hint so it can load up the `Date` column as a Python datetime type: the `parse_dates=[\"Date\"]`.\n", + "\n", + "We can see the basic structure of the downloaded data by getting the first 5 rows, which directly matches\n", + "the underlying CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatestateLaborForceUnemploymentRate
02000-01-01Alabama2142945.04.7
12000-01-01Alaska319059.06.3
22000-01-01Arizona2499980.04.1
32000-01-01Arkansas1264619.04.4
42000-01-01California16680246.05.0
\n", + "
" + ], + "text/plain": [ + " Date state LaborForce UnemploymentRate\n", + "0 2000-01-01 Alabama 2142945.0 4.7\n", + "1 2000-01-01 Alaska 319059.0 6.3\n", + "2 2000-01-01 Arizona 2499980.0 4.1\n", + "3 2000-01-01 Arkansas 1264619.0 4.4\n", + "4 2000-01-01 California 16680246.0 5.0" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_raw.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that a row has a date, state, labor force size, and unemployment rate.\n", + "\n", + "For our analysis, we want to look at the unemployment rate across different states over time, which\n", + "requires a transformation of the data similar to an Excel pivot-table." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateAlabamaAlaskaArizonaArkansasCaliforniaColoradoConnecticutDelawareFloridaGeorgia...South DakotaTennesseeTexasUtahVermontVirginiaWashingtonWest VirginiaWisconsinWyoming
Date
2000-01-014.76.34.14.45.02.82.83.53.73.7...2.43.74.63.12.72.64.95.83.24.1
2000-02-014.76.34.14.35.02.82.73.63.73.6...2.43.74.63.12.62.54.95.63.23.9
2000-03-014.66.34.04.35.02.72.63.63.73.6...2.43.84.53.12.62.45.05.53.33.9
2000-04-014.66.34.04.35.12.72.53.73.73.7...2.43.84.43.12.72.45.05.43.43.8
2000-05-014.56.34.04.25.12.72.43.73.73.7...2.43.94.33.22.72.35.15.43.53.8
\n", + "

5 rows × 50 columns

\n", + "
" + ], + "text/plain": [ + "state Alabama Alaska Arizona Arkansas California Colorado \\\n", + "Date \n", + "2000-01-01 4.7 6.3 4.1 4.4 5.0 2.8 \n", + "2000-02-01 4.7 6.3 4.1 4.3 5.0 2.8 \n", + "2000-03-01 4.6 6.3 4.0 4.3 5.0 2.7 \n", + "2000-04-01 4.6 6.3 4.0 4.3 5.1 2.7 \n", + "2000-05-01 4.5 6.3 4.0 4.2 5.1 2.7 \n", + "\n", + "state Connecticut Delaware Florida Georgia ... South Dakota \\\n", + "Date ... \n", + "2000-01-01 2.8 3.5 3.7 3.7 ... 2.4 \n", + "2000-02-01 2.7 3.6 3.7 3.6 ... 2.4 \n", + "2000-03-01 2.6 3.6 3.7 3.6 ... 2.4 \n", + "2000-04-01 2.5 3.7 3.7 3.7 ... 2.4 \n", + "2000-05-01 2.4 3.7 3.7 3.7 ... 2.4 \n", + "\n", + "state Tennessee Texas Utah Vermont Virginia Washington \\\n", + "Date \n", + "2000-01-01 3.7 4.6 3.1 2.7 2.6 4.9 \n", + "2000-02-01 3.7 4.6 3.1 2.6 2.5 4.9 \n", + "2000-03-01 3.8 4.5 3.1 2.6 2.4 5.0 \n", + "2000-04-01 3.8 4.4 3.1 2.7 2.4 5.0 \n", + "2000-05-01 3.9 4.3 3.2 2.7 2.3 5.1 \n", + "\n", + "state West Virginia Wisconsin Wyoming \n", + "Date \n", + "2000-01-01 5.8 3.2 4.1 \n", + "2000-02-01 5.6 3.2 3.9 \n", + "2000-03-01 5.5 3.3 3.9 \n", + "2000-04-01 5.4 3.4 3.8 \n", + "2000-05-01 5.4 3.5 3.8 \n", + "\n", + "[5 rows x 50 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Don't worry about the details here quite yet\n", + "unemp_all = (\n", + " unemp_raw\n", + " .reset_index()\n", + " .pivot_table(index=\"Date\", columns=\"state\", values=\"UnemploymentRate\")\n", + ")\n", + "unemp_all.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can filter it to look at a subset of the columns (i.e. “state” in this case)." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "states = [\n", + " \"Arizona\", \"California\", \"Florida\", \"Illinois\",\n", + " \"Michigan\", \"New York\", \"Texas\"\n", + "]\n", + "unemp = unemp_all[states]\n", + "unemp.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When plotting, a DataFrame knows the column and index names." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "unemp.plot(figsize=(8, 6))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dates in pandas\n", + "\n", + "You might have noticed that our index now has a nice format for the\n", + "dates (`YYYY-MM-DD`) rather than just a year.\n", + "\n", + "This is because the `dtype` of the index is a variant of `datetime`." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',\n", + " '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',\n", + " '2000-09-01', '2000-10-01',\n", + " ...\n", + " '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',\n", + " '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',\n", + " '2017-11-01', '2017-12-01'],\n", + " dtype='datetime64[ns]', name='Date', length=216, freq=None)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can index into a DataFrame with a `DatetimeIndex` using string\n", + "representations of dates.\n", + "\n", + "For example" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona 4.1\n", + "California 5.0\n", + "Florida 3.7\n", + "Illinois 4.2\n", + "Michigan 3.3\n", + "New York 4.7\n", + "Texas 4.6\n", + "Name: 2000-01-01 00:00:00, dtype: float64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Data corresponding to a single date\n", + "unemp.loc[\"01/01/2000\", :]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
2000-06-014.05.13.84.33.74.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3\n", + "2000-06-01 4.0 5.1 3.8 4.3 3.7 4.6 4.3" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Data for all days between New Years Day and June first in the year 2000\n", + "unemp.loc[\"01/01/2000\":\"06/01/2000\", :]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will learn more about what pandas can do with dates and times in an\n", + "upcoming lecture on time series data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame Aggregations\n", + "\n", + "Let’s talk about *aggregations*.\n", + "\n", + "Loosely speaking, an aggregation is an operation that combines multiple\n", + "values into a single value.\n", + "\n", + "For example, computing the mean of three numbers (for example\n", + "`[0, 1, 2]`) returns a single number (1).\n", + "\n", + "We will use aggregations extensively as we analyze and manipulate our data.\n", + "\n", + "Thankfully, pandas makes this easy!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Built-in Aggregations\n", + "\n", + "pandas already has some of the most frequently used aggregations.\n", + "\n", + "For example:\n", + "\n", + "- Mean (`mean`) \n", + "- Variance (`var`) \n", + "- Standard deviation (`std`) \n", + "- Minimum (`min`) \n", + "- Median (`median`) \n", + "- Maximum (`max`) \n", + "- etc… \n", + "\n", + "\n", + ">**Note**\n", + ">\n", + ">When looking for common operations, using “tab completion” goes a long way." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona 6.301389\n", + "California 7.299074\n", + "Florida 6.048611\n", + "Illinois 6.822685\n", + "Michigan 7.492593\n", + "New York 6.102315\n", + "Texas 5.695370\n", + "dtype: float64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As seen above, the aggregation’s default is to aggregate each column.\n", + "\n", + "However, by using the `axis` keyword argument, you can do aggregations by\n", + "row as well." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 0.352381\n", + "2000-02-01 0.384762\n", + "2000-03-01 0.364762\n", + "2000-04-01 0.353333\n", + "2000-05-01 0.294762\n", + "dtype: float64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.var(axis=1).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing Your Own Aggregation\n", + "\n", + "The built-in aggregations will get us pretty far in our analysis, but\n", + "sometimes we need more flexibility.\n", + "\n", + "We can have pandas perform custom aggregations by following these two\n", + "steps:\n", + "\n", + "1. Write a Python function that takes a `Series` as an input and\n", + " outputs a single value. \n", + "1. Call the `agg` method with our new function as an argument. \n", + "\n", + "\n", + "For example, below, we will classify states as “low unemployment” or\n", + "“high unemployment” based on whether their mean unemployment level is\n", + "above or below 6.5." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "#\n", + "# Step 1: We write the (aggregation) function that we'd like to use\n", + "#\n", + "def high_or_low(s):\n", + " \"\"\"\n", + " This function takes a pandas Series object and returns high\n", + " if the mean is above 6.5 and low if the mean is below 6.5\n", + " \"\"\"\n", + " if s.mean() < 6.5:\n", + " out = \"Low\"\n", + " else:\n", + " out = \"High\"\n", + "\n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona Low\n", + "California High\n", + "Florida Low\n", + "Illinois High\n", + "Michigan High\n", + "New York Low\n", + "Texas Low\n", + "dtype: object" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#\n", + "# Step 2: Apply it via the agg method.\n", + "#\n", + "unemp.agg(high_or_low)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 Low\n", + "2000-02-01 Low\n", + "2000-03-01 Low\n", + "2000-04-01 Low\n", + "2000-05-01 Low\n", + "dtype: object" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# How does this differ from unemp.agg(high_or_low)?\n", + "unemp.agg(high_or_low, axis=1).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that `agg` can also accept multiple functions at once." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
min3.64.53.14.23.24.23.9
max10.912.311.311.314.68.98.3
high_or_lowLowHighLowHighHighLowLow
\n", + "
" + ], + "text/plain": [ + " Arizona California Florida Illinois Michigan New York Texas\n", + "min 3.6 4.5 3.1 4.2 3.2 4.2 3.9\n", + "max 10.9 12.3 11.3 11.3 14.6 8.9 8.3\n", + "high_or_low Low High Low High High Low Low" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.agg([min, max, high_or_low])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transforms\n", + "\n", + "Many analytical operations do not necessarily involve an aggregation.\n", + "\n", + "The output of a function applied to a Series might need to be a new\n", + "Series.\n", + "\n", + "Some examples:\n", + "\n", + "- Compute the percentage change in unemployment from month to month. \n", + "- Calculate the cumulative sum of elements in each column. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Built-in Transforms\n", + "\n", + "pandas comes with many transform functions including:\n", + "\n", + "- Cumulative sum/max/min/product (`cum(sum|min|max|prod)`) \n", + "- Difference (`diff`) \n", + "- Elementwise addition/subtraction/multiplication/division (`+`, `-`, `*`, `/`) \n", + "- Percent change (`pct_change`) \n", + "- Number of occurrences of each distinct value (`value_counts`) \n", + "- Absolute value (`abs`) \n", + "\n", + "\n", + "Again, tab completion is helpful when trying to find these functions." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-01NaNNaNNaNNaNNaNNaNNaN
2000-02-010.000000.000.00.00000-0.0303030.0000000.000000
2000-03-01-0.024390.000.00.023810.000000-0.021277-0.021739
2000-04-010.000000.020.00.000000.0312500.000000-0.022222
2000-05-010.000000.000.00.000000.0606060.000000-0.022727
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York \\\n", + "Date \n", + "2000-01-01 NaN NaN NaN NaN NaN NaN \n", + "2000-02-01 0.00000 0.00 0.0 0.00000 -0.030303 0.000000 \n", + "2000-03-01 -0.02439 0.00 0.0 0.02381 0.000000 -0.021277 \n", + "2000-04-01 0.00000 0.02 0.0 0.00000 0.031250 0.000000 \n", + "2000-05-01 0.00000 0.00 0.0 0.00000 0.060606 0.000000 \n", + "\n", + "state Texas \n", + "Date \n", + "2000-01-01 NaN \n", + "2000-02-01 0.000000 \n", + "2000-03-01 -0.021739 \n", + "2000-04-01 -0.022222 \n", + "2000-05-01 -0.022727 " + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.pct_change().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-01NaNNaNNaNNaNNaNNaNNaN
2000-02-010.00.00.00.0-0.10.00.0
2000-03-01-0.10.00.00.10.0-0.1-0.1
2000-04-010.00.10.00.00.10.0-0.1
2000-05-010.00.00.00.00.20.0-0.1
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 NaN NaN NaN NaN NaN NaN NaN\n", + "2000-02-01 0.0 0.0 0.0 0.0 -0.1 0.0 0.0\n", + "2000-03-01 -0.1 0.0 0.0 0.1 0.0 -0.1 -0.1\n", + "2000-04-01 0.0 0.1 0.0 0.0 0.1 0.0 -0.1\n", + "2000-05-01 0.0 0.0 0.0 0.0 0.2 0.0 -0.1" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.diff().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transforms can be split into to several main categories:\n", + "\n", + "1. *Series transforms*: functions that take in one Series and produce another Series. The index of the input and output does not need to be the same. \n", + "1. *Scalar transforms*: functions that take a single value and produce a single value. An example is the `abs` method, or adding a constant to each value of a Series. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Series Transforms\n", + "\n", + "pandas also simplifies applying custom Series transforms to a Series or the\n", + "columns of a DataFrame. The steps are:\n", + "\n", + "1. Write a Python function that takes a Series and outputs a new Series. \n", + "1. Pass our new function as an argument to the `apply` method (alternatively, the `transform` method). \n", + "\n", + "\n", + "As an example, we will standardize our unemployment data to have mean 0\n", + "and standard deviation 1.\n", + "\n", + "After doing this, we can use an aggregation to determine at which date the\n", + "unemployment rate is most different from “normal times” for each state." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "#\n", + "# Step 1: We write the Series transform function that we'd like to use\n", + "#\n", + "def standardize_data(x):\n", + " \"\"\"\n", + " Changes the data in a Series to become mean 0 with standard deviation 1\n", + " \"\"\"\n", + " mu = x.mean()\n", + " std = x.std()\n", + "\n", + " return (x - mu)/std" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-01-1.076861-0.935545-0.976846-1.337203-1.605740-0.925962-0.849345
2000-02-01-1.076861-0.935545-0.976846-1.337203-1.644039-0.925962-0.849345
2000-03-01-1.125778-0.935545-0.976846-1.286217-1.644039-0.991993-0.926885
2000-04-01-1.125778-0.894853-0.976846-1.286217-1.605740-0.991993-1.004424
2000-05-01-1.125778-0.894853-0.976846-1.286217-1.529141-0.991993-1.081964
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York \\\n", + "Date \n", + "2000-01-01 -1.076861 -0.935545 -0.976846 -1.337203 -1.605740 -0.925962 \n", + "2000-02-01 -1.076861 -0.935545 -0.976846 -1.337203 -1.644039 -0.925962 \n", + "2000-03-01 -1.125778 -0.935545 -0.976846 -1.286217 -1.644039 -0.991993 \n", + "2000-04-01 -1.125778 -0.894853 -0.976846 -1.286217 -1.605740 -0.991993 \n", + "2000-05-01 -1.125778 -0.894853 -0.976846 -1.286217 -1.529141 -0.991993 \n", + "\n", + "state Texas \n", + "Date \n", + "2000-01-01 -0.849345 \n", + "2000-02-01 -0.849345 \n", + "2000-03-01 -0.926885 \n", + "2000-04-01 -1.004424 \n", + "2000-05-01 -1.081964 " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#\n", + "# Step 2: Apply our function via the apply method.\n", + "#\n", + "std_unemp = unemp.apply(standardize_data)\n", + "std_unemp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-011.0768610.9355450.9768461.3372031.6057400.9259620.849345
2000-02-011.0768610.9355450.9768461.3372031.6440390.9259620.849345
2000-03-011.1257780.9355450.9768461.2862171.6440390.9919930.926885
2000-04-011.1257780.8948530.9768461.2862171.6057400.9919931.004424
2000-05-011.1257780.8948530.9768461.2862171.5291410.9919931.081964
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York \\\n", + "Date \n", + "2000-01-01 1.076861 0.935545 0.976846 1.337203 1.605740 0.925962 \n", + "2000-02-01 1.076861 0.935545 0.976846 1.337203 1.644039 0.925962 \n", + "2000-03-01 1.125778 0.935545 0.976846 1.286217 1.644039 0.991993 \n", + "2000-04-01 1.125778 0.894853 0.976846 1.286217 1.605740 0.991993 \n", + "2000-05-01 1.125778 0.894853 0.976846 1.286217 1.529141 0.991993 \n", + "\n", + "state Texas \n", + "Date \n", + "2000-01-01 0.849345 \n", + "2000-02-01 0.849345 \n", + "2000-03-01 0.926885 \n", + "2000-04-01 1.004424 \n", + "2000-05-01 1.081964 " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Takes the absolute value of all elements of a function\n", + "abs_std_unemp = std_unemp.abs()\n", + "\n", + "abs_std_unemp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona 2009-11-01\n", + "California 2010-03-01\n", + "Florida 2010-01-01\n", + "Illinois 2009-12-01\n", + "Michigan 2009-06-01\n", + "New York 2009-11-01\n", + "Texas 2009-08-01\n", + "dtype: datetime64[ns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the date when unemployment was \"most different from normal\" for each State\n", + "def idxmax(x):\n", + " # idxmax of Series will return index of maximal value\n", + " return x.idxmax()\n", + "\n", + "abs_std_unemp.agg(idxmax)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Scalar Transforms\n", + "\n", + "As you may have predicted, we can also apply custom scalar transforms to our\n", + "pandas data.\n", + "\n", + "To do this, we use the following pattern:\n", + "\n", + "1. Define a Python function that takes in a scalar and produces a scalar. \n", + "1. Pass this function as an argument to the `applymap` Series or DataFrame method. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Boolean Selection\n", + "\n", + "We have seen how we can select subsets of data by referring to the index\n", + "or column names.\n", + "\n", + "However, we often want to select based on conditions met by\n", + "the data itself.\n", + "\n", + "Some examples are:\n", + "\n", + "- Restrict analysis to all individuals older than 18. \n", + "- Look at data that corresponds to particular time periods. \n", + "- Analyze only data that corresponds to a recession. \n", + "- Obtain data for a specific product or customer ID. \n", + "\n", + "\n", + "We will be able to do this by using a Series or list of boolean values\n", + "to index into a Series or DataFrame.\n", + "\n", + "Let’s look at some examples." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small = unemp.head() # Create smaller data so we can see what's happening\n", + "unemp_small" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# list of booleans selects rows\n", + "unemp_small.loc[[True, True, True, False, False]]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-03-014.05.03.74.33.24.64.5
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# second argument selects columns, the ``:`` means \"all\".\n", + "# here we use it to select all columns\n", + "unemp_small.loc[[True, False, True, False, True], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaNew YorkTexas
Date
2000-01-014.14.74.6
2000-02-014.14.74.6
2000-03-014.04.64.5
\n", + "
" + ], + "text/plain": [ + "state Arizona New York Texas\n", + "Date \n", + "2000-01-01 4.1 4.7 4.6\n", + "2000-02-01 4.1 4.7 4.6\n", + "2000-03-01 4.0 4.6 4.5" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# can use booleans to select both rows and columns\n", + "unemp_small.loc[[True, True, True, False, False], [True, False, False, False, False, True, True]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating Boolean DataFrames/Series\n", + "\n", + "We can use [conditional statements](https://datascience.quantecon.org/../python_fundamentals/control_flow.html) to\n", + "construct Series of booleans from our data." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 False\n", + "2000-02-01 False\n", + "2000-03-01 False\n", + "2000-04-01 True\n", + "2000-05-01 True\n", + "Name: Texas, dtype: bool" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small[\"Texas\"] < 4.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have our Series of bools, we can use it to extract subsets of\n", + "rows from our DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small.loc[unemp_small[\"Texas\"] < 4.5]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 True\n", + "2000-02-01 True\n", + "2000-03-01 True\n", + "2000-04-01 True\n", + "2000-05-01 True\n", + "dtype: bool" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small[\"New York\"] > unemp_small[\"Texas\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_NY = unemp_small[\"New York\"] > unemp_small[\"Texas\"]\n", + "unemp_small.loc[big_NY]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Multiple Conditions\n", + "\n", + "In the boolean section of the [basics lecture](https://datascience.quantecon.org/../python_fundamentals/basics.html), we saw\n", + "that we can use the words `and` and `or` to combine multiple booleans into\n", + "a single bool.\n", + "\n", + "Recall:\n", + "\n", + "- `True and False -> False` \n", + "- `True and True -> True` \n", + "- `False and False -> False` \n", + "- `True or False -> True` \n", + "- `True or True -> True` \n", + "- `False or False -> False` \n", + "\n", + "\n", + "We can do something similar in pandas, but instead of\n", + "`bool1 and bool2` we write:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "(bool_series1) & (bool_series2)\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Likewise, instead of `bool1 or bool2` we write:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "(bool_series1) | (bool_series2)\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 False\n", + "2000-02-01 False\n", + "2000-03-01 True\n", + "2000-04-01 True\n", + "2000-05-01 True\n", + "dtype: bool" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "small_NYTX = (unemp_small[\"Texas\"] < 4.7) & (unemp_small[\"New York\"] < 4.7)\n", + "small_NYTX" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small[small_NYTX]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `isin`\n", + "\n", + "Sometimes, we will want to check whether a data point takes on one of a\n", + "several fixed values.\n", + "\n", + "We could do this by writing `(df[\"x\"] == val_1) | (df[\"x\"] == val_2)`\n", + "(like we did above), but there is a better way: the `.isin` method" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 True\n", + "2000-02-01 True\n", + "2000-03-01 True\n", + "2000-04-01 True\n", + "2000-05-01 False\n", + "Name: Michigan, dtype: bool" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp_small[\"Michigan\"].isin([3.3, 3.2])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now select full rows where this Series is True\n", + "unemp_small.loc[unemp_small[\"Michigan\"].isin([3.3, 3.2])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `.any` and `.all`\n", + "\n", + "Recall from the boolean section of the [basics lecture](https://datascience.quantecon.org/../python_fundamentals/basics.html)\n", + "that the Python functions `any` and `all` are aggregation functions that\n", + "take a collection of booleans and return a single boolean.\n", + "\n", + "`any` returns True whenever at least one of the inputs are True while\n", + "`all` is True only when all the inputs are `True`.\n", + "\n", + "Series and DataFrames with `dtype` bool have `.any` and `.all`\n", + "methods that apply this logic to pandas objects.\n", + "\n", + "Let’s use these methods to count how many months all the states in our\n", + "sample had high unemployment.\n", + "\n", + "As we work through this example, consider the [“want\n", + "operator”](http://albertjmenkveld.com/2014/07/07/endogeneous-price-dispersion/), a helpful\n", + "concept from Nobel Laureate [Tom\n", + "Sargent](http://www.tomsargent.com) for clearly stating the goal of our analysis and\n", + "determining the steps necessary to reach the goal.\n", + "\n", + "We always begin by writing `Want:` followed by what we want to\n", + "accomplish.\n", + "\n", + "In this case, we would write:\n", + "\n", + "> Want: Count the number of months in which all states in our sample\n", + "had unemployment above 6.5%\n", + "\n", + "\n", + "After identifying the **want**, we work *backwards* to identify the\n", + "steps necessary to accomplish our goal.\n", + "\n", + "So, starting from the result, we have:\n", + "\n", + "1. Sum the number of `True` values in a Series indicating dates for\n", + " which all states had high unemployment. \n", + "1. Build the Series used in the last step by using the `.all` method\n", + " on a DataFrame containing booleans indicating whether each state had\n", + " high unemployment at each date. \n", + "1. Build the DataFrame used in the previous step using a `>`\n", + " comparison. \n", + "\n", + "\n", + "Now that we have a clear plan, let’s follow through and *apply* the want\n", + "operator:" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-01FalseFalseFalseFalseFalseFalseFalse
2000-02-01FalseFalseFalseFalseFalseFalseFalse
2000-03-01FalseFalseFalseFalseFalseFalseFalse
2000-04-01FalseFalseFalseFalseFalseFalseFalse
2000-05-01FalseFalseFalseFalseFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 False False False False False False False\n", + "2000-02-01 False False False False False False False\n", + "2000-03-01 False False False False False False False\n", + "2000-04-01 False False False False False False False\n", + "2000-05-01 False False False False False False False" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 3: construct the DataFrame of bools\n", + "high = unemp > 6.5\n", + "high.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 False\n", + "2000-02-01 False\n", + "2000-03-01 False\n", + "2000-04-01 False\n", + "2000-05-01 False\n", + "dtype: bool" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 2: use the .all method on axis=1 to get the dates where all states have a True\n", + "all_high = high.all(axis=1)\n", + "all_high.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Out of 216 months, 41 had high unemployment across all states\n" + ] + } + ], + "source": [ + "# Step 1: Call .sum to add up the number of True values in `all_high`\n", + "# (note that True == 1 and False == 0 in Python, so .sum will count Trues)\n", + "msg = \"Out of {} months, {} had high unemployment across all states\"\n", + "print(msg.format(len(all_high), all_high.sum()))" + ] + } + ], + "metadata": { + "date": 1584040758.8912327, + "filename": "basics.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Basic Functionality" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/2_basics_exercises.ipynb b/Session_7/2_basics_exercises.ipynb new file mode 100644 index 0000000..62eae16 --- /dev/null +++ b/Session_7/2_basics_exercises.ipynb @@ -0,0 +1,1452 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2.8.0)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.12.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Functionality - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**For these exercises we use unemployment data by state at a monthly frequency.**\n", + "\n", + "- **We download the data directly from a url and read it into a pandas DataFrame.**\n", + "\n", + "- **Since we want to look at the unemployment rate across different states over time, we proceed to a transformation of the data similar to an Excel pivot-table.**" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "#Download the data directly from a url and read it into a pandas DataFrame.\n", + "url = \"https://datascience.quantecon.org/assets/data/state_unemployment.csv\"\n", + "unemp_raw = pd.read_csv(url, parse_dates=[\"Date\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateAlabamaAlaskaArizonaArkansasCaliforniaColoradoConnecticutDelawareFloridaGeorgia...South DakotaTennesseeTexasUtahVermontVirginiaWashingtonWest VirginiaWisconsinWyoming
Date
2000-01-014.76.34.14.45.02.82.83.53.73.7...2.43.74.63.12.72.64.95.83.24.1
2000-02-014.76.34.14.35.02.82.73.63.73.6...2.43.74.63.12.62.54.95.63.23.9
2000-03-014.66.34.04.35.02.72.63.63.73.6...2.43.84.53.12.62.45.05.53.33.9
2000-04-014.66.34.04.35.12.72.53.73.73.7...2.43.84.43.12.72.45.05.43.43.8
2000-05-014.56.34.04.25.12.72.43.73.73.7...2.43.94.33.22.72.35.15.43.53.8
\n", + "

5 rows × 50 columns

\n", + "
" + ], + "text/plain": [ + "state Alabama Alaska Arizona Arkansas California Colorado \\\n", + "Date \n", + "2000-01-01 4.7 6.3 4.1 4.4 5.0 2.8 \n", + "2000-02-01 4.7 6.3 4.1 4.3 5.0 2.8 \n", + "2000-03-01 4.6 6.3 4.0 4.3 5.0 2.7 \n", + "2000-04-01 4.6 6.3 4.0 4.3 5.1 2.7 \n", + "2000-05-01 4.5 6.3 4.0 4.2 5.1 2.7 \n", + "\n", + "state Connecticut Delaware Florida Georgia ... South Dakota \\\n", + "Date ... \n", + "2000-01-01 2.8 3.5 3.7 3.7 ... 2.4 \n", + "2000-02-01 2.7 3.6 3.7 3.6 ... 2.4 \n", + "2000-03-01 2.6 3.6 3.7 3.6 ... 2.4 \n", + "2000-04-01 2.5 3.7 3.7 3.7 ... 2.4 \n", + "2000-05-01 2.4 3.7 3.7 3.7 ... 2.4 \n", + "\n", + "state Tennessee Texas Utah Vermont Virginia Washington \\\n", + "Date \n", + "2000-01-01 3.7 4.6 3.1 2.7 2.6 4.9 \n", + "2000-02-01 3.7 4.6 3.1 2.6 2.5 4.9 \n", + "2000-03-01 3.8 4.5 3.1 2.6 2.4 5.0 \n", + "2000-04-01 3.8 4.4 3.1 2.7 2.4 5.0 \n", + "2000-05-01 3.9 4.3 3.2 2.7 2.3 5.1 \n", + "\n", + "state West Virginia Wisconsin Wyoming \n", + "Date \n", + "2000-01-01 5.8 3.2 4.1 \n", + "2000-02-01 5.6 3.2 3.9 \n", + "2000-03-01 5.5 3.3 3.9 \n", + "2000-04-01 5.4 3.4 3.8 \n", + "2000-05-01 5.4 3.5 3.8 \n", + "\n", + "[5 rows x 50 columns]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Transformation of the data similar to an Excel pivot-table\n", + "unemp_all = (\n", + " unemp_raw\n", + " .reset_index()\n", + " .pivot_table(index=\"Date\", columns=\"state\", values=\"UnemploymentRate\")\n", + ")\n", + "unemp_all.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-014.15.03.74.23.34.74.6
2000-02-014.15.03.74.23.24.74.6
2000-03-014.05.03.74.33.24.64.5
2000-04-014.05.13.74.33.34.64.4
2000-05-014.05.13.74.33.54.64.3
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 4.1 5.0 3.7 4.2 3.3 4.7 4.6\n", + "2000-02-01 4.1 5.0 3.7 4.2 3.2 4.7 4.6\n", + "2000-03-01 4.0 5.0 3.7 4.3 3.2 4.6 4.5\n", + "2000-04-01 4.0 5.1 3.7 4.3 3.3 4.6 4.4\n", + "2000-05-01 4.0 5.1 3.7 4.3 3.5 4.6 4.3" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#subset of the columns \n", + "states = [\n", + " \"Arizona\", \"California\", \"Florida\", \"Illinois\",\n", + " \"Michigan\", \"New York\", \"Texas\"\n", + "]\n", + "unemp = unemp_all[states]\n", + "unemp.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Looking at the unemp DataFrame above, can you identify the index? The columns? You can use the cell below to verify your visual intuition.**" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',\n", + " '2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01',\n", + " '2000-09-01', '2000-10-01',\n", + " ...\n", + " '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',\n", + " '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',\n", + " '2017-11-01', '2017-12-01'],\n", + " dtype='datetime64[ns]', name='Date', length=216, freq=None)" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#index\n", + "unemp.index" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Arizona', 'California', 'Florida', 'Illinois', 'Michigan', 'New York',\n", + " 'Texas'],\n", + " dtype='object', name='state')" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#columns\n", + "unemp.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2\n", + "\n", + "**Do the following exercises in separate code cells below:**\n", + "\n", + "1. **At each date, what is the minimum unemployment rate across all states in our sample?**" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona 3.6\n", + "California 4.5\n", + "Florida 3.1\n", + "Illinois 4.2\n", + "Michigan 3.2\n", + "New York 4.2\n", + "Texas 3.9\n", + "dtype: float64" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# min unemployment rate by state\n", + "unemp.min()\n", + "\n", + "#alternatively, we can use unemp.agg(min)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Date\n", + "2000-01-01 3.3\n", + "2000-02-01 3.2\n", + "2000-03-01 3.2\n", + "2000-04-01 3.3\n", + "2000-05-01 3.5\n", + " ... \n", + "2017-08-01 4.0\n", + "2017-09-01 3.9\n", + "2017-10-01 3.9\n", + "2017-11-01 3.9\n", + "2017-12-01 3.9\n", + "Length: 216, dtype: float64" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The argument axis=1 allows to search within rows. This gives the min overall for a given date\n", + "unemp.min(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **What was the median unemployment rate in each state?**" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona 5.80\n", + "California 6.50\n", + "Florida 5.35\n", + "Illinois 6.15\n", + "Michigan 7.00\n", + "New York 5.70\n", + "Texas 5.40\n", + "dtype: float64" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# median unemployment rate by state\n", + "unemp.median()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **What was the maximum unemployment rate across the states in our sample? What state did it happen in? In what month/year was this achieved?**\n", + " Hint 1: What Python type (not dtype) is returned by the aggregation?\n", + " Hint 2: Read documentation for the method idxmax" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14.6" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# max unemployment rate across all states and Year\n", + "unemp.max().max()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Michigan'" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type(unemp)\n", + "unemp.max().idxmax()\n", + "\n", + "#if we use unemp.max() we get the max value across all states\n", + "#but we want to know more: \n", + " #we want the state that had the max of all max unemployment rates and the month/year in which it has been achieved" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2009-06-01 00:00:00')" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unemp.max(axis=1).idxmax() \n", + "\n", + "#we can check whether for this date Michigan registered indeed the highest value by doing unemp.loc[\"06/01/2009\",:]\n", + "#we will get True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Classify each state as high or low volatility based on whether the variance of their unemployment is above or below 4.**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Arizona High\n", + "California High\n", + "Florida High\n", + "Illinois Low\n", + "Michigan High\n", + "New York Low\n", + "Texas Low\n", + "dtype: object" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# high or low volatility\n", + "\n", + "# 1. we create our aggregation function\n", + "\n", + "def high_or_low_vol(v):\n", + " if v.var() > 4:\n", + " out = \"High\"\n", + " else:\n", + " out = \"Low\"\n", + " \n", + " return out\n", + "\n", + "# 2. we apply the function via the agg method\n", + "\n", + "unemp.agg(high_or_low_vol)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Imagine that we want to determine whether unemployment was high (> 6.5),medium (4.5 < x <= 6.5), or low (<= 4.5) for each state and each month.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Write a Python function that takes a single number as an input and outputs a single string noting if that number is high, medium, or low.** " + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "# Part 1: Write a Python function to classify unemployment levels.\n", + "\n", + "def unemployment_levels(ul):\n", + " if ul > 6.5:\n", + " return \"high\"\n", + " elif 4.5 < ul <= 6.5:\n", + " #elif ul < 6.5: #equivalent to the line above\n", + " return \"medium\"\n", + " else:\n", + " return \"low\"\n", + " \n", + " return ul" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Pass your function to `applymap` (quiz: why `applymap` and not `agg` or `apply`?) and save the result in a new DataFrame called `unemp_bins`.** " + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
Date
2000-01-01lowmediumlowlowlowmediummedium
2000-02-01lowmediumlowlowlowmediummedium
2000-03-01lowmediumlowlowlowmediumlow
2000-04-01lowmediumlowlowlowmediumlow
2000-05-01lowmediumlowlowlowmediumlow
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "Date \n", + "2000-01-01 low medium low low low medium medium\n", + "2000-02-01 low medium low low low medium medium\n", + "2000-03-01 low medium low low low medium low\n", + "2000-04-01 low medium low low low medium low\n", + "2000-05-01 low medium low low low medium low" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Part 2: Pass your function from part 1 to applymap\n", + "#Apply map allows to apply a function that takes a scalar and returns a scalar to a dataframe\n", + "\n", + "unemp_bins = unemp.applymap(unemployment_levels) #replace this comment with your code!!\n", + "unemp_bins.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " 3. **(Challenging) Use another transform on `unemp_bins` to count how many times each state had each of the three classifications.**\n", + " - Hint 1: Will this value counting function be a Series or scalar transform? \n", + " - Hint 2: Try googling \"pandas count unique value\" or something similar to find the right transform. \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
high7510668911426551
low4446919172258
medium971067910657129107
\n", + "
" + ], + "text/plain": [ + "state Arizona California Florida Illinois Michigan New York Texas\n", + "high 75 106 68 91 142 65 51\n", + "low 44 4 69 19 17 22 58\n", + "medium 97 106 79 106 57 129 107" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Part 3: Count the number of times each state had each classification.\n", + "\n", + "# we count the number of times each state had each classification. \n", + "def count_bins(cb):\n", + " var = cb.value_counts() \n", + " return var\n", + "\n", + "unemp_count = unemp_bins.apply(count_bins)\n", + "unemp_count" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ArizonaCaliforniaFloridaIllinoisMichiganNew YorkTexas
low44.04.069.019.017.022.058.0
medium97.0106.079.0106.057.0129.0107.0
high75.0106.068.091.0142.065.051.0
\n", + "
" + ], + "text/plain": [ + " Arizona California Florida Illinois Michigan New York Texas\n", + "low 44.0 4.0 69.0 19.0 17.0 22.0 58.0\n", + "medium 97.0 106.0 79.0 106.0 57.0 129.0 107.0\n", + "high 75.0 106.0 68.0 91.0 142.0 65.0 51.0" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# alternative solution: value_counts is a series function (columns) so I loop over columns to have the frequencies for each states, then append each series\n", + "unemp_freq_cat = pd.DataFrame()\n", + "for iter in unemp_bins.columns:\n", + " series=unemp_bins[iter].value_counts()\n", + " unemp_freq_cat=unemp_freq_cat.append(series)\n", + "unemp_freq_cat=unemp_freq_cat[['low', 'medium', 'high']]\n", + "# We can transpose the table to make it more readable, matter of preferences\n", + "unemp_freq_catT=unemp_freq_cat.T\n", + "unemp_freq_catT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Construct a horizontal bar chart of the number of occurrences of each level with one bar per state and classification (21 total bars).** " + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAHcCAYAAAAjqCsIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdeViU9f7/8dewgwICpojiSoRbaqFptmh2lFTSMjXLXdPKsvS0SHVyq1xKraPYcancsjpquYRbrpnmKa0UFXdLVBRTVlkUmN8f/pxvBCIozHDPPB/X1XWY+/7Mfb/fYx1e3vfn/owpMzPTLAAAAINysnUBAAAAt4IwAwAADI0wAwAADI0wAwAADI0wY0CJiYlKTEy0dRkAAJQLLrYuADcvKyvL1iVYTVpamry9vW1dhtU4Ur+O1KtEv/aOfsuOh4fHdfdxZQYAABgaYQYAABgaYQYAABgaYQYAABgaYQYAABgaYQYAABgaj2YDAOxaRkaGUlJSlJubW+bnysvLU0pKSpmfp7worX6dnZ3l6+srLy+vm3o/YQYAYLcyMjKUnJysgIAAubm5yWQylen5cnNz5ezsXKbnKE9Ko1+z2azLly/rwoULknRTgYbbTAAAu5WSkqKAgAC5u7uXeZDBzTGZTHJ3d1dAQMBNX+UhzAAA7FZubq7c3NxsXQaKwc3N7aZvBRJmAAB2jSsyxnArf06EGQAAYGiEGQAAYGg8zQQAcDh3DFtr1fMdio4o0fjRo0crODhYgwcPvqn9henevbtGjhypVq1alagWIyDMAABgMK+88opcXPgVfg2fhIH5xrW1dQlWkx74mXzjHrV1GVbjSP06Uq8S/ZaGlPqbS/V4RuTt7W3rEsoVwgwAAOWQ2WzW4sWLtWHDBplMJj344IPq3bu3nJycCtxmSk5O1n/+8x/t3btXvr6+6tGjh1atWqWWLVuqR48elmOmp6drypQp+vXXX+Xr66uePXvqgQcesFWLpYYJwAAAlEPbtm2Tk5OT3nnnHQ0aNEgxMTHasWNHoWNnzJihP//8U6NHj9Zrr72m77//XufPny8wbunSpWrevLnef/993XvvvZo5c2ah44yGMAMAQDlUo0YNPfnkkwoKCtK9996rhg0bKjY2tsC406dPa8+ePRoyZIjuuOMO1alTR8OGDVN2dnaBsQ888IAeeOABVatWTU8++aScnZ0VFxdnjXbKFGEGAIByqFatWvle+/v7F7rc/5kzZ2QymVSvXj3LtsqVK8vf37/IYzo7O8vHx8cuvhiTMAMAQDlU2Bc4ms3mYm0r7jFNJlOJ3l9eEWYAADCw6tWry2w26/jx45ZtFy5c0MWLF21YlXURZgAAMLDq1aurSZMmmj17tg4fPqwTJ04oOjraob4pnEezAQAOp6Qr8hZXbm5uobeHytqwYcM0a9YsjRkzRj4+PurZs6cSExPl6upq9VpsgTADAEA5M3bs2ALbXnjhhevu9/Pz06hRoyyvU1NTNWvWLAUGBlq2LVmypMAxZ86cWRrl2hxhxsB2j29k6xKsJmCiC/0aXMji6YVuN6elOdSKrvSLshAbG6usrCzVrFlTKSkp+uKLL+Tj46OmTZvaujSrIMwAAGBwubm5+uKLL5SYmCg3NzfdfvvtGjt2rDw8PGxdmlXYdZiJjIwscv9DDz2kESNGWKkaAADKRtOmTR3mKkxh7DrMLFiwwPLzTz/9pBkzZuTb5ubmZouyAABAKbLrMOPn52f5uWLFigW2XZOYmKhPP/1Uv/32m5ycnFS/fn0988wzCgwMVF5ent544w15enpq9OjRkqRLly5p+PDhuvfeezVo0CBduXJF0dHRio2NVXJysipXrqxHHnlEXbp0sTwWd+zYMc2dO1fHjh2TJFWrVk1DhgxRw4YNy/pjAADArtl1mCmOjIwMRUVFqVmzZpo4caKcnZ21ZMkSvf3224qOjparq6tGjhyp4cOHa9WqVYqMjNTMmTNVoUIF9e3bV9LVe5VVq1ZVp06d5O3trUOHDik6OlqVKlVSmzZtJEmTJk1SgwYNNGzYMDk5OenEiROFPjK3du1arVu3rsiao6KiJEkBE18v3Q+jHHPyqUi/BpeWllbodrPZfN199oh+rSsvL0+5ublWPae1z2drpdlvXl7edf99KWr+j8OHmc2bN8vDwyPfI2/Dhw/XU089pV9++UX33HOPqlSpomHDhunDDz9UUlKSdu7cqWnTplnCiIeHh3r16mV5f2BgoA4fPqytW7eqTZs2MpvN+vPPP3X33XerRo0akqSgoKBC64mIiFBERNHrHyQmJkqSLoyadEu9G0nAxNfp1+Cu9zRTWlqavL29rVyN7dCvdaWkpFh13RdbrTNjK6Xdr5OT0039++LwYebYsWM6deqUunfvnm97dna2EhISLK/vv/9+7dy5U0uWLNGQIUNUs2bNfONXrVqljRs36vz587p8+bJycnIswcVkMqlLly6aMmWK1q9frzvvvFOtW7e+bqABAADF5/BhJi8vT3fccYdefvnlAvt8fHwsP2dlZenYsWNycnLKF3IkaePGjZo3b54GDx6s0NBQeXp6asWKFdqzZ49lTL9+/dSuXTvt2rVLv/76qxYvXqyXXnrJchsKAADcHIcPM/Xq1dPPP/+sSpUqycvL67rj5syZIycnJ40dO1ZjxoxReHi47rrrLknSgQMH1LBhQz3yyCOW8WfOnClwjBo1aqhGjRrq2rWrPvzwQ3333XeEGQCwgaNPvWjV813vNuv1zJgxQ2lpaZY5kiiaw3/RZLt27eTp6al3331X+/fv19mzZxUbG6vZs2db5qb8+OOP2rRpk/75z3+qadOmeuKJJ/Thhx8qJSVF0tUv+Tp8+LB+/fVXnT59WosWLdLhw4ct57h06ZJmzZql2NhYJSYmKi4uTgcPHlRwcLBNegYAwJ44/JUZLy8vTZo0SfPmzdN7772nzMxM+fv7q0mTJvLy8tLFixc1Y8YM9e7dW/Xq1ZMk9erVS7/99pumT5+ut956S507d9Yff/yhSZOuTti877771LlzZ23fvl2S5OzsrJSUFE2dOlXJycny8fHRPffco/79+99S7Z38Ot3S+41ktZMH/VpZWX0RHwCUNocJM61bt9aqVasK3RcQEKB//vOf133v559/nu+1s7OzPvjgA8trNze3QlcS7tOnj6SrTzu99tprN1M2AMDBXblyRYsWLdL27duVkZGh2rVrq0+fPqpfv76kq8t13HPPPeratask6aOPPtIPP/yg2bNny8/PT9nZ2erXr5/GjBmjsLAwW7ZSZhz+NhMAAOXZwoULtWPHDj333HOaPHmyatasqXfffVdJSUmSpIYNG2rfvn2W8QcOHJC3t7cOHDggSTp48KBcXFwUEhJik/qtgTADAEA5lZWVpfXr16t3796WtcqeeeYZVapUSWvXrpV0NcwcPHhQubm5SkhIUEZGhh5++GFLwNm/f79CQ0Pl4mK/N2MIMwAAlFPnzp1Tbm6u7rjjDss2Z2dnhYaG6tSpU5KksLAw5eTk6OjRo9q/f7/q16+vO++8U/v375f0f0/c2jPCDAAA5ZTZbJYky/f8/dW1bZ6enqpTp47279+v/fv3q2HDhgoNDdWff/6phIQEHT16lDADAABsIzAwUC4uLjp48KBlW25urg4fPmxZZV76v3kz167CuLm5KSQkRMuWLbP7+TKSAz3NBACA0Xh4eKh9+/b6/PPP5e3trSpVqujbb79VcnKyOnToYBnXsGFDrVq1Su7u7qpTp45l29dff62GDRva9XwZiTADAHBAJV2Rt7jK4osme/fuLUmaOXOmLl26pDp16ujNN9+Un5+fZUxYWJhMJpPq169vOX+jRo20dOlSu7/FJBFmAAAod1544QXLz66urhowYIAGDBhw3fGenp768ssv821r2LChlixZUmY1lieEGQNzpBVa09LS6BcAUCgmAAMAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAENj0TwAgMPxjWtr1fOl1N9s1fMV18qVK7V27VrNnDlTkvTf//5XO3fu1NSpU21cWckQZgAAgCQpMjJSjzzyiK3LKDHCDAAAkHT1O56MiDADAEA5M3r0aFWvXl3u7u7avHmznJyc1K1bN7Vv317z58/Xtm3b5OnpqV69eunBBx+UJF24cEELFizQnj17JEmhoaEaMGCAqlWrZjnuihUrtGrVKmVlZemee+5R1apV853377eZZsyYobS0NEVFRd1wTP369fXtt9/q8uXLat++vZ566iktXbpU69atk8lkUufOndW1a9cy+byYAAwAQDn0ww8/yNPTUxMmTNBjjz2mefPmafLkyapWrZomTpyoNm3a6D//+Y8uXryo7OxsjR07Vq6urho7dqzeffdd+fn5ady4ccrOzpYk7dixQ1988YV69uypyZMnKygoSN9++22p1BoXF6fExESNHTtWQ4YM0cqVK/Xee+/pypUrGj9+vHr06KHPP/9cx44dK5Xz/R1hBgCAcqhGjRrq0aOHqlWrps6dO8vb21vOzs7q1KmTqlWrpieeeEJms1mHDh3S9u3bZTabNWzYMNWqVUvVq1fXkCFDlJWVpd27d0uSYmJi1KZNG/3jH/9QUFCQunXrppCQkFKp1cvLS4MGDVL16tV13333qU6dOkpKStLTTz+toKAgtW/fXrfddpv2799fKuf7O24zAQBQDtWqVcvys8lkkq+vr2rWrGnZ5uLioooVKyo1NVXx8fFKTExUnz598h3j8uXLOnfunCTp9OnTateuXb79oaGhOnv27C3XWqNGDTk7O1te+/r6qkKFCvnG+Pr6KiUl5ZbPVRjCDAAA5dBfw4F0NdC4uBT8tZ2Xl6e8vDzVrl1bL7/8coH9FStWvOkanJycZDab823LyckpVq2Fbfv7sUoLt5kAADC4unXr6uzZs/Lx8VG1atXy/ePt7S1Jql69ug4fPpzvfX9//Xc+Pj5KTk7Ot+33338v1dpLA2EGAACDu//+++Xr66tJkyZp//79OnfunA4cOKD58+crISFBktSxY0dt3bpVGzZsUEJCgr755hsdPXq0yOM2atRIJ06c0KZNm5SQkKAVK1bo0KFD1mipRLjNBABwOGW1Im9ubm6B2yvW4O7urnHjxunzzz/X1KlTlZGRIT8/PzVq1Mgyd6V169ZKTEzUF198oezsbIWHh6tz587asmXLdY/btGlTde/e3fKe+++/Xx06dNCuXbus1FnxmDIzM8vmBhbKTGJioqSrl/8cRVpamuVSqSNwpH4dqVeJfq3t1KlTqlGjhtXOZ6swYyul3W9Rf14eHh7XfR+3mQAAgKERZgAAgKERZgAAgKERZgAAgKERZgAAdq2sFmpD6bqVPyfCDADAbjk7O+vy5cu2LgPFcPny5Zt+MoowAwCwW76+vrpw4YKys7O5QlNOmc1mZWdn68KFC/L19b2pY7BoHgDAbnl5eUmSLl68qNzc3DI/X15enpycHOc6QWn16+zsrEqVKln+vEqKMAMAsGteXl43/UuypGy9SKC1lZd+HSc+AgAAu8SVGQPzjWtr6xKsJj3wM/nGPWrrMqzGkfp1pF4l+rV31uy3rL5fyoi4MgMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNFYANbPf4RrYuwWoCJroU2m/I4uk2qKbsmdPSHGZ1T0fqVaJfe+do/ZYXXJmxgv79+2vFihW2LgMAALtk0zAzbdo0RUZG6quvvsq3PTY2VpGRkUpJSbFKHQsXLlS/fv2Unp6eb/vJkyf1+OOPa9u2bVapAwAAlJzNr8y4ublp2bJlVgsuhenVq5cqVaqk//znP5Ztubm5mjZtmlq1aqX777//po575cqV0ioRAABch83DTOPGjVW1alV9+eWXRY47efKkxo4dqx49eqh37956//33lZSUJEmKj49XZGSk5XVWVpYee+wxjR492vL+devWaciQIYUe28XFRSNHjtSPP/6o7du3S5KWLFmiixcv6rnnnrOMO3HihN58801169ZNTz31lD766CNlZGRY9k+ZMkXvvPOO/vvf/6pfv34aNGhQoefbuHGjevTooV27dhXjEwIAAEWxeZgxmUzq16+f1q5dq4SEhELHXLx4UaNGjVKtWrU0ZcoUjR8/XpmZmRo/frzy8vIUHBysSpUqKTY2VpIUFxcnLy8vxcXFKTc3V9LVW1eNGze+bh21atVSnz599PHHH+uXX37RV199pZdeekkVK1aUJGVmZurtt99WhQoVNGXKFEVFRWn//v2aPj3/BNS9e/fq1KlTGjdunMaPH1/gPMuXL9fcuXM1evRohYeH39RnBgAA/k+5eJopPDxc9evX18KFC/Xaa68V2L969WrVqVNH/fv3t2wbOXKkevXqpaNHjyo0NFSNGjXS3r179cADDyg2NlatW7fW7t27deTIEYWFhWnfvn353l+YLl266H//+5/Gjh2riIgI3XXXXZZ9mzdvVk5OjkaOHCkPDw9J0vPPP69//etf6tevnwIDAyVJ7u7uevHFF+Xq6lrg+AsWLNCGDRv03nvvqU6dOoXWsHbtWq1bt67IOqOioiRJARNfL3KcPXHyqVhov2lpaTaopuyZzWa77e3vHKlXiX7tHf2WnWu/ewtTLsKMdPWJn1dffVVHjhwpsO/YsWPav3+/unfvXmBfQkKCQkND1bhxY61cuVKStG/fPkVGRio7O1uxsbHy8fHRhQsXirwyI129StSzZ0/961//0pNPPplvX3x8vOrUqZPvw6xfv74k6dSpU5YwU7t27UKDzPLly5WVlaWpU6eqWrVq160hIiJCERERRdaZmJgoSbowalKR4+xJwMTXC+3XXh/NTktLk7e3t63LsApH6lWiX3tHv7ZRbsJMaGioWrVqpXnz5hUIEnl5eQoPD9fAgQMLvK9SpUqSrs69+fjjj3XmzBkdOXJEjRs3VnZ2trZu3SofHx9Vq1ZNAQEBN6zD2dlZkuTkVPAOnMlkKvK1dPXKTGEaNGigXbt2adu2berRo8cN6wAAAMVj8zkzf9W3b18dOHBAu3fvzre9Xr16OnnypKpUqaKgoKB8/3h5eUmSZd7MV199pWrVqqlSpUpq3Lix4uLi9Ouvv97wqsyNBAcH6/jx48rKyrJsO3DggGXfjYSGhmrs2LFatmyZlixZcku1AACA/1OuwkxQUJA6dOigVatW5dveqVMnZWRkaPLkyTp06JDOnj2r3377TTNmzMj3NFGjRo20ZcsW3XnnnZKkqlWrytfXVz/++OMth5m2bdvKxcVF06ZN0x9//KHY2FjNnDlT9913n6pWrVqsY4SFhWns2LFaunQpgQYAgFJSbm4zXfPkk09q06ZN+bYFBARo8uTJmj9/vkaPHq0rV67otttuU7NmzfLNT7nzzjv1ww8/5AsujRo10qZNm245zHh6emrcuHGaO3euRo4cKTc3N7Vs2VLPPPNMiY5zLdC8/fbbklToPKDi6uTX6abfazSrnTwK73fYWusXYwWro8IVPmq7rcuwilvp9VB00fPLADgGU2ZmptnWRaBkrk0Abh61w8aVWM/qqHB1nOA46/I4Ur+30qsRw0x5mTBpLfRr36zZb1FPM5Wr20wAAAAlRZgBAACGRpgBAACGRpgBAACGRpgBAACGRpgBAACGRpgBAACGRpgBAACGxqJ5BnRt0TwfHx8bV2I9LERlvxypV4l+7R39lh0WzQMAAHaLMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAyNMAMAAAzNxdYF4Ob5xrW1dQlWkx74mXzjHrV1GVZjL/2m1N9s6xIAOACuzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMz7NcZLF68WNu3b1d0dHSxxkdGRmrUqFFq3br1LY0pT3aPb2TrEkpFyOLpNxxjTktzqKXxHa1fALgV5erKzLRp0xQZGal///vfBfZ99tlnioyM1NixYyVJjz32mCZMmFCq51+wYIFatGhRqscEAABlq1yFGUmqXLmyfvjhB2VlZVm25ebmavPmzbrtttss2zw9PeXj41Oq5/bz85Orq2upHhMAAJStcnebqXbt2rp48aJ++OEHPfzww5Kkn3/+Wa6urmrUqJFSU1MlFX6baePGjfrmm290+vRpVaxYUXfddZdGjBhh2Z+WlqaJEydq165dqlSpkp5++mm1bft/3zz999tMhw4d0syZMxUfH6/g4GD16dNHY8eO1XvvvafGjRsrNzdX0dHR2rNnj5KTkxUQEKAOHTrosccek5PT1Zw4bdo0paamqlmzZlq2bJmys7PVsmVLPfvss/Lw8CjzzxMAAHtX7sKMJLVv317fffedJcxs2LBBDz/8sM6ePXvd96xZs0Zz5sxR3759FR4erqysLO3duzffmC+//FL9+vVT37599d133+nf//63GjZsqCpVqhQ4XmZmpsaNG6emTZtq5MiRunjxoubMmZNvjNlslr+/v15//XX5+vrq8OHDio6Olre3t9q3b28Zd+DAAfn7++udd97R+fPnNXnyZFWvXl3du3e/lY8JAAConIaZBx98UJ9++qnOnDkjT09P7d69W0OGDNHnn39+3fd89dVXevTRR9W1a1fLtpCQkHxj2rZta7kS07t3b61cuVL79+8vNMxs2bJFeXl5Gj58uNzd3VWrVi316NFDU6ZMsYxxcXFR7969La+rVq2qY8eO6fvvv88XZry8vPT888/L2dlZwcHBat26tfbs2VNomFm7dq3WrVtX5OcTFRUlSQqY+HqR44wiLS3thmPMZnOxxtkLR+rXkXqV6Nfe0W/ZKepuRrkMMxUrVlTLli313XffqUKFCmrcuHGhgeOa5ORkXbhwQU2aNCnyuLVr17b87OzsLF9fX6WkpBQ69tSpU6pVq5bc3d0t2+64444C49asWaP169crMTFRly9fVk5OToFag4OD5ezsbHnt7++vw4cPF3reiIgIRUREFNlHYmKiJOnCqElFjjOK4jzNlJaWJm9vbytUUz44Ur+O1KtEv/aOfm2jXIYZSfrHP/6hadOmydPTU08//XSRY81mc7GO6eKSv12TyaS8vLybrnHbtm2aM2eOBg4cqLCwMHl5eSkmJkY7d+7MN+6vQaY0zgsAAP5PuXua6ZomTZrIxcVFqampatmyZZFj/fz8FBAQoD179pTa+WvUqKE//vhD2dnZlm1/v5py4MABhYaGqnPnzgoJCVFQUFCR83oAAEDpK7dhxmQyafr06Zo7d26xHpfu0aOHVq5cqeXLl+v06dM6fvy4vvnmm5s+f5s2beTk5KQZM2bo5MmT+u2337RkyZJ8Y4KCgnT8+HHt2rVLZ86c0Zdffql9+/bd9DkBAEDJldvbTNLVibPF1bFjR7m4uGj58uWaP3++KlasqPDw8Js+t6enp/71r3/p448/1ksvvaSaNWuqV69emjhxotzc3CRdnd9y4sQJffDBB5Kke++9V127dtWGDRtu+rwl0cmvk1XOU+aGrb3hkNVR4Qoftd0KxZQPjtSvI/Uq0a+9o9+rDkUXPfeztJkyMzOLN+EE2rlzp9577z0tXLhQvr6+Nqvj2gTg5lE7bFaDta2OClfHCbtsXYbVOFK/jtSrRL/2jn6vKoswY7inmcqLjRs3KjAwUJUrV9Yff/yhOXPmqEWLFjYNMgAAID/CTBGSk5O1ePFiXbx4UX5+fgoPD1f//v1tXRYAAPgLwkwRunXrpm7dutm6DAAAUIRy+zQTAABAcRBmAACAoRFmAACAoRFmAACAoRFmAACAobFongFdWzTPx8fHxpVYT3n5ZlZrcaR+HalXiX7tHf2WnaIWzePKDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDTCDAAAMDSXkr4hNzdX27Zt0969e5WcnKw+ffqoTp06Sk9P16+//qqGDRvK39+/LGoFAAAooERh5tKlS3r77bd15MgReXh4KDs7W126dJEkeXp66pNPPtFDDz2kvn37lkmxAAAAf1eiMDN//nydPHlSo0ePVkhIiPr06WPZ5+zsrHvvvVe7d+8mzFiJb1xbW5dgNemBn8k37lFbl2E1jtSvI/Uq0e/NSqm/uRSqgb0q0ZyZnTt3qnPnzrr77rtlMpkK7A8KClJiYmKpFQcAAHAjJQoz6enpCgwMvO5+s9msK1eu3HJRAAAAxVWiMFOlShWdPHnyuvv37dun6tWr33JRAAAAxVWiMNOmTRutW7dO+/bts2y7drtp1apV2rlzp9q1a1e6FQIAABShRBOAn3jiCR06dEhvvvmmqlevLpPJpNmzZystLU1JSUlq2bKlIiMjy6pWAACAAkoUZlxcXDR69Ght2bJF27dvl3R13Zm6devqgQceUJs2bQqdGAwAAFBWSrxonnT1dlObNm1KuRQAAICSK9GcmcGDB+t///vfdff/9NNPGjx48C0XBQAAUFwlCjOJiYnKzMy87v6srCydP3/+losCAAAorhJ/0WRRc2JOnz4tT0/PWyoIAACgJG44Z2bjxo3atGmT5fVXX32l9evXFxiXnp6u33//Xc2bNy/dCm/R4sWLtX37dkVHRxfrdWk4d+6cBg8erKlTp+r2228vteP+3e7xjcrs2OVByOLplp/NaWkOtZy5I/XrSL1K9P7rgkIAACAASURBVAuUhRuGmaysLF28eNHy+tKlSzKbzfnGmEwmubu7q0OHDurVq1fpV1lC06ZNU2pqqkaPHn3DsY899pg6d+5cquevXLmyFixYIB8fn1I9LgAAKOiGYaZTp07q1KmTJGnQoEEaMmSI7rnnnjIvzFo8PT1L/daYs7Oz/Pz8SvWYAACgcCV6NPuTTz4pqzps5u+3ma5d1WnWrJmWLVum7OxstWzZUs8++6w8PDwkSVeuXNG8efP0/fff69KlS6pbt64GDBighg0bSip4myknJ0effPKJduzYodTUVFWqVEkPPvig+vfvb6u2AQCwGze1zowkZWRkKCMjQ3l5eQX2ValS5ZaKsrUDBw7I399f77zzjs6fP6/JkyerevXq6t69uyTps88+0w8//KDhw4crMDBQy5cv15gxYzRr1iz5+/sXON61r3p49dVXVaVKFV24cEGnT5+2dlsAANilEoeZdevW6ZtvvlFCQsJ1x6xYseKWirI1Ly8vPf/883J2dlZwcLBat26tPXv2qHv37srKytKaNWv04osvWiY7P//889q7d69iYmLUp0+fAsdLTExUUFCQGjZsKJPJpCpVqqh+/fqFnnvt2rVat25dkfVFRUVJkgImvn6LnZZvaWlplp/NZnO+1/bOkfp1pF4l+rV39Ft2rt0dKUyJwsx3332n6OhoNWnSRO3atdPChQvVpUsXubq6auPGjfLz8yv1ybS2EBwcLGdnZ8trf39/HT58WJKUkJCgnJycfGHE2dlZYWFhio+PL/R47dq109tvv62hQ4eqWbNmCg8P19133y0np4JPxkdERCgiIqLI+hITEyVJF0ZNKnFvRvLXp5nS0tLk7e1tw2qsy5H6daReJfq1d/RrGyVaZ2blypVq3Lixxo8frw4dOkiSwsPD1bdvX0VHRys9PV0ZGRllUqg1/TXISFef1vr77bSSfAdVSEiI5s6dq759+8psNmvatGn617/+VegtOgAAUDIlCjNnzpxRq1atrr7x/19VyMnJkSRVrFhR7du3V0xMTCmXWL5Uq1ZNLi4uOnDggGVbbm6uDh48qJo1a173fV5eXrrvvvv0/PPPa/To0dq7d2+Rt+oAAEDxlOg2k4eHh2WNGU9PTzk5OSkpKcmy39vbW3/++WfpVljOeHh4qGPHjpo/f758fHxUtWpVrVixQsnJyerYsWOh71m+fLn8/PxUt25dOTs7a+vWrfLy8lJAQICVqwcAwP6UKMxUr15df/zxh6Srt2Lq1KmjTZs2qU2bNsrLy9PmzZtVtWrVMim0PLn2SPVHH32k9PR01atXT2PGjCn0SSbpavD7+uuvLVdi6tatqzFjxhQ5mak4Ovl1uqX3l3vD1lp+XB0VrvBR221YjHU5Ur+O1KtUPvo9FF30vDzAaEyZmZnmGw+76uuvv9bKlSs1e/Zsubm5aefOnZowYYLc3NxkMpmUlZWlESNGqG3btmVZs8O7NgG4edQOG1diPaujwtVxwi5bl2E1jtSvI/UqlY9+rRlmyssEUWuh37JTak8zPf7443r88cctr1u2bKkJEyZo+/btcnJyUosWLdS4ceObrxQAAKCEbnrRvGsaNGigBg0alEYtAAAAJVaip5m6dOmiLVu2XHf/tm3b1KVLl1utCQAAoNhKFGb+/m3Zf8e6KQAAwNpKFGakoheLO3z4sCpWrHhLBQEAAJTEDefMrFy5UitXrrS8njt3rhYuXFhg3KVLl5SRkaE2bdqUaoEAAABFuWGY8fHxUfXq1SVdfSTYz89Pfn5++caYTCZ5eHgoJCTELr6bCQAAGMcNw0ybNm0sV1veeOMN9ezZU02aNCnrugAAAIqlRIvmxcbG6sSJE3r00Uct27Zs2aIvv/xS6enpeuCBBzR48OBCvw0apefaonk+Pj42rsR6WIjKfjlSrxL92jv6LTtFLZpXotTx5ZdfKi4uzvI6Pj5eH330kUwmk0JCQhQTE6NVq1bdfKUAAAAlVKIwc/LkSd1xxx2W11u3bpW7u7umTJmiMWPGqE2bNtqwYUOpFwkAAHA9JQozGRkZ+R693r17t5o2bSovLy9JV1cDPnfuXOlWCAAAUIQShRk/Pz+dPHlSknThwgUdP35czZo1s+zPzMxkvgwAALCqEn03U6tWrRQTE6MrV67oyJEjcnV1VYsWLSz7T5w4ocDAwFIvEgAA4HpKFGaefvppJScna8uWLfLy8tJLL71kWXMmIyNDO3bsUKdOncqkUAAAgMKUKMx4eHjon//853X3zZs3T+7u7qVSGAAAQHGUKMwUxcnJSRUqVCitwwEAABQLs3UBAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChEWYAAIChudi6ANw837i2ti7BatIDP5Nv3KO2LsNqHKlfR+g1pf5mW5cA2DWuzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAEMjzAAAAENziK8zmDZtmlJTUzV69OhbOs6GDRs0a9YsLVmy5Lpjvv76a8XExOiTTz65pXMVx+7xjcr8HOVFwEQX+rVT1uw1ZPF0q5wHgHXZTZiZNm2aNm3aVGD7Rx99VGrnuP/++xUeHl5qxwMAALfObsKMJDVt2lQjR47Mt83Hx6dUjp2TkyN3d3e5u7uXyvEAAEDpsKsw4+LiIj8/vxuOu3LliubNm6fvv/9ely5dUt26dTVgwAA1bNhQkhQbG6s33nhDo0eP1uLFi3XixAlFRUUpNTW1wG2mZcuWafny5crKylKrVq0UGBiY71yHDx/WwoULdezYMeXk5Kh27doaOHCgwsLCSrd5AAAclENOAP7ss8+0bds2DR8+XB999JFq1aqlMWPG6OLFi/nGzZs3T71799bHH3+sO+64o8Bxtm3bpkWLFumpp57Shx9+qOrVq2v58uX5xmRmZqpt27aaNGmSpkyZorp162rMmDFKSUkp0x4BAHAUdnVl5pdfflH37t0trxs0aKCxY8fmG5OVlaU1a9boxRdfVPPmzSVJzz//vPbu3auYmBj16dPHMrZXr1666667rnu+lStX6qGHHtIjjzwiSerZs6diY2OVkJBgGdOkSZN87xk6dKh27NihX375RW3bti1wzLVr12rdunVF9hkVFSVJCpj4epHj7ImTT0X6tVPW7DUtLc0q5ymK2WwuF3VYC/3aN2v26+Hhcd19dhVmGjVqpGHDhlleFza/JSEhQTk5Oapfv75lm7Ozs8LCwhQfH59v7O23317k+eLj49W+fft828LCwvKFmeTkZC1atEixsbFKTk5WXl6eLl++rPPnzxd6zIiICEVERBR53sTEREnShVGTihxnTwImvk6/dsqavZaHp5nS0tLk7e1t6zKshn7tW3np167CjJubm4KCgoo11mQy3XBMaUz2nTZtmpKTkzV48GBVqVJFrq6ueuutt5STk3PLxwYAAA44Z6ZatWpycXHRgQMHLNtyc3N18OBB1axZs0THCg4O1qFDh/Jt+/vruLg4RUZGqnnz5qpVq5Y8PT2VlJR08w0AAIB87OrKTHF4eHioY8eOmj9/vnx8fFS1alWtWLFCycnJ6tixY4mO9eijj2rq1Km6/fbb1bhxY23fvl2HDh3Kd8ktKChImzdvVmhoqLKysjRv3jy5uDjcxw4AQJlxyN+q/fv3l3R1Qb309HTVq1dPY8aMkb+/f4mOc//99+vs2bNauHChsrOz1aJFC3Xt2lUbN260jHnppZc0Y8YMjRgxQv7+/urVq1epPcnUya9TqRzHCFY7edCvnSrrXg9FFz0HDYDxmTIzM822LgIlc20CcPOoHTauxHpWR4Wr44Rdti7Dahyp37LutbyFmfIyYdJa6Ne+WbPfop5mcrg5MwAAwL4QZgAAgKERZgAAgKERZgAAgKERZgAAgKERZgAAgKERZgAAgKERZgAAgKERZgAAgKGxArABXVsB2MfHx8aVWA+ratovR+pVol97R79lhxWAAQCA3SLMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQyPMAAAAQ3OxdQG4eb5xbW1dgtWkB34m37hHbV2G1ThSv47Uq0S/Nyul/uZSqAb2iiszAADA0AgzAADA0AgzAADA0AgzAADA0AgzAADA0AgzAADA0AgzAADA0AgzAADA0AgzAADA0EyZmZlmWxdxK2JjY/XGG29o0aJF8vX1LfBaknbu3KlPP/1U586dU5s2bTRixIgyq+fcuXMaPHiwpk6dqttvv71MzpGYmHj1f599s9jvCVk8vUxqsZa0tDR5e3vbugyrcaR+HalXiX7tHf2WHQ8Pj+vus/nXGSQlJWnJkiX6+eef9eeff8rHx0e1a9dWZGSkwsPDS3y8sLAwLViwQD4+PpZt06dP1z/+8Q9FRkYW+WGUhsqVKxc4PwAAKDs2DTPnzp3Ta6+9Jk9PT/Xt21d16tSR2WzWnj17FB0drc8++6zEx3R1dZWfn5/ldXp6ulJTU3XXXXcpICDgpmu9cuWKXF1dbzjO2dk53/kBAEDZsmmY+fjjjyVJ06ZNk6enp2V7cHCw2rRpI0lavny5Nm7cqISEBFWoUEF33323Bg4cqIoVKxZ6zL/eZjp58qTeeOMNSdKbb169JfPee++pcePG2rFjhxYvXqzTp0+rUqVKioiIUI8ePWQymSRJgwYNUrt27XT+/Hn9+OOPatq0qQYMGKDBgwdr1KhRWrt2rQ4cOKCqVavqmWeeUbNmzSQVvM2Um5ur6Oho7dmzR8nJyQoICFCHDh302GOPycmJKUsAANwqm/02TUtL0y+//KJOnTrlCzLXXAsrJpNJgwcPVnR0tF555RUdOXJEs2bNKtY5wsLCFB0dLUmKiorSggULFBYWpqNHj2rSpElq1aqVZsyYoX79+mnp0qX69ttv871/+fLlqlGjhqZOnaq+fftati9cuFCRkZGaPn26br/9dr3//vvKzMwstAaz2Sx/f3+9/vrrmjlzpvr06aMlS5Zow4YNxeoBAAAUzWZXZhISEmQ2mxUcHFzkuC5dulh+rlq1qvr376933nlHI0aMuOGVDVdXV8skYG9vb8vtn+XLl6tRo0Z6+umnJUnVq1fXmTNntGzZMkVGRlre36hRI3Xr1s3y+ty5c5aaWrRoIUnq27evNm3apOPHj6thw4YFanBxcVHv3r3z9XDs2DF9//33at++fYHxa9eu1bp164rsKyoqSpIUMPH1Isf9VVpaWrHHlkdms9nwPZSEI/XrSL1K9Gvv6LfslMsJwGZz8R6i2rNnj5YuXar4+HhlZGQoNzdXOTk5SkpKuuk5MPHx8WrevHm+bQ0aNNAXX3yhjIwMeXl5SdJ1n0aqU6eO5Wd/f39JUkpKynXPt2bNGq1fv16JiYm6fPmycnJyVKVKlULHRkREKCIiosj6rz3NdGHUpCLH/RVPMxmLI/XrSL1K9Gvv6Nc2bBZmgoKCZDKZFB8fr1atWhU6JjExUePGjVP79u319NNPy9vbW8eOHdP777+vnJycMq/R3d290O3Ozs6Wn6/NsbleONu2bZvmzJmjgQMHKiwsTF5eXoqJidHOnTtLv2AAAByQzcKMt7e3mjVrppiYGEVGRhaYN5Oenq4jR44oJydHgwcPtgSIn3/++ZbPHRwcrLi4uHzbDhw4oMqVK1uuypSWAwcOKDQ0VJ07d7ZsO3v2bKmeAwAAR2bTx2mee+45mc1mjRgxQj/88INOnTql+Ph4rV69Wi+++KKCgoKUl5enlStX6uzZs9q6datWrFhxy+ft2rWr9u3bZ3maacuWLVq+fLkef/zxUugqv6CgIB0/fly7du3SmTNn9OWXX2rfvn2lfh4AAByVTR/NDgwM1IcffqglS5Zo3rx5unDhgmXRvGHDhqlOnTp65plntGzZMi1atEhhYWEaOHCgJk+efEvnDQkJ0euvv67FixdryZIlqlSpkrp165bv6klpiYiI0IkTJ/TBBx9Iku6991517dq1VJ5m6uTXqfiDh6295fPZ0uqocIWP2m7rMqzGkfp1pF4l+r1Zh6KLnksIx2b4rzNwRNcmADeP2mHjSqxndVS4Ok7YZesyrMaR+nWkXiX6vVlGCTPlZUKstZSXrzNg1TYAAGBohBkAAGBohBkAAGBohBkAAGBohBkAAGBohBkAAGBohBkAAGBohBkAAGBohBkAAGBorABsQNdWAPbx8bFxJdbDqpr2y5F6lejX3tFv2WEFYAAAYLcIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNAIMwAAwNBcbF0Abp5vXFtbl2A16YGfyTfuUVuXYTWO1K8j9SrZV78p9TfbugRAEldmAACAwRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAobECsIHtHt/I1iVYTcBEF/otQsji6WVYTdkyp6U51EqyjtYvYA0OdWUmNjZWkZGRSklJsXUpAACglBg6zBw7dkxdunTRa6+9VqzxYWFhWrBggXx8fMq4MgAAYC2GDjPr1q1Tx44d9ccffyg+Pr7IsTk5OXJ1dZWfn59MJpOVKgQAAGXNsHNmsrOz9f3332vChAnKzs7W+vXrNWjQIEnSuXPnNHjwYL3yyitav369Dh48qAEDBqhWrVp64403tGjRIvn6+mrQoEFKTEwscOy5c+eqatWqSkxM1Jw5c7Rnzx5JUtOmTTVkyBBVrlxZkrR48WJt375dPXv21MKFC5WSkqI777xTL774onx9fSVJhw8f1sKFC3Xs2DHl5OSodu3aGjhwoMLCwqz0SQEAYN9MmZmZZlsXcTM2bdqkb775RtOnT1dsbKwmTZqkefPmycXFxRJmqlSpooEDB6pevXpycXFRQkJCvjCTkpKivLw8yzGnT5+uhIQEffjhh3Jzc9PLL78sNzc3PfPMMzKZTJo1a5Zyc3M1depUmUwmLV68WN98842aNm2qp556StnZ2Zo8ebLuuusuvfDCC5KkPXv26MKFC7r99tslSTExMdqyZYtmzZplCTx/tXbtWq1bt67I3qOioiRJucmppfVxlntOPhWVl5pu6zKspqT9uvgV/HfJKMxms0NdLaVf+0a/Zee222677j7DXplZv3692rZtK0lq1KiR3N3d9b///U+tW7e2jOncuXO+1wkJCfmO8dcwsXTpUh08eFBTpkyRu7u7fv31V/3++++aPXu2qlatKkl65ZVXNGTIEO3Zs0dNmzaVJOXm5urll19WhQoVJEkdOnTQxo0bLcdt0qRJvnMOHTpUO3bs0C+//GKp/68iIiIUERFRZO/XriZdGDWpyHH2JGDi6/RbBCM/zZSWliZvb29bl2E19Gvf6Nc2DBlmzpw5o7i4OL366quSJJPJpAcffFDr16/PF15CQkKKdbyffvpJixcv1tixY1WtWjVJUnx8vPz9/S1BRpICAwPl7++vkydPWsJMlSpVLEFGkgICApScnGx5nZycrEWLFik2NlbJycnKy8vT5cuXdf78+Zv/AAAAgIUhw8z69euVl5engQMHFtj315Dg4eFxw2P98ccf+uCDDzR06FA1bty4WOf/6yU1Z2fnAvvN5v+7czdt2jQlJydbbnu5urrqrbfeUk5OTrHOBQAAima4MJObm6tNmzapb9++atGiRb59U6dO1YYNG/TQQw8V61gpKSkaP368OnTooA4dOuTbFxwcrIsXL+rcuXOWqzNnz57VxYsXVbNmzWLXGxcXpyFDhqh58+aSpKSkJCUlJRX7/QAAoGiGCzM///yzUlNT1aFDhwLrxdx///1as2ZNoXNRCjNhwgT5+/ura9eu+QKGj4+PmjZtqtq1a2vKlCkaMmSIzGazZs2apXr16unOO+8sdr1BQUHavHmzQkNDlZWVZZmkXBo6+XUqleMYwWonD/otyrC1ZVdMGVsdFa7wUdttXUaJHIouel4bAOsyXJj57rvv1Lhx40IXvrvvvvs0f/58/fbbb8U61v79+yVJ/fv3z7f92qPZb775pmbPnq033nhD0tXJvEOHDi3RzO2XXnpJM2bM0IgRI+Tv769evXqxAjEAAKXIsI9mO7JrTzM1j9ph40qsZ3VUuDpO2GXrMqzGkfo1Yq+3cmWmvDz9YS30a9+s2W9R82ANvQIwAAAAYQYAABgaYQYAABgaYQYAABgaYQYAABgaYQYAABgaYQYAABgaYQYAABgai+YZ0LVF8wpbBdlesRCV/XKkXiX6tXf0W3ZYNA8AANgtwgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0wgwAADA0vjXbgK59azYAAI6kSpUqhW7nygwAADA0rswY1IgRIzRt2jRbl2E19Gu/HKlXiX7tHf3aBldmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoRFmAACAoTm/9dZbY2xdBG5OSEiIrUuwKvq1X47Uq0S/9o5+rY8VgAEAgKFxmwkAABgaYQYAABgaYQYAABgaYQYAABiai60LQMnExMTo66+/VlJSkmrWrKlnnnlGDRs2tHVZt2zJkiXasWOHTp8+LVdXV91xxx3q16+fatWqZRljNpv1xRdfaN26dUpPT1doaKieffbZfGOM6L///a8WLlyoTp066dlnn5Vkn71evHhR8+fP165du5SZmanAwEA999xzaty4sST76jk3N1dffPGFNm/erKSkJPn5+alNmzZ66qmn5OzsLMnY/e7bt0/ffPONjh49qosXL+qll17Sww8/bNlfnN7S09M1a9Ys/fTTT5KkFi1aaOjQoapYsaLV+ylKUb3m5ORo0aJF2r17txISEuTl5aXGjRurX79+qlKliuUYV65c0aeffqqtW7fq8uXLatKkiZ577jlVrlzZVm1d143+bP9qxowZWrdunQYMGKDHH3/cst0W/XJlxkC2bdumOXPmqEePHvroo49Uv359jRkzRomJibYu7ZbFxsaqU6dOev/99/Xuu+/K2dlZb731ltLS0ixjli1bpuXLl2vIkCGaOnWqfH199fbbbysjI8OGld+agwcPat26dapdu3a+7fbWa3p6ul577TWZzWaNHj1aM2fO1NChQ1WpUiXLGHvqedmyZYqJidHQoUP18ccfa8iQIYqJidGSJUvyjTFqv1lZWapVq5aGDBkiNze3AvuL09v777+vY8eOacyYMRo7dqyOHTumqVOnWrONYimq1+zsbB07dkw9evTQhx9+qLfeekt//vmnxowZo9zcXMu4OXPmaMeOHXr11Vc1ceJEZWRkaNy4cfnGlBc3+rO9Zvv27Tpy5Ij8/f0L7LNFv4QZA1m+fLnatWunDh06KDg4WEOHDpWfn5/WrFlj69Ju2bhx4/Twww+rVq1aql27tkaOHKnU1FTFxcVJuvo3vZUrV6pbt25q3bq1atWqpREjRigzM1Nbt261cfU359KlS5oyZYqGDx+e72+j9tjr119/LX9/f40cOVKhoaEKDAxUkyZNFBwcLMn+eo6Li1OLFi3UokULVa1aVffcc4/uueceHTp0SJLx+w0PD1ffvn3VunVrOTnl/zVSnN7i4+P1yy+/6IUXXlD9+vUVFhamYcOG6eeff9apU6ds0dJ1FdVrhQoVNH78eN1///2qUaOGQkNDNWzYMMXHxys+Pl7S1f/Ov/vuOw0YMEDNmjVTSEiIRo4cqd9//1179uyxRUtFKqrfaxITEzV79my98sorcnHJf4PHVv0SZgziypUrOnr0qJo1a5Zve7NmzSy/8O1JZmam8vLyVKFCBUnSuXPnlJSUlK9/d3d3NWzYUAcPHrRVmbdkxowZat26tZo0aZJvuz32unPnToWGhmrSpEnq3bu3hg8frm+//VZm89Vlruyt5wYNGmjv3r2WX2gnT57U3r17FR4eLsn++v2r4vR28OBBeXp6qn79+pYxDRo0kIeHh+H7v3b16dpfUI4ePaqcnJx8n8dtt92mGjVqGPL/u3Nzc/X++++rZ8+elr+M/JWt+mXOjEGkpqYqLy8v32V5SapUqVK5TPe3avbs2apbt67CwsIkSUlJSZJUaP8XLlywen23at26dUpISNDIkSML7LO3XiXp7NmzWr16tbp06aInnnhCJ06c0KxZsyRJnTt3truen3jiCWVmZmrYsGFycnJSbm6uevTooU6dOkmyzz/ja4rTW1JSknx8fGQymSz7TSaTfH19Le83omtzRVq0aGGZH5KUlCQnJyf5+PjkG+vn52fIXj///HN5e3urY8eOhe63Vb+EGYP563/89mru3LmKi4vTpEmTLJMlr/l7/2az2XCfyalTp7RgwQJNnDhRrq6u1x1nD71eYzabFRISon79+kmS6tWrpzNnzigmJkadO3e2jLOXnrdt26bNmzfrlVdek6YedwAABHNJREFUUc2aNXX8+HHNmTNHVatWVfv27S3j7KXfwtyot8L6vHalzohyc3M1ZcoUpaen66233rrheCP+WcfGxmrjxo3697//XeL3lnW/hBmD8PHxkZOTU4Fkm5ycXOBvQEY2Z84cbdu2Te+++64CAwMt2/38/CRdTf233XabZXtKSorh+j948KBSU1P1wgsvWLbl5eVp//79WrNmjaKjoyXZR6/X+Pn5FbgkXaNGDZ0/f96yX7Kfnj/77DM99thjeuCBByRJtWvX1vnz57V06VK1b9/e7vr9q+L05ufnp5SUlHy/4Mxms1JTUy3vN5Jrt15+//13TZgwId9VCT8/P+Xl5Sk1NVW+vr6W7cnJyYZ7EjU2NlZJSUnq27evZVteXp7mz5+vlStXat68eTbrlzBjEK6urgoJCdFvv/2m++67z7L9t99+07333mvDykrP7NmztW3b/2vvDkLZD+M4jr+nlY3fyi+xsmkOJuSkFiWjlcQaTmI32cXRRSGOSw5yWUpppRwmB+XATfofJAdKaqV2kHGglZSxpP9Bfn8z8f//HeZZ39fxqdXz6ant+3ue5/fdL8LhcM4Pn91uR9d1jo+PqaurAyCTyXB6esrIyEg+pvvfWltbcbvdWWOLi4tUVVUxODiIw+EomKyvGhoaSCaTWWOXl5fG66uFtL7w8pbL+8uTRUVFPD8/A4WX962/yVZfX086nSYejxv3ZuLxOA8PD8bRsiqenp6Yn5/n/PyccDicU4zV1tZiNps5Ojqis7MTgJubGy4uLrLuDKmgt7eXtra2rLHZ2Vm8Xi/d3d1A/vJKMaOQgYEBFhYWcLvdNDY2sr29TSqVoqenJ99T+7alpSV2d3eZnp5G0zRjB8pisWC1WjGZTPT19bG+vo7T6cThcBCLxbBarXR0dOR59v9G07ScXhoWiwWbzWb04SiUrK/6+/uZmJggFovR3t5OIpFga2vLeMIrpPUF8Hg8bGxsYLfbjWOmzc1NfD4foH7edDrN1dUV8PJkfn19TSKRQNM0Kisrv8xWXV1Nc3MzkUjE2KGMRCJ4PB6cTmfecn3ks6zl5eXMzc1xdnbGzMwMJpPJ+O4qKSmhuLiY0tJSurq6iEajlJWVYbPZWFlZoaamJufy/0/w1dq+3zk0m83oum6sW77yyr9mK+a1aV4qlcLlchEKhWhqasr3tL4tEAh8OD48PEwwGAT+NOLa2dkxGnGNjY0p0WTsK5OTk7hcrpymeYWU9fDwkNXVVZLJJBUVFfj9fgKBQNYxQ6Fkvr+/Z21tjf39fW5vb9F1Ha/Xy9DQkNG7Q+W8JycnTE1N5Yz7fD7Gx8f/Ktvd3R3Ly8scHBwA0NLS8iOb5n2WNRgMEgqFPvzc22ZzmUyGaDTK3t4ej4+PRhO5t8dwP8VXa/ve6Ogofr8/q2lePvJKMSOEEEIIpUmfGSGEEEIoTYoZIYQQQihNihkhhBBCKE2KGSGEEEIoTYoZIYQQQihNihkhhBBCKE2KGSGEEEIoTYoZIYQQQijtN4e90jHYZ7EqAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "## then make a horizontal bar chart here\n", + "unemp_count.T.plot.barh(figsize=(8,8))\n", + "unemp_count.plot.barh(figsize=(8,8))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5. **(Challenging) Repeat the previous step, but count how many states had each classification in each month. Which month had the most states with high unemployment? What about medium and low?** " + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The month with high unemployment is 2009-04-01 00:00:00.\n", + "The month with medium unemployment is 2001-09-01 00:00:00.\n", + "The month with low unemployment is 2000-08-01 00:00:00.\n" + ] + } + ], + "source": [ + "# Part 4: Apply the same transform from part 4, but to each date instead of to each state.\n", + "\n", + "#a solution based on this lecture has not been found :p\n", + "#alternative solution: we transpose the previous table in order to be able to apply the same function as before \n", + " #for more informtion on reshaping the DataFrames, check Lecture 6\n", + "unemp_binsT = unemp_bins.T\n", + "unemp_binsT\n", + "\n", + "#we call the function previously created for the transposed DataFrame\n", + "unemp_countT = unemp_binsT.apply(count_bins)\n", + "unemp_countT.T.head()\n", + "\n", + "#we create a for loop to answer the question\n", + "times = [\"high\", \"medium\", \"low\"]\n", + "\n", + "for x in times:\n", + " m=unemp_countT.T[x].idxmax()\n", + " \n", + " print(f\"The month with {x} unemployment is {m}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **For a single state of your choice, determine what the mean unemployment is during \"Low\", \"Medium\", and \"High\" unemployment times (recall your `unemp_bins` DataFrame from the exercise above).**\n", + " - Think about how you would do this for all the states in our sample and write your thoughts... We will soon learn tools that will *greatly* simplify operations like this that operate on distinct *groups* of data at a time. " + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "During high unemployment times, the mean unemployment in Arizona is 8.73333333333333.\n", + "During medium unemployment times, the mean unemployment in Arizona is 5.436082474226807.\n", + "During low unemployment times, the mean unemployment in Arizona is 4.0636363636363635.\n" + ] + } + ], + "source": [ + "#The analysis is performed on Arizona\n", + "\n", + "times = [\"high\", \"medium\", \"low\"]\n", + "\n", + "for x in times:\n", + " Arizona_x = unemp[\"Arizona\"].loc[unemp_bins[\"Arizona\"]==x].mean()\n", + "\n", + " print(f\"During {x} unemployment times, the mean unemployment in Arizona is {Arizona_x}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Arizona_Bins\n", + "high 8.733333\n", + "low 4.063636\n", + "medium 5.436082\n", + "Name: Arizona_Values, dtype: float64" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Alternative solution:\n", + "\n", + "# I chose Arizona. I extract both values and bin series, rename them and concatenate it into an new data frame.\n", + "Arizona_bins=pd.Series(unemp_bins['Arizona'], name='Arizona_Bins')\n", + "Arizona_val=pd.Series(unemp['Arizona'], name='Arizona_Values')\n", + "Arizona_valbins = pd.concat([Arizona_bins, Arizona_val], axis=1)\n", + "Arizona_valbins.head()\n", + "Arizona_valbins.groupby('Arizona_Bins')['Arizona_Values'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Arizona - high: 8.73333333333333.\n", + "Arizona - medium: 5.436082474226807.\n", + "Arizona - low: 4.0636363636363635.\n", + "California - high: 9.249056603773582.\n", + "California - medium: 5.454716981132074.\n", + "California - low: 4.5.\n", + "Florida - high: 9.155882352941177.\n", + "Florida - medium: 5.331645569620254.\n", + "Florida - low: 3.807246376811594.\n", + "Illinois - high: 8.740659340659343.\n", + "Illinois - medium: 5.618867924528302.\n", + "Illinois - low: 4.35263157894737.\n", + "Michigan - high: 8.76267605633803.\n", + "Michigan - medium: 5.4070175438596495.\n", + "Michigan - low: 3.8764705882352946.\n", + "New York - high: 8.16923076923077.\n", + "New York - medium: 5.3612403100775206.\n", + "New York - low: 4.340909090909091.\n", + "Texas - high: 7.552941176470585.\n", + "Texas - medium: 5.567289719626171.\n", + "Texas - low: 4.298275862068967.\n" + ] + } + ], + "source": [ + "#Now, we perform the same analysis on all countries\n", + "\n", + "states = [\"Arizona\", \"California\", \"Florida\", \"Illinois\",\n", + " \"Michigan\", \"New York\", \"Texas\"]\n", + "\n", + "for y in states:\n", + " for x in times:\n", + " y_x = unemp[y].loc[unemp_bins[y]==x].mean()\n", + "\n", + " print(f\"{y} - {x}: {y_x}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Which states in our sample performs the best during \"bad times?\" To determine this, compute the mean unemployment for each state only for months in which the mean unemployment rate in our sample is greater than 7.** " + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "Texas 7.972727\n", + "dtype: float64" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Between bracket, the condition that the value is greater than 7\n", + "df=unemp[unemp>7].mean()\n", + "df=df.loc[df==min(df)]\n", + "df\n", + "# There must be a way to be more concise" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Arizona: 9.038461538461535\n", + "California: 10.234210526315788\n", + "Florida: 9.568965517241377\n", + "Illinois: 9.423529411764704\n", + "Michigan: 9.465384615384618\n", + "New York: 8.286666666666667\n", + "Texas: 7.972727272727272\n" + ] + } + ], + "source": [ + "#Alternative solution\n", + "def unemployment_levels2(ul2):\n", + " if ul2 > 7:\n", + " return \"High\"\n", + " else:\n", + " return \"Low\"\n", + " \n", + " return ul2\n", + "\n", + "unemp_bins2 = unemp.applymap(unemployment_levels2)\n", + "\n", + "for s in states:\n", + " s_high = unemp[s].loc[unemp_bins2[s]==\"High\"].mean()\n", + " \n", + " print(f\"{s}: {s_high}\")\n", + " \n", + "#The 3 states that perform the best during high unemployment times (>7) are, in ascending order, Texas, New York, Arizona" + ] + } + ], + "metadata": { + "date": 1584040758.8912327, + "filename": "basics.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Basic Functionality" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Session_7/3_the_index.ipynb b/Session_7/3_the_index.ipynb new file mode 100644 index 0000000..ebcf492 --- /dev/null +++ b/Session_7/3_the_index.ipynb @@ -0,0 +1,3919 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Index\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Introduction to pandas](https://datascience.quantecon.org/intro.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Understand how the index is used to align data \n", + "- Know how to set and reset the index \n", + "- Understand how to select subsets of data by slicing on index and columns \n", + "- Understand that for DataFrames, the column names also align data " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [The Index](#The-Index) \n", + " - [So What is this Index?](#So-What-is-this-Index?) \n", + " - [Setting the Index](#Setting-the-Index) \n", + " - [Re-setting the Index](#Re-setting-the-Index) \n", + " - [Choose the Index Carefully](#Choose-the-Index-Carefully) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (1.12.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## So What is this Index?\n", + "\n", + "Every Series or DataFrame has an index.\n", + "\n", + "We told you that the index was the “row labels” for the data.\n", + "\n", + "This is true, but an index in pandas does much more than label the rows.\n", + "\n", + "The purpose of this lecture is to understand the importance of the index.\n", + "\n", + "The [pandas\n", + "documentation](https://pandas.pydata.org/pandas-docs/stable/dsintro.html)\n", + "says\n", + "\n", + "> Data alignment is intrinsic. The link between labels and data will\n", + "not be broken unless done so explicitly by you.\n", + "\n", + "\n", + "In practice, the index and column names are used to make sure the data is\n", + "properly aligned when operating on multiple DataFrames.\n", + "\n", + "This is a somewhat abstract concept that is best understood by\n", + "example…\n", + "\n", + "Let’s begin by loading some data on GDP components that we collected from\n", + "the World Bank’s World Development Indicators Dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 72 entries, 0 to 71\n", + "Data columns (total 7 columns):\n", + "country 72 non-null object\n", + "year 72 non-null int64\n", + "GovExpend 72 non-null float64\n", + "Consumption 72 non-null float64\n", + "Exports 72 non-null float64\n", + "Imports 72 non-null float64\n", + "GDP 72 non-null float64\n", + "dtypes: float64(5), int64(1), object(1)\n", + "memory usage: 4.1+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
1Canada20160.3648991.0584260.5763940.5757751.814016
2Canada20150.3583031.0352080.5688590.5757931.794270
3Canada20140.3534851.0119880.5503230.5723441.782252
4Canada20130.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "1 Canada 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://datascience.quantecon.org/assets/data/wdi_data.csv\"\n", + "df = pd.read_csv(url)\n", + "df.info()\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We’ll also extract a couple smaller DataFrames we can use in examples." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
1Canada20160.3648991.0584260.5763940.5757751.814016
2Canada20150.3583031.0352080.5688590.5757931.794270
3Canada20140.3534851.0119880.5503230.5723441.782252
4Canada20130.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "1 Canada 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_small = df.head(5)\n", + "df_small" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
3Canada20140.3534851.0119880.5503230.5723441.782252
2Canada20150.3583031.0352080.5688590.5757931.794270
4Canada20130.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_tiny = df.iloc[[0, 3, 2, 4], :]\n", + "df_tiny" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImportsExports
00.6000310.582831
10.5757750.576394
20.5757930.568859
30.5723440.550323
40.5586360.518040
\n", + "
" + ], + "text/plain": [ + " Imports Exports\n", + "0 0.600031 0.582831\n", + "1 0.575775 0.576394\n", + "2 0.575793 0.568859\n", + "3 0.572344 0.550323\n", + "4 0.558636 0.518040" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex = df_small[[\"Imports\", \"Exports\"]]\n", + "im_ex_copy = im_ex.copy()\n", + "im_ex_copy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observe what happens when we evaluate `im_ex + im_ex_copy`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImportsExports
01.2000631.165661
11.1515501.152787
21.1515851.137718
31.1446881.100646
41.1172721.036081
\n", + "
" + ], + "text/plain": [ + " Imports Exports\n", + "0 1.200063 1.165661\n", + "1 1.151550 1.152787\n", + "2 1.151585 1.137718\n", + "3 1.144688 1.100646\n", + "4 1.117272 1.036081" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex + im_ex_copy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this operated *elementwise*, meaning that the `+`\n", + "operation was applied to each element of `im_ex` and the corresponding\n", + "element of `im_ex_copy`.\n", + "\n", + "Let’s take a closer look at `df_tiny`:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
3Canada20140.3534851.0119880.5503230.5723441.782252
2Canada20150.3583031.0352080.5688590.5757931.794270
4Canada20130.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_tiny" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Relative to `im_ex` notice a few things:\n", + "\n", + "- The row labeled `1` appears in `im_ex` but not `df_tiny`. \n", + "- For row labels that appear in both, they are not in the same position\n", + " within each DataFrame. \n", + "- Certain columns appear only in `df_tiny`. \n", + "- The `Imports` and `Exports` columns are the 6th and 5th columns of\n", + " `df_tiny` and the 1st and 2nd of `im_ex`, respectively. \n", + "\n", + "\n", + "Now, let’s see what happens when we try `df_tiny + im_ex`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConsumptionExportsGDPGovExpendImportscountryyear
0NaN1.165661NaNNaN1.200063NaNNaN
1NaNNaNNaNNaNNaNNaNNaN
2NaN1.137718NaNNaN1.151585NaNNaN
3NaN1.100646NaNNaN1.144688NaNNaN
4NaN1.036081NaNNaN1.117272NaNNaN
\n", + "
" + ], + "text/plain": [ + " Consumption Exports GDP GovExpend Imports country year\n", + "0 NaN 1.165661 NaN NaN 1.200063 NaN NaN\n", + "1 NaN NaN NaN NaN NaN NaN NaN\n", + "2 NaN 1.137718 NaN NaN 1.151585 NaN NaN\n", + "3 NaN 1.100646 NaN NaN 1.144688 NaN NaN\n", + "4 NaN 1.036081 NaN NaN 1.117272 NaN NaN" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex_tiny = df_tiny + im_ex\n", + "im_ex_tiny" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Whoa, a lot happened! Let’s break it down." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Automatic Alignment\n", + "\n", + "For all (row, column) combinations that appear in both DataFrames (e.g.\n", + "rows `[1, 3]` and columns `[Imports, Exports]`), the value of `im_ex_tiny`\n", + "is equal to `df_tiny.loc[row, col] + im_ex.loc[row, col]`.\n", + "\n", + "This happened even though the rows and columns were not in the same\n", + "order.\n", + "\n", + "We refer to this as pandas *aligning* the data for us.\n", + "\n", + "To see how awesome this is, think about how to do something similar in\n", + "Excel:\n", + "\n", + "- `df_tiny` and `im_ex` would be in different sheets. \n", + "- The index and column names would be the first column and row in each\n", + " sheet. \n", + "- We would have a third sheet to hold the sum. \n", + "- For each label in the first row and column of *either* the `df_tiny`\n", + " sheet or the `im_ex` sheet we would have to do a `IFELSE` to check\n", + " if the label exists in the other sheet and then a `VLOOKUP` to\n", + " extract the value. \n", + "\n", + "\n", + "In pandas, this happens automatically, behind the scenes, and *very\n", + "quickly*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling Missing Data\n", + "\n", + "For all elements in row `1` or columns\n", + "`[\"country\", \"year\", \"GovExpend\", \"Consumption\", \"GDP\"]`,\n", + "the value in `im_ex_tiny` is `NaN`.\n", + "\n", + "This is how pandas represents *missing data*.\n", + "\n", + "So, when pandas was trying to look up the values in `df_tiny` and `im_ex`, it could\n", + "only find a value in one DataFrame: the other value was missing.\n", + "\n", + "When pandas tries to add a number to something that is missing, it says\n", + "that the result is missing (spelled `NaN`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting the Index\n", + "\n", + "For a DataFrame `df`, the `df.set_index` method allows us to use one\n", + "(or more) of the DataFrame’s columns as the index.\n", + "\n", + "Here’s an example." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryGovExpendConsumptionExportsImportsGDP
year
2017Canada0.3726651.0954750.5828310.6000311.868164
2016Canada0.3648991.0584260.5763940.5757751.814016
2015Canada0.3583031.0352080.5688590.5757931.794270
2014Canada0.3534851.0119880.5503230.5723441.782252
2013Canada0.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " country GovExpend Consumption Exports Imports GDP\n", + "year \n", + "2017 Canada 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "2016 Canada 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2015 Canada 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "2014 Canada 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "2013 Canada 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# first, create the DataFrame\n", + "df_year = df.set_index([\"year\"])\n", + "df_year.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the year is on the index, we can use `.loc` to extract all the\n", + "data for a specific year." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryGovExpendConsumptionExportsImportsGDP
year
2010Canada0.3473320.9219520.4699490.5003411.613543
2010Germany0.6533861.9154811.4437351.2661263.417095
2010United Kingdom0.5211461.5985630.6908240.7450652.452900
2010United States2.51014310.1858361.8462802.36018314.992053
\n", + "
" + ], + "text/plain": [ + " country GovExpend Consumption Exports Imports GDP\n", + "year \n", + "2010 Canada 0.347332 0.921952 0.469949 0.500341 1.613543\n", + "2010 Germany 0.653386 1.915481 1.443735 1.266126 3.417095\n", + "2010 United Kingdom 0.521146 1.598563 0.690824 0.745065 2.452900\n", + "2010 United States 2.510143 10.185836 1.846280 2.360183 14.992053" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year.loc[2010]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This would be helpful, for example, if we wanted to compute the difference\n", + "in the average of all our variables from one year to the next." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "GovExpend 0.033317\n", + "Consumption -0.042998\n", + "Exports -0.121425\n", + "Imports -0.140042\n", + "GDP -0.182610\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year.loc[2009].mean() - df_year.loc[2008].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that pandas did a few things for us.\n", + "\n", + "- After computing `.mean()`, the row labels (index) were the former column names. \n", + "- These column names were used to align data when we wanted asked pandas to\n", + " compute the difference. \n", + "\n", + "\n", + "Suppose that someone asked you, “What was the GDP in the US in 2010?”\n", + "\n", + "To compute that using `df_year` you might do something like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "14.992052727" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year.loc[df_year[\"country\"] == \"United States\", \"GDP\"].loc[2010]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That was a lot of work!\n", + "\n", + "Now, suppose that after seeing you extract that data, your friend asks you\n", + "“What about GDP in Germany and the UK in 2010?”\n", + "\n", + "To answer that question, you might write." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "year\n", + "2010 3.417095\n", + "2010 2.452900\n", + "Name: GDP, dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year.loc[df_year[\"country\"].isin([\"United Kingdom\", \"Germany\"]), \"GDP\"].loc[2010]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this code is similar to the code above, but now provides a result\n", + "that is ambiguous.\n", + "\n", + "The two elements in the series both have with label 2010.\n", + "\n", + "How do we know which is which?\n", + "\n", + "We might think that the first value corresponds to the United Kingdom because\n", + "that is what we listed first in the call to `isin`, but we would be wrong!\n", + "\n", + "Let’s check." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryGovExpendConsumptionExportsImportsGDP
year
2010Canada0.3473320.9219520.4699490.5003411.613543
2010Germany0.6533861.9154811.4437351.2661263.417095
2010United Kingdom0.5211461.5985630.6908240.7450652.452900
2010United States2.51014310.1858361.8462802.36018314.992053
\n", + "
" + ], + "text/plain": [ + " country GovExpend Consumption Exports Imports GDP\n", + "year \n", + "2010 Canada 0.347332 0.921952 0.469949 0.500341 1.613543\n", + "2010 Germany 0.653386 1.915481 1.443735 1.266126 3.417095\n", + "2010 United Kingdom 0.521146 1.598563 0.690824 0.745065 2.452900\n", + "2010 United States 2.510143 10.185836 1.846280 2.360183 14.992053" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year.loc[2010]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setting just the year as index has one more potential issue: we will\n", + "get data alignment only on the year, which may not be sufficient.\n", + "\n", + "To demonstrate this point, suppose now you are asked to use our WDI dataset\n", + "to compute an approximation for net exports and investment in in 2009.\n", + "\n", + "As a seasoned economist, you would remember the expenditure formula for GDP is\n", + "written\n", + "\n", + "$$\n", + "GDP = Consumption + Investment + GovExpend + Net Exports\n", + "$$\n", + "\n", + "which we can rearrange to compute investment as a function of the variables in\n", + "our DataFrame…\n", + "\n", + "$$\n", + "Investment = GDP - Consumption - GovExpend - Net Exports\n", + "$$\n", + "\n", + "Note that we can compute NetExports as `Exports - Imports`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "year\n", + "2017 -0.017201\n", + "2016 0.000619\n", + "2015 -0.006934\n", + "2014 -0.022021\n", + "2013 -0.040596\n", + "2012 -0.041787\n", + "2011 -0.035878\n", + "2010 -0.030393\n", + "2009 0.000896\n", + "2008 0.004068\n", + "2007 0.032451\n", + "2006 0.053530\n", + "2005 0.072729\n", + "2004 0.091902\n", + "2003 0.097794\n", + "2002 0.121850\n", + "2001 0.122673\n", + "2000 0.118702\n", + "2017 0.264214\n", + "dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx = df_year[\"Exports\"] - df_year[\"Imports\"]\n", + "nx.head(19)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, suppose that we accidentally had a bug in our code that swapped\n", + "the data for Canada and Germany’s net exports in 2017.\n", + "\n", + ">**Note**\n", + ">\n", + ">This example is contrived, but if you were getting unclean data from\n", + "some resource or doing more complicated operations, this type of mistake\n", + "becomes increasingly likely." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "year\n", + "2017 0.264214\n", + "2016 0.000619\n", + "2015 -0.006934\n", + "2014 -0.022021\n", + "2013 -0.040596\n", + "2012 -0.041787\n", + "2011 -0.035878\n", + "2010 -0.030393\n", + "2009 0.000896\n", + "2008 0.004068\n", + "2007 0.032451\n", + "2006 0.053530\n", + "2005 0.072729\n", + "2004 0.091902\n", + "2003 0.097794\n", + "2002 0.121850\n", + "2001 0.122673\n", + "2000 0.118702\n", + "2017 -0.017201\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ca17 = nx.iloc[[0]]\n", + "g17 = nx.iloc[[18]]\n", + "nx.iloc[[0]] = g17\n", + "nx.iloc[[18]] = ca17\n", + "\n", + "nx.head(19)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that if we now add `nx` to the DataFrame and compute investment\n", + "pandas doesn’t complain." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryGovExpendConsumptionExportsImportsGDPNetExportsInvestment
year
2017Canada0.3726651.0954750.5828310.6000311.8681640.2642140.135811
2016Canada0.3648991.0584260.5763940.5757751.8140160.0006190.390072
2015Canada0.3583031.0352080.5688590.5757931.794270-0.0069340.407692
2014Canada0.3534851.0119880.5503230.5723441.782252-0.0220210.438800
2013Canada0.3515410.9864000.5180400.5586361.732714-0.0405960.435369
2012Canada0.3543420.9612260.5059690.5477561.693428-0.0417870.419647
2011Canada0.3518870.9431450.4923490.5282271.664240-0.0358780.405086
2010Canada0.3473320.9219520.4699490.5003411.613543-0.0303930.374652
2009Canada0.3396860.8900780.4406920.4397961.5652910.0008960.334631
2008Canada0.3307660.8896020.5063500.5022811.6128620.0040680.388425
2007Canada0.3187770.8640120.5304530.4980021.5968760.0324510.381636
2006Canada0.3113820.8276430.5244610.4709311.5646080.0535300.372053
2005Canada0.3030430.7943900.5199500.4472221.5246080.0727290.354447
2004Canada0.2998540.7643570.5086570.4167541.4773170.0919020.321203
2003Canada0.2943350.7417960.4819930.3841991.4330890.0977940.299164
2002Canada0.2860940.7219740.4904650.3686151.4077250.1218500.277806
2001Canada0.2797670.6942300.4846960.3620231.3665900.1226730.269921
2000Canada0.2705530.6777130.4995260.3808231.3428050.1187020.275837
2017Germany0.7455792.1120091.9305631.6663483.883870-0.0172011.043482
\n", + "
" + ], + "text/plain": [ + " country GovExpend Consumption Exports Imports GDP \\\n", + "year \n", + "2017 Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "2016 Canada 0.364899 1.058426 0.576394 0.575775 1.814016 \n", + "2015 Canada 0.358303 1.035208 0.568859 0.575793 1.794270 \n", + "2014 Canada 0.353485 1.011988 0.550323 0.572344 1.782252 \n", + "2013 Canada 0.351541 0.986400 0.518040 0.558636 1.732714 \n", + "2012 Canada 0.354342 0.961226 0.505969 0.547756 1.693428 \n", + "2011 Canada 0.351887 0.943145 0.492349 0.528227 1.664240 \n", + "2010 Canada 0.347332 0.921952 0.469949 0.500341 1.613543 \n", + "2009 Canada 0.339686 0.890078 0.440692 0.439796 1.565291 \n", + "2008 Canada 0.330766 0.889602 0.506350 0.502281 1.612862 \n", + "2007 Canada 0.318777 0.864012 0.530453 0.498002 1.596876 \n", + "2006 Canada 0.311382 0.827643 0.524461 0.470931 1.564608 \n", + "2005 Canada 0.303043 0.794390 0.519950 0.447222 1.524608 \n", + "2004 Canada 0.299854 0.764357 0.508657 0.416754 1.477317 \n", + "2003 Canada 0.294335 0.741796 0.481993 0.384199 1.433089 \n", + "2002 Canada 0.286094 0.721974 0.490465 0.368615 1.407725 \n", + "2001 Canada 0.279767 0.694230 0.484696 0.362023 1.366590 \n", + "2000 Canada 0.270553 0.677713 0.499526 0.380823 1.342805 \n", + "2017 Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "\n", + " NetExports Investment \n", + "year \n", + "2017 0.264214 0.135811 \n", + "2016 0.000619 0.390072 \n", + "2015 -0.006934 0.407692 \n", + "2014 -0.022021 0.438800 \n", + "2013 -0.040596 0.435369 \n", + "2012 -0.041787 0.419647 \n", + "2011 -0.035878 0.405086 \n", + "2010 -0.030393 0.374652 \n", + "2009 0.000896 0.334631 \n", + "2008 0.004068 0.388425 \n", + "2007 0.032451 0.381636 \n", + "2006 0.053530 0.372053 \n", + "2005 0.072729 0.354447 \n", + "2004 0.091902 0.321203 \n", + "2003 0.097794 0.299164 \n", + "2002 0.121850 0.277806 \n", + "2001 0.122673 0.269921 \n", + "2000 0.118702 0.275837 \n", + "2017 -0.017201 1.043482 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_year[\"NetExports\"] = nx\n", + "df_year[\"Investment\"] = df_year.eval(\"GDP - Consumption - GovExpend - NetExports\")\n", + "df_year.head(19)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because we didn’t also have data alignment on the country, we would have overstated Canada’s investment by 281 billion USD and understated Germany’s by the\n", + "same amount.\n", + "\n", + "To make these types operation easier, we need to include both the year\n", + "and country in the index…" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting a Hierarchical Index\n", + "\n", + "Include multiple columns in the index is advantageous in some situations.\n", + "\n", + "These situations might include:\n", + "\n", + "- When we need more than one piece of information (column) to identify an\n", + " observation (as in the Germany and UK GDP example above) \n", + "- When we need data-alignment by more than one column \n", + "\n", + "\n", + "To achieve multiple columns in the index, we pass a list of multiple column\n", + "names to `set_index`." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
20150.3583031.0352080.5688590.5757931.794270
20140.3534851.0119880.5503230.5723441.782252
20130.3515410.9864000.5180400.5586361.732714
20120.3543420.9612260.5059690.5477561.693428
20110.3518870.9431450.4923490.5282271.664240
20100.3473320.9219520.4699490.5003411.613543
20090.3396860.8900780.4406920.4397961.565291
20080.3307660.8896020.5063500.5022811.612862
20070.3187770.8640120.5304530.4980021.596876
20060.3113820.8276430.5244610.4709311.564608
20050.3030430.7943900.5199500.4472221.524608
20040.2998540.7643570.5086570.4167541.477317
20030.2943350.7417960.4819930.3841991.433089
20020.2860940.7219740.4904650.3686151.407725
20010.2797670.6942300.4846960.3620231.366590
20000.2705530.6777130.4995260.3808231.342805
Germany20170.7455792.1120091.9305631.6663483.883870
20160.7340142.0756151.8449491.5894953.801859
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + " 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + " 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + " 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + " 2012 0.354342 0.961226 0.505969 0.547756 1.693428\n", + " 2011 0.351887 0.943145 0.492349 0.528227 1.664240\n", + " 2010 0.347332 0.921952 0.469949 0.500341 1.613543\n", + " 2009 0.339686 0.890078 0.440692 0.439796 1.565291\n", + " 2008 0.330766 0.889602 0.506350 0.502281 1.612862\n", + " 2007 0.318777 0.864012 0.530453 0.498002 1.596876\n", + " 2006 0.311382 0.827643 0.524461 0.470931 1.564608\n", + " 2005 0.303043 0.794390 0.519950 0.447222 1.524608\n", + " 2004 0.299854 0.764357 0.508657 0.416754 1.477317\n", + " 2003 0.294335 0.741796 0.481993 0.384199 1.433089\n", + " 2002 0.286094 0.721974 0.490465 0.368615 1.407725\n", + " 2001 0.279767 0.694230 0.484696 0.362023 1.366590\n", + " 2000 0.270553 0.677713 0.499526 0.380823 1.342805\n", + "Germany 2017 0.745579 2.112009 1.930563 1.666348 3.883870\n", + " 2016 0.734014 2.075615 1.844949 1.589495 3.801859" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi = df.set_index([\"country\", \"year\"])\n", + "wdi.head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that in the display above, the row labels seem to have two\n", + "*levels* now.\n", + "\n", + "The *outer* (or left-most) level is named `country` and the *inner* (or\n", + "right-most) level is named `year`.\n", + "\n", + "When a DataFrame’s index has multiple levels, we (and the pandas documentation)\n", + "refer to the DataFrame as having a hierarchical index." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Slicing a Hierarchical Index\n", + "\n", + "Now, we can answer our friend’s questions in a much more straightforward way." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "14.992052727" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[(\"United States\", 2010), \"GDP\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "country year\n", + "Germany 2010 3.417095\n", + "United Kingdom 2010 2.452900\n", + "Name: GDP, dtype: float64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[([\"United Kingdom\", \"Germany\"], 2010), \"GDP\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown above, we can use `wdi.loc` to extract different slices of our\n", + "national accounts data.\n", + "\n", + "The rules for using `.loc` with a hierarchically-indexed DataFrame are\n", + "similar to the ones we’ve learned for standard DataFrames, but they are a bit\n", + "more elaborate as we now have more structure to our data.\n", + "\n", + "**Slicing rules**\n", + "\n", + "pandas slicing reacts differently to `list`s and `tuple`s.\n", + "\n", + "It does this to provide more flexibility to select the\n", + "data you want.\n", + "\n", + "`list` in row slicing will be an “or” operation, where it chooses rows\n", + "based on whether the index value corresponds to any element of the list.\n", + "\n", + "`tuple` in row slicing will be used to denote a single hierarchical\n", + "index and must include a value for each level.\n", + "\n", + "**Row slicing examples**\n", + "\n", + "1. `wdi.loc[\"United States\"]`: all rows where the *outer* most index value is\n", + " equal to `United States` \n", + "1. `wdi.loc[(\"United States\", 2010)]`: all rows where the *outer-most* index value\n", + " is equal to `\"United States` and the second level is equal to `2010` \n", + "1. `wdi.loc[[\"United States\", \"Canada\"]]`: all rows where the *outer-most* index is\n", + " either `\"United States\"` or `\"Canada\"` \n", + "1. `wdi.loc[([\"United States\", \"Canada\"], [2010, 2011]), :]`: all rows where the\n", + " *outer-most* index is either `\"United States` or `\"Canada\"` AND where the\n", + " second level index is either `2010` or `2011` \n", + "1. `wdi.loc[[(\"United States\", 2010), (\"Canada\", 2011)], :]`: all rows where the the\n", + " two hierarchical indices are either `(\"United States\", 2010)` or\n", + " `(\"Canada\", 2011)` \n", + "\n", + "\n", + "We can also restrict `.loc` to extract certain columns by doing:\n", + "\n", + "1. `wdi.loc[rows, GDP]`: return the rows specified by rows (see rules\n", + " above) and only column named `GDP` (returned object will be a\n", + " Series) \n", + "1. `df.loc[rows, [\"GDP\", \"Consumption\"]]`: return the rows specified by rows\n", + " (see rules above) and only columns `GDP` and `Consumption` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alignment with `MultiIndex`\n", + "\n", + "The data alignment features we talked about above also apply to a\n", + "`MultiIndex` DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `pd.IndexSlice`\n", + "\n", + "When we want to extract rows for a few values of the outer index and all\n", + "values for an inner index level, we can use the convenient\n", + "`df.loc[[id11, id22]]` shorthand.\n", + "\n", + "We can use this notation to extract all the data for the United States and\n", + "Canada." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
20150.3583031.0352080.5688590.5757931.794270
20140.3534851.0119880.5503230.5723441.782252
20130.3515410.9864000.5180400.5586361.732714
20120.3543420.9612260.5059690.5477561.693428
20110.3518870.9431450.4923490.5282271.664240
20100.3473320.9219520.4699490.5003411.613543
20090.3396860.8900780.4406920.4397961.565291
20080.3307660.8896020.5063500.5022811.612862
20070.3187770.8640120.5304530.4980021.596876
20060.3113820.8276430.5244610.4709311.564608
20050.3030430.7943900.5199500.4472221.524608
20040.2998540.7643570.5086570.4167541.477317
20030.2943350.7417960.4819930.3841991.433089
20020.2860940.7219740.4904650.3686151.407725
20010.2797670.6942300.4846960.3620231.366590
20000.2705530.6777130.4995260.3808231.342805
United States20172.40574312.0192662.2870713.06995417.348627
20162.40798111.7221332.2199372.93600416.972348
20152.37313011.4098002.2222282.88133716.710459
20142.33407111.0006192.2095552.73222816.242526
20132.35338110.6872142.1186392.60019815.853796
20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
20092.50739010.0106871.6464322.08629914.617299
20082.40777110.1378471.7973472.40034914.997756
20072.35198710.1593871.7010962.45501615.018268
20062.3149579.9385031.5649202.39518914.741688
20052.2870229.6430981.4312052.24624614.332500
20042.2679999.3114311.3359782.10858513.846058
20032.2335198.9747081.2181991.89282513.339312
20022.1931888.6983061.1921801.80410512.968263
20012.1120388.4804611.2132531.74079712.746262
20002.0405008.2720971.2877391.79099512.620268
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + " 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + " 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + " 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + " 2012 0.354342 0.961226 0.505969 0.547756 1.693428\n", + " 2011 0.351887 0.943145 0.492349 0.528227 1.664240\n", + " 2010 0.347332 0.921952 0.469949 0.500341 1.613543\n", + " 2009 0.339686 0.890078 0.440692 0.439796 1.565291\n", + " 2008 0.330766 0.889602 0.506350 0.502281 1.612862\n", + " 2007 0.318777 0.864012 0.530453 0.498002 1.596876\n", + " 2006 0.311382 0.827643 0.524461 0.470931 1.564608\n", + " 2005 0.303043 0.794390 0.519950 0.447222 1.524608\n", + " 2004 0.299854 0.764357 0.508657 0.416754 1.477317\n", + " 2003 0.294335 0.741796 0.481993 0.384199 1.433089\n", + " 2002 0.286094 0.721974 0.490465 0.368615 1.407725\n", + " 2001 0.279767 0.694230 0.484696 0.362023 1.366590\n", + " 2000 0.270553 0.677713 0.499526 0.380823 1.342805\n", + "United States 2017 2.405743 12.019266 2.287071 3.069954 17.348627\n", + " 2016 2.407981 11.722133 2.219937 2.936004 16.972348\n", + " 2015 2.373130 11.409800 2.222228 2.881337 16.710459\n", + " 2014 2.334071 11.000619 2.209555 2.732228 16.242526\n", + " 2013 2.353381 10.687214 2.118639 2.600198 15.853796\n", + " 2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + " 2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + " 2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + " 2009 2.507390 10.010687 1.646432 2.086299 14.617299\n", + " 2008 2.407771 10.137847 1.797347 2.400349 14.997756\n", + " 2007 2.351987 10.159387 1.701096 2.455016 15.018268\n", + " 2006 2.314957 9.938503 1.564920 2.395189 14.741688\n", + " 2005 2.287022 9.643098 1.431205 2.246246 14.332500\n", + " 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + " 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + " 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + " 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + " 2000 2.040500 8.272097 1.287739 1.790995 12.620268" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[[\"United States\", \"Canada\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, suppose we wanted to extract the data for all countries, but only the\n", + "years 2005, 2007, and 2009.\n", + "\n", + "We cannot do this using `wdi.loc` because the year is on the second level,\n", + "not outer-most level of our index.\n", + "\n", + "To get around this limitation, we can use the `pd.IndexSlice` helper.\n", + "\n", + "Here’s an example." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20090.3396860.8900780.4406920.4397961.565291
20070.3187770.8640120.5304530.4980021.596876
20050.3030430.7943900.5199500.4472221.524608
Germany20090.6450231.9083931.2605251.1219143.283144
20070.6056241.8942191.4424361.2138353.441356
20050.5911841.8662531.1752001.0280943.213777
United Kingdom20090.5197161.5871520.6538300.6890112.411632
20070.5045491.6447890.7102000.7676992.527327
20050.4908061.5789140.6400880.7159512.403352
United States20092.50739010.0106871.6464322.08629914.617299
20072.35198710.1593871.7010962.45501615.018268
20052.2870229.6430981.4312052.24624614.332500
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2009 0.339686 0.890078 0.440692 0.439796 1.565291\n", + " 2007 0.318777 0.864012 0.530453 0.498002 1.596876\n", + " 2005 0.303043 0.794390 0.519950 0.447222 1.524608\n", + "Germany 2009 0.645023 1.908393 1.260525 1.121914 3.283144\n", + " 2007 0.605624 1.894219 1.442436 1.213835 3.441356\n", + " 2005 0.591184 1.866253 1.175200 1.028094 3.213777\n", + "United Kingdom 2009 0.519716 1.587152 0.653830 0.689011 2.411632\n", + " 2007 0.504549 1.644789 0.710200 0.767699 2.527327\n", + " 2005 0.490806 1.578914 0.640088 0.715951 2.403352\n", + "United States 2009 2.507390 10.010687 1.646432 2.086299 14.617299\n", + " 2007 2.351987 10.159387 1.701096 2.455016 15.018268\n", + " 2005 2.287022 9.643098 1.431205 2.246246 14.332500" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[pd.IndexSlice[:, [2005, 2007, 2009]], :]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the `:` in the first part of `[:, [\"A\", \"D\"]]`\n", + "instructed pandas to give us rows for all values of the outer most index\n", + "level and that the `:` just before `]` said grab all the columns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multi-index Columns\n", + "\n", + "The functionality of `MultiIndex` also applies to the column names.\n", + "\n", + "Let’s see how it works." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryCanada...United States
year2017201620152014201320122011201020092008...2009200820072006200520042003200220012000
GovExpend0.3726650.3648990.3583030.3534850.3515410.3543420.3518870.3473320.3396860.330766...2.5073902.4077712.3519872.3149572.2870222.2679992.2335192.1931882.1120382.040500
Consumption1.0954751.0584261.0352081.0119880.9864000.9612260.9431450.9219520.8900780.889602...10.01068710.13784710.1593879.9385039.6430989.3114318.9747088.6983068.4804618.272097
Exports0.5828310.5763940.5688590.5503230.5180400.5059690.4923490.4699490.4406920.506350...1.6464321.7973471.7010961.5649201.4312051.3359781.2181991.1921801.2132531.287739
Imports0.6000310.5757750.5757930.5723440.5586360.5477560.5282270.5003410.4397960.502281...2.0862992.4003492.4550162.3951892.2462462.1085851.8928251.8041051.7407971.790995
GDP1.8681641.8140161.7942701.7822521.7327141.6934281.6642401.6135431.5652911.612862...14.61729914.99775615.01826814.74168814.33250013.84605813.33931212.96826312.74626212.620268
\n", + "

5 rows × 72 columns

\n", + "
" + ], + "text/plain": [ + "country Canada \\\n", + "year 2017 2016 2015 2014 2013 2012 \n", + "GovExpend 0.372665 0.364899 0.358303 0.353485 0.351541 0.354342 \n", + "Consumption 1.095475 1.058426 1.035208 1.011988 0.986400 0.961226 \n", + "Exports 0.582831 0.576394 0.568859 0.550323 0.518040 0.505969 \n", + "Imports 0.600031 0.575775 0.575793 0.572344 0.558636 0.547756 \n", + "GDP 1.868164 1.814016 1.794270 1.782252 1.732714 1.693428 \n", + "\n", + "country ... United States \\\n", + "year 2011 2010 2009 2008 ... 2009 \n", + "GovExpend 0.351887 0.347332 0.339686 0.330766 ... 2.507390 \n", + "Consumption 0.943145 0.921952 0.890078 0.889602 ... 10.010687 \n", + "Exports 0.492349 0.469949 0.440692 0.506350 ... 1.646432 \n", + "Imports 0.528227 0.500341 0.439796 0.502281 ... 2.086299 \n", + "GDP 1.664240 1.613543 1.565291 1.612862 ... 14.617299 \n", + "\n", + "country \\\n", + "year 2008 2007 2006 2005 2004 2003 \n", + "GovExpend 2.407771 2.351987 2.314957 2.287022 2.267999 2.233519 \n", + "Consumption 10.137847 10.159387 9.938503 9.643098 9.311431 8.974708 \n", + "Exports 1.797347 1.701096 1.564920 1.431205 1.335978 1.218199 \n", + "Imports 2.400349 2.455016 2.395189 2.246246 2.108585 1.892825 \n", + "GDP 14.997756 15.018268 14.741688 14.332500 13.846058 13.339312 \n", + "\n", + "country \n", + "year 2002 2001 2000 \n", + "GovExpend 2.193188 2.112038 2.040500 \n", + "Consumption 8.698306 8.480461 8.272097 \n", + "Exports 1.192180 1.213253 1.287739 \n", + "Imports 1.804105 1.740797 1.790995 \n", + "GDP 12.968263 12.746262 12.620268 \n", + "\n", + "[5 rows x 72 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT = wdi.T # .T means \"transpose\" or \"swap rows and columns\"\n", + "wdiT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that `wdiT` seems to have two levels of names for the columns.\n", + "\n", + "The same logic laid out in the above row slicing rules applies when we\n", + "have a hierarchical index for column names." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
year201720162015201420132012201120102009200820072006200520042003200220012000
GovExpend2.4057432.4079812.3731302.3340712.3533812.3988732.4343782.5101432.5073902.4077712.3519872.3149572.2870222.2679992.2335192.1931882.1120382.040500
Consumption12.01926611.72213311.40980011.00061910.68721410.53404210.37806010.18583610.01068710.13784710.1593879.9385039.6430989.3114318.9747088.6983068.4804618.272097
Exports2.2870712.2199372.2222282.2095552.1186392.0455091.9780831.8462801.6464321.7973471.7010961.5649201.4312051.3359781.2181991.1921801.2132531.287739
Imports3.0699542.9360042.8813372.7322282.6001982.5606772.4931942.3601832.0862992.4003492.4550162.3951892.2462462.1085851.8928251.8041051.7407971.790995
GDP17.34862716.97234816.71045916.24252615.85379615.56703815.22455514.99205314.61729914.99775615.01826814.74168814.33250013.84605813.33931212.96826312.74626212.620268
\n", + "
" + ], + "text/plain": [ + "year 2017 2016 2015 2014 2013 2012 \\\n", + "GovExpend 2.405743 2.407981 2.373130 2.334071 2.353381 2.398873 \n", + "Consumption 12.019266 11.722133 11.409800 11.000619 10.687214 10.534042 \n", + "Exports 2.287071 2.219937 2.222228 2.209555 2.118639 2.045509 \n", + "Imports 3.069954 2.936004 2.881337 2.732228 2.600198 2.560677 \n", + "GDP 17.348627 16.972348 16.710459 16.242526 15.853796 15.567038 \n", + "\n", + "year 2011 2010 2009 2008 2007 2006 \\\n", + "GovExpend 2.434378 2.510143 2.507390 2.407771 2.351987 2.314957 \n", + "Consumption 10.378060 10.185836 10.010687 10.137847 10.159387 9.938503 \n", + "Exports 1.978083 1.846280 1.646432 1.797347 1.701096 1.564920 \n", + "Imports 2.493194 2.360183 2.086299 2.400349 2.455016 2.395189 \n", + "GDP 15.224555 14.992053 14.617299 14.997756 15.018268 14.741688 \n", + "\n", + "year 2005 2004 2003 2002 2001 2000 \n", + "GovExpend 2.287022 2.267999 2.233519 2.193188 2.112038 2.040500 \n", + "Consumption 9.643098 9.311431 8.974708 8.698306 8.480461 8.272097 \n", + "Exports 1.431205 1.335978 1.218199 1.192180 1.213253 1.287739 \n", + "Imports 2.246246 2.108585 1.892825 1.804105 1.740797 1.790995 \n", + "GDP 14.332500 13.846058 13.339312 12.968263 12.746262 12.620268 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT.loc[:, \"United States\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryCanada...United States
year2017201620152014201320122011201020092008...2009200820072006200520042003200220012000
GovExpend0.3726650.3648990.3583030.3534850.3515410.3543420.3518870.3473320.3396860.330766...2.5073902.4077712.3519872.3149572.2870222.2679992.2335192.1931882.1120382.040500
Consumption1.0954751.0584261.0352081.0119880.9864000.9612260.9431450.9219520.8900780.889602...10.01068710.13784710.1593879.9385039.6430989.3114318.9747088.6983068.4804618.272097
Exports0.5828310.5763940.5688590.5503230.5180400.5059690.4923490.4699490.4406920.506350...1.6464321.7973471.7010961.5649201.4312051.3359781.2181991.1921801.2132531.287739
Imports0.6000310.5757750.5757930.5723440.5586360.5477560.5282270.5003410.4397960.502281...2.0862992.4003492.4550162.3951892.2462462.1085851.8928251.8041051.7407971.790995
GDP1.8681641.8140161.7942701.7822521.7327141.6934281.6642401.6135431.5652911.612862...14.61729914.99775615.01826814.74168814.33250013.84605813.33931212.96826312.74626212.620268
\n", + "

5 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + "country Canada \\\n", + "year 2017 2016 2015 2014 2013 2012 \n", + "GovExpend 0.372665 0.364899 0.358303 0.353485 0.351541 0.354342 \n", + "Consumption 1.095475 1.058426 1.035208 1.011988 0.986400 0.961226 \n", + "Exports 0.582831 0.576394 0.568859 0.550323 0.518040 0.505969 \n", + "Imports 0.600031 0.575775 0.575793 0.572344 0.558636 0.547756 \n", + "GDP 1.868164 1.814016 1.794270 1.782252 1.732714 1.693428 \n", + "\n", + "country ... United States \\\n", + "year 2011 2010 2009 2008 ... 2009 \n", + "GovExpend 0.351887 0.347332 0.339686 0.330766 ... 2.507390 \n", + "Consumption 0.943145 0.921952 0.890078 0.889602 ... 10.010687 \n", + "Exports 0.492349 0.469949 0.440692 0.506350 ... 1.646432 \n", + "Imports 0.528227 0.500341 0.439796 0.502281 ... 2.086299 \n", + "GDP 1.664240 1.613543 1.565291 1.612862 ... 14.617299 \n", + "\n", + "country \\\n", + "year 2008 2007 2006 2005 2004 2003 \n", + "GovExpend 2.407771 2.351987 2.314957 2.287022 2.267999 2.233519 \n", + "Consumption 10.137847 10.159387 9.938503 9.643098 9.311431 8.974708 \n", + "Exports 1.797347 1.701096 1.564920 1.431205 1.335978 1.218199 \n", + "Imports 2.400349 2.455016 2.395189 2.246246 2.108585 1.892825 \n", + "GDP 14.997756 15.018268 14.741688 14.332500 13.846058 13.339312 \n", + "\n", + "country \n", + "year 2002 2001 2000 \n", + "GovExpend 2.193188 2.112038 2.040500 \n", + "Consumption 8.698306 8.480461 8.272097 \n", + "Exports 1.192180 1.213253 1.287739 \n", + "Imports 1.804105 1.740797 1.790995 \n", + "GDP 12.968263 12.746262 12.620268 \n", + "\n", + "[5 rows x 36 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT.loc[:, [\"United States\", \"Canada\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryCanadaUnited States
year20102010
GovExpend0.3473322.510143
Consumption0.92195210.185836
Exports0.4699491.846280
Imports0.5003412.360183
GDP1.61354314.992053
\n", + "
" + ], + "text/plain": [ + "country Canada United States\n", + "year 2010 2010\n", + "GovExpend 0.347332 2.510143\n", + "Consumption 0.921952 10.185836\n", + "Exports 0.469949 1.846280\n", + "Imports 0.500341 2.360183\n", + "GDP 1.613543 14.992053" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT.loc[:, ([\"United States\", \"Canada\"], 2010)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-setting the Index\n", + "\n", + "The `df.reset_index` method will move one or more level of the index\n", + "back into the DataFrame as a normal column.\n", + "\n", + "With no additional arguments, it moves all levels out of the index and\n", + "sets the index of the returned DataFrame to the default of\n", + "`range(df.shape[0])`." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
1Canada20160.3648991.0584260.5763940.5757751.814016
2Canada20150.3583031.0352080.5688590.5757931.794270
3Canada20140.3534851.0119880.5503230.5723441.782252
4Canada20130.3515410.9864000.5180400.5586361.732714
........................
67United States20042.2679999.3114311.3359782.10858513.846058
68United States20032.2335198.9747081.2181991.89282513.339312
69United States20022.1931888.6983061.1921801.80410512.968263
70United States20012.1120388.4804611.2132531.74079712.746262
71United States20002.0405008.2720971.2877391.79099512.620268
\n", + "

72 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "1 Canada 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + ".. ... ... ... ... ... ... ...\n", + "67 United States 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + "68 United States 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + "69 United States 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + "70 United States 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + "71 United States 2000 2.040500 8.272097 1.287739 1.790995 12.620268\n", + "\n", + "[72 rows x 7 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Choose the Index Carefully\n", + "\n", + "So, now that we know that we use index and column names for\n", + "aligning data, “how should we pick the index?” is a natural question to ask.\n", + "\n", + "To guide us to the right answer, we will list the first two components\n", + "to [Hadley Wickham’s](http://hadley.nz/) description of [tidy\n", + "data](http://vita.had.co.nz/papers/tidy-data.html):\n", + "\n", + "1. Each column should each have one variable. \n", + "1. Each row should each have one observation. \n", + "\n", + "\n", + "If we strive to have our data in a tidy form (we should), then when\n", + "choosing the index, we should set:\n", + "\n", + "- the row labels (index) to be a unique identifier for an observation\n", + " of data \n", + "- the column names to identify one variable \n", + "\n", + "\n", + "For example, suppose we are looking data on interest rates.\n", + "\n", + "Each column might represent one bond or asset and each row might\n", + "represent the date.\n", + "\n", + "Using hierarchical row and column indices allows us to store higher\n", + "dimensional data in our (inherently) two dimensional DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Know Your Goal\n", + "\n", + "The correct column(s) to choose for the index often depends on the context of\n", + "your analysis.\n", + "\n", + "For example, if I were studying how GDP and consumption evolved over time for\n", + "various countries, I would want time (year) and country name on the index\n", + "\n", + "On the other hand, if I were trying to look at the differences across countries\n", + "and variables within a particular year, I may opt to put the country and\n", + "variable on the index and have years be columns.\n", + "\n", + "Following the tidy data rules above and thinking about how you intend to *use*\n", + "the data – and a little practice – will enable you to consistently select the\n", + "correct index." + ] + } + ], + "metadata": { + "date": 1584040764.873878, + "filename": "the_index.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "The Index" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/3_the_index_exercises.ipynb b/Session_7/3_the_index_exercises.ipynb new file mode 100644 index 0000000..e631853 --- /dev/null +++ b/Session_7/3_the_index_exercises.ipynb @@ -0,0 +1,3443 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.8.0)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow->qeds) (1.12.0)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Index - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For these exercises we load data on GDP components collected from the World Bank’s World Development Indicators Dataset.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For the purpose of this exercise we extract smaller DataFrames.**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "url = \"https://datascience.quantecon.org/assets/data/wdi_data.csv\"\n", + "df = pd.read_csv(url)\n", + "#df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "df_small = df.head(5)\n", + "#df_small" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "df_tiny = df.iloc[[0, 3, 2, 4], :]\n", + "#df_tiny" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "im_ex = df_small[[\"Imports\", \"Exports\"]]\n", + "im_ex_copy = im_ex.copy()\n", + "#im_ex_copy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Create a new DataFrame as follows: im_ex_tiny = df_tiny + im_ex. Analyze the outcome.**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConsumptionExportsGDPGovExpendImportscountryyear
0NaN1.165661NaNNaN1.200063NaNNaN
1NaNNaNNaNNaNNaNNaNNaN
2NaN1.137718NaNNaN1.151585NaNNaN
3NaN1.100646NaNNaN1.144688NaNNaN
4NaN1.036081NaNNaN1.117272NaNNaN
\n", + "
" + ], + "text/plain": [ + " Consumption Exports GDP GovExpend Imports country year\n", + "0 NaN 1.165661 NaN NaN 1.200063 NaN NaN\n", + "1 NaN NaN NaN NaN NaN NaN NaN\n", + "2 NaN 1.137718 NaN NaN 1.151585 NaN NaN\n", + "3 NaN 1.100646 NaN NaN 1.144688 NaN NaN\n", + "4 NaN 1.036081 NaN NaN 1.117272 NaN NaN" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex_tiny = df_tiny + im_ex\n", + "im_ex_tiny" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **What happens when you apply the mean method to im_ex_tiny? In particular, what happens to columns that have missing data?**\n", + "\n", + " - HINT: also looking at the output of the sum method might help" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Consumption NaN\n", + "Exports 1.110027\n", + "GDP NaN\n", + "GovExpend NaN\n", + "Imports 1.153402\n", + "country NaN\n", + "year NaN\n", + "dtype: float64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex_tiny.mean()\n", + "\n", + "#we get missing values for the columns where all values are missing (NaN)\n", + "#the mean is computed for the other columns even though there are missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Consumption 0.000000\n", + "Exports 4.440106\n", + "GDP 0.000000\n", + "GovExpend 0.000000\n", + "Imports 4.613608\n", + "country 0.000000\n", + "year 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "im_ex_tiny.sum() \n", + "\n", + "#we get 0s for the columns where all values are missing (NaN)\n", + "#the sum is computed for the other columns (even though there are missing values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises 2-6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We create a new DataFrame to use throughout the exercises 2-6. Now we set \"country\" and \"year\" as indices.**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
20150.3583031.0352080.5688590.5757931.794270
20140.3534851.0119880.5503230.5723441.782252
20130.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + " 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + " 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + " 2013 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi = df.set_index([\"country\", \"year\"])\n", + "wdi.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For each of the examples below do the following and write your answers:**\n", + "\n", + " - Determine which of the rules above applies.\n", + " - Identify the type of the returned value.\n", + " - Explain why the slicing operation returned the data it did." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
20150.3583031.0352080.5688590.5757931.794270
20140.3534851.0119880.5503230.5723441.782252
20130.3515410.9864000.5180400.5586361.732714
20120.3543420.9612260.5059690.5477561.693428
20110.3518870.9431450.4923490.5282271.664240
20100.3473320.9219520.4699490.5003411.613543
20090.3396860.8900780.4406920.4397961.565291
20080.3307660.8896020.5063500.5022811.612862
20070.3187770.8640120.5304530.4980021.596876
20060.3113820.8276430.5244610.4709311.564608
20050.3030430.7943900.5199500.4472221.524608
20040.2998540.7643570.5086570.4167541.477317
20030.2943350.7417960.4819930.3841991.433089
20020.2860940.7219740.4904650.3686151.407725
20010.2797670.6942300.4846960.3620231.366590
20000.2705530.6777130.4995260.3808231.342805
United States20172.40574312.0192662.2870713.06995417.348627
20162.40798111.7221332.2199372.93600416.972348
20152.37313011.4098002.2222282.88133716.710459
20142.33407111.0006192.2095552.73222816.242526
20132.35338110.6872142.1186392.60019815.853796
20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
20092.50739010.0106871.6464322.08629914.617299
20082.40777110.1378471.7973472.40034914.997756
20072.35198710.1593871.7010962.45501615.018268
20062.3149579.9385031.5649202.39518914.741688
20052.2870229.6430981.4312052.24624614.332500
20042.2679999.3114311.3359782.10858513.846058
20032.2335198.9747081.2181991.89282513.339312
20022.1931888.6983061.1921801.80410512.968263
20012.1120388.4804611.2132531.74079712.746262
20002.0405008.2720971.2877391.79099512.620268
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + " 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + " 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + " 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + " 2012 0.354342 0.961226 0.505969 0.547756 1.693428\n", + " 2011 0.351887 0.943145 0.492349 0.528227 1.664240\n", + " 2010 0.347332 0.921952 0.469949 0.500341 1.613543\n", + " 2009 0.339686 0.890078 0.440692 0.439796 1.565291\n", + " 2008 0.330766 0.889602 0.506350 0.502281 1.612862\n", + " 2007 0.318777 0.864012 0.530453 0.498002 1.596876\n", + " 2006 0.311382 0.827643 0.524461 0.470931 1.564608\n", + " 2005 0.303043 0.794390 0.519950 0.447222 1.524608\n", + " 2004 0.299854 0.764357 0.508657 0.416754 1.477317\n", + " 2003 0.294335 0.741796 0.481993 0.384199 1.433089\n", + " 2002 0.286094 0.721974 0.490465 0.368615 1.407725\n", + " 2001 0.279767 0.694230 0.484696 0.362023 1.366590\n", + " 2000 0.270553 0.677713 0.499526 0.380823 1.342805\n", + "United States 2017 2.405743 12.019266 2.287071 3.069954 17.348627\n", + " 2016 2.407981 11.722133 2.219937 2.936004 16.972348\n", + " 2015 2.373130 11.409800 2.222228 2.881337 16.710459\n", + " 2014 2.334071 11.000619 2.209555 2.732228 16.242526\n", + " 2013 2.353381 10.687214 2.118639 2.600198 15.853796\n", + " 2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + " 2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + " 2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + " 2009 2.507390 10.010687 1.646432 2.086299 14.617299\n", + " 2008 2.407771 10.137847 1.797347 2.400349 14.997756\n", + " 2007 2.351987 10.159387 1.701096 2.455016 15.018268\n", + " 2006 2.314957 9.938503 1.564920 2.395189 14.741688\n", + " 2005 2.287022 9.643098 1.431205 2.246246 14.332500\n", + " 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + " 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + " 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + " 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + " 2000 2.040500 8.272097 1.287739 1.790995 12.620268" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[[\"United States\", \"Canada\"]]\n", + "\n", + "#row slicing\n", + "#all rows where the outer-most index is either \"United States\" or \"Canada\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20120.3543420.9612260.5059690.5477561.693428
20110.3518870.9431450.4923490.5282271.664240
20100.3473320.9219520.4699490.5003411.613543
United States20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2012 0.354342 0.961226 0.505969 0.547756 1.693428\n", + " 2011 0.351887 0.943145 0.492349 0.528227 1.664240\n", + " 2010 0.347332 0.921952 0.469949 0.500341 1.613543\n", + "United States 2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + " 2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + " 2010 2.510143 10.185836 1.846280 2.360183 14.992053" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[([\"United States\", \"Canada\"], [2010, 2011, 2012]), :]\n", + "\n", + "#row slicing\n", + "#all rows where the outer-most index is either \"United States or \"Canada\" AND \n", + "#where the second level index is either 2010 or 2011 or 2012" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
year
20172.40574312.0192662.2870713.06995417.348627
20162.40798111.7221332.2199372.93600416.972348
20152.37313011.4098002.2222282.88133716.710459
20142.33407111.0006192.2095552.73222816.242526
20132.35338110.6872142.1186392.60019815.853796
20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
20092.50739010.0106871.6464322.08629914.617299
20082.40777110.1378471.7973472.40034914.997756
20072.35198710.1593871.7010962.45501615.018268
20062.3149579.9385031.5649202.39518914.741688
20052.2870229.6430981.4312052.24624614.332500
20042.2679999.3114311.3359782.10858513.846058
20032.2335198.9747081.2181991.89282513.339312
20022.1931888.6983061.1921801.80410512.968263
20012.1120388.4804611.2132531.74079712.746262
20002.0405008.2720971.2877391.79099512.620268
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "year \n", + "2017 2.405743 12.019266 2.287071 3.069954 17.348627\n", + "2016 2.407981 11.722133 2.219937 2.936004 16.972348\n", + "2015 2.373130 11.409800 2.222228 2.881337 16.710459\n", + "2014 2.334071 11.000619 2.209555 2.732228 16.242526\n", + "2013 2.353381 10.687214 2.118639 2.600198 15.853796\n", + "2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + "2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + "2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + "2009 2.507390 10.010687 1.646432 2.086299 14.617299\n", + "2008 2.407771 10.137847 1.797347 2.400349 14.997756\n", + "2007 2.351987 10.159387 1.701096 2.455016 15.018268\n", + "2006 2.314957 9.938503 1.564920 2.395189 14.741688\n", + "2005 2.287022 9.643098 1.431205 2.246246 14.332500\n", + "2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + "2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + "2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + "2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + "2000 2.040500 8.272097 1.287739 1.790995 12.620268" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[\"United States\"]\n", + "\n", + "#row slicing\n", + "#all rows where the outer most index value is equal to United States" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GDP 14.992053\n", + "Exports 1.846280\n", + "Name: (United States, 2010), dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[(\"United States\", 2010), [\"GDP\", \"Exports\"]] \n", + "\n", + "#row + column slicing\n", + "#we get the values for the GDP and Exports columns in the United States in 2010 \n", + "#however, this is not presented as expected (the column names appear as indices)\n", + "# Tuple used to select united states on year 2010. Then, only GDP and Exports columns are extracted" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GovExpend 2.510143\n", + "Consumption 10.185836\n", + "Exports 1.846280\n", + "Imports 2.360183\n", + "GDP 14.992053\n", + "Name: (United States, 2010), dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[(\"United States\", 2010)]\n", + "\n", + "#row slicing\n", + "#we get the values for all columns in the United States in 2010 \n", + "#however, this is not presented as expected (the column names appear as indices)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
United States20102.51014310.1858361.8462802.36018314.992053
Canada20150.3583031.0352080.5688590.5757931.794270
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "United States 2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + "Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[[(\"United States\", 2010), (\"Canada\", 2015)]]\n", + "\n", + "#row slicing\n", + "#all rows where the the two hierarchical indices are either (\"United States\", 2010) or (\"Canada\", 2015)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "country year\n", + "Canada 2017 1.868164\n", + " 2016 1.814016\n", + " 2015 1.794270\n", + " 2014 1.782252\n", + " 2013 1.732714\n", + " 2012 1.693428\n", + " 2011 1.664240\n", + " 2010 1.613543\n", + " 2009 1.565291\n", + " 2008 1.612862\n", + " 2007 1.596876\n", + " 2006 1.564608\n", + " 2005 1.524608\n", + " 2004 1.477317\n", + " 2003 1.433089\n", + " 2002 1.407725\n", + " 2001 1.366590\n", + " 2000 1.342805\n", + "United States 2017 17.348627\n", + " 2016 16.972348\n", + " 2015 16.710459\n", + " 2014 16.242526\n", + " 2013 15.853796\n", + " 2012 15.567038\n", + " 2011 15.224555\n", + " 2010 14.992053\n", + " 2009 14.617299\n", + " 2008 14.997756\n", + " 2007 15.018268\n", + " 2006 14.741688\n", + " 2005 14.332500\n", + " 2004 13.846058\n", + " 2003 13.339312\n", + " 2002 12.968263\n", + " 2001 12.746262\n", + " 2000 12.620268\n", + "Name: GDP, dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[[\"United States\", \"Canada\"], \"GDP\"]\n", + "\n", + "#row + column slicing\n", + "#all rows for United States or Canada only for the column GDP" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "year\n", + "2017 17.348627\n", + "2016 16.972348\n", + "2015 16.710459\n", + "2014 16.242526\n", + "2013 15.853796\n", + "2012 15.567038\n", + "2011 15.224555\n", + "2010 14.992053\n", + "2009 14.617299\n", + "2008 14.997756\n", + "2007 15.018268\n", + "2006 14.741688\n", + "2005 14.332500\n", + "2004 13.846058\n", + "2003 13.339312\n", + "2002 12.968263\n", + "2001 12.746262\n", + "2000 12.620268\n", + "Name: GDP, dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi.loc[\"United States\", \"GDP\"]\n", + "\n", + "#row + column slicing\n", + "#all rows for the US for column GDP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Try setting my_df to some subset of the rows in wdi (use one of the .loc variations above).**" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
United States20102.51014310.1858361.8462802.36018314.992053
Canada20150.3583031.0352080.5688590.5757931.794270
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "United States 2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + "Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#below we take a few examples (we comment the ones not used in the analysis below)\n", + "\n", + "my_df = wdi.loc[[(\"United States\", 2010), (\"Canada\", 2015)]]\n", + "#my_df = wdi.loc[\"United States\"]\n", + "#my_df = wdi.loc[(\"United States\", 2010)]\n", + "#my_df = wdi.loc[\"United States\", \"GDP\"]\n", + "#my_df = wdi.loc[[\"United States\", \"Canada\"]]\n", + "\n", + "my_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Then see what happens when you do wdi / my_df or my_df ** wdi.**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada2000NaNNaNNaNNaNNaN
2001NaNNaNNaNNaNNaN
2002NaNNaNNaNNaNNaN
2003NaNNaNNaNNaNNaN
2004NaNNaNNaNNaNNaN
.....................
United States2013NaNNaNNaNNaNNaN
2014NaNNaNNaNNaNNaN
2015NaNNaNNaNNaNNaN
2016NaNNaNNaNNaNNaN
2017NaNNaNNaNNaNNaN
\n", + "

72 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2000 NaN NaN NaN NaN NaN\n", + " 2001 NaN NaN NaN NaN NaN\n", + " 2002 NaN NaN NaN NaN NaN\n", + " 2003 NaN NaN NaN NaN NaN\n", + " 2004 NaN NaN NaN NaN NaN\n", + "... ... ... ... ... ...\n", + "United States 2013 NaN NaN NaN NaN NaN\n", + " 2014 NaN NaN NaN NaN NaN\n", + " 2015 NaN NaN NaN NaN NaN\n", + " 2016 NaN NaN NaN NaN NaN\n", + " 2017 NaN NaN NaN NaN NaN\n", + "\n", + "[72 rows x 5 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#we create a variable that does wdi/my_df and we analyze the outcome\n", + "x = wdi / my_df\n", + "x\n", + "\n", + "#since it is not clear at a first look, it's better if we extract the subsets used before to see what happens (uncomment to check):\n", + " #x_df = x.loc[[(\"United States\", 2010), (\"Canada\", 2015)]]\n", + " #x_df = x.loc[\"United States\"]\n", + " #x_df = x.loc[(\"United States\", 2010)]\n", + " #x_df = x.loc[\"United States\", \"GDP\"]\n", + " #x_df = x.loc[[\"United States\", \"Canada\"]]\n", + " #x_df" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada2000NaNNaNNaNNaNNaN
2001NaNNaNNaNNaNNaN
2002NaNNaNNaNNaNNaN
2003NaNNaNNaNNaNNaN
2004NaNNaNNaNNaNNaN
.....................
United States2013NaNNaNNaNNaNNaN
2014NaNNaNNaNNaNNaN
2015NaNNaNNaNNaNNaN
2016NaNNaNNaNNaNNaN
2017NaNNaNNaNNaNNaN
\n", + "

72 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2000 NaN NaN NaN NaN NaN\n", + " 2001 NaN NaN NaN NaN NaN\n", + " 2002 NaN NaN NaN NaN NaN\n", + " 2003 NaN NaN NaN NaN NaN\n", + " 2004 NaN NaN NaN NaN NaN\n", + "... ... ... ... ... ...\n", + "United States 2013 NaN NaN NaN NaN NaN\n", + " 2014 NaN NaN NaN NaN NaN\n", + " 2015 NaN NaN NaN NaN NaN\n", + " 2016 NaN NaN NaN NaN NaN\n", + " 2017 NaN NaN NaN NaN NaN\n", + "\n", + "[72 rows x 5 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#we create a variable that does wdi**my_df and we analyze the outcome\n", + "y = wdi ** my_df\n", + "y\n", + "\n", + "#since it is not clear at a first look, it's better if we extract the subsets used before to see what happens (uncomment to check):\n", + " #y_df = y.loc[[(\"United States\", 2010), (\"Canada\", 2015)]]\n", + " #y_df = y.loc[\"United States\"]\n", + " #y_df = y.loc[(\"United States\", 2010)]\n", + " #y_df = y.loc[\"United States\", \"GDP\"]\n", + " #y_df = y.loc[[\"United States\", \"Canada\"]]\n", + " #y_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Try changing the subset of rows in my_df and repeat until you understand what is happening.**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "#conclusion: the operation applies only to the subset my_df and not to the entire DataFrame wdi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Below, we create wdi2, which is the same as df4 except that the levels of the index are swapped.**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
yearcountry
2017Canada0.3726651.0954750.5828310.6000311.868164
2016Canada0.3648991.0584260.5763940.5757751.814016
2015Canada0.3583031.0352080.5688590.5757931.794270
2014Canada0.3534851.0119880.5503230.5723441.782252
2013Canada0.3515410.9864000.5180400.5586361.732714
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "year country \n", + "2017 Canada 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "2016 Canada 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2015 Canada 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "2014 Canada 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "2013 Canada 0.351541 0.986400 0.518040 0.558636 1.732714" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2 = df.set_index([\"year\", \"country\"])\n", + "wdi2.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **In the cells after df6 is defined, we have commented out a few of the slicing examples from the previous exercise. For each of these examples, use pd.IndexSlice to extract the same data from df6.**\n", + "\n", + " - HINT: You will need to swap the order of the row slicing arguments within the pd.IndexSlice." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
United States20172.40574312.0192662.2870713.06995417.348627
20162.40798111.7221332.2199372.93600416.972348
20152.37313011.4098002.2222282.88133716.710459
20142.33407111.0006192.2095552.73222816.242526
20132.35338110.6872142.1186392.60019815.853796
20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
20092.50739010.0106871.6464322.08629914.617299
20082.40777110.1378471.7973472.40034914.997756
20072.35198710.1593871.7010962.45501615.018268
20062.3149579.9385031.5649202.39518914.741688
20052.2870229.6430981.4312052.24624614.332500
20042.2679999.3114311.3359782.10858513.846058
20032.2335198.9747081.2181991.89282513.339312
20022.1931888.6983061.1921801.80410512.968263
20012.1120388.4804611.2132531.74079712.746262
20002.0405008.2720971.2877391.79099512.620268
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "United States 2017 2.405743 12.019266 2.287071 3.069954 17.348627\n", + " 2016 2.407981 11.722133 2.219937 2.936004 16.972348\n", + " 2015 2.373130 11.409800 2.222228 2.881337 16.710459\n", + " 2014 2.334071 11.000619 2.209555 2.732228 16.242526\n", + " 2013 2.353381 10.687214 2.118639 2.600198 15.853796\n", + " 2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + " 2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + " 2010 2.510143 10.185836 1.846280 2.360183 14.992053\n", + " 2009 2.507390 10.010687 1.646432 2.086299 14.617299\n", + " 2008 2.407771 10.137847 1.797347 2.400349 14.997756\n", + " 2007 2.351987 10.159387 1.701096 2.455016 15.018268\n", + " 2006 2.314957 9.938503 1.564920 2.395189 14.741688\n", + " 2005 2.287022 9.643098 1.431205 2.246246 14.332500\n", + " 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + " 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + " 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + " 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + " 2000 2.040500 8.272097 1.287739 1.790995 12.620268" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#the equivalent of wdi.loc[\"United States\"] is below:\n", + "\n", + "#we create a shortcut for pd.IndexSlice to use in what follows\n", + "idx = pd.IndexSlice\n", + "\n", + "wdi.loc[idx[\"United States\",:],:]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20120.3543420.9612260.5059690.5477561.693428
20110.3518870.9431450.4923490.5282271.664240
20100.3473320.9219520.4699490.5003411.613543
United States20122.39887310.5340422.0455092.56067715.567038
20112.43437810.3780601.9780832.49319415.224555
20102.51014310.1858361.8462802.36018314.992053
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2012 0.354342 0.961226 0.505969 0.547756 1.693428\n", + " 2011 0.351887 0.943145 0.492349 0.528227 1.664240\n", + " 2010 0.347332 0.921952 0.469949 0.500341 1.613543\n", + "United States 2012 2.398873 10.534042 2.045509 2.560677 15.567038\n", + " 2011 2.434378 10.378060 1.978083 2.493194 15.224555\n", + " 2010 2.510143 10.185836 1.846280 2.360183 14.992053" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#the equivalent of wdi.loc[([\"United States\", \"Canada\"], [2010, 2011, 2012]), :] is below:\n", + "\n", + "wdi.loc[idx[([\"United States\", \"Canada\"],[2010, 2011, 2012])], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "country year\n", + "Canada 2017 1.868164\n", + " 2016 1.814016\n", + " 2015 1.794270\n", + " 2014 1.782252\n", + " 2013 1.732714\n", + " 2012 1.693428\n", + " 2011 1.664240\n", + " 2010 1.613543\n", + " 2009 1.565291\n", + " 2008 1.612862\n", + " 2007 1.596876\n", + " 2006 1.564608\n", + " 2005 1.524608\n", + " 2004 1.477317\n", + " 2003 1.433089\n", + " 2002 1.407725\n", + " 2001 1.366590\n", + " 2000 1.342805\n", + "United States 2017 17.348627\n", + " 2016 16.972348\n", + " 2015 16.710459\n", + " 2014 16.242526\n", + " 2013 15.853796\n", + " 2012 15.567038\n", + " 2011 15.224555\n", + " 2010 14.992053\n", + " 2009 14.617299\n", + " 2008 14.997756\n", + " 2007 15.018268\n", + " 2006 14.741688\n", + " 2005 14.332500\n", + " 2004 13.846058\n", + " 2003 13.339312\n", + " 2002 12.968263\n", + " 2001 12.746262\n", + " 2000 12.620268\n", + "Name: GDP, dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#the equivalent of wdi.loc[[\"United States\", \"Canada\"], \"GDP\"] is below:\n", + "\n", + "wdi.loc[idx[[\"United States\", \"Canada\"],:], \"GDP\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 5\n", + "\n", + "**For the purpose of this exercise, we create a new DataFrame that swaps the rows and columns of the wdi DataFrame.**" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryCanada...United States
year2017201620152014201320122011201020092008...2009200820072006200520042003200220012000
GovExpend0.3726650.3648990.3583030.3534850.3515410.3543420.3518870.3473320.3396860.330766...2.5073902.4077712.3519872.3149572.2870222.2679992.2335192.1931882.1120382.040500
Consumption1.0954751.0584261.0352081.0119880.9864000.9612260.9431450.9219520.8900780.889602...10.01068710.13784710.1593879.9385039.6430989.3114318.9747088.6983068.4804618.272097
Exports0.5828310.5763940.5688590.5503230.5180400.5059690.4923490.4699490.4406920.506350...1.6464321.7973471.7010961.5649201.4312051.3359781.2181991.1921801.2132531.287739
Imports0.6000310.5757750.5757930.5723440.5586360.5477560.5282270.5003410.4397960.502281...2.0862992.4003492.4550162.3951892.2462462.1085851.8928251.8041051.7407971.790995
GDP1.8681641.8140161.7942701.7822521.7327141.6934281.6642401.6135431.5652911.612862...14.61729914.99775615.01826814.74168814.33250013.84605813.33931212.96826312.74626212.620268
\n", + "

5 rows × 72 columns

\n", + "
" + ], + "text/plain": [ + "country Canada \\\n", + "year 2017 2016 2015 2014 2013 2012 \n", + "GovExpend 0.372665 0.364899 0.358303 0.353485 0.351541 0.354342 \n", + "Consumption 1.095475 1.058426 1.035208 1.011988 0.986400 0.961226 \n", + "Exports 0.582831 0.576394 0.568859 0.550323 0.518040 0.505969 \n", + "Imports 0.600031 0.575775 0.575793 0.572344 0.558636 0.547756 \n", + "GDP 1.868164 1.814016 1.794270 1.782252 1.732714 1.693428 \n", + "\n", + "country ... United States \\\n", + "year 2011 2010 2009 2008 ... 2009 \n", + "GovExpend 0.351887 0.347332 0.339686 0.330766 ... 2.507390 \n", + "Consumption 0.943145 0.921952 0.890078 0.889602 ... 10.010687 \n", + "Exports 0.492349 0.469949 0.440692 0.506350 ... 1.646432 \n", + "Imports 0.528227 0.500341 0.439796 0.502281 ... 2.086299 \n", + "GDP 1.664240 1.613543 1.565291 1.612862 ... 14.617299 \n", + "\n", + "country \\\n", + "year 2008 2007 2006 2005 2004 2003 \n", + "GovExpend 2.407771 2.351987 2.314957 2.287022 2.267999 2.233519 \n", + "Consumption 10.137847 10.159387 9.938503 9.643098 9.311431 8.974708 \n", + "Exports 1.797347 1.701096 1.564920 1.431205 1.335978 1.218199 \n", + "Imports 2.400349 2.455016 2.395189 2.246246 2.108585 1.892825 \n", + "GDP 14.997756 15.018268 14.741688 14.332500 13.846058 13.339312 \n", + "\n", + "country \n", + "year 2002 2001 2000 \n", + "GovExpend 2.193188 2.112038 2.040500 \n", + "Consumption 8.698306 8.480461 8.272097 \n", + "Exports 1.192180 1.213253 1.287739 \n", + "Imports 1.804105 1.740797 1.790995 \n", + "GDP 12.968263 12.746262 12.620268 \n", + "\n", + "[5 rows x 72 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT = wdi.T # .T means \"transpose\" or \"swap rows and columns\"\n", + "wdiT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Use pd.IndexSlice to extract all data from wdiT where the year level of the column names (the second level) is one of 2010, 2012, and 2014.**" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryCanadaGermanyUnited KingdomUnited States
year201420122010201420122010201420122010201420122010
GovExpend0.3534850.3543420.3473320.6859900.6664540.6533860.5388880.5281940.5211462.3340712.3988732.510143
Consumption1.0119880.9612260.9219521.9999531.9673901.9154811.6757161.6125501.59856311.00061910.53404210.185836
Exports0.5503230.5059690.4699491.7122701.6074551.4437350.7740220.7454840.6908242.2095552.0455091.846280
Imports0.5723440.5477560.5003411.4454091.3541221.2661260.8273110.7726920.7450652.7322282.5606772.360183
GDP1.7822521.6934281.6135433.6549243.5595873.4170952.6571592.5293232.45290016.24252615.56703814.992053
\n", + "
" + ], + "text/plain": [ + "country Canada Germany \\\n", + "year 2014 2012 2010 2014 2012 2010 \n", + "GovExpend 0.353485 0.354342 0.347332 0.685990 0.666454 0.653386 \n", + "Consumption 1.011988 0.961226 0.921952 1.999953 1.967390 1.915481 \n", + "Exports 0.550323 0.505969 0.469949 1.712270 1.607455 1.443735 \n", + "Imports 0.572344 0.547756 0.500341 1.445409 1.354122 1.266126 \n", + "GDP 1.782252 1.693428 1.613543 3.654924 3.559587 3.417095 \n", + "\n", + "country United Kingdom United States \\\n", + "year 2014 2012 2010 2014 2012 \n", + "GovExpend 0.538888 0.528194 0.521146 2.334071 2.398873 \n", + "Consumption 1.675716 1.612550 1.598563 11.000619 10.534042 \n", + "Exports 0.774022 0.745484 0.690824 2.209555 2.045509 \n", + "Imports 0.827311 0.772692 0.745065 2.732228 2.560677 \n", + "GDP 2.657159 2.529323 2.452900 16.242526 15.567038 \n", + "\n", + "country \n", + "year 2010 \n", + "GovExpend 2.510143 \n", + "Consumption 10.185836 \n", + "Exports 1.846280 \n", + "Imports 2.360183 \n", + "GDP 14.992053 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdiT.loc[:, idx[:,[2010, 2012, 2014]]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 6\n", + "\n", + "1. **Move just the `year` level of the index back as a column.**" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearGovExpendConsumptionExportsImportsGDP
country
Canada20170.3726651.0954750.5828310.6000311.868164
Canada20160.3648991.0584260.5763940.5757751.814016
Canada20150.3583031.0352080.5688590.5757931.794270
Canada20140.3534851.0119880.5503230.5723441.782252
Canada20130.3515410.9864000.5180400.5586361.732714
.....................
United States20042.2679999.3114311.3359782.10858513.846058
United States20032.2335198.9747081.2181991.89282513.339312
United States20022.1931888.6983061.1921801.80410512.968263
United States20012.1120388.4804611.2132531.74079712.746262
United States20002.0405008.2720971.2877391.79099512.620268
\n", + "

72 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " year GovExpend Consumption Exports Imports GDP\n", + "country \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "Canada 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + "... ... ... ... ... ... ...\n", + "United States 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + "United States 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + "United States 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + "United States 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + "United States 2000 2.040500 8.272097 1.287739 1.790995 12.620268\n", + "\n", + "[72 rows x 6 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# remove just year level and add as column\n", + "wdi_1 = df.set_index([\"country\", \"year\"])\n", + "wdi_1.reset_index([\"year\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Completely throw away all levels of the index.** " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDP
0Canada20170.3726651.0954750.5828310.6000311.868164
1Canada20160.3648991.0584260.5763940.5757751.814016
2Canada20150.3583031.0352080.5688590.5757931.794270
3Canada20140.3534851.0119880.5503230.5723441.782252
4Canada20130.3515410.9864000.5180400.5586361.732714
........................
67United States20042.2679999.3114311.3359782.10858513.846058
68United States20032.2335198.9747081.2181991.89282513.339312
69United States20022.1931888.6983061.1921801.80410512.968263
70United States20012.1120388.4804611.2132531.74079712.746262
71United States20002.0405008.2720971.2877391.79099512.620268
\n", + "

72 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports GDP\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "1 Canada 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2 Canada 2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "3 Canada 2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "4 Canada 2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + ".. ... ... ... ... ... ... ...\n", + "67 United States 2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + "68 United States 2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + "69 United States 2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + "70 United States 2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + "71 United States 2000 2.040500 8.272097 1.287739 1.790995 12.620268\n", + "\n", + "[72 rows x 7 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# throw away all levels of index\n", + "wdi_2 = df.set_index([\"country\", \"year\"])\n", + "wdi_2.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Remove the `country` of the index and *do not* keep it as a column.**" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
year
20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
20150.3583031.0352080.5688590.5757931.794270
20140.3534851.0119880.5503230.5723441.782252
20130.3515410.9864000.5180400.5586361.732714
..................
20042.2679999.3114311.3359782.10858513.846058
20032.2335198.9747081.2181991.89282513.339312
20022.1931888.6983061.1921801.80410512.968263
20012.1120388.4804611.2132531.74079712.746262
20002.0405008.2720971.2877391.79099512.620268
\n", + "

72 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "year \n", + "2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "2015 0.358303 1.035208 0.568859 0.575793 1.794270\n", + "2014 0.353485 1.011988 0.550323 0.572344 1.782252\n", + "2013 0.351541 0.986400 0.518040 0.558636 1.732714\n", + "... ... ... ... ... ...\n", + "2004 2.267999 9.311431 1.335978 2.108585 13.846058\n", + "2003 2.233519 8.974708 1.218199 1.892825 13.339312\n", + "2002 2.193188 8.698306 1.192180 1.804105 12.968263\n", + "2001 2.112038 8.480461 1.213253 1.740797 12.746262\n", + "2000 2.040500 8.272097 1.287739 1.790995 12.620268\n", + "\n", + "[72 rows x 5 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remove country from the index -- don't keep it as a column\n", + "wdi_3 = df.set_index([\"country\", \"year\"])\n", + "wdi_3.reset_index([\"country\"], drop=True)" + ] + } + ], + "metadata": { + "date": 1584040764.873878, + "filename": "the_index.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "The Index" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Session_7/4_storage_formats.ipynb b/Session_7/4_storage_formats.ipynb new file mode 100644 index 0000000..f00663c --- /dev/null +++ b/Session_7/4_storage_formats.ipynb @@ -0,0 +1,1080 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Storage Formats\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Intro to DataFrames and Series](https://datascience.quantecon.org/intro.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Understand that data can be saved in various formats \n", + "- Know where to get help on file input and output \n", + "- Know when to use csv, xlsx, feather, and sql formats \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- Results for all NFL games between September 1920 to February 2017 " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Storage Formats](#Storage-Formats) \n", + " - [File Formats](#File-Formats) \n", + " - [Writing DataFrames](#Writing-DataFrames) \n", + " - [Reading Files into DataFrames](#Reading-Files-into-DataFrames) \n", + " - [Practice](#Practice) " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# Uncomment following line to install on colab\n", + "#! pip install qeds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## File Formats\n", + "\n", + "Data can be saved in a variety of formats.\n", + "\n", + "pandas understands how to write and read DataFrames to and from many of\n", + "these formats.\n", + "\n", + "We defer to the [official\n", + "documentation](https://pandas.pydata.org/pandas-docs/stable/io.html)\n", + "for a full description of how to interact with all the file formats, but\n", + "will briefly discuss a few of them here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CSV\n", + "\n", + "**What is it?** CSVs store data as plain text (strings) where each row is a\n", + "line and columns are separated by `,`.\n", + "\n", + "**Pros**\n", + "\n", + "- Widely used (you should be familiar with it) \n", + "- Plain text file (can open on any computer, “future proof”) \n", + "- Can be read from and written to by most data software \n", + "\n", + "\n", + "**Cons**\n", + "\n", + "- Not the most efficient way to store or access \n", + "- No formal standard, so there is room for user interpretation on how to\n", + " handle edge cases (e.g. what to do about a data field that itself includes\n", + " a comma) \n", + "\n", + "\n", + "**When to use**:\n", + "\n", + "- A great default option for most use cases " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### xlsx\n", + "\n", + "**What is it?** xlsx is a binary file format used as Excel’s default.\n", + "\n", + "**Pros**:\n", + "\n", + "- Standard format in many industries \n", + "- Easy to share with colleagues that use Excel \n", + "\n", + "\n", + "**Cons**:\n", + "\n", + "- Quite slow to read/write large amounts of data \n", + "- Stores both data and *metadata* like styling and display information\n", + " and even plots. This metadata is not always portable to other file formats\n", + " or programs. \n", + "\n", + "\n", + "**When to use**:\n", + "\n", + "- When sharing data with Excel \n", + "- When you would like special formatting to be applied to the\n", + " spreadsheet when viewed in Excel " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parquet\n", + "\n", + "**What is it?** Parquet is a custom binary format designed for efficient reading and\n", + "writing of data stored in columns.\n", + "\n", + "**Pros**:\n", + "\n", + "- *Very* fast \n", + "- Naturally understands all `dtypes` used by pandas, including\n", + " multi-index DataFrames \n", + "- Very common in “big data” systems like Hadoop or Spark \n", + "- Supports various compression algorithms \n", + "\n", + "\n", + "**Cons**:\n", + "\n", + "- Binary storage format that is not human-readable \n", + "\n", + "\n", + "**When to use**:\n", + "\n", + "- If you have “not small” amounts (> 100 MB) of unchanging data that\n", + " you want to read quickly \n", + "- If you want to store data in an size-and-time-efficient way that may\n", + " be accessed by external systems " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feather\n", + "\n", + "**What is it?** Feather is a custom binary format designed for efficient reading and\n", + "writing of data stored in columns.\n", + "\n", + "**Pros**:\n", + "\n", + "- *Very* fast – even faster than parquet \n", + "- Naturally understands all `dtypes` used by pandas \n", + "\n", + "\n", + "**Cons**:\n", + "\n", + "- Can only read and write from Python and a handful of other\n", + " programming languages \n", + "- New file format (introduced in March ‘16), so most files don’t come\n", + " in this format \n", + "- Only supports standard pandas index, so you need to `reset_index`\n", + " before saving and then `set_index` after loading \n", + "\n", + "\n", + "**When to use**:\n", + "\n", + "- Use as an alternative to Parquet if you need the absolute best read and write\n", + " speeds for unchanging datasets \n", + "- Only use when you will not need to access the data in a programming language\n", + " or software outside of Python, R, and Julia " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SQL\n", + "\n", + "**What is it?** SQL is a language used to interact with relational\n", + "databases… [more info](https://en.wikipedia.org/wiki/SQL)\n", + "\n", + "**Pros**:\n", + "\n", + "- Well established industry standard for handling data \n", + "- Much of the world’s data is in a SQL database somewhere \n", + "\n", + "\n", + "**Cons**:\n", + "\n", + "- Complicated: to have full control you need to learn another language\n", + " (SQL) \n", + "\n", + "\n", + "**When to use**:\n", + "\n", + "- When reading from or writing to existing SQL databases \n", + "\n", + "\n", + "**NOTE**: We can cover interacting with SQL databases in a dedicated\n", + "lecture – contact us for more information." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Writing DataFrames\n", + "\n", + "Let’s now talk about saving a DataFrame to a file.\n", + "\n", + "As a general rule of thumb, if we have a DataFrame `df` and we would\n", + "like to save to save it as a file of type `FOO`, then we would call\n", + "the method named `df.to_FOO(...)`.\n", + "\n", + "We will show you how this can be done and try to highlight some of the\n", + "items mentioned above.\n", + "\n", + "But, we will not cover all possible options and features — we feel\n", + "it is best to learn these as you need them by consulting the appropriate\n", + "documentation.\n", + "\n", + "First, we need some DataFrames to save. Let’s make them now.\n", + "\n", + "Note that by default `df2` will be approximately 10 MB.\n", + "\n", + "If you need to change this number, adjust the value of\n", + "the `wanted_mb` variable below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df2.shape = (100000, 13)\n", + "df2 is approximately 9.9183349609375 MB\n" + ] + } + ], + "source": [ + "np.random.seed(42) # makes sure we get the same random numbers each time\n", + "df1 = pd.DataFrame(\n", + " np.random.randint(0, 100, size=(10, 4)),\n", + " columns=[\"a\", \"b\", \"c\", \"d\"]\n", + ")\n", + "\n", + "wanted_mb = 10 # CHANGE THIS LINE\n", + "nrow = 100000\n", + "ncol = int(((wanted_mb * 1024**2) / 8) / nrow)\n", + "df2 = pd.DataFrame(\n", + " np.random.rand(nrow, ncol),\n", + " columns=[\"x{}\".format(i) for i in range(ncol)]\n", + ")\n", + "\n", + "print(\"df2.shape = \", df2.shape)\n", + "print(\"df2 is approximately {} MB\".format(df2.memory_usage().sum() / (1024**2)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [df.to_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html)\n", + "\n", + "Let’s start with `df.to_csv`.\n", + "\n", + "Without any additional arguments, the `df.to_csv` function will return\n", + "a string containing the csv form of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ",a,b,c,d\n", + "0,51,92,14,71\n", + "1,60,20,82,86\n", + "2,74,74,87,99\n", + "3,23,2,21,52\n", + "4,1,87,29,37\n", + "5,1,63,59,20\n", + "6,32,75,57,21\n", + "7,88,48,90,58\n", + "8,41,91,59,79\n", + "9,14,61,61,46\n", + "\n" + ] + } + ], + "source": [ + "# notice the plain text format -- one row per line, columns separated by `'`\n", + "print(df1.to_csv())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we do pass an argument, the first argument will be used as the file name." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "df1.to_csv(\"df1.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the cell below to verify that the file was created." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "os.path.isfile(\"df1.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s see how long it takes to save `df2` to a file. (Because of the `%%time` at\n", + "the top, Jupyter will report the total time to run all code in\n", + "the cell)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 4.18 s\n" + ] + } + ], + "source": [ + "%%time\n", + "df2.to_csv(\"df2.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we will see below, this isn’t as fastest file format we could choose." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [df.to_excel](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html)\n", + "\n", + "When saving a DataFrame to an Excel workbook, we can\n", + "choose both the name of the workbook (file) and the name of the sheet\n", + "within the file where the DataFrame should be written.\n", + "\n", + "We do this by passing the workbook name as the first argument and the\n", + "sheet name as the second argument as follows." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "df1.to_excel(\"df1.xlsx\", \"df1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "pandas also gives us the option to write more than one DataFrame to a\n", + "workbook.\n", + "\n", + "To do this, we need to first construct an instance of `pd.ExcelWriter`\n", + "and then pass that as the first argument to `df.to_excel`.\n", + "\n", + "Let’s see how this works." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "with pd.ExcelWriter(\"df1.xlsx\") as writer:\n", + " df1.to_excel(writer, \"df1\")\n", + " (df1 + 10).to_excel(writer, \"df1 plus 10\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "with ... as ... :\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "syntax used above is an example of a *context manager*.\n", + "\n", + "We don’t need to understand all the details behind what this means\n", + "(google it if you are curious).\n", + "\n", + "For now, just recognize that particular syntax as the way to write\n", + "multiple sheets to an Excel workbook.\n", + "\n", + "

\n", + "\n", + "WARNING:\n", + "\n", + "

\n", + "\n", + "Saving `df2` to an excel file takes a very long time.\n", + "\n", + "For that reason, we will just show the code and hard-code the output\n", + "we saw when we ran the code." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "%%time\n", + "df2.to_excel(\"df2.xlsx\")\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + " Wall time: 25.7 s\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [pyarrow.feather.write_feather](https://arrow.apache.org/docs/python/generated/pyarrow.feather.write_feather.html#pyarrow.feather.write_feather)\n", + "\n", + "As noted above, the feather file format was developed for very efficient\n", + "reading and writing between Python and your computer.\n", + "\n", + "Support for this format is provided by a separate Python package called `pyarrow`.\n", + "\n", + "This package is not installed by default. To install it, copy/paste the code\n", + "below into a code cell and execute." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```markdown\n", + "!pip install pyarrow\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The parameters for `pyarrow.feather.write_feather` are the DataFrame and file name.\n", + "\n", + "Let’s try it out." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pyarrow.feather\n", + "pyarrow.feather.write_feather(df1, \"df1.feather\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 40.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "pyarrow.feather.write_feather(df2, \"df2.feather\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An example timing result:\n", + "\n", + "|format|time|\n", + "|:---------:|:----------------------:|\n", + "|csv|2.66 seconds|\n", + "|xlsx|25.7 seconds|\n", + "|feather|43 milliseconds|\n", + "As you can see, saving this DataFrame in the feather format was far\n", + "faster than either CSV or Excel." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading Files into DataFrames\n", + "\n", + "As with the `df.to_FOO` family of methods, there are similar\n", + "`pd.read_FOO` functions. (Note: they are in defined pandas, not as\n", + "methods on a DataFrame.)\n", + "\n", + "These methods have many more options because data storage can be messy or wrong.\n", + "\n", + "We will explore these in more detail in a separate lecture.\n", + "\n", + "For now, we just want to highlight the differences in how to read data\n", + "from each of the file formats.\n", + "\n", + "Let’s start by reading the files we just created to verify that they\n", + "match the data we began with." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
051921471
160208286
274748799
32322152
41872937
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "0 51 92 14 71\n", + "1 60 20 82 86\n", + "2 74 74 87 99\n", + "3 23 2 21 52\n", + "4 1 87 29 37" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# notice that index was specified in the first (0th -- why?) column of the file\n", + "df1_csv = pd.read_csv(\"df1.csv\", index_col=0)\n", + "df1_csv.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
051921471
160208286
274748799
32322152
41872937
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "0 51 92 14 71\n", + "1 60 20 82 86\n", + "2 74 74 87 99\n", + "3 23 2 21 52\n", + "4 1 87 29 37" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_xlsx = pd.read_excel(\"df1.xlsx\", \"df1\", index_col=0)\n", + "df1_xlsx.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
051921471
160208286
274748799
32322152
41872937
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "0 51 92 14 71\n", + "1 60 20 82 86\n", + "2 74 74 87 99\n", + "3 23 2 21 52\n", + "4 1 87 29 37" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# notice feather already knows what the index is\n", + "df1_feather = pyarrow.feather.read_feather(\"df1.feather\")\n", + "df1_feather.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the `pd.read_FOO` family of functions, we can also read files\n", + "from places on the internet.\n", + "\n", + "We saved our `df1` DataFrame to a file\n", + "and posted it online.\n", + "\n", + "Below, we show an example of using `pd.read_csv` to read this file." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
051921471
160208286
274748799
32322152
41872937
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "0 51 92 14 71\n", + "1 60 20 82 86\n", + "2 74 74 87 99\n", + "3 23 2 21 52\n", + "4 1 87 29 37" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_url = \"https://storage.googleapis.com/workshop_materials/df1.csv\"\n", + "df1_web = pd.read_csv(df1_url, index_col=0)\n", + "df1_web.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanup\n", + "\n", + "If you want to remove the files we just created, run the following cell." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "def try_remove(file):\n", + " if os.path.isfile(file):\n", + " os.remove(file)\n", + "\n", + "for df in [\"df1\", \"df2\"]:\n", + " for extension in [\"csv\", \"feather\", \"xlsx\"]:\n", + " filename = df + \".\" + extension\n", + " try_remove(filename)" + ] + } + ], + "metadata": { + "date": 1584040764.364358, + "filename": "storage_formats.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Storage Formats" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/4_storage_formats_exercises.ipynb b/Session_7/4_storage_formats_exercises.ipynb new file mode 100644 index 0000000..7893f00 --- /dev/null +++ b/Session_7/4_storage_formats_exercises.ipynb @@ -0,0 +1,584 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow->qeds) (1.12.0)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.16.0)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow) (1.12.0)\n", + "Requirement already satisfied: numpy>=1.14 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow) (1.16.5)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "import numpy as np\n", + "!pip install pyarrow\n", + "import pyarrow.feather" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Storage Formats - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**In the cell below, the variable `url` contains a web address to a csv file containing the result of all NFL games from September 1920 to February 2017.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Use `pd.read_csv` to read this file into a DataFrame named `nfl`.** " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seasonneutralplayoffteam1team2elo1elo2elo_prob1score1score2result1
date
1920-09-26192000RIISTP1503.9470001300.0000000.8246514801.0
1920-10-03192000AKRWHE1503.4200001300.0000000.8242124301.0
1920-10-03192000RCHABU1503.4200001300.0000000.8242121001.0
1920-10-03192000DAYCOL1493.0020001504.9080000.5758191401.0
1920-10-03192000RIIMUN1516.1080001478.0040000.6441714501.0
....................................
2017-01-15201601DALGB1617.7946831635.4511720.56771431340.0
2017-01-15201601KCPIT1681.9264631647.7341790.63899316180.0
2017-01-22201601ATLGB1664.1272661651.5377310.60984044211.0
2017-01-22201601NEPIT1747.1603211662.4372150.70305236171.0
2017-02-05201611ATLNE1688.0814181763.8181680.39270028340.0
\n", + "

15740 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " season neutral playoff team1 team2 elo1 elo2 \\\n", + "date \n", + "1920-09-26 1920 0 0 RII STP 1503.947000 1300.000000 \n", + "1920-10-03 1920 0 0 AKR WHE 1503.420000 1300.000000 \n", + "1920-10-03 1920 0 0 RCH ABU 1503.420000 1300.000000 \n", + "1920-10-03 1920 0 0 DAY COL 1493.002000 1504.908000 \n", + "1920-10-03 1920 0 0 RII MUN 1516.108000 1478.004000 \n", + "... ... ... ... ... ... ... ... \n", + "2017-01-15 2016 0 1 DAL GB 1617.794683 1635.451172 \n", + "2017-01-15 2016 0 1 KC PIT 1681.926463 1647.734179 \n", + "2017-01-22 2016 0 1 ATL GB 1664.127266 1651.537731 \n", + "2017-01-22 2016 0 1 NE PIT 1747.160321 1662.437215 \n", + "2017-02-05 2016 1 1 ATL NE 1688.081418 1763.818168 \n", + "\n", + " elo_prob1 score1 score2 result1 \n", + "date \n", + "1920-09-26 0.824651 48 0 1.0 \n", + "1920-10-03 0.824212 43 0 1.0 \n", + "1920-10-03 0.824212 10 0 1.0 \n", + "1920-10-03 0.575819 14 0 1.0 \n", + "1920-10-03 0.644171 45 0 1.0 \n", + "... ... ... ... ... \n", + "2017-01-15 0.567714 31 34 0.0 \n", + "2017-01-15 0.638993 16 18 0.0 \n", + "2017-01-22 0.609840 44 21 1.0 \n", + "2017-01-22 0.703052 36 17 1.0 \n", + "2017-02-05 0.392700 28 34 0.0 \n", + "\n", + "[15740 rows x 11 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://raw.githubusercontent.com/fivethirtyeight/nfl-elo-game/\"\n", + "url = url + \"3488b7d0b46c5f6583679bc40fb3a42d729abd39/data/nfl_games.csv\"\n", + "\n", + "nfl = pd.read_csv(url, index_col=0)\n", + "nfl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Print the shape and column names of `nfl`.**" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nfl.shape= (15740, 11)\n", + "Index(['season', 'neutral', 'playoff', 'team1', 'team2', 'elo1', 'elo2',\n", + " 'elo_prob1', 'score1', 'score2', 'result1'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(\"nfl.shape=\",nfl.shape)\n", + "print(nfl.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Save the DataFrame to a file named `nfl.xlsx`** " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "nfl.to_excel(\"nfl.xlsx\", \"nfl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Open the spreadsheet using Excel on your computer.** " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Compute the average total points in each game (note, you will need to sum two of the columns to get total points).** " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39.95368487928844" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#we compute the total points as score1 + score 2, then we take the average\n", + "nfl[\"total_points\"] = nfl[\"score1\"]+nfl[\"score2\"]\n", + "nfl[\"total_points\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Repeat the above calculation, but only for playoff games.** " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "42.70404411764706" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#here we compute the average total points for the subset corresponding to playoff games\n", + "nfl[\"total_points\"][nfl[\"playoff\"]==1].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Compute the average score for your favorite team (you’ll need to consider when they were team1 vs team2).** " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17.125" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#I don't have a smarter solution\n", + "\n", + "#in order to get the average for AKR:\n", + " #I sum the total points obtained by AKR either as team1 or team2\n", + " #I sum the total number of games of AKR either as team1 or team2\n", + " #then I divide the total points by the total number of games\n", + "\n", + "sum_teams12 = nfl[\"total_points\"][nfl[\"team1\"]==\"AKR\"].sum() + nfl[\"total_points\"][nfl[\"team2\"]==\"AKR\"].sum()\n", + "count_teams12 = nfl[\"total_points\"][nfl[\"team1\"]==\"AKR\"].count() + nfl[\"total_points\"][nfl[\"team2\"]==\"AKR\"].count()\n", + "average = sum_teams12/count_teams12\n", + "average" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Compute the ratio of “upsets” to total games played. An upset is defined as a team with a lower ELO winning the game.** " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.34358322744599745" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#for all the matches \n", + "nfl.loc[((nfl['elo1']>nfl['elo2']) & (nfl['result1']==0)) | ((nfl['elo2']>nfl['elo1']) & (nfl['result1']==1)) , 'Upset?' ] =1 \n", + "nfl.loc[(nfl['Upset?']!=1), 'Upset?']=0\n", + "nfl['Upset?'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.21739130434782608" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#only for the matches of AKR (as team1)\n", + "nfl.loc[((nfl['elo1']>nfl['elo2']) & (nfl['result1']==0)) | ((nfl['elo2']>nfl['elo1']) & (nfl['result1']==1)) , 'Upset?' ] =1 \n", + "nfl.loc[(nfl['Upset?']!=1), 'Upset?']=0\n", + "nfl['Upset?'][nfl[\"team1\"] == \"AKR\"].mean()" + ] + } + ], + "metadata": { + "date": 1584040764.364358, + "filename": "storage_formats.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Storage Formats" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Session_7/5_data_clean.ipynb b/Session_7/5_data_clean.ipynb new file mode 100644 index 0000000..425d5fe --- /dev/null +++ b/Session_7/5_data_clean.ipynb @@ -0,0 +1,1947 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleaning Data\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Intro](https://datascience.quantecon.org/intro.html) \n", + "- [Boolean selection](https://datascience.quantecon.org/basics.html) \n", + "- [Indexing](https://datascience.quantecon.org/the_index.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Be able to use string methods to clean data that comes as a string \n", + "- Be able to drop missing data \n", + "- Use cleaning methods to prepare and analyze a real dataset \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- Item information from about 3,000 Chipotle meals from about 1,800\n", + " Grubhub orders " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2.8.0)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.12.0)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import qeds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Cleaning Data](#Cleaning-Data) \n", + " - [Cleaning Data](#Cleaning-Data) \n", + " - [String Methods](#String-Methods) \n", + " - [Type Conversions](#Type-Conversions) \n", + " - [Missing Data](#Missing-Data) \n", + " - [Case Study](#Case-Study) \n", + " - [Appendix: Performance of `.str` Methods](#Appendix:-Performance-of-`.str`-Methods) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning Data\n", + "\n", + "For many data projects, a [significant proportion of\n", + "time](https://www.forbes.com/sites/gilpress/2016/03/23/data-preparation-most-time-consuming-least-enjoyable-data-science-task-survey-says/#74d447456f63)\n", + "is spent collecting and cleaning the data — not performing the analysis.\n", + "\n", + "This non-analysis work is often called “data cleaning”.\n", + "\n", + "pandas provides very powerful data cleaning tools, which we\n", + "will demonstrate using the following dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_column
0#2323green0
1#2424red1
2#1818yellow0
3#1414orange2
4#12NaNpurple1
5#10XYZblue0
6#3535pink2
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column\n", + "0 #23 23 green 0\n", + "1 #24 24 red 1\n", + "2 #18 18 yellow 0\n", + "3 #14 14 orange 2\n", + "4 #12 NaN purple 1\n", + "5 #10 XYZ blue 0\n", + "6 #35 35 pink 2" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\"numbers\": [\"#23\", \"#24\", \"#18\", \"#14\", \"#12\", \"#10\", \"#35\"],\n", + " \"nums\": [\"23\", \"24\", \"18\", \"14\", np.nan, \"XYZ\", \"35\"],\n", + " \"colors\": [\"green\", \"red\", \"yellow\", \"orange\", \"purple\", \"blue\", \"pink\"],\n", + " \"other_column\": [0, 1, 0, 2, 1, 0, 2]})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What would happen if we wanted to try and compute the mean of\n", + "`numbers`?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "df[\"numbers\"].mean()\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It throws an error!\n", + "\n", + "Can you figure out why?\n", + "\n", + "Hint: When looking at error messages, start at the very\n", + "bottom.\n", + "\n", + "The final error says, `TypeError: Could not convert #23#24... to numeric`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## String Methods\n", + "\n", + "Our solution to the previous exercise was to remove the `#` by using\n", + "the `replace` string method: `int(c2n.replace(\"#\", \"\"))`.\n", + "\n", + "One way to make this change to every element of a column would be to\n", + "loop through all elements of the column and apply the desired string\n", + "methods…" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 6.51 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# Iterate over all rows\n", + "for row in df.iterrows():\n", + "\n", + " # `iterrows` method produces a tuple with two elements...\n", + " # The first element is an index and the second is a Series with the data from that row\n", + " index_value, column_values = row\n", + "\n", + " # Apply string method\n", + " clean_number = int(column_values[\"numbers\"].replace(\"#\", \"\"))\n", + "\n", + " # The `at` method is very similar to the `loc` method, but it is specialized\n", + " # for accessing single elements at a time... We wanted to use it here to give\n", + " # the loop the best chance to beat a faster method which we show you next.\n", + " df.at[index_value, \"numbers_loop\"] = clean_number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While this is fast for a small dataset like this, this method slows for larger datasets.\n", + "\n", + "One *significantly* faster (and easier) method is to apply a string\n", + "method to an entire column of data.\n", + "\n", + "Most methods that are available to a Python string (we learned a\n", + "few of them in the [strings lecture](https://datascience.quantecon.org/../python_fundamentals/basics.html)) are\n", + "also available to a pandas Series that has `dtype` object.\n", + "\n", + "We access them by doing `s.str.method_name` where `method_name` is\n", + "the name of the method.\n", + "\n", + "When we apply the method to a Series, it is applied to all rows in the\n", + "Series in one shot!\n", + "\n", + "Let’s redo our previous example using a pandas `.str` method." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 0 ns\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# ~2x faster than loop... However, speed gain increases with size of DataFrame. The\n", + "# speedup can be in the ballpark of ~100-500x faster for big DataFrames.\n", + "# See appendix at the end of the lecture for an application on a larger DataFrame\n", + "df[\"numbers_str\"] = df[\"numbers\"].str.replace(\"#\", \"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use `.str` to access almost any string method that works on\n", + "normal strings. (See the [official\n", + "documentation](https://pandas.pydata.org/pandas-docs/stable/text.html)\n", + "for more information.)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 True\n", + "5 False\n", + "6 True\n", + "Name: colors, dtype: bool" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"colors\"].str.contains(\"p\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Green\n", + "1 Red\n", + "2 Yellow\n", + "3 Orange\n", + "4 Purple\n", + "5 Blue\n", + "6 Pink\n", + "Name: colors, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"colors\"].str.capitalize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Type Conversions\n", + "\n", + "In our example above, the `dtype` of the `numbers_str` column shows that pandas still treats\n", + "it as a string even after we have removed the `\"#\"`.\n", + "\n", + "We need to convert this column to numbers.\n", + "\n", + "The best way to do this is using the `pd.to_numeric` function.\n", + "\n", + "This method attempts to convert whatever is stored in a Series into\n", + "numeric values\n", + "\n", + "For example, after the `\"#\"` removed, the numbers of column\n", + "`\"numbers\"` are ready to be converted to actual numbers." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "df[\"numbers_numeric\"] = pd.to_numeric(df[\"numbers_str\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "numbers object\n", + "nums object\n", + "colors object\n", + "other_column int64\n", + "numbers_loop float64\n", + "numbers_str object\n", + "numbers_numeric int64\n", + "dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
4#12NaNpurple112.01212
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "4 #12 NaN purple 1 12.0 12 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "4 12 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can convert to other types well.\n", + "\n", + "Using the `astype` method, we can convert to any of the supported\n", + "pandas `dtypes` (recall the [intro lecture](https://datascience.quantecon.org/intro.html)).\n", + "\n", + "Below are some examples. (Pay attention to the reported `dtype`)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 23\n", + "1 24\n", + "2 18\n", + "3 14\n", + "4 12\n", + "5 10\n", + "6 35\n", + "Name: numbers_numeric, dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"numbers_numeric\"].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 23.0\n", + "1 24.0\n", + "2 18.0\n", + "3 14.0\n", + "4 12.0\n", + "5 10.0\n", + "6 35.0\n", + "Name: numbers_numeric, dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"numbers_numeric\"].astype(float)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Missing Data\n", + "\n", + "Many datasets have missing data.\n", + "\n", + "In our example, we are missing an element from the `\"nums\"` column." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
4#12NaNpurple112.01212
5#10XYZblue010.01010
6#3535pink235.03535
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "4 #12 NaN purple 1 12.0 12 \n", + "5 #10 XYZ blue 0 10.0 10 \n", + "6 #35 35 pink 2 35.0 35 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "4 12 \n", + "5 10 \n", + "6 35 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can find missing data by using the `isnull` method." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0FalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalse
4FalseTrueFalseFalseFalseFalseFalse
5FalseFalseFalseFalseFalseFalseFalse
6FalseFalseFalseFalseFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 False False False False False False \n", + "1 False False False False False False \n", + "2 False False False False False False \n", + "3 False False False False False False \n", + "4 False True False False False False \n", + "5 False False False False False False \n", + "6 False False False False False False \n", + "\n", + " numbers_numeric \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + "5 False \n", + "6 False " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We might want to know whether particular rows or columns have any\n", + "missing data.\n", + "\n", + "To do this we can use the `.any` method on the boolean DataFrame\n", + "`df.isnull()`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "numbers False\n", + "nums True\n", + "colors False\n", + "other_column False\n", + "numbers_loop False\n", + "numbers_str False\n", + "numbers_numeric False\n", + "dtype: bool" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().any(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 True\n", + "5 False\n", + "6 False\n", + "dtype: bool" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().any(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Many approaches have been developed to deal with missing data, but the two most commonly used (and the corresponding DataFrame method) are:\n", + "\n", + "- Exclusion: Ignore any data that is missing (`.dropna`). \n", + "- Imputation: Compute “predicted” values for the data that is missing\n", + " (`.fillna`). \n", + "\n", + "\n", + "For the advantages and disadvantages of these (and other) approaches,\n", + "consider reading the [Wikipedia\n", + "article](https://en.wikipedia.org/wiki/Missing_data).\n", + "\n", + "For now, let’s see some examples." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
5#10XYZblue010.01010
6#3535pink235.03535
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "5 #10 XYZ blue 0 10.0 10 \n", + "6 #35 35 pink 2 35.0 35 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "5 10 \n", + "6 35 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# drop all rows containing a missing observation\n", + "df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
4#12100purple112.01212
5#10XYZblue010.01010
6#3535pink235.03535
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "4 #12 100 purple 1 12.0 12 \n", + "5 #10 XYZ blue 0 10.0 10 \n", + "6 #35 35 pink 2 35.0 35 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "4 12 \n", + "5 10 \n", + "6 35 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fill the missing values with a specific value\n", + "df.fillna(value=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
4#12XYZpurple112.01212
5#10XYZblue010.01010
6#3535pink235.03535
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "4 #12 XYZ purple 1 12.0 12 \n", + "5 #10 XYZ blue 0 10.0 10 \n", + "6 #35 35 pink 2 35.0 35 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "4 12 \n", + "5 10 \n", + "6 35 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use the _next_ valid observation to fill the missing data\n", + "df.fillna(method=\"bfill\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columnnumbers_loopnumbers_strnumbers_numeric
0#2323green023.02323
1#2424red124.02424
2#1818yellow018.01818
3#1414orange214.01414
4#1214purple112.01212
5#10XYZblue010.01010
6#3535pink235.03535
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column numbers_loop numbers_str \\\n", + "0 #23 23 green 0 23.0 23 \n", + "1 #24 24 red 1 24.0 24 \n", + "2 #18 18 yellow 0 18.0 18 \n", + "3 #14 14 orange 2 14.0 14 \n", + "4 #12 14 purple 1 12.0 12 \n", + "5 #10 XYZ blue 0 10.0 10 \n", + "6 #35 35 pink 2 35.0 35 \n", + "\n", + " numbers_numeric \n", + "0 23 \n", + "1 24 \n", + "2 18 \n", + "3 14 \n", + "4 12 \n", + "5 10 \n", + "6 35 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use the _previous_ valid observation to fill missing data\n", + "df.fillna(method=\"ffill\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will see more examples of dealing with missing data in future\n", + "chapters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Study\n", + "\n", + "We will now use data from an\n", + "[article](https://www.nytimes.com/interactive/2015/02/17/upshot/what-do-people-actually-order-at-chipotle.html)\n", + "written by The Upshot at the NYTimes.\n", + "\n", + "This data has order information from almost 2,000 Chipotle orders and\n", + "includes information on what was ordered and how much it cost." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "\n", + " choice_description item_price \n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle = qeds.data.load(\"chipotle_raw\")\n", + "chipotle.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Appendix: Performance of `.str` Methods\n", + "\n", + "Let’s repeat the “remove the `#`” example from above, but this time on\n", + "a much larger dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
floatsstrings
015.5215.52%
197.9897.98%
277.8177.81%
372.9972.99%
428.4728.47%
\n", + "
" + ], + "text/plain": [ + " floats strings\n", + "0 15.52 15.52%\n", + "1 97.98 97.98%\n", + "2 77.81 77.81%\n", + "3 72.99 72.99%\n", + "4 28.47 28.47%" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "test = pd.DataFrame({\"floats\": np.round(100*np.random.rand(100000), 2)})\n", + "test[\"strings\"] = test[\"floats\"].astype(str) + \"%\"\n", + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 19.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "for row in test.iterrows():\n", + " index_value, column_values = row\n", + " clean_number = column_values[\"strings\"].replace(\"%\", \"\")\n", + " test.at[index_value, \"numbers_loop\"] = clean_number" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 62.3 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "test[\"numbers_str_method\"] = test[\"strings\"].str.replace(\"%\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test[\"numbers_str_method\"].equals(test[\"numbers_loop\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We got the exact same result in a fraction of the time!" + ] + } + ], + "metadata": { + "date": 1584040759.478476, + "filename": "data_clean.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Cleaning Data" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/5_data_clean_exercises.ipynb b/Session_7/5_data_clean_exercises.ipynb new file mode 100644 index 0000000..746e4e2 --- /dev/null +++ b/Session_7/5_data_clean_exercises.ipynb @@ -0,0 +1,915 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (1.12.0)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "import numpy as np\n", + "import qeds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleaning Data - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Convert the string below into a number.**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "39.0\n" + ] + } + ], + "source": [ + "c2n = \"#39\"\n", + "\n", + "new_c2n = c2n.replace(\"#\", \"\")\n", + "\n", + "print(float(new_c2n))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises 2-3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For these exercises, we create the following DataFrame:**" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_column
0#2323green0
1#2424red1
2#1818yellow0
3#1414orange2
4#12NaNpurple1
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column\n", + "0 #23 23 green 0\n", + "1 #24 24 red 1\n", + "2 #18 18 yellow 0\n", + "3 #14 14 orange 2\n", + "4 #12 NaN purple 1" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\"numbers\": [\"#23\", \"#24\", \"#18\", \"#14\", \"#12\", \"#10\", \"#35\"],\n", + " \"nums\": [\"23\", \"24\", \"18\", \"14\", np.nan, \"XYZ\", \"35\"],\n", + " \"colors\": [\"green\", \"red\", \"yellow\", \"orange\", \"purple\", \"blue\", \"pink\"],\n", + " \"other_column\": [0, 1, 0, 2, 1, 0, 2]})\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " **Make a new column called `colors_upper` that contains the elements of `colors` with all uppercase letters.**" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columncolors_upper
0#2323green0GREEN
1#2424red1RED
2#1818yellow0YELLOW
3#1414orange2ORANGE
4#12NaNpurple1PURPLE
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column colors_upper\n", + "0 #23 23 green 0 GREEN\n", + "1 #24 24 red 1 RED\n", + "2 #18 18 yellow 0 YELLOW\n", + "3 #14 14 orange 2 ORANGE\n", + "4 #12 NaN purple 1 PURPLE" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['colors_upper']=df['colors'].str.upper()\n", + "df.head()\n", + "\n", + "#if we want only the first letter as capital letter \n", + "#df['colors_upper'] = df[\"colors\"].str.capitalize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Convert the column `\"nums\"` to a numeric type using `pd.to_numeric` and save it to the DataFrame as `\"nums_tonumeric\"`.**\n", + "\n", + " - Notice that there is a missing value, and a value that is not a number. Look at the documentation for `pd.to_numeric` and think about how to overcome this." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbersnumscolorsother_columncolors_uppernums_tonumeric
0#2323green0GREEN23.0
1#2424red1RED24.0
2#1818yellow0YELLOW18.0
3#1414orange2ORANGE14.0
4#12NaNpurple1PURPLENaN
5#10XYZblue0BLUENaN
6#3535pink2PINK35.0
\n", + "
" + ], + "text/plain": [ + " numbers nums colors other_column colors_upper nums_tonumeric\n", + "0 #23 23 green 0 GREEN 23.0\n", + "1 #24 24 red 1 RED 24.0\n", + "2 #18 18 yellow 0 YELLOW 18.0\n", + "3 #14 14 orange 2 ORANGE 14.0\n", + "4 #12 NaN purple 1 PURPLE NaN\n", + "5 #10 XYZ blue 0 BLUE NaN\n", + "6 #35 35 pink 2 PINK 35.0" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"nums_tonumeric\"] = pd.to_numeric(df[\"nums\"], errors='coerce')\n", + "df\n", + "# errors \"coerce\" means that we force the conversion of string to NaN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Think about why this could be a bad idea of used without knowing what your data looks like. (Think about what happens when you apply it to the `\"numbers\"` column before replacing the `\"#\"`.)**" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "#that would set all values to NaN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For this exercise, we use data from an article written by The Upshot at the NYTimes, which has order information from almost 2,000 Chipotle orders and includes information on what was ordered and how much it cost.**" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "\n", + " choice_description item_price \n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle = qeds.data.load(\"chipotle_raw\")\n", + "chipotle.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Make sure the `item_price` column has a numeric `dtype` (probably float).**" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_priceitem_price_numeric
011Chips and Fresh Tomato SalsaNaN$2.392.39
111Izze[Clementine]$3.393.39
211Nantucket Nectar[Apple]$3.393.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.392.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.9816.98
\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "\n", + " choice_description item_price \\\n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", + "\n", + " item_price_numeric \n", + "0 2.39 \n", + "1 3.39 \n", + "2 3.39 \n", + "3 2.39 \n", + "4 16.98 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle[\"item_price_numeric\"] = pd.to_numeric(chipotle[\"item_price\"].str.replace(\"$\", \"\"))\n", + "chipotle.dtypes\n", + "chipotle.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **What is the average price of an item with chicken?**" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10.133724358974309" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle.loc[chipotle['item_name'].str.match('Chicken'),\"item_price_numeric\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **What is the average price of an item with steak?** " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10.518888888888851" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle.loc[chipotle['item_name'].str.match('Steak'),\"item_price_numeric\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. **Did chicken or steak produce more revenue (total)?** " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipotle.loc[chipotle['item_name'].str.match('Chicken'),\"item_price_numeric\"].sum()>chipotle.loc[chipotle['item_name'].str.match('Steak'),\"item_price_numeric\"].sum()\n", + "\n", + "#we create a boolean which confirms that chicken produces more total revenue compared to steak" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Product containing steak generated a revenue of 8072.619999999973 dollars\n", + "Product containing chicken generated a revenue of 17742.14999999992 dollars\n" + ] + } + ], + "source": [ + "#Alternative solution:\n", + "chipotle['revenue']=chipotle['item_price_numeric']*chipotle['quantity']\n", + "\n", + "steak= chipotle[chipotle['item_name'].str.contains(pat='Steak')].sum()\n", + "print('Product containing steak generated a revenue of', steak.loc['revenue'], 'dollars')\n", + "steak= chipotle[chipotle['item_name'].str.contains(pat='Chicken')].sum()\n", + "print('Product containing chicken generated a revenue of', steak.loc['revenue'], 'dollars')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5. **How many missing items are there in this dataset? How many missing items in each column?**" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1246" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#in the dataset\n", + "chipotle.isnull().sum().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "order_id 0\n", + "quantity 0\n", + "item_name 0\n", + "choice_description 1246\n", + "item_price 0\n", + "item_price_numeric 0\n", + "revenue 0\n", + "dtype: int64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#in each column\n", + "chipotle.isnull().sum()" + ] + } + ], + "metadata": { + "date": 1584040759.478476, + "filename": "data_clean.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Cleaning Data" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/6_reshape.ipynb b/Session_7/6_reshape.ipynb new file mode 100644 index 0000000..c7a6948 --- /dev/null +++ b/Session_7/6_reshape.ipynb @@ -0,0 +1,5296 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reshape\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [pandas intro](https://datascience.quantecon.org/intro.html) \n", + "- [pandas basics](https://datascience.quantecon.org/basics.html) \n", + "- [Importance of index](https://datascience.quantecon.org/the_index.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Understand and be able to apply the `melt`/`stack`/`unstack`/`pivot` methods \n", + "- Practice transformations of indices \n", + "- Understand tidy data " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.12.0)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2.8.0)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "%matplotlib inline\n", + "# activate plot theme\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Reshape](#Reshape) \n", + " - [Tidy Data](#Tidy-Data) \n", + " - [Reshaping your Data](#Reshaping-your-Data) \n", + " - [Long vs Wide](#Long-vs-Wide) \n", + " - [`set_index`, `reset_index`, and Transpose](#`set_index`,-`reset_index`,-and-Transpose) \n", + " - [`stack` and `unstack`](#`stack`-and-`unstack`) \n", + " - [`melt`](#`melt`) \n", + " - [`pivot` and `pivot_table`](#`pivot`-and-`pivot_table`) \n", + " - [Visualizing Reshaping](#Visualizing-Reshaping) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tidy Data\n", + "\n", + "While pushed more generally in the `R` language, the concept of “[tidy data](https://en.wikipedia.org/wiki/Tidy_data)” is helpful in understanding the\n", + "objectives for reshaping data, which in turn makes advanced features like\n", + "[GroupBy](https://datascience.quantecon.org/groupby.html) more seamless.\n", + "\n", + "Hadley Wickham gives a terminology slightly better-adapted for the experimental\n", + "sciences, but nevertheless useful for the social sciences.\n", + "\n", + "> A dataset is a collection of values, usually either numbers (if\n", + "quantitative) or strings (if qualitative). Values are organized in two\n", + "ways. Every value belongs to a variable and an observation. A variable\n", + "contains all values that measure the same underlying attribute (like\n", + "height, temperature, duration) across units. An observation contains all\n", + "values measured on the same unit (like a person, or a day, or a race)\n", + "across attributes. – [Tidy Data (Journal of Statistical Software 2013)](https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf)\n", + "\n", + "\n", + "With this framing,\n", + "\n", + "> A dataset is messy or tidy depending on how rows, columns and tables are\n", + "matched with observations, variables, and types. In tidy data:\n", + "1. Each variable forms a column.\n", + "2. Each observation forms a row.\n", + "3. Each type of observational unit forms a table.\n", + "\n", + "\n", + "The “column” and “row” terms map directly to pandas columns and rows, while the\n", + "“table” maps to a pandas DataFrame.\n", + "\n", + "With this thinking and interpretation, it becomes essential to think through\n", + "what uniquely identifies an “observation” in your data.\n", + "\n", + "Is it a country? A year? A combination of country and year?\n", + "\n", + "These will become the indices of your DataFrame.\n", + "\n", + "For those with more of a database background, the “tidy” format matches the\n", + "[3rd normal form](https://en.wikipedia.org/wiki/Third_normal_form) in\n", + "database theory, where the referential integrity of the database is maintained\n", + "by the uniqueness of the index.\n", + "\n", + "When considering how to map this to the social sciences, note that\n", + "reshaping data can change what we consider to be the variable and\n", + "observation in a way that doesn’t occur within the natural sciences.\n", + "\n", + "For example, if the “observation” uniquely identified by a country and year and\n", + "the “variable” is GDP, you may wish to reshape it so that the “observable” is a\n", + "country, and the variables are a GDP for each year.\n", + "\n", + "A word of caution: The tidy approach, where there is no redundancy and each\n", + "type of observational unit forms a table, is a good approach for storing data,\n", + "but you will frequently reshape/merge/etc. in order to make graphing or\n", + "analysis easier. This doesn’t break the tidy format since those examples are\n", + "ephemeral states used in analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reshaping your Data\n", + "\n", + "The data you receive is not always in a “shape” that makes it easy to analyze.\n", + "\n", + "What do we mean by shape? The number of rows and columns in a\n", + "DataFrame and how information is stored in the index and column names.\n", + "\n", + "This lecture will teach you the basic concepts of reshaping data.\n", + "\n", + "As with other topics, we recommend reviewing the [pandas\n", + "documentation](https://pandas.pydata.org/pandas-docs/stable/reshaping.html)\n", + "on this subject for additional information.\n", + "\n", + "We will keep our discussion here as brief and simple as possible because\n", + "these tools will reappear in subsequent lectures." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9 entries, 0 to 8\n", + "Data columns (total 8 columns):\n", + "Year 9 non-null int64\n", + "Player 9 non-null object\n", + "Team 9 non-null object\n", + "TeamName 9 non-null object\n", + "Games 9 non-null int64\n", + "Pts 9 non-null float64\n", + "Assist 9 non-null float64\n", + "Rebound 9 non-null float64\n", + "dtypes: float64(3), int64(2), object(3)\n", + "memory usage: 704.0+ bytes\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNameGamesPtsAssistRebound
02015CurryGSWWarriors7930.16.75.4
12016CurryGSWWarriors7925.36.64.5
22017CurryGSWWarriors5126.46.15.1
32015DurantOKCThunder7228.25.08.2
42016DurantGSWWarriors6225.14.88.3
52017DurantGSWWarriors6826.45.46.8
62015IbakaOKCThunder7812.60.86.8
72016IbakaORLMagic5615.11.16.8
82016IbakaTORRaptors2314.20.76.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName Games Pts Assist Rebound\n", + "0 2015 Curry GSW Warriors 79 30.1 6.7 5.4\n", + "1 2016 Curry GSW Warriors 79 25.3 6.6 4.5\n", + "2 2017 Curry GSW Warriors 51 26.4 6.1 5.1\n", + "3 2015 Durant OKC Thunder 72 28.2 5.0 8.2\n", + "4 2016 Durant GSW Warriors 62 25.1 4.8 8.3\n", + "5 2017 Durant GSW Warriors 68 26.4 5.4 6.8\n", + "6 2015 Ibaka OKC Thunder 78 12.6 0.8 6.8\n", + "7 2016 Ibaka ORL Magic 56 15.1 1.1 6.8\n", + "8 2016 Ibaka TOR Raptors 23 14.2 0.7 6.8" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://datascience.quantecon.org/assets/data/bball.csv\"\n", + "bball = pd.read_csv(url)\n", + "bball.info()\n", + "\n", + "bball" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Long vs Wide\n", + "\n", + "Many of these operations change between long and wide DataFrames.\n", + "\n", + "What does it mean for a DataFrame to be long or wide?\n", + "\n", + "Here is long possible long-form representation of our basketball data." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNamevariablevalue
02015CurryGSWWarriorsGames79.0
12016CurryGSWWarriorsGames79.0
22017CurryGSWWarriorsGames51.0
32015DurantOKCThunderGames72.0
42016DurantGSWWarriorsGames62.0
52017DurantGSWWarriorsGames68.0
62015IbakaOKCThunderGames78.0
72016IbakaORLMagicGames56.0
82016IbakaTORRaptorsGames23.0
92015CurryGSWWarriorsPts30.1
102016CurryGSWWarriorsPts25.3
112017CurryGSWWarriorsPts26.4
122015DurantOKCThunderPts28.2
132016DurantGSWWarriorsPts25.1
142017DurantGSWWarriorsPts26.4
152015IbakaOKCThunderPts12.6
162016IbakaORLMagicPts15.1
172016IbakaTORRaptorsPts14.2
182015CurryGSWWarriorsAssist6.7
192016CurryGSWWarriorsAssist6.6
202017CurryGSWWarriorsAssist6.1
212015DurantOKCThunderAssist5.0
222016DurantGSWWarriorsAssist4.8
232017DurantGSWWarriorsAssist5.4
242015IbakaOKCThunderAssist0.8
252016IbakaORLMagicAssist1.1
262016IbakaTORRaptorsAssist0.7
272015CurryGSWWarriorsRebound5.4
282016CurryGSWWarriorsRebound4.5
292017CurryGSWWarriorsRebound5.1
302015DurantOKCThunderRebound8.2
312016DurantGSWWarriorsRebound8.3
322017DurantGSWWarriorsRebound6.8
332015IbakaOKCThunderRebound6.8
342016IbakaORLMagicRebound6.8
352016IbakaTORRaptorsRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName variable value\n", + "0 2015 Curry GSW Warriors Games 79.0\n", + "1 2016 Curry GSW Warriors Games 79.0\n", + "2 2017 Curry GSW Warriors Games 51.0\n", + "3 2015 Durant OKC Thunder Games 72.0\n", + "4 2016 Durant GSW Warriors Games 62.0\n", + "5 2017 Durant GSW Warriors Games 68.0\n", + "6 2015 Ibaka OKC Thunder Games 78.0\n", + "7 2016 Ibaka ORL Magic Games 56.0\n", + "8 2016 Ibaka TOR Raptors Games 23.0\n", + "9 2015 Curry GSW Warriors Pts 30.1\n", + "10 2016 Curry GSW Warriors Pts 25.3\n", + "11 2017 Curry GSW Warriors Pts 26.4\n", + "12 2015 Durant OKC Thunder Pts 28.2\n", + "13 2016 Durant GSW Warriors Pts 25.1\n", + "14 2017 Durant GSW Warriors Pts 26.4\n", + "15 2015 Ibaka OKC Thunder Pts 12.6\n", + "16 2016 Ibaka ORL Magic Pts 15.1\n", + "17 2016 Ibaka TOR Raptors Pts 14.2\n", + "18 2015 Curry GSW Warriors Assist 6.7\n", + "19 2016 Curry GSW Warriors Assist 6.6\n", + "20 2017 Curry GSW Warriors Assist 6.1\n", + "21 2015 Durant OKC Thunder Assist 5.0\n", + "22 2016 Durant GSW Warriors Assist 4.8\n", + "23 2017 Durant GSW Warriors Assist 5.4\n", + "24 2015 Ibaka OKC Thunder Assist 0.8\n", + "25 2016 Ibaka ORL Magic Assist 1.1\n", + "26 2016 Ibaka TOR Raptors Assist 0.7\n", + "27 2015 Curry GSW Warriors Rebound 5.4\n", + "28 2016 Curry GSW Warriors Rebound 4.5\n", + "29 2017 Curry GSW Warriors Rebound 5.1\n", + "30 2015 Durant OKC Thunder Rebound 8.2\n", + "31 2016 Durant GSW Warriors Rebound 8.3\n", + "32 2017 Durant GSW Warriors Rebound 6.8\n", + "33 2015 Ibaka OKC Thunder Rebound 6.8\n", + "34 2016 Ibaka ORL Magic Rebound 6.8\n", + "35 2016 Ibaka TOR Raptors Rebound 6.8" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Don't worry about what this command does -- We'll see it soon\n", + "bball_long = bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"])\n", + "\n", + "bball_long" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And here is a wide-form version." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
variableAssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + "variable Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + "variable Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Again, don't worry about this command... We'll see it soon too\n", + "bball_wide = bball_long.pivot_table(\n", + " index=\"Year\",\n", + " columns=[\"Player\", \"variable\", \"Team\"],\n", + " values=\"value\"\n", + ")\n", + "bball_wide" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `set_index`, `reset_index`, and Transpose\n", + "\n", + "We have already seen a few basic methods for reshaping a\n", + "DataFrame.\n", + "\n", + "- `set_index`: Move one or more columns into the index. \n", + "- `reset_index`: Move one or more index levels out of the index and make\n", + " them either columns or drop from DataFrame. \n", + "- `T`: Swap row and column labels. \n", + "\n", + "\n", + "Sometimes, the simplest approach is the right approach.\n", + "\n", + "Let’s review them briefly." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TeamTeamNameGamesPtsAssistRebound
PlayerYear
Curry2015GSWWarriors7930.16.75.4
2016GSWWarriors7925.36.64.5
2017GSWWarriors5126.46.15.1
Durant2015OKCThunder7228.25.08.2
2016GSWWarriors6225.14.88.3
\n", + "
" + ], + "text/plain": [ + " Team TeamName Games Pts Assist Rebound\n", + "Player Year \n", + "Curry 2015 GSW Warriors 79 30.1 6.7 5.4\n", + " 2016 GSW Warriors 79 25.3 6.6 4.5\n", + " 2017 GSW Warriors 51 26.4 6.1 5.1\n", + "Durant 2015 OKC Thunder 72 28.2 5.0 8.2\n", + " 2016 GSW Warriors 62 25.1 4.8 8.3" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball2 = bball.set_index([\"Player\", \"Year\"])\n", + "bball2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
Year201520162017201520162017201520162016
TeamGSWGSWGSWOKCGSWGSWOKCORLTOR
TeamNameWarriorsWarriorsWarriorsThunderWarriorsWarriorsThunderMagicRaptors
Games797951726268785623
Pts30.125.326.428.225.126.412.615.114.2
Assist6.76.66.154.85.40.81.10.7
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka \\\n", + "Year 2015 2016 2017 2015 2016 2017 2015 \n", + "Team GSW GSW GSW OKC GSW GSW OKC \n", + "TeamName Warriors Warriors Warriors Thunder Warriors Warriors Thunder \n", + "Games 79 79 51 72 62 68 78 \n", + "Pts 30.1 25.3 26.4 28.2 25.1 26.4 12.6 \n", + "Assist 6.7 6.6 6.1 5 4.8 5.4 0.8 \n", + "\n", + "Player \n", + "Year 2016 2016 \n", + "Team ORL TOR \n", + "TeamName Magic Raptors \n", + "Games 56 23 \n", + "Pts 15.1 14.2 \n", + "Assist 1.1 0.7 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball3 = bball2.T\n", + "bball3.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `stack` and `unstack`\n", + "\n", + "The `stack` and `unstack` methods operate directly on the index\n", + "and/or column labels." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `stack`\n", + "\n", + "`stack` is used to move certain levels of the column labels into the\n", + "index (i.e. moving from wide to long)\n", + "\n", + "Let’s take `ball_wide` as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
variableAssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + "variable Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + "variable Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose that we want to be able to use the `mean` method to compute the\n", + "average value of each stat for each player, regardless of year or team.\n", + "\n", + "To do that, we need two column levels: one for the player and one for the variable.\n", + "\n", + "We can achieve this using the `stack` method." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
variableAssistGamesPtsReboundAssistGamesPtsReboundAssistGamesPtsRebound
YearTeam
2015GSW6.779.030.15.4NaNNaNNaNNaNNaNNaNNaNNaN
OKCNaNNaNNaNNaN5.072.028.28.20.878.012.66.8
2016GSW6.679.025.34.54.862.025.18.3NaNNaNNaNNaN
ORLNaNNaNNaNNaNNaNNaNNaNNaN1.156.015.16.8
TORNaNNaNNaNNaNNaNNaNNaNNaN0.723.014.26.8
2017GSW6.151.026.45.15.468.026.46.8NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka \\\n", + "variable Assist Games Pts Rebound Assist Games Pts Rebound Assist Games \n", + "Year Team \n", + "2015 GSW 6.7 79.0 30.1 5.4 NaN NaN NaN NaN NaN NaN \n", + " OKC NaN NaN NaN NaN 5.0 72.0 28.2 8.2 0.8 78.0 \n", + "2016 GSW 6.6 79.0 25.3 4.5 4.8 62.0 25.1 8.3 NaN NaN \n", + " ORL NaN NaN NaN NaN NaN NaN NaN NaN 1.1 56.0 \n", + " TOR NaN NaN NaN NaN NaN NaN NaN NaN 0.7 23.0 \n", + "2017 GSW 6.1 51.0 26.4 5.1 5.4 68.0 26.4 6.8 NaN NaN \n", + "\n", + "Player \n", + "variable Pts Rebound \n", + "Year Team \n", + "2015 GSW NaN NaN \n", + " OKC 12.6 6.8 \n", + "2016 GSW NaN NaN \n", + " ORL 15.1 6.8 \n", + " TOR 14.2 6.8 \n", + "2017 GSW NaN NaN " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide.stack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can compute the statistic we are after." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Player variable\n", + "Curry Assist 6.466667\n", + " Games 69.666667\n", + " Pts 27.266667\n", + " Rebound 5.000000\n", + "Durant Assist 5.066667\n", + " Games 67.333333\n", + " Pts 26.566667\n", + " Rebound 7.766667\n", + "Ibaka Assist 0.866667\n", + " Games 52.333333\n", + " Pts 13.966667\n", + " Rebound 6.800000\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "player_stats = bball_wide.stack().mean()\n", + "player_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now suppose instead of that we wanted to compute the average for each team and\n", + "stat, averaging over years and players.\n", + "\n", + "We’d need to move the `Player` level down into the index so we are\n", + "left with column levels for Team and variable.\n", + "\n", + "We can ask pandas do this using the `level` keyword argument." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variableAssistGamesPtsRebound
TeamGSWOKCORLTORGSWOKCORLTORGSWOKCORLTORGSWOKCORLTOR
YearPlayer
2015Curry6.7NaNNaNNaN79.0NaNNaNNaN30.1NaNNaNNaN5.4NaNNaNNaN
DurantNaN5.0NaNNaNNaN72.0NaNNaNNaN28.2NaNNaNNaN8.2NaNNaN
IbakaNaN0.8NaNNaNNaN78.0NaNNaNNaN12.6NaNNaNNaN6.8NaNNaN
2016Curry6.6NaNNaNNaN79.0NaNNaNNaN25.3NaNNaNNaN4.5NaNNaNNaN
Durant4.8NaNNaNNaN62.0NaNNaNNaN25.1NaNNaNNaN8.3NaNNaNNaN
IbakaNaNNaN1.10.7NaNNaN56.023.0NaNNaN15.114.2NaNNaN6.86.8
2017Curry6.1NaNNaNNaN51.0NaNNaNNaN26.4NaNNaNNaN5.1NaNNaNNaN
Durant5.4NaNNaNNaN68.0NaNNaNNaN26.4NaNNaNNaN6.8NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "variable Assist Games Pts \\\n", + "Team GSW OKC ORL TOR GSW OKC ORL TOR GSW OKC ORL \n", + "Year Player \n", + "2015 Curry 6.7 NaN NaN NaN 79.0 NaN NaN NaN 30.1 NaN NaN \n", + " Durant NaN 5.0 NaN NaN NaN 72.0 NaN NaN NaN 28.2 NaN \n", + " Ibaka NaN 0.8 NaN NaN NaN 78.0 NaN NaN NaN 12.6 NaN \n", + "2016 Curry 6.6 NaN NaN NaN 79.0 NaN NaN NaN 25.3 NaN NaN \n", + " Durant 4.8 NaN NaN NaN 62.0 NaN NaN NaN 25.1 NaN NaN \n", + " Ibaka NaN NaN 1.1 0.7 NaN NaN 56.0 23.0 NaN NaN 15.1 \n", + "2017 Curry 6.1 NaN NaN NaN 51.0 NaN NaN NaN 26.4 NaN NaN \n", + " Durant 5.4 NaN NaN NaN 68.0 NaN NaN NaN 26.4 NaN NaN \n", + "\n", + "variable Rebound \n", + "Team TOR GSW OKC ORL TOR \n", + "Year Player \n", + "2015 Curry NaN 5.4 NaN NaN NaN \n", + " Durant NaN NaN 8.2 NaN NaN \n", + " Ibaka NaN NaN 6.8 NaN NaN \n", + "2016 Curry NaN 4.5 NaN NaN NaN \n", + " Durant NaN 8.3 NaN NaN NaN \n", + " Ibaka 14.2 NaN NaN 6.8 6.8 \n", + "2017 Curry NaN 5.1 NaN NaN NaN \n", + " Durant NaN 6.8 NaN NaN NaN " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide.stack(level=\"Player\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can compute the mean." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "variable Team\n", + "Assist GSW 5.92\n", + " OKC 2.90\n", + " ORL 1.10\n", + " TOR 0.70\n", + "Games GSW 67.80\n", + " OKC 75.00\n", + " ORL 56.00\n", + " TOR 23.00\n", + "Pts GSW 26.66\n", + " OKC 20.40\n", + " ORL 15.10\n", + " TOR 14.20\n", + "Rebound GSW 6.02\n", + " OKC 7.50\n", + " ORL 6.80\n", + " TOR 6.80\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide.stack(level=\"Player\").mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice a few features of the `stack` method:\n", + "\n", + "- Without any arguments, the `stack` arguments move the level of column\n", + " labels closest to the data (also called inner-most or bottom level of labels)\n", + " to become the index level closest to the data (also called the inner-most or\n", + " right-most level of the index). In our example, this moved `Team` down from\n", + " columns to the index. \n", + "- When we do pass a level, that level of column labels is moved down to the\n", + " right-most level of the index and all other column labels stay in their\n", + " relative position. \n", + "\n", + "\n", + "Note that we can also move multiple levels at a time in one call to `stack`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variableAssistGamesPtsRebound
YearPlayerTeam
2015CurryGSW6.779.030.15.4
DurantOKC5.072.028.28.2
IbakaOKC0.878.012.66.8
2016CurryGSW6.679.025.34.5
DurantGSW4.862.025.18.3
IbakaORL1.156.015.16.8
TOR0.723.014.26.8
2017CurryGSW6.151.026.45.1
DurantGSW5.468.026.46.8
\n", + "
" + ], + "text/plain": [ + "variable Assist Games Pts Rebound\n", + "Year Player Team \n", + "2015 Curry GSW 6.7 79.0 30.1 5.4\n", + " Durant OKC 5.0 72.0 28.2 8.2\n", + " Ibaka OKC 0.8 78.0 12.6 6.8\n", + "2016 Curry GSW 6.6 79.0 25.3 4.5\n", + " Durant GSW 4.8 62.0 25.1 8.3\n", + " Ibaka ORL 1.1 56.0 15.1 6.8\n", + " TOR 0.7 23.0 14.2 6.8\n", + "2017 Curry GSW 6.1 51.0 26.4 5.1\n", + " Durant GSW 5.4 68.0 26.4 6.8" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide.stack(level=[\"Player\", \"Team\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the example above, we started with one level on the index (just the year) and\n", + "stacked two levels to end up with a three-level index.\n", + "\n", + "Notice that the two new index levels went closer to the data than the existing\n", + "level and that their order matched the order we passed in our list argument to\n", + "`level`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `unstack`\n", + "\n", + "Now suppose that we wanted to see a bar chart of each player’s stats.\n", + "\n", + "This chart should have one “section” for each player and a different colored\n", + "bar for each variable.\n", + "\n", + "As we’ll learn in more detail in a later lecture, we will\n", + "need to have the player’s name on the index and the variables as columns to do this.\n", + "\n", + ">**Note**\n", + ">\n", + ">In general, for a DataFrame, calling the `plot` method will put the index\n", + "on the horizontal (x) axis and make a new line/bar/etc. for each column.\n", + "\n", + "Notice that we are close to that with the `player_stats` variable." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Player variable\n", + "Curry Assist 6.466667\n", + " Games 69.666667\n", + " Pts 27.266667\n", + " Rebound 5.000000\n", + "Durant Assist 5.066667\n", + " Games 67.333333\n", + " Pts 26.566667\n", + " Rebound 7.766667\n", + "Ibaka Assist 0.866667\n", + " Games 52.333333\n", + " Pts 13.966667\n", + " Rebound 6.800000\n", + "dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "player_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now need to rotate the variable level of the index up to be column layers.\n", + "\n", + "We use the `unstack` method for this." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variableAssistGamesPtsRebound
Player
Curry6.46666769.66666727.2666675.000000
Durant5.06666767.33333326.5666677.766667
Ibaka0.86666752.33333313.9666676.800000
\n", + "
" + ], + "text/plain": [ + "variable Assist Games Pts Rebound\n", + "Player \n", + "Curry 6.466667 69.666667 27.266667 5.000000\n", + "Durant 5.066667 67.333333 26.566667 7.766667\n", + "Ibaka 0.866667 52.333333 13.966667 6.800000" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "player_stats.unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can make our plot!" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEoCAYAAACjGLHcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de3hU5bn38e/kAEEgBJHzQQSRg76AiEDltXi6tYoV3FZtdbd4xMsi3VgUrW2Vui81HIq6EW3RVGN1V2m1G90U8X6t1Q0VKepWBHWLiOGk4RBCNueEef+YCSaYkJlkJpNZ+X2uiyuz1qy15p6s8MuTZz3zrFA4HEZERIIlI9UFiIhI4incRUQCSOEuIhJACncRkQBKebgXFRWFi4qKdFVXRCSBslJdQKWdO3cGNuDLyspo27ZtqsuQetL5S19BP3d5eXmh2p5LectdREQST+EuIhJACncRkQBSuIuIBFCTuaAqIsFWXl7O9u3bOXjwYKO95qFDhygtLW2010uW7OxsOnToQFZW7JGtcBeRRrF9+3ZatWpFp06dCIVqHeSRUBUVFWRmZjbKayVLOBymrKyM7du307lz55j3qzPczaw/8HyVVX2Au4Gno+t7A+uBK9y9JI6aRaQZOXjwYKMGe1CEQiHatm3Lrl274tqvzj53d//E3Ye6+1DgNGAP8GfgTuA1d+8HvBZdFhGplYK9furzfYv3guq5wGfu/gUwDiiMri8Exsf96iIikhTx9rl/H/hD9HFnd98C4O5bzKzTkRub2URg4tEOWFBQAEQ+SRZUlX1mkp50/hLj0KFDVFRUNPrr1vaa+fn5TJ48mdatW9e674QJEygsLPzG+kcffZRhw4YxatSohNVZl0OHDn3j5zAvL6/W7WMOdzNrAVwC/CzWfdx9PjD/aNtUziuTDh8RXnvV5Hrt1/m396fF+5OaBf0j7I2ltLS00S9u1nRBNRwOEw6H+fnPfx7TMWqqORQKkZGR0ajvJyMjI66fw3ha7hcC77r7V9Hlr8ysa7TV3hUojuNYIiIN8swzz9CxY0cuuOACABYsWEAoFGLNmjXs3r2b8vJyrrzySkaOHElxcTH3338/J598Mv/zP//DtGnTuOeee8jPzyc3N5eZM2eybds2Dh48yEUXXYSZHX6dwsJCVq9eTevWrZkyZQrt2rWrVsdnn31GYWEh+/btIzc3l0mTJtG+fftG/V7UJJ4+9x/wdZcMwEvAhOjjCcDCRBUlIlKX0aNHs2zZssPLf//73znrrLO4/fbbmTlzJtOnT+f3v/89lbcS3bx5M2PGjGHWrFl07Nix2rFuvvlmZs6cSX5+PosXLz7c/bF//3769OnDzJkzGTRoEH/84x+r7VdeXs7vfvc7pk6dysyZMzn77LP5wx/+QFMQU8vdzI4BDLipyup8YIGZXQ8UAZcnvjwRkZqdcMIJ7Nq1ix07drBr1y7atGlD+/bteeqpp/joo48IhULs2LGDnTt3AnDcccdx0kkn1XisxYsXs2LFCgC2bdvGli1baNu2LaFQiDPOOAOAb3/728yaNavafps3b2bDhg3867/+KxDpF28KrXaIMdzdfQ/Q4Yh124mMnhERSYlRo0axfPlydu7cyejRo/mv//ovdu3axYwZM8jKyuLmm28+/InYnJycGo+xevVqPvjgA+677z5atmzJPffcU+unaI8ckhgOh+nRowf3339/Yt9YAmhuGRFJW5VdM8uXL2fUqFHs2bOHdu3akZWVxYcffsi2bdvqPMaePXto06YNLVu2ZNOmTXz66aeHnwuHwyxfvhyApUuXMmDAgGr7duvWjV27dvHJJ58AkW6aDRs2JPAd1p+mHxCRtNWzZ0/27t3LscceS/v27TnzzDPJz8/njjvuoHfv3nTr1q3OYwwdOpRXX32VqVOn0q1bN/r163f4uZYtW7JhwwamTZvGMcccw6233lpt3+zsbKZOncqTTz7Jnj17qKioYOzYsfTs2TPh7zVeocqLDalSORQyNzc3pXXEQkMh05vOX2pt3LiRHj16NOprBmFumUo1ff90JyYRkWZG4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgGkce4ikhL9J72S0ON9Mu87MW339ttvM3v2bB566CG6d+8e12s899xzDBo0iMGDB9f4/IoVK+jatWuTGOeulruINCvLli1jwIAB1SYdi9X3v//9WoMdIuG+cePGhpSXMGq5i0izsXfvXj7++GOmT5/OjBkzuOKKKygpKeHBBx9kz549HDp0iBtvvJGTTjqJxx57jHXr1gFwzjnncPHFF/PII49w2mmn8a1vfYtnnnmGlStXkpmZyZAhQxg5ciQrV65kzZo1vPDCC9x222106dIlZe9V4S4izcY//vEPhg4dSrdu3WjTpg3r1q1j9erVDBkyhMsuu4yKigoOHDjA+vXr2bFjB3PmzAFg9+7d1Y5TVlbGihUrePjhhwmFQuzevZvWrVszfPjww+GfauqWEZFmY+nSpYwePRqITDq2dOlS+vbty+uvv86CBQsoKiqiVatWdO7cmeLiYgoKCnjvvfdo1apVteMcc8wxtGjRgscee4y3336bFi1apOLtHJVa7iLSLJSVlfHhhx+yYcMGQqEQhw4dAuCHP/wh9957L++++y5z585l3Lhxh2/q8f7777NkyRLeeustfvzjHx8+VmZmJg888ACrVq1i2bJlLF68mOnTp6fondVM4S4izcLy5csZM2YMN9309T2H7r77btasWcOAAQM477zz2L9/P+vWrePUU08lKyuLUaNG0blzZ+bNm1ftWHv37uXAgQMMGzaMfv36MXlyZFK6Vq1asW/fvkZ9X7VRuItISsQ6dDFRli5dyvjx46utGzVqFPPmzSMnJ4fMzExycnK45ZZb2LFjB48++ujh1v3VV19dbb99+/YxY8YMDh48SDgc5pprrgEiXT2/+c1v+Mtf/sLUqVNTekFVU/7GQVPGpjedv9TSlL8Noyl/RURE4S4iEkQx9bmbWR7wBHAKEAauAz4Bngd6A+uBK9y9JClViohIXGJtuT8MvOLuA4AhwEfAncBr7t4PeC26LCIiTUCd4W5mucC3gQIAdz/g7juBcUBhdLNCYHzNRxARkcZW52gZMxsKzAfWEGm1vwP8C7DJ3fOqbFfi7u2P2HciMPFoxy8oKDgNIBSq9aJvk1FeUlqv/TLzctPi/QWdzl9qlZSUxD0Lo3xt06ZNtG9fLWLp2bNnrT+YsfS5ZwHDgMnu/raZPUyMXTDuPp/IL4ZaVQ6FTIehZmtvuqte+2koXdOg85dapaWl1YYl1ndoam1O/Pe531h35FDInTt38tRTT/Hpp5/SunVrsrKyGDduHCNHjkxoLcmQkZER189hLOG+Edjo7m9Hl/9EJNy/MrOu7r7FzLoCxXFXKyLSSMLhMDNnzuSss85iypQpAGzdupWVK1emuLLkqDPc3f1LM9tgZv3d/RPgXCJdNGuACUB+9OvCpFYqItIAH374IVlZWZx//vmH13Xs2JELL7yQ4uJi5s6dy/79+wG4/vrr6d+/P6tXr+b5558nLy+P9evXM2LECI4//ngWLVrEgQMHmDZtGl26dKG0tJTHH3+cbdu2AXDNNdcwYMAAVq9ezZNPPglEup7vvffeb0xCliyxTj8wGXjWzFoA64BriVyMXWBm1wNFwOXJKVFEpOE2bNhAnz59anyuXbt2/PKXv6RFixZs2bKFhx56iBkzZgDwxRdfcOutt9KmTRsmTZrEueeeS35+PosWLWLx4sVce+21PPnkk4wdO5aBAweydetW7rvvPh566CFefvllbrjhBgYMGMDevXsbdfbImMLd3f8bGF7DU+cmthwRkcbxxBNP8PHHH5OVlcUvf/lLCgoKWL9+PRkZGWzZsuXwdn379j18IbNLly4MGTIEgF69erF69WoAVq1aVe0OTHv27GHv3r3079+fwsJCzjzzTEaOHNlorXbQxGEi0kz07NmT5cuXH16+4YYb2LVrF3feeSeLFi0iLy+P2bNnEw6Hueqqqw5vl52dffhxKBQ6vJyRkUFFRQUQ6c+/7777aNmyZbXXvPTSSxk2bBjvvfced911F3fffXejjRjS9AMi0iyccsopHDx4kCVLlhxeV9nHvmfPHtq3b09GRgZvvvnm4dkgYzV48GBeeeXrG35//vnnAHz55Zccf/zxjB8/nr59+7Jp06YEvJPYqOUuIilR09DFZAqFQkybNo2nnnqKhQsXkpubS05ODldffTV9+vRh9uzZvPXWW5x88snfaIHX5brrruOJJ55g6tSpVFRUMGjQICZOnMiiRYtYvXo1GRkZ9OjRg1NPPTVJ7+6bNOVvHDRlbHrT+UstTfnbMJryV0REFO4iIkGkcBcRCSCFu4hIACncRUQCSOEuIhJAGucuIinR7qOzE3q80oGv17nNFVdcQa9evTh06BDdu3fnlltuoby8nKVLl3LBBRcktJ5UU8tdRJqNFi1aMHv2bObMmUNWVhavvvoqu3fvrvap1aBQy11EmqWBAwfyxRdf8Oyzz/Lll19y2223MXjwYL773e/y4IMPsmfPHg4dOsSNN97IwIEDU11u3BTuItLsVFRU8N577zF06FBOPfVUNmzYwOzZswF4+eWXGTJkCJdddhkVFRUcOHAgxdXWj8JdRJqNAwcOcNtttwGRlvs555xDSUlJtW369u3Lo48+SkVFBaeffjonnHBCKkptMIW7iDQblX3uRzNo0CDuvfde3n33XebOncu4ceMYM2ZMI1WYOLqgKiLNWqtWrdi7d+/h5a1bt9KuXTvOO+88zj33XNatW5fC6upPLXcRSYlYhi5W2reuqO6Natgm+/i6b4zRtm1b+vfvz09/+lOGDh1Kr169eOmll8jMzCQnJ4dbbrkl5jqbEoW7iDQbzzzzTI3rp0yZUm35rLPOaoRqkkvdMiIiAaRwFxEJoJi6ZcxsPVAGVADl7j7czI4Fngd6A+uBK9y9pLZjiIhI44mn5X62uw919+HR5TuB19y9H/BadFlERJqAhnTLjAMKo48LgfENL0dERBIhphtkm9nnQAkQBn7r7vPNbKe751XZpsTd2x+x30Rg4tGOXVBQcBpE7kze1JWXlNZrv8y83LR4f0Gn85daJSUldO9e99DEmoTLK+q1XygrGDfHBti0aRPt21eLWHr27FnrD2asQyFHu/tmM+sEuJl9HMtO7j4fmH+0bYqKisJAWtxdfu1Nd9Vrv86/vT8t3l/Q6fylVmlpKZmZX4ftyls3J/T4p0wu/8a67OO7V3vNqlP+durUicmTJ9O6detaj1lcXEx+fj5z5sxJaK1Hs2DBAnJycrjkkkuqrc/IyIjr5zCmbhl33xz9Wgz8GRgBfGVmXQGiX4tjflURkRSoOuVvmzZteOWVV1JdUtLU2XI3s9ZAhruXRR+fD9wLvARMAPKjXxcms1ARkUQ66aST+OKLLw4vL1y4kLfeeouDBw8yYsQIrrzySiAyg+QjjzzC559/TteuXZk8eTItW7Zk1apVPP3001RUVHDiiSdy4403kp2dzY9//GPy8/PJzc3ls88+4+mnn+ZXv/oVCxYsYNu2bXz11Vds27aNsWPHctFFFwHwwgsv8MYbb3DccceRm5tLnz59Gvz+Ymm5dwaWmtn7wApgkbu/QiTUzcw+BSy6LCLS5FVUVLBq1SqGD48M/nv//ffZsmULDzzwALNmzWLdunWsWbMGgM2bN3Peeefx61//mmOOOYYlS5Zw4MAB5s2bx6233sqcOXOoqKjg1VdfrfN1N23axC9+8QseeOAB/vjHP1JeXs5nn33GsmXLmDVrFrfddhtr165NyHuss+Xu7uuAITWs3w6cm5AqREQaQeWUv1u3bqVPnz4MHjwYiIT7Bx98wO233w7Avn372LJlC8cddxwdOnRgwIABAJx55pksXryYwYMH06lTJ7p16wZEpit45ZVXGDt27FFff9iwYWRnZ5OdnU27du0oLS3l448/ZsSIEbRs2RLg8C+chtLcMiLSbFT2ue/evZv8/HyWLFnCRRddRDgc5tJLL8XMqm1fXFxc40ipo40yzMjIOPz8kTf6yM7OrrZdRUVkFFAyRmNp+gERaXZat27Nddddx0svvUR5eTlDhw7lr3/96+Gpf7dv305paWTo7LZt2/jkk08AWLZsGQMHDqR79+4UFxezZcsWAN544w0GDRoEQKdOnQ5PE/z222/XWcvAgQNZsWIF+/fvZ+/evbzzzjsJeY9quYtISgx/sFvM28Y05W+cTjjhBHr37s2yZcsYM2YMGzdu5Oc//zkAOTk5/OQnPyEjI4Pu3bvzxhtvMH/+fLp27cr5559PixYtmDRp0uH+9hNPPJHzzz8fgMsvv5zHHnuMF198kX79+tVZR58+fTjjjDO4/fbb6dixY8Lu1xrTh5iSqXKce25ubkrriMXaqybXaz+Nk24adP5Sa+PGjfTo0aNe+9Y33I8c557Oavr+5eXl1dqfo24ZEZEAUriLiASQwl1EGk2qu4HTVX2+bwp3EWkU2dnZlJWVKeDjFA6HKSsrqzaMMhYaLSMijaJDhw5s376dXbt2xb3vwdKd9XrNzE2R8eTpLjs7mw4dOsS1j8JdRBpFVlYWnTt3rte+a6fNqNd+xzXjkU7p/ytNRES+QeEuIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBpHAXEQkghbuISAAp3EVEAkjhLiISQAp3EZEAinluGTPLBFYCm9z9YjM7AXgOOBZ4F/ihux842jFERKRxxNNy/xfgoyrLM4AH3b0fUAJcn8jCRESk/mIKdzPrAYwFnoguh4BzgD9FNykExiejQBERiV+s3TIPAdOAyrkzOwA73b08urwR6H7kTmY2EZh4tAMXFBQAUFZWFmMpqdMh/4567Vc52b6kls5f+tK5q1leXl6tz9UZ7mZ2MVDs7u+Y2VnR1TXdcfsbt1dx9/nA/KMdv6ioKAykxZzLa2+6q177dW7Gc0o3JTp/6UvnLn6xdMuMBi4xs/VELqCeQ6Qln2dmlb8cegCbk1KhiIjErc5wd/efuXsPd+8NfB/4q7tfDbwOfC+62QRgYdKqFBGRuDRknPsdwE/NbC2RPviCxJQkIiINFdc9VN39b8Dfoo/XASMSX5KIiDSUPqEqIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBpHAXEQkghbuISAAp3EVEAkjhLiISQAp3EZEAUriLiASQwl1EJIAU7iIiAaRwFxEJIIW7iEgAKdxFRAJI4S4iEkAKdxGRAFK4i4gEkMJdRCSA6rxBtpnlAG8CLaPb/8nd7zGzE4DngGOBd4EfuvuBZBYrIiKxiaXlvh84x92HAEOB75jZKGAG8KC79wNKgOuTV6aIiMSjzpa7u4eB/40uZkf/hYFzgKui6wuB6cBjiS9RRETiVWe4A5hZJvAOcCIwD/gM2Onu5dFNNgLda9hvIjDxaMcuKCgAoKysLOaiU6VD/h312i8cDqfF+ws6nb/0pXNXs7y8vFqfiync3b0CGGpmecCfgYE1bBauYb/5wPyjHbuoqCgM0LZt21hKSam1N91Vr/06//b+tHh/Qafzl7507uIX12gZd98J/A0YBeSZWeUvhx7A5sSWJiIi9VVnuJtZx2iLHTNrBZwHfAS8DnwvutkEYGGyihQRkfjE0nLvCrxuZh8A/wDc3f8TuAP4qZmtBToABckrU0RE4hHLaJkPgFNrWL8OGJGMokREpGH0CVURkQBSuIuIBJDCXUQkgGIa5y4NEyrfTruPLol7v9KBryehGhFpDtRyFxEJILXcReqgv7wkHanlLiISQAp3EZEAUriLiASQwl1EJIAU7iIiAaRwFxEJIIW7iEgAKdxFRAJI4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgGkcBcRCSCFu4hIANU5n7uZ9QSeBroAh4D57v6wmR0LPA/0BtYDV7h7SfJKFRGRWMXSci8Hprr7QGAUMMnMBgF3Aq+5ez/gteiyiIg0AXWGu7tvcfd3o4/LgI+A7sA4oDC6WSEwPllFiohIfELhcDjmjc2sN/AmcApQ5O55VZ4rcff2R2w/EZh4tGMWFBScBhAKhWKvOkXKS0rrtV9muzZkHNoZ937hrA71ej2pmc5f+qr3ucvLTYtsqa+ePXvW+uZivoeqmbUBXgCmuPsuM6tzH3efD8w/2jZFRUVhgLZt28ZaSsqsvemueu3XZd5Uun95bdz76R6ciaXzl77qe+46//b+tMiWZIhptIyZZRMJ9mfd/cXo6q/MrGv0+a5AcXJKFBGReNUZ7mYWAgqAj9x9TpWnXgImRB9PABYmvjwREamPWLplRgM/BFaZ2X9H190F5AMLzOx6oAi4PDkliohIvOoMd3dfCtTWaX9uYssREZFE0CdURUQCSOEuIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBpHAXEQkghbuISAAp3EVEAkjhLiISQAp3EZEAUriLiARQzDfrEBFJN6Hy7bT76JK49wvCjVbUchcRCSCFu4hIACncRUQCSOEuIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBVOeHmMzsd8DFQLG7nxJddyzwPNAbWA9c4e4lyStTRETiEUvL/SngO0esuxN4zd37Aa9Fl0VEpImoM9zd/U1gxxGrxwGF0ceFwPgE1yUiIg0QCofDdW5kZr2B/6zSLbPT3fOqPF/i7u1r2G8iMPFoxy4oKDgNIBQKxVd5CpSXlNZrv8x2bcg4tDPu/cJZHer1elIznb/0pXNXs549e9YanEmdOMzd5wPzj7ZNUVFRGKBt27bJLCUh1t50V7326zJvKt2/vDbu/YIweVFTovOXvnTu4lff0TJfmVlXgOjX4sSVJCIiDVXfcH8JmBB9PAFYmJhyREQkEWIZCvkH4CzgODPbCNwD5AMLzOx6oAi4PJlFiohIfOoMd3f/QS1PnZvgWkREJEH0CVURkQDSbfZEkmTlrZvrtd/wB7sluBJpjtRyFxEJILXcRUSOEIS/uppluPef9Eq99luU4DpERJJF3TIiIgGkcBcRCSCFu4hIACncRUQCqFleUE0XQbhinwy6IC5SN7XcRUQCSOEuIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBpHAXEQkghbuISAAp3EVEAkjhLiISQAp3EZEAatDcMmb2HeBhIBN4wt3zE1KViASS5gVqPPVuuZtZJjAPuBAYBPzAzAYlqjAREam/UDgcrteOZvYtYLq7XxBd/hmAuz8Qz3GKiorqV4CIiNCrV69QTesb0i3THdhQZXkjMLLqBmY2EZh4tIMUFBQ0oAQREalJQ8K9pt8W1Vrh7j4fmN+A1wgEM1vp7sNTXYfUj85f+mrO564ho2U2Aj2rLPcA6nd3CRERSaiGtNz/AfQzsxOATcD3gasSUpWIiDRIvVvu7l4O3AIsAT4CFrj76kQVJiIi9degce7u/hfgLwmqRUREEkSfUBURCSCFu4hIACncRUQCSOEuIhJACvfG0ew/yJXmdP7SV7M9d/WeW0ZERJoutdxFRAJI4S4iEkAKdxGRAFK4i4gEkMI9SaJ3qpI0ZWaXx7JOmiYza29mI8zs25X/Ul1TY1O4J89aM5ulWw+mrZ/FuE6aGDO7AXiTyKSGv4p+nZ7KmlKhQROHyVENJjIN8hNmlgH8DnjO3Xeltiw5GjO7ELgI6G5m/1blqVygPDVVSZz+BTgdWO7uZ5vZACIh36yo5Z4k7l7m7o+7+xnANOAeYIuZFZrZiSkuT2q3GVgJ7APeqfLvJeCCFNYlsdvn7vsAzKylu38M9E9xTY1OLfckifa5jwWuBXoDvwaeBc4kMk3ySSkrTmrl7u8D75vZv7v7wVTXI/Wy0czygP8A3MxKaIZ3iVO4J8+nwOvALHf/e5X1f2qOF3fS0Agzmw4cT+T/SQgIu3uflFYldXL3S6MPp5vZ60A7YHEKS0oJhXsSRFvtT7n7vTU97+4/aeSSJH4FwK1EumQqUlyLxMHMrnf3AgB3fyO6Lh+4M6WFNTKFexK4e4WZnQ3UGO6SFkrdvdm19gLie2a2z92fBTCzR4GWKa6p0WnisCQxs/uI/Dn4PLC7cr27v5uyoiRm0ZZeJvAisL9yvc5f02dmrYhcAP8dcCGww92npLaqxqeWe/KcEf1atfUeBs5JQS0Sv5HRr8OrrNP5a8LM7NgqizcQuaC6DLjXzI519x2pqSw11HJPgui49u+5+4JU1yLSXJjZ50R+AYeqfK3U7C6GK9yTxMzedHeNikljZjYWOBnIqVxX20VykaZG3TLJ42Z2G9/sc29WfxqmKzP7DXAMcDbwBPA9YEVKi5KYmdkpwCCq/2J+OnUVNT59QjV5rgMmEZnjovJTjitTWpHE4wx3/xFQ4u6/Ar4F9ExxTRIDM7sHmBv9dzYwE7gkpUWlgFruSeLuJ6S6BmmQfdGve8ysG7Ad0DlND98DhgDvufu1ZtaZyF9fzYrCPUnM7Ec1rW9ufxqmsZejH2GfBbxL5ALd46ktSWK0190PmVm5meUCxUCzupgKCvdkOr3K4xzgXCIhoXBv4qKjnV5z953AC2b2n0COu5emuDSJzcroL+bHiXSH/i/N8HqJRss0EjNrB/ze3Ztd3186MrO33P1bqa5DGsbMegO57v5BqmtpbGq5N549QL9UFyExe9XMLgNedHe1gNKMmf0T8H+JdKctBZpduKvlniRm9jKRHyyIjEoaBCxw92Y1eVG6MrMyoDWRG3Ts4+tZIXNTWpjUKTqXzInAH6KrrgQ+c/dJqauq8anlnmDRG3F0BmZXWV1OZJ6STSkpSuLm7m1TXYPU2xjglMq/uMysEFiV2pIan8I98R4C7jqyj8/Mhkef+25KqpK41Dbnvru/2di1SNw+AXoBX0SXe9IMu2UU7onXu6aLN+6+MnpxR9LD7VUe5wAjiIy80MRhTVSVrtB2wEdmtiK6PBL4+9H2DSKFe+LlHOW5Vo1WhTSIu1f7C8vMehL5pKM0XbPr3qT5ULgn3j/M7EZ3r/aBFzO7nkjLT9LTRuCUVBchtau865JEKNwTbwrwZzO7mq/DfDjQAri01r2kSTGzuVQf7TQUeJprwXwAAAMFSURBVD91FUldoiOcahr+1yxHOmkoZJJEb7NX2dJb7e5/TWU9Eh8zm1BlsRxY7+7LUlWPSLwU7iK1MLOOAO6+NdW1iMRL4S5ShZmFgHuAW4j8OZ9BpOU+VzfqkHSi+dxFqpsCjAZOd/cO7t6eyFC60WZ2a2pLE4mdwl2kuh8BP3D3zytXuPs64J+jz4mkBYW7SHXZ7r7tyJXRfvfsFNQjUi8Kd5HqDtTzOZEmRePcRaobYma7algf4uifPhZpUjRaRkQkgNQtIyISQAp3EZEAUrhLs2NmfzOzG1Jdh0gy6YKqBJaZrSdyV6wKYDfwF2ByKmsSaSxquUvQfdfd2wDDgNOBX6SiCDNTQ0oalX7gpFlw901mtpgj5mQ3s77A48AQItPFLgEmuftOM7sdGOXul1XZfi5Q4e5TzKwdMAe4CDgEPAnc4+4VZnYNcCOwApgAPEqKfrFI86SWuzQL0TspXQS8d8RTIeABoBswkMj9NqdHn3sG+I6Z5UWPkQVcCfw++nwhkUnFTgROBc4HqvbljwTWAZ2A+xL6hkTqoJa7BN1/mFk5UAosAu4HFlc+6e5rgbXRxa1mNofIrJC4+xYzexO4nEjr/jvANnd/x8w6AxcCee6+F9htZg8CE4HfRo+32d3nRh+XJ/NNihxJ4S5BN97d/1/VFWZW9XEn4N+AM4G2RP6aLamyeSFwM5Fw/2e+brUfT2SumS1VjpcBbKiyb9XHIo1K4S7N3QNE+toHu/t2MxsPPFLl+f8AHjOzU4CLgWnR9RuA/cBx7l5bq1wf/5aUUZ+7NHdtgf8FdppZd+D2qk+6+z7gT8C/AyvcvSi6fgvwKvBrM8s1swwz62tmYxq3fJGaKdylufsVkWGSlX3yL9awTSHwf/i6S6bSj4jc+HwNka6cPwFdk1apSBw0cZhIHcysF/Ax0MXda5oxUqTJUctd5CjMLAP4KfCcgl3SiS6oitTCzFoDXwFfEBkGKZI21C0jIhJA6pYREQkghbuISAAp3EVEAkjhLiISQAp3EZEA+v8Wv/kwWzb3/wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "player_stats.unstack().plot.bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This particular visualization would be helpful if we wanted to see which stats\n", + "for which each player is strongest.\n", + "\n", + "For example, we can see that Steph Curry scores far more points than he does\n", + "rebound, but Serge Ibaka is a bit more balanced.\n", + "\n", + "What if we wanted to be able to compare all players for each statistic?\n", + "\n", + "This would be easier to do if the bars were grouped by variable, with a\n", + "different bar for each player.\n", + "\n", + "To plot this, we need to have the variables on the index and the player\n", + "name as column names.\n", + "\n", + "We can get this DataFrame by setting `level=\"Player\"` when calling `unstack`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
variable
Assist6.4666675.0666670.866667
Games69.66666767.33333352.333333
Pts27.26666726.56666713.966667
Rebound5.0000007.7666676.800000
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka\n", + "variable \n", + "Assist 6.466667 5.066667 0.866667\n", + "Games 69.666667 67.333333 52.333333\n", + "Pts 27.266667 26.566667 13.966667\n", + "Rebound 5.000000 7.766667 6.800000" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "player_stats.unstack(level=\"Player\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "player_stats.unstack(level=\"Player\").plot.bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use the chart to make a number of statements about players:\n", + "\n", + "- Ibaka does not get many assists, compared to Curry and Durant. \n", + "- Steph and Kevin Durant are both high scorers. \n", + "\n", + "\n", + "Based on the examples above, notice a few things about `unstack`:\n", + "\n", + "- It is the *inverse* of `stack`; `stack` will move labels down\n", + " from columns to index, while `unstack` moves them up from index to columns. \n", + "- By default, `unstack` will move the level of the index closest to the data\n", + " and place it in the column labels closest to the data. \n", + "\n", + "\n", + ">**Note**\n", + ">\n", + ">Just as we can pass multiple levels to `stack`, we can also pass multiple\n", + "levels to `unstack`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary\n", + "\n", + "In some ways `set_index`, `reset_index`, `stack`, and `unstack`\n", + "are the “most fundamental” reshaping operations…\n", + "\n", + "The other operations we discuss can be formulated with these\n", + "four operations (and, in fact, some of them are exactly written as these\n", + "operations in `pandas`’s code base).\n", + "\n", + "*Pro tip*: We remember stack vs unstack with a mnemonic: **U**nstack moves index\n", + "levels **U**p" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `melt`\n", + "\n", + "The `melt` method is used to move from wide to long form.\n", + "\n", + "It can be used to move all of the “values” stored in your DataFrame to a\n", + "single column with all other columns being used to contain identifying\n", + "information.\n", + "\n", + "**Warning**: When you use `melt`, any index that you currently have\n", + "will be deleted.\n", + "\n", + "We saw used `melt` above when we constructed `bball_long`:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNameGamesPtsAssistRebound
02015CurryGSWWarriors7930.16.75.4
12016CurryGSWWarriors7925.36.64.5
22017CurryGSWWarriors5126.46.15.1
32015DurantOKCThunder7228.25.08.2
42016DurantGSWWarriors6225.14.88.3
52017DurantGSWWarriors6826.45.46.8
62015IbakaOKCThunder7812.60.86.8
72016IbakaORLMagic5615.11.16.8
82016IbakaTORRaptors2314.20.76.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName Games Pts Assist Rebound\n", + "0 2015 Curry GSW Warriors 79 30.1 6.7 5.4\n", + "1 2016 Curry GSW Warriors 79 25.3 6.6 4.5\n", + "2 2017 Curry GSW Warriors 51 26.4 6.1 5.1\n", + "3 2015 Durant OKC Thunder 72 28.2 5.0 8.2\n", + "4 2016 Durant GSW Warriors 62 25.1 4.8 8.3\n", + "5 2017 Durant GSW Warriors 68 26.4 5.4 6.8\n", + "6 2015 Ibaka OKC Thunder 78 12.6 0.8 6.8\n", + "7 2016 Ibaka ORL Magic 56 15.1 1.1 6.8\n", + "8 2016 Ibaka TOR Raptors 23 14.2 0.7 6.8" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNamevariablevalue
02015CurryGSWWarriorsGames79.0
12016CurryGSWWarriorsGames79.0
22017CurryGSWWarriorsGames51.0
32015DurantOKCThunderGames72.0
42016DurantGSWWarriorsGames62.0
52017DurantGSWWarriorsGames68.0
62015IbakaOKCThunderGames78.0
72016IbakaORLMagicGames56.0
82016IbakaTORRaptorsGames23.0
92015CurryGSWWarriorsPts30.1
102016CurryGSWWarriorsPts25.3
112017CurryGSWWarriorsPts26.4
122015DurantOKCThunderPts28.2
132016DurantGSWWarriorsPts25.1
142017DurantGSWWarriorsPts26.4
152015IbakaOKCThunderPts12.6
162016IbakaORLMagicPts15.1
172016IbakaTORRaptorsPts14.2
182015CurryGSWWarriorsAssist6.7
192016CurryGSWWarriorsAssist6.6
202017CurryGSWWarriorsAssist6.1
212015DurantOKCThunderAssist5.0
222016DurantGSWWarriorsAssist4.8
232017DurantGSWWarriorsAssist5.4
242015IbakaOKCThunderAssist0.8
252016IbakaORLMagicAssist1.1
262016IbakaTORRaptorsAssist0.7
272015CurryGSWWarriorsRebound5.4
282016CurryGSWWarriorsRebound4.5
292017CurryGSWWarriorsRebound5.1
302015DurantOKCThunderRebound8.2
312016DurantGSWWarriorsRebound8.3
322017DurantGSWWarriorsRebound6.8
332015IbakaOKCThunderRebound6.8
342016IbakaORLMagicRebound6.8
352016IbakaTORRaptorsRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName variable value\n", + "0 2015 Curry GSW Warriors Games 79.0\n", + "1 2016 Curry GSW Warriors Games 79.0\n", + "2 2017 Curry GSW Warriors Games 51.0\n", + "3 2015 Durant OKC Thunder Games 72.0\n", + "4 2016 Durant GSW Warriors Games 62.0\n", + "5 2017 Durant GSW Warriors Games 68.0\n", + "6 2015 Ibaka OKC Thunder Games 78.0\n", + "7 2016 Ibaka ORL Magic Games 56.0\n", + "8 2016 Ibaka TOR Raptors Games 23.0\n", + "9 2015 Curry GSW Warriors Pts 30.1\n", + "10 2016 Curry GSW Warriors Pts 25.3\n", + "11 2017 Curry GSW Warriors Pts 26.4\n", + "12 2015 Durant OKC Thunder Pts 28.2\n", + "13 2016 Durant GSW Warriors Pts 25.1\n", + "14 2017 Durant GSW Warriors Pts 26.4\n", + "15 2015 Ibaka OKC Thunder Pts 12.6\n", + "16 2016 Ibaka ORL Magic Pts 15.1\n", + "17 2016 Ibaka TOR Raptors Pts 14.2\n", + "18 2015 Curry GSW Warriors Assist 6.7\n", + "19 2016 Curry GSW Warriors Assist 6.6\n", + "20 2017 Curry GSW Warriors Assist 6.1\n", + "21 2015 Durant OKC Thunder Assist 5.0\n", + "22 2016 Durant GSW Warriors Assist 4.8\n", + "23 2017 Durant GSW Warriors Assist 5.4\n", + "24 2015 Ibaka OKC Thunder Assist 0.8\n", + "25 2016 Ibaka ORL Magic Assist 1.1\n", + "26 2016 Ibaka TOR Raptors Assist 0.7\n", + "27 2015 Curry GSW Warriors Rebound 5.4\n", + "28 2016 Curry GSW Warriors Rebound 4.5\n", + "29 2017 Curry GSW Warriors Rebound 5.1\n", + "30 2015 Durant OKC Thunder Rebound 8.2\n", + "31 2016 Durant GSW Warriors Rebound 8.3\n", + "32 2017 Durant GSW Warriors Rebound 6.8\n", + "33 2015 Ibaka OKC Thunder Rebound 6.8\n", + "34 2016 Ibaka ORL Magic Rebound 6.8\n", + "35 2016 Ibaka TOR Raptors Rebound 6.8" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is how we made ``bball_long``\n", + "bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the columns we specified as `id_vars` remained columns, but all\n", + "other columns were put into two new columns:\n", + "\n", + "1. `variable`: This has dtype string and contains the former column names.\n", + " as values \n", + "1. `value`: This has the former values. \n", + "\n", + "\n", + "Using this method is an effective way to get our data in *tidy* form as noted\n", + "above." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `pivot` and `pivot_table`\n", + "\n", + "The next two reshaping methods that we will use are closely related.\n", + "\n", + "Some of you might even already be familiar with these ideas because you\n", + "have previously used *pivot tables* in Excel.\n", + "\n", + "- If so, good news. We think this is even more powerful than Excel\n", + " and easier to use! \n", + "- If not, good news. You are about to learn a very powerful and user-friendly tool. \n", + "\n", + "\n", + "We will begin with `pivot`.\n", + "\n", + "The `pivot` method:\n", + "\n", + "- Takes the unique values of one column and places them along the index. \n", + "- Takes the unique values of another column and places them along the\n", + " columns. \n", + "- Takes the values that correspond to a third column and fills in the\n", + " DataFrame values that correspond to that index/column pair. \n", + "\n", + "\n", + "We’ll illustrate with an example." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant
Year
201530.128.2
201625.325.1
201726.426.4
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant\n", + "Year \n", + "2015 30.1 28.2\n", + "2016 25.3 25.1\n", + "2017 26.4 26.4" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .head 8 excludes Ibaka -- will discuss why later\n", + "bball.head(6).pivot(index=\"Year\", columns=\"Player\", values=\"Pts\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can replicate `pivot` using three of the fundamental operations\n", + "from above:\n", + "\n", + "1. Call `set_index` with the `index` and `columns` arguments \n", + "1. Extract the `values` column \n", + "1. `unstack` the columns level of the new index " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant
Year
201530.128.2
201625.325.1
201726.426.4
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant\n", + "Year \n", + "2015 30.1 28.2\n", + "2016 25.3 25.1\n", + "2017 26.4 26.4" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1--------------------------------------- 2--- 3----------------------\n", + "bball.head(6).set_index([\"Year\", \"Player\"])[\"Pts\"].unstack(level=\"Player\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One important thing to be aware of is that in order for `pivot` to\n", + "work, the index/column pairs must be *unique*!\n", + "\n", + "Below, we demonstrate the error that occurs when they are not unique." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide-output": false + }, + "source": [ + "```python\n", + "# Ibaka shows up twice in 2016 because he was traded mid-season from\n", + "# the Orlando Magic to the Toronto Raptors\n", + "bball.pivot(index=\"Year\", columns=\"Player\", values=\"Pts\")\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `pivot_table`\n", + "\n", + "The `pivot_table` method is a generalization of `pivot`.\n", + "\n", + "It overcomes two limitations of `pivot`:\n", + "\n", + "1. It allows you to choose multiple columns for the index/columns/values\n", + " arguments. \n", + "1. It allows you to deal with duplicate entries by\n", + " having you choose how to combine them. " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNameGamesPtsAssistRebound
02015CurryGSWWarriors7930.16.75.4
12016CurryGSWWarriors7925.36.64.5
22017CurryGSWWarriors5126.46.15.1
32015DurantOKCThunder7228.25.08.2
42016DurantGSWWarriors6225.14.88.3
52017DurantGSWWarriors6826.45.46.8
62015IbakaOKCThunder7812.60.86.8
72016IbakaORLMagic5615.11.16.8
82016IbakaTORRaptors2314.20.76.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName Games Pts Assist Rebound\n", + "0 2015 Curry GSW Warriors 79 30.1 6.7 5.4\n", + "1 2016 Curry GSW Warriors 79 25.3 6.6 4.5\n", + "2 2017 Curry GSW Warriors 51 26.4 6.1 5.1\n", + "3 2015 Durant OKC Thunder 72 28.2 5.0 8.2\n", + "4 2016 Durant GSW Warriors 62 25.1 4.8 8.3\n", + "5 2017 Durant GSW Warriors 68 26.4 5.4 6.8\n", + "6 2015 Ibaka OKC Thunder 78 12.6 0.8 6.8\n", + "7 2016 Ibaka ORL Magic 56 15.1 1.1 6.8\n", + "8 2016 Ibaka TOR Raptors 23 14.2 0.7 6.8" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we can replicate the functionality of `pivot` if we pass\n", + "the same arguments." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant
Year
201530.128.2
201625.325.1
201726.426.4
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant\n", + "Year \n", + "2015 30.1 28.2\n", + "2016 25.3 25.1\n", + "2017 26.4 26.4" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.head(6).pivot(index=\"Year\", columns=\"Player\", values=\"Pts\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But we can also choose multiple columns to be used in\n", + "index/columns/values." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
YearTeam
2015GSW30.1NaNNaN
OKCNaN28.212.6
2016GSW25.325.1NaN
ORLNaNNaN15.1
TORNaNNaN14.2
2017GSW26.426.4NaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka\n", + "Year Team \n", + "2015 GSW 30.1 NaN NaN\n", + " OKC NaN 28.2 12.6\n", + "2016 GSW 25.3 25.1 NaN\n", + " ORL NaN NaN 15.1\n", + " TOR NaN NaN 14.2\n", + "2017 GSW 26.4 26.4 NaN" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=[\"Year\", \"Team\"], columns=\"Player\", values=\"Pts\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
TeamGSWGSWOKCOKCORLTOR
Year
201530.1NaN28.212.6NaNNaN
201625.325.1NaNNaN15.114.2
201726.426.4NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka \n", + "Team GSW GSW OKC OKC ORL TOR\n", + "Year \n", + "2015 30.1 NaN 28.2 12.6 NaN NaN\n", + "2016 25.3 25.1 NaN NaN 15.1 14.2\n", + "2017 26.4 26.4 NaN NaN NaN NaN" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Year\", columns=[\"Player\", \"Team\"], values=\"Pts\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "AND we can deal with duplicated index/column pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
Year
201530.128.212.60
201625.325.114.65
201726.426.4NaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka\n", + "Year \n", + "2015 30.1 28.2 12.60\n", + "2016 25.3 25.1 14.65\n", + "2017 26.4 26.4 NaN" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This produced an error\n", + "# bball.pivot(index=\"Year\", columns=\"Player\", values=\"Pts\")\n", + "\n", + "# This doesn't!\n", + "bball_pivoted = bball.pivot_table(index=\"Year\", columns=\"Player\", values=\"Pts\")\n", + "bball_pivoted" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`pivot_table` handles duplicate index/column pairs using an aggregation.\n", + "\n", + "By default, the aggregation is the mean.\n", + "\n", + "For example, our duplicated index/column pair is `(\"x\", 1)` and had\n", + "associated values of 2 and 5.\n", + "\n", + "Notice that `bball_pivoted.loc[2016, \"Ibaka\"]` is `(15.1 + 14.2)/2 = 14.65`.\n", + "\n", + "We can choose how `pandas` aggregates all of the values.\n", + "\n", + "For example, here’s how we would keep the max." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
Year
201530.128.212.6
201625.325.115.1
201726.426.4NaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka\n", + "Year \n", + "2015 30.1 28.2 12.6\n", + "2016 25.3 25.1 15.1\n", + "2017 26.4 26.4 NaN" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Year\", columns=\"Player\", values=\"Pts\", aggfunc=max)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Maybe we wanted to count how many values there were." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurantIbaka
Year
20151.01.01.0
20161.01.02.0
20171.01.0NaN
\n", + "
" + ], + "text/plain": [ + "Player Curry Durant Ibaka\n", + "Year \n", + "2015 1.0 1.0 1.0\n", + "2016 1.0 1.0 2.0\n", + "2017 1.0 1.0 NaN" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Year\", columns=\"Player\", values=\"Pts\", aggfunc=len)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can even pass multiple aggregation functions!" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
maxlen
PlayerCurryDurantIbakaCurryDurantIbaka
Year
201530.128.212.61.01.01.0
201625.325.115.11.01.02.0
201726.426.4NaN1.01.0NaN
\n", + "
" + ], + "text/plain": [ + " max len \n", + "Player Curry Durant Ibaka Curry Durant Ibaka\n", + "Year \n", + "2015 30.1 28.2 12.6 1.0 1.0 1.0\n", + "2016 25.3 25.1 15.1 1.0 1.0 2.0\n", + "2017 26.4 26.4 NaN 1.0 1.0 NaN" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Year\", columns=\"Player\", values=\"Pts\", aggfunc=[max, len])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing Reshaping\n", + "\n", + "Now that you have learned the basics and had a chance to experiment,\n", + "we will use some generic data to provide a visualization of what the above\n", + "reshape operations do.\n", + "\n", + "The data we will use is:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4 entries, 0 to 3\n", + "Data columns (total 5 columns):\n", + "A 4 non-null int64\n", + "B 4 non-null object\n", + "C 4 non-null int64\n", + "D 4 non-null int64\n", + "E 4 non-null int64\n", + "dtypes: int64(4), object(1)\n", + "memory usage: 288.0+ bytes\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
00x1102
10y2201
21x1305
31z4204
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "0 0 x 1 10 2\n", + "1 0 y 2 20 1\n", + "2 1 x 1 30 5\n", + "3 1 z 4 20 4" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# made up\n", + "# columns A and B are \"identifiers\" while C, D, and E are variables.\n", + "df = pd.DataFrame({\n", + " \"A\": [0, 0, 1, 1],\n", + " \"B\": \"x y x z\".split(),\n", + " \"C\": [1, 2, 1, 4],\n", + " \"D\": [10, 20, 30, 20,],\n", + " \"E\": [2, 1, 5, 4,]\n", + "})\n", + "\n", + "df.info()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDE
AB
0x1102
y2201
1x1305
z4204
\n", + "
" + ], + "text/plain": [ + " C D E\n", + "A B \n", + "0 x 1 10 2\n", + " y 2 20 1\n", + "1 x 1 30 5\n", + " z 4 20 4" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df.set_index([\"A\", \"B\"])\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
A01
Bxyxz
C1214
D10203020
E2154
\n", + "
" + ], + "text/plain": [ + "A 0 1 \n", + "B x y x z\n", + "C 1 2 1 4\n", + "D 10 20 30 20\n", + "E 2 1 5 4" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = df2.T\n", + "df3.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `stack` and `unstack`\n", + "\n", + "Below is an animation that shows how stacking works.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/reshape_files/stack.gif](https://datascience.quantecon.org/assets/_static/reshape_files/stack.gif) " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDE
AB
0x1102
y2201
1x1305
z4204
\n", + "
" + ], + "text/plain": [ + " C D E\n", + "A B \n", + "0 x 1 10 2\n", + " y 2 20 1\n", + "1 x 1 30 5\n", + " z 4 20 4" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "A B \n", + "0 x C 1\n", + " D 10\n", + " E 2\n", + " y C 2\n", + " D 20\n", + " E 1\n", + "1 x C 1\n", + " D 30\n", + " E 5\n", + " z C 4\n", + " D 20\n", + " E 4\n", + "dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2_stack = df2.stack()\n", + "df2_stack" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And here is an animation that shows how unstacking works.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/reshape_files/unstack_level0.gif](https://datascience.quantecon.org/assets/_static/reshape_files/unstack_level0.gif) " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDE
AB
0x1102
y2201
1x1305
z4204
\n", + "
" + ], + "text/plain": [ + " C D E\n", + "A B \n", + "0 x 1 10 2\n", + " y 2 20 1\n", + "1 x 1 30 5\n", + " z 4 20 4" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDE
Bxyzxyzxyz
A
01.02.0NaN10.020.0NaN2.01.0NaN
11.0NaN4.030.0NaN20.05.0NaN4.0
\n", + "
" + ], + "text/plain": [ + " C D E \n", + "B x y z x y z x y z\n", + "A \n", + "0 1.0 2.0 NaN 10.0 20.0 NaN 2.0 1.0 NaN\n", + "1 1.0 NaN 4.0 30.0 NaN 20.0 5.0 NaN 4.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `melt`\n", + "\n", + "As noted above, the `melt` method transforms data from wide to long in form.\n", + "\n", + "Here’s a visualization of that operation.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/reshape_files/melt.gif](https://datascience.quantecon.org/assets/_static/reshape_files/melt.gif) " + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
00x1102
10y2201
21x1305
31z4204
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "0 0 x 1 10 2\n", + "1 0 y 2 20 1\n", + "2 1 x 1 30 5\n", + "3 1 z 4 20 4" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABvariablevalue
00xC1
10yC2
21xC1
31zC4
40xD10
50yD20
61xD30
71zD20
80xE2
90yE1
101xE5
111zE4
\n", + "
" + ], + "text/plain": [ + " A B variable value\n", + "0 0 x C 1\n", + "1 0 y C 2\n", + "2 1 x C 1\n", + "3 1 z C 4\n", + "4 0 x D 10\n", + "5 0 y D 20\n", + "6 1 x D 30\n", + "7 1 z D 20\n", + "8 0 x E 2\n", + "9 0 y E 1\n", + "10 1 x E 5\n", + "11 1 z E 4" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_melted = df.melt(id_vars=[\"A\", \"B\"])\n", + "df_melted" + ] + } + ], + "metadata": { + "date": 1584040763.8179712, + "filename": "reshape.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Reshape" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/6_reshape_exercises.ipynb b/Session_7/6_reshape_exercises.ipynb new file mode 100644 index 0000000..7ab7376 --- /dev/null +++ b/Session_7/6_reshape_exercises.ipynb @@ -0,0 +1,3882 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (1.12.0)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import numpy as np\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reshape - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For these exercises we use basketball data. After loading the data:**\n", + "- we create a long-form version\n", + "- we create a wide-form version " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNameGamesPtsAssistRebound
02015CurryGSWWarriors7930.16.75.4
12016CurryGSWWarriors7925.36.64.5
22017CurryGSWWarriors5126.46.15.1
32015DurantOKCThunder7228.25.08.2
42016DurantGSWWarriors6225.14.88.3
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName Games Pts Assist Rebound\n", + "0 2015 Curry GSW Warriors 79 30.1 6.7 5.4\n", + "1 2016 Curry GSW Warriors 79 25.3 6.6 4.5\n", + "2 2017 Curry GSW Warriors 51 26.4 6.1 5.1\n", + "3 2015 Durant OKC Thunder 72 28.2 5.0 8.2\n", + "4 2016 Durant GSW Warriors 62 25.1 4.8 8.3" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://datascience.quantecon.org/assets/data/bball.csv\"\n", + "bball = pd.read_csv(url)\n", + "bball.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNamevariablevalue
02015CurryGSWWarriorsGames79.0
12016CurryGSWWarriorsGames79.0
22017CurryGSWWarriorsGames51.0
32015DurantOKCThunderGames72.0
42016DurantGSWWarriorsGames62.0
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName variable value\n", + "0 2015 Curry GSW Warriors Games 79.0\n", + "1 2016 Curry GSW Warriors Games 79.0\n", + "2 2017 Curry GSW Warriors Games 51.0\n", + "3 2015 Durant OKC Thunder Games 72.0\n", + "4 2016 Durant GSW Warriors Games 62.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_long = bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"])\n", + "bball_long.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
variableAssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + "variable Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + "variable Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball_wide = bball_long.pivot_table(\n", + " index=\"Year\",\n", + " columns=[\"Player\", \"variable\", \"Team\"],\n", + " values=\"value\"\n", + ")\n", + "bball_wide.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**(*Warning*: This one is challenging):**\n", + "\n", + "**Recall the `bball_wide` DataFrame from above (repeated below to jog your memory).**\n", + "\n", + "**In this task, you will start from `ball` and re-recreate `bball_wide` by combining the operations we just learned about. There are many ways to do this, so be creative.**\n", + "\n", + "**The solution presented in the lecture used `set_index`, `T`, `stack`, and `unstack` in that order.**\n", + "\n", + "**Here are a few hints:**\n", + "\n", + "- Think about what columns you will need to call `set_index` on so that their data ends up as labels (either in index or columns). \n", + "- Leave other columns (e.g. the actual game stats) as actual columns so their data can stay data during your reshaping. \n", + "\n", + "**Hint**: You might need to add `.sort_index(axis=1)` after you are finished to get the columns in the same order.\n", + "\n", + "**Hint**: You may not end up with a `variable` header on the second level of column labels. This is ok." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
variableAssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + "variable Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + "variable Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#we have to recreate this DataFrame in a different way\n", + "bball_wide" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
AssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + " Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + " Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.drop(\"TeamName\", axis=1).set_index([\"Year\", \"Player\", \"Team\"]).stack().unstack(level=[1, 3, 2]).sort_index(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**(Tentative) explanation of the code:**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamGamesPtsAssistRebound
02015CurryGSW7930.16.75.4
12016CurryGSW7925.36.64.5
22017CurryGSW5126.46.15.1
32015DurantOKC7228.25.08.2
42016DurantGSW6225.14.88.3
52017DurantGSW6826.45.46.8
62015IbakaOKC7812.60.86.8
72016IbakaORL5615.11.16.8
82016IbakaTOR2314.20.76.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team Games Pts Assist Rebound\n", + "0 2015 Curry GSW 79 30.1 6.7 5.4\n", + "1 2016 Curry GSW 79 25.3 6.6 4.5\n", + "2 2017 Curry GSW 51 26.4 6.1 5.1\n", + "3 2015 Durant OKC 72 28.2 5.0 8.2\n", + "4 2016 Durant GSW 62 25.1 4.8 8.3\n", + "5 2017 Durant GSW 68 26.4 5.4 6.8\n", + "6 2015 Ibaka OKC 78 12.6 0.8 6.8\n", + "7 2016 Ibaka ORL 56 15.1 1.1 6.8\n", + "8 2016 Ibaka TOR 23 14.2 0.7 6.8" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# '.drop(\"TeamName\", axis=1)' drops the columns (axis=1) TeamName, Avoid to have a useless (maybe akward ?) variable in the final dataset\n", + "bball.drop(\"TeamName\", axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GamesPtsAssistRebound
YearPlayerTeam
2015CurryGSW7930.16.75.4
2016CurryGSW7925.36.64.5
2017CurryGSW5126.46.15.1
2015DurantOKC7228.25.08.2
2016DurantGSW6225.14.88.3
2017DurantGSW6826.45.46.8
2015IbakaOKC7812.60.86.8
2016IbakaORL5615.11.16.8
TOR2314.20.76.8
\n", + "
" + ], + "text/plain": [ + " Games Pts Assist Rebound\n", + "Year Player Team \n", + "2015 Curry GSW 79 30.1 6.7 5.4\n", + "2016 Curry GSW 79 25.3 6.6 4.5\n", + "2017 Curry GSW 51 26.4 6.1 5.1\n", + "2015 Durant OKC 72 28.2 5.0 8.2\n", + "2016 Durant GSW 62 25.1 4.8 8.3\n", + "2017 Durant GSW 68 26.4 5.4 6.8\n", + "2015 Ibaka OKC 78 12.6 0.8 6.8\n", + "2016 Ibaka ORL 56 15.1 1.1 6.8\n", + " TOR 23 14.2 0.7 6.8" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# set_index([\"Year\", \"Player\", \"Team\"]) sets multindex, the hierarchy is Year > Player > Team. This is necessary if we want to stack the data (ie group and 'stack'), with each component of the stack corresponding to the values in a given year of a player in a given team\n", + "bball.drop(\"TeamName\", axis=1).set_index([\"Year\", \"Player\", \"Team\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Year Player Team \n", + "2015 Curry GSW Games 79.0\n", + " Pts 30.1\n", + " Assist 6.7\n", + " Rebound 5.4\n", + "2016 Curry GSW Games 79.0\n", + " Pts 25.3\n", + " Assist 6.6\n", + " Rebound 4.5\n", + "2017 Curry GSW Games 51.0\n", + " Pts 26.4\n", + " Assist 6.1\n", + " Rebound 5.1\n", + "2015 Durant OKC Games 72.0\n", + " Pts 28.2\n", + " Assist 5.0\n", + " Rebound 8.2\n", + "2016 Durant GSW Games 62.0\n", + " Pts 25.1\n", + " Assist 4.8\n", + " Rebound 8.3\n", + "2017 Durant GSW Games 68.0\n", + " Pts 26.4\n", + " Assist 5.4\n", + " Rebound 6.8\n", + "2015 Ibaka OKC Games 78.0\n", + " Pts 12.6\n", + " Assist 0.8\n", + " Rebound 6.8\n", + "2016 Ibaka ORL Games 56.0\n", + " Pts 15.1\n", + " Assist 1.1\n", + " Rebound 6.8\n", + " TOR Games 23.0\n", + " Pts 14.2\n", + " Assist 0.7\n", + " Rebound 6.8\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# stack stacks the data for each YEAR, PLAYER, TEAM triple. the outcome is a 'stack', each component of this stack being a table of the value of all the variables (ie non index) for a given combination of the indexes.\n", + "bball.drop(\"TeamName\", axis=1).set_index([\"Year\", \"Player\", \"Team\"]).stack()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
GamesPtsAssistReboundGamesPtsAssistReboundGamesPts...AssistReboundGamesPtsAssistReboundGamesPtsAssistRebound
TeamGSWGSWGSWGSWOKCOKCOKCOKCGSWGSW...OKCOKCORLORLORLORLTORTORTORTOR
Year
201579.030.16.75.472.028.25.08.2NaNNaN...0.86.8NaNNaNNaNNaNNaNNaNNaNNaN
201679.025.36.64.5NaNNaNNaNNaN62.025.1...NaNNaN56.015.11.16.823.014.20.76.8
201751.026.46.15.1NaNNaNNaNNaN68.026.4...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant \\\n", + " Games Pts Assist Rebound Games Pts Assist Rebound Games Pts \n", + "Team GSW GSW GSW GSW OKC OKC OKC OKC GSW GSW \n", + "Year \n", + "2015 79.0 30.1 6.7 5.4 72.0 28.2 5.0 8.2 NaN NaN \n", + "2016 79.0 25.3 6.6 4.5 NaN NaN NaN NaN 62.0 25.1 \n", + "2017 51.0 26.4 6.1 5.1 NaN NaN NaN NaN 68.0 26.4 \n", + "\n", + "Player ... Ibaka \\\n", + " ... Assist Rebound Games Pts Assist Rebound Games Pts Assist \n", + "Team ... OKC OKC ORL ORL ORL ORL TOR TOR TOR \n", + "Year ... \n", + "2015 ... 0.8 6.8 NaN NaN NaN NaN NaN NaN NaN \n", + "2016 ... NaN NaN 56.0 15.1 1.1 6.8 23.0 14.2 0.7 \n", + "2017 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "Player \n", + " Rebound \n", + "Team TOR \n", + "Year \n", + "2015 NaN \n", + "2016 6.8 \n", + "2017 NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# An intuitive idea to explain that ?\n", + "bball.drop(\"TeamName\", axis=1).set_index([\"Year\", \"Player\", \"Team\"]).stack().unstack(level=[1,3,2])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PlayerCurryDurant...Ibaka
AssistGamesPtsReboundAssistGamesPts...AssistGamesPtsRebound
TeamGSWGSWGSWGSWGSWOKCGSWOKCGSWOKC...TOROKCORLTOROKCORLTOROKCORLTOR
Year
20156.779.030.15.4NaN5.0NaN72.0NaN28.2...NaN78.0NaNNaN12.6NaNNaN6.8NaNNaN
20166.679.025.34.54.8NaN62.0NaN25.1NaN...0.7NaN56.023.0NaN15.114.2NaN6.86.8
20176.151.026.45.15.4NaN68.0NaN26.4NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Player Curry Durant ... \\\n", + " Assist Games Pts Rebound Assist Games Pts ... \n", + "Team GSW GSW GSW GSW GSW OKC GSW OKC GSW OKC ... \n", + "Year ... \n", + "2015 6.7 79.0 30.1 5.4 NaN 5.0 NaN 72.0 NaN 28.2 ... \n", + "2016 6.6 79.0 25.3 4.5 4.8 NaN 62.0 NaN 25.1 NaN ... \n", + "2017 6.1 51.0 26.4 5.1 5.4 NaN 68.0 NaN 26.4 NaN ... \n", + "\n", + "Player Ibaka \n", + " Assist Games Pts Rebound \n", + "Team TOR OKC ORL TOR OKC ORL TOR OKC ORL TOR \n", + "Year \n", + "2015 NaN 78.0 NaN NaN 12.6 NaN NaN 6.8 NaN NaN \n", + "2016 0.7 NaN 56.0 23.0 NaN 15.1 14.2 NaN 6.8 6.8 \n", + "2017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort_index(axis=1) sort the variables in the alphabetical order. Changes the order of the columns variable only\n", + "bball.drop(\"TeamName\", axis=1).set_index([\"Year\", \"Player\", \"Team\"]).stack().unstack(level=[1,3,2]).sort_index(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **What do you think would happen if we wrote `bball.melt(id_vars=[\"Year\", \"Player\"])` rather than `bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"])`? Were you right? Write your thoughts.** " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# for bball.melt(id_vars=[\"Year\", \"Player\"]) we would get only the columns Year and Player and all the other columns \n", + "# would be put into two new columns: variable (Team, Teamname, etc) and values (the values associated with each)\n", + "\n", + "# for bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"]) we would get the columns Year, Player, Team and Teamname\n", + "# and all the remaining columns will be put into two new columns: variable and values" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayervariablevalue
02015CurryTeamGSW
12016CurryTeamGSW
22017CurryTeamGSW
32015DurantTeamOKC
42016DurantTeamGSW
52017DurantTeamGSW
62015IbakaTeamOKC
72016IbakaTeamORL
82016IbakaTeamTOR
92015CurryTeamNameWarriors
102016CurryTeamNameWarriors
112017CurryTeamNameWarriors
122015DurantTeamNameThunder
132016DurantTeamNameWarriors
142017DurantTeamNameWarriors
152015IbakaTeamNameThunder
162016IbakaTeamNameMagic
172016IbakaTeamNameRaptors
182015CurryGames79
192016CurryGames79
202017CurryGames51
212015DurantGames72
222016DurantGames62
232017DurantGames68
242015IbakaGames78
252016IbakaGames56
262016IbakaGames23
272015CurryPts30.1
282016CurryPts25.3
292017CurryPts26.4
302015DurantPts28.2
312016DurantPts25.1
322017DurantPts26.4
332015IbakaPts12.6
342016IbakaPts15.1
352016IbakaPts14.2
362015CurryAssist6.7
372016CurryAssist6.6
382017CurryAssist6.1
392015DurantAssist5
402016DurantAssist4.8
412017DurantAssist5.4
422015IbakaAssist0.8
432016IbakaAssist1.1
442016IbakaAssist0.7
452015CurryRebound5.4
462016CurryRebound4.5
472017CurryRebound5.1
482015DurantRebound8.2
492016DurantRebound8.3
502017DurantRebound6.8
512015IbakaRebound6.8
522016IbakaRebound6.8
532016IbakaRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player variable value\n", + "0 2015 Curry Team GSW\n", + "1 2016 Curry Team GSW\n", + "2 2017 Curry Team GSW\n", + "3 2015 Durant Team OKC\n", + "4 2016 Durant Team GSW\n", + "5 2017 Durant Team GSW\n", + "6 2015 Ibaka Team OKC\n", + "7 2016 Ibaka Team ORL\n", + "8 2016 Ibaka Team TOR\n", + "9 2015 Curry TeamName Warriors\n", + "10 2016 Curry TeamName Warriors\n", + "11 2017 Curry TeamName Warriors\n", + "12 2015 Durant TeamName Thunder\n", + "13 2016 Durant TeamName Warriors\n", + "14 2017 Durant TeamName Warriors\n", + "15 2015 Ibaka TeamName Thunder\n", + "16 2016 Ibaka TeamName Magic\n", + "17 2016 Ibaka TeamName Raptors\n", + "18 2015 Curry Games 79\n", + "19 2016 Curry Games 79\n", + "20 2017 Curry Games 51\n", + "21 2015 Durant Games 72\n", + "22 2016 Durant Games 62\n", + "23 2017 Durant Games 68\n", + "24 2015 Ibaka Games 78\n", + "25 2016 Ibaka Games 56\n", + "26 2016 Ibaka Games 23\n", + "27 2015 Curry Pts 30.1\n", + "28 2016 Curry Pts 25.3\n", + "29 2017 Curry Pts 26.4\n", + "30 2015 Durant Pts 28.2\n", + "31 2016 Durant Pts 25.1\n", + "32 2017 Durant Pts 26.4\n", + "33 2015 Ibaka Pts 12.6\n", + "34 2016 Ibaka Pts 15.1\n", + "35 2016 Ibaka Pts 14.2\n", + "36 2015 Curry Assist 6.7\n", + "37 2016 Curry Assist 6.6\n", + "38 2017 Curry Assist 6.1\n", + "39 2015 Durant Assist 5\n", + "40 2016 Durant Assist 4.8\n", + "41 2017 Durant Assist 5.4\n", + "42 2015 Ibaka Assist 0.8\n", + "43 2016 Ibaka Assist 1.1\n", + "44 2016 Ibaka Assist 0.7\n", + "45 2015 Curry Rebound 5.4\n", + "46 2016 Curry Rebound 4.5\n", + "47 2017 Curry Rebound 5.1\n", + "48 2015 Durant Rebound 8.2\n", + "49 2016 Durant Rebound 8.3\n", + "50 2017 Durant Rebound 6.8\n", + "51 2015 Ibaka Rebound 6.8\n", + "52 2016 Ibaka Rebound 6.8\n", + "53 2016 Ibaka Rebound 6.8" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.melt(id_vars=[\"Year\", \"Player\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayerTeamTeamNamevariablevalue
02015CurryGSWWarriorsGames79.0
12016CurryGSWWarriorsGames79.0
22017CurryGSWWarriorsGames51.0
32015DurantOKCThunderGames72.0
42016DurantGSWWarriorsGames62.0
52017DurantGSWWarriorsGames68.0
62015IbakaOKCThunderGames78.0
72016IbakaORLMagicGames56.0
82016IbakaTORRaptorsGames23.0
92015CurryGSWWarriorsPts30.1
102016CurryGSWWarriorsPts25.3
112017CurryGSWWarriorsPts26.4
122015DurantOKCThunderPts28.2
132016DurantGSWWarriorsPts25.1
142017DurantGSWWarriorsPts26.4
152015IbakaOKCThunderPts12.6
162016IbakaORLMagicPts15.1
172016IbakaTORRaptorsPts14.2
182015CurryGSWWarriorsAssist6.7
192016CurryGSWWarriorsAssist6.6
202017CurryGSWWarriorsAssist6.1
212015DurantOKCThunderAssist5.0
222016DurantGSWWarriorsAssist4.8
232017DurantGSWWarriorsAssist5.4
242015IbakaOKCThunderAssist0.8
252016IbakaORLMagicAssist1.1
262016IbakaTORRaptorsAssist0.7
272015CurryGSWWarriorsRebound5.4
282016CurryGSWWarriorsRebound4.5
292017CurryGSWWarriorsRebound5.1
302015DurantOKCThunderRebound8.2
312016DurantGSWWarriorsRebound8.3
322017DurantGSWWarriorsRebound6.8
332015IbakaOKCThunderRebound6.8
342016IbakaORLMagicRebound6.8
352016IbakaTORRaptorsRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player Team TeamName variable value\n", + "0 2015 Curry GSW Warriors Games 79.0\n", + "1 2016 Curry GSW Warriors Games 79.0\n", + "2 2017 Curry GSW Warriors Games 51.0\n", + "3 2015 Durant OKC Thunder Games 72.0\n", + "4 2016 Durant GSW Warriors Games 62.0\n", + "5 2017 Durant GSW Warriors Games 68.0\n", + "6 2015 Ibaka OKC Thunder Games 78.0\n", + "7 2016 Ibaka ORL Magic Games 56.0\n", + "8 2016 Ibaka TOR Raptors Games 23.0\n", + "9 2015 Curry GSW Warriors Pts 30.1\n", + "10 2016 Curry GSW Warriors Pts 25.3\n", + "11 2017 Curry GSW Warriors Pts 26.4\n", + "12 2015 Durant OKC Thunder Pts 28.2\n", + "13 2016 Durant GSW Warriors Pts 25.1\n", + "14 2017 Durant GSW Warriors Pts 26.4\n", + "15 2015 Ibaka OKC Thunder Pts 12.6\n", + "16 2016 Ibaka ORL Magic Pts 15.1\n", + "17 2016 Ibaka TOR Raptors Pts 14.2\n", + "18 2015 Curry GSW Warriors Assist 6.7\n", + "19 2016 Curry GSW Warriors Assist 6.6\n", + "20 2017 Curry GSW Warriors Assist 6.1\n", + "21 2015 Durant OKC Thunder Assist 5.0\n", + "22 2016 Durant GSW Warriors Assist 4.8\n", + "23 2017 Durant GSW Warriors Assist 5.4\n", + "24 2015 Ibaka OKC Thunder Assist 0.8\n", + "25 2016 Ibaka ORL Magic Assist 1.1\n", + "26 2016 Ibaka TOR Raptors Assist 0.7\n", + "27 2015 Curry GSW Warriors Rebound 5.4\n", + "28 2016 Curry GSW Warriors Rebound 4.5\n", + "29 2017 Curry GSW Warriors Rebound 5.1\n", + "30 2015 Durant OKC Thunder Rebound 8.2\n", + "31 2016 Durant GSW Warriors Rebound 8.3\n", + "32 2017 Durant GSW Warriors Rebound 6.8\n", + "33 2015 Ibaka OKC Thunder Rebound 6.8\n", + "34 2016 Ibaka ORL Magic Rebound 6.8\n", + "35 2016 Ibaka TOR Raptors Rebound 6.8" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.melt(id_vars=[\"Year\", \"Player\", \"Team\", \"TeamName\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Read the documentation and focus on the argument `value_vars`. How does `bball.melt(id_vars=[\"Year\", \"Player\"], value_vars=[\"Pts\", \"Rebound\"])` differ from `bball.melt(id_vars=[\"Year\", \"Player\"])`?** " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayervariablevalue
02015CurryPts30.1
12016CurryPts25.3
22017CurryPts26.4
32015DurantPts28.2
42016DurantPts25.1
52017DurantPts26.4
62015IbakaPts12.6
72016IbakaPts15.1
82016IbakaPts14.2
92015CurryRebound5.4
102016CurryRebound4.5
112017CurryRebound5.1
122015DurantRebound8.2
132016DurantRebound8.3
142017DurantRebound6.8
152015IbakaRebound6.8
162016IbakaRebound6.8
172016IbakaRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player variable value\n", + "0 2015 Curry Pts 30.1\n", + "1 2016 Curry Pts 25.3\n", + "2 2017 Curry Pts 26.4\n", + "3 2015 Durant Pts 28.2\n", + "4 2016 Durant Pts 25.1\n", + "5 2017 Durant Pts 26.4\n", + "6 2015 Ibaka Pts 12.6\n", + "7 2016 Ibaka Pts 15.1\n", + "8 2016 Ibaka Pts 14.2\n", + "9 2015 Curry Rebound 5.4\n", + "10 2016 Curry Rebound 4.5\n", + "11 2017 Curry Rebound 5.1\n", + "12 2015 Durant Rebound 8.2\n", + "13 2016 Durant Rebound 8.3\n", + "14 2017 Durant Rebound 6.8\n", + "15 2015 Ibaka Rebound 6.8\n", + "16 2016 Ibaka Rebound 6.8\n", + "17 2016 Ibaka Rebound 6.8" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.melt(id_vars=[\"Year\", \"Player\"], value_vars=[\"Pts\", \"Rebound\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearPlayervariablevalue
02015CurryTeamGSW
12016CurryTeamGSW
22017CurryTeamGSW
32015DurantTeamOKC
42016DurantTeamGSW
52017DurantTeamGSW
62015IbakaTeamOKC
72016IbakaTeamORL
82016IbakaTeamTOR
92015CurryTeamNameWarriors
102016CurryTeamNameWarriors
112017CurryTeamNameWarriors
122015DurantTeamNameThunder
132016DurantTeamNameWarriors
142017DurantTeamNameWarriors
152015IbakaTeamNameThunder
162016IbakaTeamNameMagic
172016IbakaTeamNameRaptors
182015CurryGames79
192016CurryGames79
202017CurryGames51
212015DurantGames72
222016DurantGames62
232017DurantGames68
242015IbakaGames78
252016IbakaGames56
262016IbakaGames23
272015CurryPts30.1
282016CurryPts25.3
292017CurryPts26.4
302015DurantPts28.2
312016DurantPts25.1
322017DurantPts26.4
332015IbakaPts12.6
342016IbakaPts15.1
352016IbakaPts14.2
362015CurryAssist6.7
372016CurryAssist6.6
382017CurryAssist6.1
392015DurantAssist5
402016DurantAssist4.8
412017DurantAssist5.4
422015IbakaAssist0.8
432016IbakaAssist1.1
442016IbakaAssist0.7
452015CurryRebound5.4
462016CurryRebound4.5
472017CurryRebound5.1
482015DurantRebound8.2
492016DurantRebound8.3
502017DurantRebound6.8
512015IbakaRebound6.8
522016IbakaRebound6.8
532016IbakaRebound6.8
\n", + "
" + ], + "text/plain": [ + " Year Player variable value\n", + "0 2015 Curry Team GSW\n", + "1 2016 Curry Team GSW\n", + "2 2017 Curry Team GSW\n", + "3 2015 Durant Team OKC\n", + "4 2016 Durant Team GSW\n", + "5 2017 Durant Team GSW\n", + "6 2015 Ibaka Team OKC\n", + "7 2016 Ibaka Team ORL\n", + "8 2016 Ibaka Team TOR\n", + "9 2015 Curry TeamName Warriors\n", + "10 2016 Curry TeamName Warriors\n", + "11 2017 Curry TeamName Warriors\n", + "12 2015 Durant TeamName Thunder\n", + "13 2016 Durant TeamName Warriors\n", + "14 2017 Durant TeamName Warriors\n", + "15 2015 Ibaka TeamName Thunder\n", + "16 2016 Ibaka TeamName Magic\n", + "17 2016 Ibaka TeamName Raptors\n", + "18 2015 Curry Games 79\n", + "19 2016 Curry Games 79\n", + "20 2017 Curry Games 51\n", + "21 2015 Durant Games 72\n", + "22 2016 Durant Games 62\n", + "23 2017 Durant Games 68\n", + "24 2015 Ibaka Games 78\n", + "25 2016 Ibaka Games 56\n", + "26 2016 Ibaka Games 23\n", + "27 2015 Curry Pts 30.1\n", + "28 2016 Curry Pts 25.3\n", + "29 2017 Curry Pts 26.4\n", + "30 2015 Durant Pts 28.2\n", + "31 2016 Durant Pts 25.1\n", + "32 2017 Durant Pts 26.4\n", + "33 2015 Ibaka Pts 12.6\n", + "34 2016 Ibaka Pts 15.1\n", + "35 2016 Ibaka Pts 14.2\n", + "36 2015 Curry Assist 6.7\n", + "37 2016 Curry Assist 6.6\n", + "38 2017 Curry Assist 6.1\n", + "39 2015 Durant Assist 5\n", + "40 2016 Durant Assist 4.8\n", + "41 2017 Durant Assist 5.4\n", + "42 2015 Ibaka Assist 0.8\n", + "43 2016 Ibaka Assist 1.1\n", + "44 2016 Ibaka Assist 0.7\n", + "45 2015 Curry Rebound 5.4\n", + "46 2016 Curry Rebound 4.5\n", + "47 2017 Curry Rebound 5.1\n", + "48 2015 Durant Rebound 8.2\n", + "49 2016 Durant Rebound 8.3\n", + "50 2017 Durant Rebound 6.8\n", + "51 2015 Ibaka Rebound 6.8\n", + "52 2016 Ibaka Rebound 6.8\n", + "53 2016 Ibaka Rebound 6.8" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.melt(id_vars=[\"Year\", \"Player\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# for bball.melt(id_vars=[\"Year\", \"Player\"], value_vars=[\"Pts\", \"Rebound\"]), while we keep the columns Year and Player as\n", + "# before, we also indicate which of the remaining columns to be considered in the two new column\n", + "\n", + "# thus, variable will only get Pts and Rebound from the remaining columns and values will take the values associated " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Consider the differences between `bball.stack` and `bball.melt`. Is there a way to make them generate the same output? Write your thoughts.**\n", + " - Hint: you might need to use both `stack` and another method from above" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variablevalue
0Year2015
1PlayerCurry
2TeamGSW
3TeamNameWarriors
4Games79
.........
67TeamNameRaptors
68Games23
69Pts14.2
70Assist0.7
71Rebound6.8
\n", + "

72 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " variable value\n", + "0 Year 2015\n", + "1 Player Curry\n", + "2 Team GSW\n", + "3 TeamName Warriors\n", + "4 Games 79\n", + ".. ... ...\n", + "67 TeamName Raptors\n", + "68 Games 23\n", + "69 Pts 14.2\n", + "70 Assist 0.7\n", + "71 Rebound 6.8\n", + "\n", + "[72 rows x 2 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bball.stack() also gives as results something similar to variables and values associated, but we don't have regular columns\n", + "# plus we don't have an index \n", + "\n", + "bball.stack().reset_index().drop(\"level_0\", axis=1).rename(columns={\"level_1\":\"variable\", 0: \"value\"})\n", + "#bball.melt()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Can you think of a reason to ever use `pivot` rather than `pivot_table`? Write your thoughts.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why use pivot rather than pivot_table ? \\\n", + "Reminders: \n", + "- The pivot method:\n", + " - takes the unique values of one column and places them along the index.\n", + " - Takes the unique values of another column and places them along the columns.\n", + " - Takes the values that correpsond to a third column and fills in the DataFrame values that correspond to that index / column pair\n", + "- The pivot_table method:\n", + " - It allows you to choose multiple columns for the index/columns/values\n", + " arguments.\n", + " - It allows you to deal with duplicate entries by\n", + " having you choose how to combine them. \n", + " \n", + "pivot_table deals with non-unique values. This can be risky for instance here, it would aggregate the two different values for (IBAKA 2016) in a probably wrong way. For instance, if the aggregator is a mean, the two part of the season in which IBAKA played for different team would be given the same weight, which is probably not the case IRL. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Create a pivot table with column `Player` as the index, `TeamName` as the columns, and `[Rebound, Assist]` as the values. What happens when you use `aggfunc=[np.max, np.min, len]`?** " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AssistRebound
TeamNameMagicRaptorsThunderWarriorsMagicRaptorsThunderWarriors
Player
CurryNaNNaNNaN6.466667NaNNaNNaN5.00
DurantNaNNaN5.05.100000NaNNaN8.27.55
Ibaka1.10.70.8NaN6.86.86.8NaN
\n", + "
" + ], + "text/plain": [ + " Assist Rebound \n", + "TeamName Magic Raptors Thunder Warriors Magic Raptors Thunder Warriors\n", + "Player \n", + "Curry NaN NaN NaN 6.466667 NaN NaN NaN 5.00\n", + "Durant NaN NaN 5.0 5.100000 NaN NaN 8.2 7.55\n", + "Ibaka 1.1 0.7 0.8 NaN 6.8 6.8 6.8 NaN" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Player\", columns=\"TeamName\", values=[\"Rebound\",\"Assist\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amaxaminlen
AssistReboundAssist...ReboundAssistRebound
TeamNameMagicRaptorsThunderWarriorsMagicRaptorsThunderWarriorsMagicRaptors...ThunderWarriorsMagicRaptorsThunderWarriorsMagicRaptorsThunderWarriors
Player
CurryNaNNaNNaN6.7NaNNaNNaN5.4NaNNaN...NaN4.5NaNNaNNaN3.0NaNNaNNaN3.0
DurantNaNNaN5.05.4NaNNaN8.28.3NaNNaN...8.26.8NaNNaN1.02.0NaNNaN1.02.0
Ibaka1.10.70.8NaN6.86.86.8NaN1.10.7...6.8NaN1.01.01.0NaN1.01.01.0NaN
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " amax \\\n", + " Assist Rebound \n", + "TeamName Magic Raptors Thunder Warriors Magic Raptors Thunder Warriors \n", + "Player \n", + "Curry NaN NaN NaN 6.7 NaN NaN NaN 5.4 \n", + "Durant NaN NaN 5.0 5.4 NaN NaN 8.2 8.3 \n", + "Ibaka 1.1 0.7 0.8 NaN 6.8 6.8 6.8 NaN \n", + "\n", + " amin ... len \\\n", + " Assist ... Rebound Assist \n", + "TeamName Magic Raptors ... Thunder Warriors Magic Raptors Thunder Warriors \n", + "Player ... \n", + "Curry NaN NaN ... NaN 4.5 NaN NaN NaN 3.0 \n", + "Durant NaN NaN ... 8.2 6.8 NaN NaN 1.0 2.0 \n", + "Ibaka 1.1 0.7 ... 6.8 NaN 1.0 1.0 1.0 NaN \n", + "\n", + " \n", + " Rebound \n", + "TeamName Magic Raptors Thunder Warriors \n", + "Player \n", + "Curry NaN NaN NaN 3.0 \n", + "Durant NaN NaN 1.0 2.0 \n", + "Ibaka 1.0 1.0 1.0 NaN \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bball.pivot_table(index=\"Player\", columns=\"TeamName\", values=[\"Rebound\",\"Assist\"], aggfunc=[np.max, np.min, len])" + ] + } + ], + "metadata": { + "date": 1584040763.8179712, + "filename": "reshape.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Reshape" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/7_merge.ipynb b/Session_7/7_merge.ipynb new file mode 100644 index 0000000..501a541 --- /dev/null +++ b/Session_7/7_merge.ipynb @@ -0,0 +1,5601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merge\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Reshape](https://datascience.quantecon.org/reshape.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Know the different pandas routines for combining datasets \n", + "- Know when to use `pd.concat` vs `pd.merge` vs `pd.join` \n", + "- Be able to apply the three main combining routines \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- WDI data on GDP components, population, and square miles of countries \n", + "- Book ratings: 6,000,000 ratings for the 10,000 most rated books on\n", + " [Goodreads](https://www.goodreads.com/) \n", + "- Details for all delayed US domestic flights in November 2016,\n", + " obtained from the [Bureau of Transportation\n", + " Statistics](https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [Merge](#Merge) \n", + " - [Combining Datasets](#Combining-Datasets) \n", + " - [`pd.concat`](#`pd.concat`) \n", + " - [`pd.merge`](#`pd.merge`) \n", + " - [Arguments to `merge`](#Arguments-to-`merge`) \n", + " - [`df.join`](#`df.join`) \n", + " - [Case Study](#Case-Study) \n", + " - [Extra Example: Airline Delays](#Extra-Example:-Airline-Delays) \n", + " - [Visualizing Merge Operations](#Visualizing-Merge-Operations) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow->qeds) (1.12.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2.8.0)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n" + ] + } + ], + "source": [ + "# Uncomment following line to install on colab\n", + "! pip install qeds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combining Datasets\n", + "\n", + "Often, we will want perform joint analysis on data from different sources.\n", + "\n", + "For example, when analyzing the regional sales for a company, we might\n", + "want to include industry aggregates or demographic information for each\n", + "region.\n", + "\n", + "Or perhaps we are working with product-level data, have a list of\n", + "product groups in a separate dataset, and want to compute aggregate\n", + "statistics for each group." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import qeds\n", + "%matplotlib inline\n", + "# activate plot theme\n", + "import qeds\n", + "qeds.themes.mpl_style();\n", + "\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "MultiIndex: 72 entries, (Canada, 2017) to (United States, 2000)\n", + "Data columns (total 5 columns):\n", + "GovExpend 72 non-null float64\n", + "Consumption 72 non-null float64\n", + "Exports 72 non-null float64\n", + "Imports 72 non-null float64\n", + "GDP 72 non-null float64\n", + "dtypes: float64(5)\n", + "memory usage: 3.2+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
country
Canada0.3726651.0954750.5828310.6000311.868164
Germany0.7455792.1120091.9305631.6663483.883870
United Kingdom0.5495381.8091540.8626290.9331452.818704
United States2.40574312.0192662.2870713.06995417.348627
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704\n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# from WDI. Units trillions of 2010 USD\n", + "url = \"https://datascience.quantecon.org/assets/data/wdi_data.csv\"\n", + "wdi = pd.read_csv(url).set_index([\"country\", \"year\"])\n", + "wdi.info()\n", + "\n", + "wdi2017 = wdi.xs(2017, level=\"year\")\n", + "wdi2017" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
countryyear
Canada20170.3726651.0954750.5828310.6000311.868164
20160.3648991.0584260.5763940.5757751.814016
Germany20170.7455792.1120091.9305631.6663483.883870
20160.7340142.0756151.8449491.5894953.801859
United Kingdom20170.5495381.8091540.8626290.9331452.818704
20160.5505961.7723480.8167920.9014942.768241
United States20172.40574312.0192662.2870713.06995417.348627
20162.40798111.7221332.2199372.93600416.972348
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164\n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016\n", + "Germany 2017 0.745579 2.112009 1.930563 1.666348 3.883870\n", + " 2016 0.734014 2.075615 1.844949 1.589495 3.801859\n", + "United Kingdom 2017 0.549538 1.809154 0.862629 0.933145 2.818704\n", + " 2016 0.550596 1.772348 0.816792 0.901494 2.768241\n", + "United States 2017 2.405743 12.019266 2.287071 3.069954 17.348627\n", + " 2016 2.407981 11.722133 2.219937 2.936004 16.972348" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2016_17 = wdi.loc[pd.IndexSlice[:, [2016, 2017]],: ]\n", + "wdi2016_17" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sq_miles
country
United States3.8000
Canada3.8000
Germany0.1370
United Kingdom0.0936
Russia6.6000
\n", + "
" + ], + "text/plain": [ + " sq_miles\n", + "country \n", + "United States 3.8000\n", + "Canada 3.8000\n", + "Germany 0.1370\n", + "United Kingdom 0.0936\n", + "Russia 6.6000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Data from https://www.nationmaster.com/country-info/stats/Geography/Land-area/Square-miles\n", + "# units -- millions of square miles\n", + "sq_miles = pd.Series({\n", + " \"United States\": 3.8,\n", + " \"Canada\": 3.8,\n", + " \"Germany\": 0.137,\n", + " \"United Kingdom\": 0.0936,\n", + " \"Russia\": 6.6,\n", + "}, name=\"sq_miles\").to_frame()\n", + "sq_miles.index.name = \"country\"\n", + "sq_miles" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "MultiIndex: 72 entries, (Canada, 2017) to (United States, 2000)\n", + "Data columns (total 1 columns):\n", + "Population 72 non-null float64\n", + "dtypes: float64(1)\n", + "memory usage: 1005.0+ bytes\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Population
countryyear
Canada201736.540268
201636.109487
201535.702908
201435.437435
201335.082954
201234.714222
201134.339328
201034.004889
200933.628895
200833.247118
\n", + "
" + ], + "text/plain": [ + " Population\n", + "country year \n", + "Canada 2017 36.540268\n", + " 2016 36.109487\n", + " 2015 35.702908\n", + " 2014 35.437435\n", + " 2013 35.082954\n", + " 2012 34.714222\n", + " 2011 34.339328\n", + " 2010 34.004889\n", + " 2009 33.628895\n", + " 2008 33.247118" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# from WDI. Units millions of people\n", + "pop_url = \"https://datascience.quantecon.org/assets/data/wdi_population.csv\"\n", + "pop = pd.read_csv(pop_url).set_index([\"country\", \"year\"])\n", + "pop.info()\n", + "pop.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose that we were asked to compute a number of statistics with the data above:\n", + "\n", + "- As a measure of land usage or productivity, what is Consumption per square mile? \n", + "- What is GDP per capita (per person) for each country in each year? How about\n", + " Consumption per person? \n", + "- What is the population density of each country? How much does it change over time? \n", + "\n", + "\n", + "Notice that to answer any of the questions from above, we will have to use data\n", + "from more than one of our DataFrames.\n", + "\n", + "In this lecture, we will learn many techniques for combining datasets that\n", + "originate from different sources, careful to ensure that data is properly\n", + "aligned.\n", + "\n", + "In pandas three main methods can combine datasets:\n", + "\n", + "1. `pd.concat([dfs...])` \n", + "1. `pd.merge(df1, df2)` \n", + "1. `df1.join(df2)` \n", + "\n", + "\n", + "We’ll look at each one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `pd.concat`\n", + "\n", + "The `pd.concat` function is used to stack two or more DataFrames\n", + "together.\n", + "\n", + "An example of when you might want to do this is if you have monthly data\n", + "in separate files on your computer and would like to have 1 year of data\n", + "in a single DataFrame.\n", + "\n", + "The first argument to `pd.concat` is a list of DataFrames to be\n", + "stitched together.\n", + "\n", + "The other commonly used argument is named `axis`.\n", + "\n", + "As we have seen before, many pandas functions have an `axis` argument\n", + "that specifies whether a particular operation should happen down rows\n", + "(`axis=0`) or along columns (`axis=1`).\n", + "\n", + "In the context of `pd.concat`, setting `axis=0` (the default case)\n", + "will stack DataFrames on top of one another while `axis=1` stacks them\n", + "side by side.\n", + "\n", + "We’ll look at each case separately." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `axis=0`\n", + "\n", + "When we call `pd.concat` and set `axis=0`, the list of DataFrames\n", + "passed in the first argument will be stacked on top of one another.\n", + "\n", + "Let’s try it out here." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConsumptionExportsGDPGovExpendImportssq_miles
country
Canada1.0954750.5828311.8681640.3726650.600031NaN
Germany2.1120091.9305633.8838700.7455791.666348NaN
United Kingdom1.8091540.8626292.8187040.5495380.933145NaN
United States12.0192662.28707117.3486272.4057433.069954NaN
United StatesNaNNaNNaNNaNNaN3.8000
CanadaNaNNaNNaNNaNNaN3.8000
GermanyNaNNaNNaNNaNNaN0.1370
United KingdomNaNNaNNaNNaNNaN0.0936
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " Consumption Exports GDP GovExpend Imports \\\n", + "country \n", + "Canada 1.095475 0.582831 1.868164 0.372665 0.600031 \n", + "Germany 2.112009 1.930563 3.883870 0.745579 1.666348 \n", + "United Kingdom 1.809154 0.862629 2.818704 0.549538 0.933145 \n", + "United States 12.019266 2.287071 17.348627 2.405743 3.069954 \n", + "United States NaN NaN NaN NaN NaN \n", + "Canada NaN NaN NaN NaN NaN \n", + "Germany NaN NaN NaN NaN NaN \n", + "United Kingdom NaN NaN NaN NaN NaN \n", + "Russia NaN NaN NaN NaN NaN \n", + "\n", + " sq_miles \n", + "country \n", + "Canada NaN \n", + "Germany NaN \n", + "United Kingdom NaN \n", + "United States NaN \n", + "United States 3.8000 \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "Russia 6.6000 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# equivalent to pd.concat([wdi2017, sq_miles]) -- axis=0 is default\n", + "pd.concat([wdi2017, sq_miles], axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice a few things:\n", + "\n", + "- \n", + "
\n", + "
The number of rows in the output is the total number
\n", + "
\n", + " of rows in all inputs. The labels are all from the original\n", + " DataFrames. \n", + "
\n", + " \n", + "
\n", + " \n", + "- The column labels are all the distinct column labels from all the inputs. \n", + "- For columns that appeared only in one input, the value for all row labels\n", + " originating from a different input is equal to `NaN` (marked as missing). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `axis=1`\n", + "\n", + "In this example, concatenating by stacking\n", + "side-by-side makes more sense.\n", + "\n", + "We accomplish this by passing `axis=1` to `pd.concat`:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
RussiaNaNNaNNaNNaNNaN6.6000
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "Russia NaN NaN NaN NaN NaN \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "Russia 6.6000 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([wdi2017, sq_miles], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice here that\n", + "\n", + "- The index entries are all unique index entries that appeared in any DataFrame. \n", + "- The column labels are all column labels from the inputs. \n", + "- As `wdi2017` didn’t have a `Russia` row, the value for all of its columns\n", + " is `NaN`. \n", + "\n", + "\n", + "Now we can answer one of our questions from above: What is\n", + "Consumption per square mile?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/plain": [ + "Canada 0.288283\n", + "Germany 15.416124\n", + "Russia NaN\n", + "United Kingdom 19.328569\n", + "United States 3.162965\n", + "dtype: float64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp = pd.concat([wdi2017, sq_miles], axis=1)\n", + "temp[\"Consumption\"] / temp[\"sq_miles\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `pd.merge`\n", + "\n", + "`pd.merge` operates on two DataFrames at a time and is primarily used\n", + "to bring columns from one DataFrame into another, *aligning* data based\n", + "on one or more “key” columns.\n", + "\n", + "This is a somewhat difficult concept to grasp by reading, so let’s look at some\n", + "examples." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2017, sq_miles, on=\"country\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output here looks very similar to what we saw with `concat` and\n", + "`axis=1`, except that the row for `Russia` does not appear.\n", + "\n", + "We will talk more about why this happened soon.\n", + "\n", + "For now, let’s look at a slightly more intriguing example:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Canada0.3648991.0584260.5763940.5757751.8140163.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
Germany0.7340142.0756151.8449491.5894953.8018590.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United Kingdom0.5505961.7723480.8167920.9014942.7682410.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
United States2.40798111.7221332.2199372.93600416.9723483.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Canada 0.364899 1.058426 0.576394 0.575775 1.814016 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "Germany 0.734014 2.075615 1.844949 1.589495 3.801859 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United Kingdom 0.550596 1.772348 0.816792 0.901494 2.768241 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "United States 2.407981 11.722133 2.219937 2.936004 16.972348 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 \n", + "United States 3.8000 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2016_17, sq_miles, on=\"country\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here’s how we think about what happened:\n", + "\n", + "- The data in `wdi2016_17` is copied over exactly as is. \n", + "- Because `country` was on the index for both DataFrames, it is on the\n", + " index of the output. \n", + "- We lost the year on the index – we’ll work on getting it back below. \n", + "- The additional column in `sq_miles` was added to column labels for the\n", + " output. \n", + "- The data from the `sq_miles` column was added to the output by looking up\n", + " rows where the `country` in the two DataFrames lined up.\n", + " - Note that all the countries appeared twice, and the data in `sq_miles` was repeated. This is because `wdi2016_17` had two rows for each country.\n", + " - Also note that because `Russia` did not appear in `wdi2016_17`, the value `sq_miles.loc[\"Russia\"]` (i.e. `6.6`) is not used the output. \n", + "\n", + "\n", + "How do we get the year back?\n", + "\n", + "We must first call `reset_index` on `wdi2016_17` so\n", + "that in the first step when all columns are copied over, `year` is included." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearGovExpendConsumptionExportsImportsGDPsq_miles
0Canada20170.3726651.0954750.5828310.6000311.8681643.8000
1Canada20160.3648991.0584260.5763940.5757751.8140163.8000
2Germany20170.7455792.1120091.9305631.6663483.8838700.1370
3Germany20160.7340142.0756151.8449491.5894953.8018590.1370
4United Kingdom20170.5495381.8091540.8626290.9331452.8187040.0936
5United Kingdom20160.5505961.7723480.8167920.9014942.7682410.0936
6United States20172.40574312.0192662.2870713.06995417.3486273.8000
7United States20162.40798111.7221332.2199372.93600416.9723483.8000
\n", + "
" + ], + "text/plain": [ + " country year GovExpend Consumption Exports Imports \\\n", + "0 Canada 2017 0.372665 1.095475 0.582831 0.600031 \n", + "1 Canada 2016 0.364899 1.058426 0.576394 0.575775 \n", + "2 Germany 2017 0.745579 2.112009 1.930563 1.666348 \n", + "3 Germany 2016 0.734014 2.075615 1.844949 1.589495 \n", + "4 United Kingdom 2017 0.549538 1.809154 0.862629 0.933145 \n", + "5 United Kingdom 2016 0.550596 1.772348 0.816792 0.901494 \n", + "6 United States 2017 2.405743 12.019266 2.287071 3.069954 \n", + "7 United States 2016 2.407981 11.722133 2.219937 2.936004 \n", + "\n", + " GDP sq_miles \n", + "0 1.868164 3.8000 \n", + "1 1.814016 3.8000 \n", + "2 3.883870 0.1370 \n", + "3 3.801859 0.1370 \n", + "4 2.818704 0.0936 \n", + "5 2.768241 0.0936 \n", + "6 17.348627 3.8000 \n", + "7 16.972348 3.8000 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2016_17.reset_index(), sq_miles, on=\"country\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple Columns\n", + "\n", + "Sometimes, we need to merge multiple columns.\n", + "\n", + "For example our `pop` and `wdi2016_17` DataFrames both have observations\n", + "organized by country and year.\n", + "\n", + "To properly merge these datasets, we would need to align the data by\n", + "both country and year.\n", + "\n", + "We pass a list to the `on` argument to accomplish this:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPPopulation
countryyear
Canada20170.3726651.0954750.5828310.6000311.86816436.540268
20160.3648991.0584260.5763940.5757751.81401636.109487
Germany20170.7455792.1120091.9305631.6663483.88387082.657002
20160.7340142.0756151.8449491.5894953.80185982.348669
United Kingdom20170.5495381.8091540.8626290.9331452.81870466.058859
20160.5505961.7723480.8167920.9014942.76824165.595565
United States20172.40574312.0192662.2870713.06995417.348627325.147121
20162.40798111.7221332.2199372.93600416.972348323.071342
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country year \n", + "Canada 2017 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + " 2016 0.364899 1.058426 0.576394 0.575775 1.814016 \n", + "Germany 2017 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + " 2016 0.734014 2.075615 1.844949 1.589495 3.801859 \n", + "United Kingdom 2017 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + " 2016 0.550596 1.772348 0.816792 0.901494 2.768241 \n", + "United States 2017 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + " 2016 2.407981 11.722133 2.219937 2.936004 16.972348 \n", + "\n", + " Population \n", + "country year \n", + "Canada 2017 36.540268 \n", + " 2016 36.109487 \n", + "Germany 2017 82.657002 \n", + " 2016 82.348669 \n", + "United Kingdom 2017 66.058859 \n", + " 2016 65.595565 \n", + "United States 2017 325.147121 \n", + " 2016 323.071342 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2016_17, pop, on=[\"country\", \"year\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can answer more of our questions from above: What is GDP per capita (per\n", + "person) for each country in each year? How about Consumption per person?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "country year\n", + "Canada 2017 0.051126\n", + " 2016 0.050237\n", + "Germany 2017 0.046988\n", + " 2016 0.046168\n", + "United Kingdom 2017 0.042670\n", + " 2016 0.042202\n", + "United States 2017 0.053356\n", + " 2016 0.052534\n", + "dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi_pop = pd.merge(wdi2016_17, pop, on=[\"country\", \"year\"])\n", + "wdi_pop[\"GDP\"] / wdi_pop[\"Population\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "country year\n", + "Canada 2017 0.029980\n", + " 2016 0.029312\n", + "Germany 2017 0.025551\n", + " 2016 0.025205\n", + "United Kingdom 2017 0.027387\n", + " 2016 0.027019\n", + "United States 2017 0.036966\n", + " 2016 0.036283\n", + "dtype: float64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi_pop[\"Consumption\"] / wdi_pop[\"Population\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Arguments to `merge`\n", + "\n", + "The `pd.merge` function can take many optional arguments.\n", + "\n", + "We’ll talk about a few of the most commonly-used ones here and refer you\n", + "to the\n", + "[documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.merge.html#pandas.merge)\n", + "for more details.\n", + "\n", + "We’ll follow the pandas convention and refer to the first argument to\n", + "`pd.merge` as `left` and call the second `right`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `on`\n", + "\n", + "We have already seen this one used before, but we want to point out that on\n", + "is optional.\n", + "\n", + "If nothing is given for this argument, pandas will use **all** columns\n", + "in `left` and `right` with the same name.\n", + "\n", + "In our example, `country` is the only column that appears in both\n", + "DataFrames, so it is used for `on` if we don’t pass anything.\n", + "\n", + "The following two are equivalent." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2017, sq_miles, on=\"country\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryGovExpendConsumptionExportsImportsGDPsq_miles
0Canada0.3726651.0954750.5828310.6000311.8681643.8000
1Germany0.7455792.1120091.9305631.6663483.8838700.1370
2United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
3United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " country GovExpend Consumption Exports Imports GDP \\\n", + "0 Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "1 Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "2 United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "3 United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "0 3.8000 \n", + "1 0.1370 \n", + "2 0.0936 \n", + "3 3.8000 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if we move index back to columns, the `on` is un-necessary\n", + "pd.merge(wdi2017.reset_index(), sq_miles.reset_index())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `left_on`, `right_on`\n", + "\n", + "Above, we used the `on` argument to identify a column in both `left`\n", + "and `right` that was used to align data.\n", + "\n", + "Sometimes, both DataFrames don’t have the same name for this column.\n", + "\n", + "In that case, we use the `left_on` and `right_on` arguments, passing\n", + "the proper column name(s) to align the data.\n", + "\n", + "We’ll show you an example below, but it is somewhat silly as our\n", + "DataFrames do both have the `country` column." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2017, sq_miles, left_on=\"country\", right_on=\"country\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `left_index`, `right_index`\n", + "\n", + "Sometimes, as in our example, the key used to align data is actually in the\n", + "index instead of one of the columns.\n", + "\n", + "In this case, we can use the `left_index` or `right_index` arguments.\n", + "\n", + "We should only set these values to a boolean (`True` or `False`).\n", + "\n", + "Let’s practice with this." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(wdi2017, sq_miles, left_on=\"country\", right_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `how`\n", + "\n", + "The `how` is perhaps the most powerful, but most conceptually\n", + "difficult of the arguments we will cover.\n", + "\n", + "This argument controls which values from the key column(s) appear in the\n", + "output.\n", + "\n", + "The 4 possible options for this argument are summarized in\n", + "the image below.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/merge_files/merge_venns.png](https://datascience.quantecon.org/assets/_static/merge_files/merge_venns.png) \n", + "In words, we have:\n", + "\n", + "- `left`: Default and what we described above. It uses\n", + " the keys from the `left` DataFrame. \n", + "- `right`: Output will contain all keys from `right`. \n", + "- `inner`: The output will only contain keys that appear in *both*\n", + " `left` and `right`. \n", + "- `outer`: The output will contain any key found in either `left`\n", + " or `right`. \n", + "\n", + "\n", + "In addition to the above, we will use the following two DataFrames to\n", + "illustrate the `how` option." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
country
Canada0.3726651.0954750.5828310.6000311.868164
Germany0.7455792.1120091.9305631.6663483.883870
United Kingdom0.5495381.8091540.8626290.9331452.818704
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2017_no_US = wdi2017.drop(\"United States\")\n", + "wdi2017_no_US" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sq_miles
country
United States3.8000
Canada3.8000
United Kingdom0.0936
Russia6.6000
\n", + "
" + ], + "text/plain": [ + " sq_miles\n", + "country \n", + "United States 3.8000\n", + "Canada 3.8000\n", + "United Kingdom 0.0936\n", + "Russia 6.6000" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sq_miles_no_germany = sq_miles.drop(\"Germany\")\n", + "sq_miles_no_germany" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let’s see all the possible `how` options." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# default\n", + "pd.merge(wdi2017, sq_miles, on=\"country\", how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "Russia NaN NaN NaN NaN NaN \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 \n", + "Russia 6.6000 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# notice ``Russia`` is included\n", + "pd.merge(wdi2017, sq_miles, on=\"country\", how=\"right\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 0.1370\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# notice no United States or Russia\n", + "pd.merge(wdi2017_no_US, sq_miles, on=\"country\", how=\"inner\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.883870NaN
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United StatesNaNNaNNaNNaNNaN3.8000
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 NaN\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936\n", + "United States NaN NaN NaN NaN NaN 3.8000\n", + "Russia NaN NaN NaN NaN NaN 6.6000" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# includes all 5, even though they don't all appear in either DataFrame\n", + "pd.merge(wdi2017_no_US, sq_miles_no_germany, on=\"country\", how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `df.merge(df2)`\n", + "\n", + "Note that the DataFrame type has a `merge` *method*.\n", + "\n", + "It is the same as the function we have been working with, but passes the\n", + "DataFrame before the period as `left`.\n", + "\n", + "Thus `df.merge(other)` is equivalent to `pd.merge(df, other)`." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "Russia NaN NaN NaN NaN NaN \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 \n", + "Russia 6.6000 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2017.merge(sq_miles, on=\"country\", how=\"right\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `df.join`\n", + "\n", + "The `join` method for a DataFrame is very similar to the `merge`\n", + "method described above, but only allows you to use the index of the\n", + "`right` DataFrame as the join key.\n", + "\n", + "Thus, `left.join(right, on=\"country\")` is equivalent to calling\n", + "`pd.merge(left, right, left_on=\"country\", right_index=True)`.\n", + "\n", + "The implementation of the `join` method calls `merge` internally,\n", + "but sets the `left_on` and `right_index` arguments for you.\n", + "\n", + "You can do anything with `df.join` that you can do with\n", + "`df.merge`, but df.join` is more convenient to use if the keys of `right`\n", + "are in the index." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2017.join(sq_miles, on=\"country\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wdi2017.merge(sq_miles, left_on=\"country\", right_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Study\n", + "\n", + "Let’s put these tools to practice by loading some real datasets and\n", + "seeing how these functions can be applied.\n", + "\n", + "We’ll analyze ratings of books from the website [Goodreads](https://www.goodreads.com/).\n", + "\n", + "We accessed the data [here](https://github.com/zygmuntz/goodbooks-10k).\n", + "\n", + "Let’s load it up." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idbook_idrating
012585
1240814
222605
3292965
4223183
\n", + "
" + ], + "text/plain": [ + " user_id book_id rating\n", + "0 1 258 5\n", + "1 2 4081 4\n", + "2 2 260 5\n", + "3 2 9296 5\n", + "4 2 2318 3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5976479 entries, 0 to 5976478\n", + "Data columns (total 3 columns):\n", + "user_id int64\n", + "book_id int64\n", + "rating int64\n", + "dtypes: int64(3)\n", + "memory usage: 136.8 MB\n" + ] + } + ], + "source": [ + "ratings = qeds.data.load(\"goodreads_ratings\")\n", + "display(ratings.head())\n", + "ratings.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can already do some interesting things with just the ratings data.\n", + "\n", + "Let’s see how many ratings of each number are in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ratings[\"rating\"].value_counts().sort_index().plot(kind=\"bar\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s also see how many users have rated `N` books, for all `N`\n", + "possible.\n", + "\n", + "To do this, we will use `value_counts` twice (can you think of why?).\n", + "\n", + "We will see a more flexible way of performing similar grouped operations in\n", + "a future lecture." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
N_ratingsN_users
0191
1201
2213
32213
4235
52411
62513
72623
82734
92826
\n", + "
" + ], + "text/plain": [ + " N_ratings N_users\n", + "0 19 1\n", + "1 20 1\n", + "2 21 3\n", + "3 22 13\n", + "4 23 5\n", + "5 24 11\n", + "6 25 13\n", + "7 26 23\n", + "8 27 34\n", + "9 28 26" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_by_n = (\n", + " ratings[\"user_id\"]\n", + " .value_counts() # Series. Index: user_id, value: n ratings by user\n", + " .value_counts() # Series. Index: n_ratings by user, value: N_users with this many ratings\n", + " .sort_index() # Sort our Series by the index (number of ratings)\n", + " .reset_index() # Dataframe with columns `index` (from above) and `user_id`\n", + " .rename(columns={\"index\": \"N_ratings\", \"user_id\": \"N_users\"})\n", + ")\n", + "users_by_n.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s look at some statistics on that dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
N_ratingsN_users
count181.00000181.000000
mean109.01105295.160221
std52.41342309.461848
min19.000001.000000
25%64.0000040.000000
50%109.00000158.000000
75%154.00000538.000000
max200.00000964.000000
\n", + "
" + ], + "text/plain": [ + " N_ratings N_users\n", + "count 181.00000 181.000000\n", + "mean 109.01105 295.160221\n", + "std 52.41342 309.461848\n", + "min 19.00000 1.000000\n", + "25% 64.00000 40.000000\n", + "50% 109.00000 158.000000\n", + "75% 154.00000 538.000000\n", + "max 200.00000 964.000000" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_by_n.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the same data visually in a box plot." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "N_ratings AxesSubplot(0.125,0.125;0.352273x0.755)\n", + "N_users AxesSubplot(0.547727,0.125;0.352273x0.755)\n", + "dtype: object" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "users_by_n.plot(kind=\"box\", subplots=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s practice applying the want operator…\n", + "\n", + "**Want**: Determine whether a relationship between the number of\n", + "ratings a user has written and the distribution of the ratings exists. (Maybe we\n", + "are an author hoping to inflate our ratings and wonder if we should\n", + "target “more experienced” Goodreads users, or focus on newcomers.)\n", + "\n", + "Let’s start from the result and work our way backwards:\n", + "\n", + "1. We can answer our question if we have two similar DataFrames: \n", + " - All ratings by the `N` (e.g. 5) users with the most ratings \n", + " - All ratings by the `N` users with the least number of\n", + " ratings \n", + "1. To get that, we will need to extract rows of `ratings` with\n", + " `user_id` associated with the `N` most and least prolific raters \n", + "1. For that, we need the most and least active `user_id`s \n", + "1. To get that info, we need a count of how many ratings each user left. \n", + " - We can get that with `df[\"user_id\"].value_counts()`, so let’s\n", + " start there. " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "30944 200\n", + "12874 200\n", + "52036 199\n", + "28158 199\n", + "12381 199\n", + "Name: user_id, dtype: int64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# step 4\n", + "n_ratings = ratings[\"user_id\"].value_counts()\n", + "n_ratings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# step 3\n", + "N = 5\n", + "most_prolific_users = n_ratings.nlargest(5).index.tolist()\n", + "least_prolific_users = n_ratings.nsmallest(5).index.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# step 2\n", + "active_ratings = ratings.loc[ratings[\"user_id\"].isin(most_prolific_users), :]\n", + "inactive_ratings = ratings.loc[ratings[\"user_id\"].isin(least_prolific_users), :]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# step 1 -- get the answer!\n", + "active_ratings[\"rating\"].value_counts().sort_index().plot(\n", + " kind=\"bar\", title=\"Distribution of ratings by most active users\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "inactive_ratings[\"rating\"].value_counts().sort_index().plot(\n", + " kind=\"bar\", title=\"Distribution of ratings by least active users\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! From the picture above, the new users look much more\n", + "likely to leave 5 star ratings than more experienced users." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Book Data\n", + "\n", + "We know what you are probably thinking: “Isn’t this a lecture on merging?\n", + "Why are we only using one dataset?”\n", + "\n", + "We hear you.\n", + "\n", + "Let’s also load a dataset containing information on the actual books." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (10000, 3)\n", + "dtypes:\n", + "book_id int64\n", + "authors object\n", + "title object\n", + "dtype: object\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
book_idauthorstitle
01Suzanne CollinsThe Hunger Games (The Hunger Games, #1)
12J.K. Rowling, Mary GrandPréHarry Potter and the Sorcerer's Stone (Harry P...
23Stephenie MeyerTwilight (Twilight, #1)
34Harper LeeTo Kill a Mockingbird
45F. Scott FitzgeraldThe Great Gatsby
\n", + "
" + ], + "text/plain": [ + " book_id authors \\\n", + "0 1 Suzanne Collins \n", + "1 2 J.K. Rowling, Mary GrandPré \n", + "2 3 Stephenie Meyer \n", + "3 4 Harper Lee \n", + "4 5 F. Scott Fitzgerald \n", + "\n", + " title \n", + "0 The Hunger Games (The Hunger Games, #1) \n", + "1 Harry Potter and the Sorcerer's Stone (Harry P... \n", + "2 Twilight (Twilight, #1) \n", + "3 To Kill a Mockingbird \n", + "4 The Great Gatsby " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "books = qeds.data.load(\"goodreads_books\")\n", + "# we only need a few of the columns\n", + "books = books[[\"book_id\", \"authors\", \"title\"]]\n", + "print(\"shape: \", books.shape)\n", + "print(\"dtypes:\\n\", books.dtypes, sep=\"\")\n", + "books.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could do similar interesting things with just the books dataset,\n", + "but we will skip it for now and merge them together." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "rated_books = pd.merge(ratings, books)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let’s see which books have been most often rated." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',\n", + " \"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)\",\n", + " 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',\n", + " 'The Great Gatsby',\n", + " 'To Kill a Mockingbird',\n", + " 'The Hobbit',\n", + " 'Twilight (Twilight, #1)',\n", + " 'The Hunger Games (The Hunger Games, #1)',\n", + " 'Catching Fire (The Hunger Games, #2)',\n", + " 'Mockingjay (The Hunger Games, #3)']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "most_rated_books_id = rated_books[\"book_id\"].value_counts().nlargest(10).index\n", + "most_rated_books = rated_books.loc[rated_books[\"book_id\"].isin(most_rated_books_id), :]\n", + "list(most_rated_books[\"title\"].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s use our `pivot_table` knowledge to compute the average rating\n", + "for each of these books." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rating
title
Catching Fire (The Hunger Games, #2)4.133422
Harry Potter and the Chamber of Secrets (Harry Potter, #2)4.229418
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)4.418732
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)4.351350
Mockingjay (The Hunger Games, #3)3.853131
The Great Gatsby3.772224
The Hobbit4.148477
The Hunger Games (The Hunger Games, #1)4.279707
To Kill a Mockingbird4.329369
Twilight (Twilight, #1)3.214341
\n", + "
" + ], + "text/plain": [ + " rating\n", + "title \n", + "Catching Fire (The Hunger Games, #2) 4.133422\n", + "Harry Potter and the Chamber of Secrets (Harry ... 4.229418\n", + "Harry Potter and the Prisoner of Azkaban (Harry... 4.418732\n", + "Harry Potter and the Sorcerer's Stone (Harry Po... 4.351350\n", + "Mockingjay (The Hunger Games, #3) 3.853131\n", + "The Great Gatsby 3.772224\n", + "The Hobbit 4.148477\n", + "The Hunger Games (The Hunger Games, #1) 4.279707\n", + "To Kill a Mockingbird 4.329369\n", + "Twilight (Twilight, #1) 3.214341" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "most_rated_books.pivot_table(values=\"rating\", index=\"title\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These ratings seem surprisingly low, given that they are the most often\n", + "rated books on Goodreads.\n", + "\n", + "I wonder what the bottom of the distribution looks like…" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s compute the average number of ratings for each book in our sample." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rating
title
The Complete Calvin and Hobbes4.829876
ESV Study Bible4.818182
Attack of the Deranged Mutant Killer Monster Snow Goons4.768707
The Indispensable Calvin and Hobbes4.766355
The Revenge of the Baby-Sat4.761364
There's Treasure Everywhere: A Calvin and Hobbes Collection4.760456
The Authoritative Calvin and Hobbes: A Calvin and Hobbes Treasury4.757202
It's a Magical World: A Calvin and Hobbes Collection4.747396
Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)4.736842
The Calvin and Hobbes Tenth Anniversary Book4.728528
\n", + "
" + ], + "text/plain": [ + " rating\n", + "title \n", + "The Complete Calvin and Hobbes 4.829876\n", + "ESV Study Bible 4.818182\n", + "Attack of the Deranged Mutant Killer Monster Sn... 4.768707\n", + "The Indispensable Calvin and Hobbes 4.766355\n", + "The Revenge of the Baby-Sat 4.761364\n", + "There's Treasure Everywhere: A Calvin and Hobbe... 4.760456\n", + "The Authoritative Calvin and Hobbes: A Calvin a... 4.757202\n", + "It's a Magical World: A Calvin and Hobbes Colle... 4.747396\n", + "Harry Potter Boxed Set, Books 1-5 (Harry Potter... 4.736842\n", + "The Calvin and Hobbes Tenth Anniversary Book 4.728528" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_ratings = (\n", + " rated_books\n", + " .pivot_table(values=\"rating\", index=\"title\")\n", + " .sort_values(by=\"rating\", ascending=False)\n", + ")\n", + "average_ratings.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What does the overall distribution of average ratings look like?" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot a kernel density estimate of average ratings\n", + "average_ratings.plot.density(xlim=(1, 5))\n", + "\n", + "# or a histogram\n", + "average_ratings.plot.hist(bins=30, xlim=(1, 5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like most books have an average rating of just below 4." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extra Example: Airline Delays\n", + "\n", + "Let’s look at one more example.\n", + "\n", + "This time, we will use a dataset from the [Bureau of Transportation\n", + "Statistics](https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time)\n", + "that describes the cause of all US domestic flight delays\n", + "in November 2016:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 460949 entries, 0 to 460948\n", + "Data columns (total 4 columns):\n", + "CRSDepTime 460949 non-null datetime64[ns]\n", + "Carrier 460949 non-null object\n", + "CarrierDelay 460949 non-null float64\n", + "ArrDelay 452229 non-null float64\n", + "dtypes: datetime64[ns](1), float64(2), object(1)\n", + "memory usage: 14.1+ MB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRSDepTimeCarrierCarrierDelayArrDelay
02016-12-18 15:58:00AA0.020.0
12016-12-19 15:58:00AA0.020.0
22016-12-20 15:58:00AA0.0-3.0
32016-12-21 15:58:00AA0.0-10.0
42016-12-22 15:58:00AA0.0-8.0
\n", + "
" + ], + "text/plain": [ + " CRSDepTime Carrier CarrierDelay ArrDelay\n", + "0 2016-12-18 15:58:00 AA 0.0 20.0\n", + "1 2016-12-19 15:58:00 AA 0.0 20.0\n", + "2 2016-12-20 15:58:00 AA 0.0 -3.0\n", + "3 2016-12-21 15:58:00 AA 0.0 -10.0\n", + "4 2016-12-22 15:58:00 AA 0.0 -8.0" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_perf = qeds.load(\"airline_performance_dec16\")[[\"CRSDepTime\", \"Carrier\", \"CarrierDelay\", \"ArrDelay\"]]\n", + "air_perf.info()\n", + "air_perf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Carrier` column identifies the airline and the `CarrierDelay`\n", + "reports the number of minutes of the total delay assigned as the\n", + "“carrier’s fault”.\n", + "\n", + "**Want**: Determine which airlines, on average, contribute most to\n", + "delays.\n", + "\n", + "We can do this using `pivot_table`:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDelay
Carrier
F97.856566
EV7.125663
OO6.705469
B65.588006
DL4.674957
HA4.577753
UA4.368148
NK4.166264
AA4.073358
VX3.342923
\n", + "
" + ], + "text/plain": [ + " CarrierDelay\n", + "Carrier \n", + "F9 7.856566\n", + "EV 7.125663\n", + "OO 6.705469\n", + "B6 5.588006\n", + "DL 4.674957\n", + "HA 4.577753\n", + "UA 4.368148\n", + "NK 4.166264\n", + "AA 4.073358\n", + "VX 3.342923" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_delays = (\n", + " air_perf\n", + " .pivot_table(index=\"Carrier\", values=\"CarrierDelay\", aggfunc=\"mean\")\n", + " .sort_values(\"CarrierDelay\")\n", + " .nlargest(10, \"CarrierDelay\")\n", + ")\n", + "avg_delays" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The one issue with this dataset is that we don’t know what all those two\n", + "letter carrier codes are!\n", + "\n", + "Thankfully, we have a second dataset that maps the two letter code\n", + "into the full airline name." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Description
Code
ZWAir Wisconsin Airlines Corp (1994 - )
ZXAirbc Ltd. (1990 - 2000)
ZXAir Georgian (2002 - )
ZYAtlantic Gulf Airlines (1985 - 1986)
ZYZSkyway Aviation Inc. (1960 - 2002)
\n", + "
" + ], + "text/plain": [ + " Description\n", + "Code \n", + "ZW Air Wisconsin Airlines Corp (1994 - )\n", + "ZX Airbc Ltd. (1990 - 2000)\n", + "ZX Air Georgian (2002 - )\n", + "ZY Atlantic Gulf Airlines (1985 - 1986)\n", + "ZYZ Skyway Aviation Inc. (1960 - 2002)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "carrier_code = qeds.load(\"airline_carrier_codes\")\n", + "carrier_code.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s merge these names so we know which airlines we should avoid\n", + "flying…" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDelayDescription
F97.856566Frontier Airlines Inc. (1994 - )
EV7.125663ExpressJet Airlines Inc. (2012 - )
EV7.125663Atlantic Southeast Airlines (1993 - 2011)
OO6.705469SkyWest Airlines Inc. (2003 - )
B65.588006JetBlue Airways (2000 - )
DL4.674957Delta Air Lines Inc. (1960 - )
HA4.577753Hawaiian Airlines Inc. (1960 - )
UA4.368148United Air Lines Inc. (1960 - )
NK4.166264Spirit Air Lines (1992 - )
AA4.073358American Airlines Inc. (1960 - )
VX3.342923Virgin America (2007 - )
VX3.342923Aces Airlines (1992 - 2003)
\n", + "
" + ], + "text/plain": [ + " CarrierDelay Description\n", + "F9 7.856566 Frontier Airlines Inc. (1994 - )\n", + "EV 7.125663 ExpressJet Airlines Inc. (2012 - )\n", + "EV 7.125663 Atlantic Southeast Airlines (1993 - 2011)\n", + "OO 6.705469 SkyWest Airlines Inc. (2003 - )\n", + "B6 5.588006 JetBlue Airways (2000 - )\n", + "DL 4.674957 Delta Air Lines Inc. (1960 - )\n", + "HA 4.577753 Hawaiian Airlines Inc. (1960 - )\n", + "UA 4.368148 United Air Lines Inc. (1960 - )\n", + "NK 4.166264 Spirit Air Lines (1992 - )\n", + "AA 4.073358 American Airlines Inc. (1960 - )\n", + "VX 3.342923 Virgin America (2007 - )\n", + "VX 3.342923 Aces Airlines (1992 - 2003)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_delays_w_code = avg_delays.join(carrier_code)\n", + "avg_delays_w_code.sort_values(\"CarrierDelay\", ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on that information, which airlines would you avoid near\n", + "the holidays?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing Merge Operations\n", + "\n", + "As we did in the [reshape lecture](https://datascience.quantecon.org/reshape.html), we will visualize the\n", + "various merge operations using artificial DataFrames.\n", + "\n", + "First, we create some dummy DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfL: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2
L1A110
L2B220
L3A330
L4C440
\n", + "
" + ], + "text/plain": [ + " Key C1 C2\n", + "L1 A 1 10\n", + "L2 B 2 20\n", + "L3 A 3 30\n", + "L4 C 4 40" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfR:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC3
R1A100
R2B200
R3C300
R4D400
\n", + "
" + ], + "text/plain": [ + " Key C3\n", + "R1 A 100\n", + "R2 B 200\n", + "R3 C 300\n", + "R4 D 400" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dfL = pd.DataFrame(\n", + " {\"Key\": [\"A\", \"B\", \"A\", \"C\"], \"C1\":[1, 2, 3, 4], \"C2\": [10, 20, 30, 40]},\n", + " index=[\"L1\", \"L2\", \"L3\", \"L4\"]\n", + ")[[\"Key\", \"C1\", \"C2\"]]\n", + "\n", + "print(\"This is dfL: \")\n", + "display(dfL)\n", + "\n", + "dfR = pd.DataFrame(\n", + " {\"Key\": [\"A\", \"B\", \"C\", \"D\"], \"C3\": [100, 200, 300, 400]},\n", + " index=[\"R1\", \"R2\", \"R3\", \"R4\"]\n", + ")[[\"Key\", \"C3\"]]\n", + "\n", + "print(\"This is dfR:\")\n", + "display(dfR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `pd.concat`\n", + "\n", + "Recall that calling `pd.concat(..., axis=0)` will stack DataFrames on top of\n", + "one another:" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
C1C2C3Key
L11.010.0NaNA
L22.020.0NaNB
L33.030.0NaNA
L44.040.0NaNC
R1NaNNaN100.0A
R2NaNNaN200.0B
R3NaNNaN300.0C
R4NaNNaN400.0D
\n", + "
" + ], + "text/plain": [ + " C1 C2 C3 Key\n", + "L1 1.0 10.0 NaN A\n", + "L2 2.0 20.0 NaN B\n", + "L3 3.0 30.0 NaN A\n", + "L4 4.0 40.0 NaN C\n", + "R1 NaN NaN 100.0 A\n", + "R2 NaN NaN 200.0 B\n", + "R3 NaN NaN 300.0 C\n", + "R4 NaN NaN 400.0 D" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([dfL, dfR], axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here’s how we might visualize that.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/merge_files/concat_axis0.gif](https://datascience.quantecon.org/assets/_static/merge_files/concat_axis0.gif) \n", + "We can also set `axis=1` to stack side by side." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2KeyC3
L1A1.010.0NaNNaN
L2B2.020.0NaNNaN
L3A3.030.0NaNNaN
L4C4.040.0NaNNaN
R1NaNNaNNaNA100.0
R2NaNNaNNaNB200.0
R3NaNNaNNaNC300.0
R4NaNNaNNaND400.0
\n", + "
" + ], + "text/plain": [ + " Key C1 C2 Key C3\n", + "L1 A 1.0 10.0 NaN NaN\n", + "L2 B 2.0 20.0 NaN NaN\n", + "L3 A 3.0 30.0 NaN NaN\n", + "L4 C 4.0 40.0 NaN NaN\n", + "R1 NaN NaN NaN A 100.0\n", + "R2 NaN NaN NaN B 200.0\n", + "R3 NaN NaN NaN C 300.0\n", + "R4 NaN NaN NaN D 400.0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([dfL, dfR], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here’s how we might visualize that.\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/merge_files/concat_axis1.gif](https://datascience.quantecon.org/assets/_static/merge_files/concat_axis1.gif) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `pd.merge`\n", + "\n", + "The animation below shows a visualization of what happens when we call" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2C3
0A110100
1A330100
2B220200
3C440300
\n", + "
" + ], + "text/plain": [ + " Key C1 C2 C3\n", + "0 A 1 10 100\n", + "1 A 3 30 100\n", + "2 B 2 20 200\n", + "3 C 4 40 300" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(dfL, dfR, on=\"Key\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![https://datascience.quantecon.org/assets/_static/merge_files/left_merge.gif](https://datascience.quantecon.org/assets/_static/merge_files/left_merge.gif) \n", + "Now, let’s focus on what happens when we set `how=\"right\"`.\n", + "\n", + "Pay special attention to what happens when filling the output value for\n", + "the key `A`." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2C3
0A1.010.0100
1A3.030.0100
2B2.020.0200
3C4.040.0300
4DNaNNaN400
\n", + "
" + ], + "text/plain": [ + " Key C1 C2 C3\n", + "0 A 1.0 10.0 100\n", + "1 A 3.0 30.0 100\n", + "2 B 2.0 20.0 200\n", + "3 C 4.0 40.0 300\n", + "4 D NaN NaN 400" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(dfL, dfR, on=\"Key\", how=\"right\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![https://datascience.quantecon.org/assets/_static/merge_files/right_merge.gif](https://datascience.quantecon.org/assets/_static/merge_files/right_merge.gif) " + ] + } + ], + "metadata": { + "date": 1584040762.8915727, + "filename": "merge.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "Merge" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/7_merge_exercises.ipynb b/Session_7/7_merge_exercises.ipynb new file mode 100644 index 0000000..a8c7694 --- /dev/null +++ b/Session_7/7_merge_exercises.ipynb @@ -0,0 +1,2465 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: six>=1.0.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pyarrow->qeds) (1.12.0)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (2.8.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import pandas as pd\n", + "import qeds\n", + "%matplotlib inline\n", + "import qeds\n", + "qeds.themes.mpl_style();\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merge - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises 1-3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For these exercises we use several DataFrames:**\n", + "- WDI data on GDP components\n", + "- population\n", + "- square miles of countries " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# from WDI. Units trillions of 2010 USD\n", + "url = \"https://datascience.quantecon.org/assets/data/wdi_data.csv\"\n", + "wdi = pd.read_csv(url).set_index([\"country\", \"year\"])\n", + "#we create a DataFrame corresponding to 2017 that we need later on\n", + "wdi2017 = wdi.xs(2017, level=\"year\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# Data from https://www.nationmaster.com/country-info/stats/Geography/Land-area/Square-miles\n", + "# units -- millions of square miles\n", + "sq_miles = pd.Series({\n", + " \"United States\": 3.8,\n", + " \"Canada\": 3.8,\n", + " \"Germany\": 0.137,\n", + " \"United Kingdom\": 0.0936,\n", + " \"Russia\": 6.6,\n", + "}, name=\"sq_miles\").to_frame()\n", + "sq_miles.index.name = \"country\"" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# from WDI. Units millions of people\n", + "pop_url = \"https://datascience.quantecon.org/assets/data/wdi_population.csv\"\n", + "pop = pd.read_csv(pop_url).set_index([\"country\", \"year\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Use your new `merge` skills to answer the final question from above: What is the population density of each country? How much does it change over time?**" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Populationsq_miles
countryyear
Canada201736.5402683.8
201636.1094873.8
201535.7029083.8
201435.4374353.8
201335.0829543.8
\n", + "
" + ], + "text/plain": [ + " Population sq_miles\n", + "country year \n", + "Canada 2017 36.540268 3.8\n", + " 2016 36.109487 3.8\n", + " 2015 35.702908 3.8\n", + " 2014 35.437435 3.8\n", + " 2013 35.082954 3.8" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#we first merge the DataFrames that allow us to compute the population density, namely pop and sq_miles\n", + "sqm_pop = pop.join(sq_miles, on='country')\n", + "sqm_pop.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Populationsq_milesdensity
countryyear
Canada201736.5402683.89.615860
201636.1094873.89.502497
201535.7029083.89.395502
201435.4374353.89.325641
201335.0829543.89.232356
\n", + "
" + ], + "text/plain": [ + " Population sq_miles density\n", + "country year \n", + "Canada 2017 36.540268 3.8 9.615860\n", + " 2016 36.109487 3.8 9.502497\n", + " 2015 35.702908 3.8 9.395502\n", + " 2014 35.437435 3.8 9.325641\n", + " 2013 35.082954 3.8 9.232356" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we next compute the population density of each country as Population/sq_miles\n", + " # and store it in a new column called \"density\"\n", + "sqm_pop[\"density\"] = sqm_pop[\"Population\"] / sqm_pop[\"sq_miles\"]\n", + "sqm_pop.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# finally, we plot the time series of population density for each country\n", + " \n", + "sqm_pop=sqm_pop.sort_index()\n", + "sqm_pop.head()\n", + "sqm_pop['density'].unstack(level=0).plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Create two new DataFrames:**\n", + "\n", + " - wdi2017_no_US, which excludes the United States\n", + " - sq_miles_no_germant, which excludes Germany\n", + "\n", + " **Compare the `how=\"left\"` with `how=\"inner\"` options using the DataFrames `wdi2017_no_US` and `sq_miles_no_germany`. Are they different? How?**" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is wdi2017_no_US\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDP
country
Canada0.3726651.0954750.5828310.6000311.868164
Germany0.7455792.1120091.9305631.6663483.883870
United Kingdom0.5495381.8091540.8626290.9331452.818704
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is sq_miles_no_germany\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sq_miles
country
United States3.8000
Canada3.8000
United Kingdom0.0936
Russia6.6000
\n", + "
" + ], + "text/plain": [ + " sq_miles\n", + "country \n", + "United States 3.8000\n", + "Canada 3.8000\n", + "United Kingdom 0.0936\n", + "Russia 6.6000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# We create the two DataFrames\n", + "wdi2017_no_US = wdi2017.drop(\"United States\")\n", + "print(\"This is wdi2017_no_US\")\n", + "display(wdi2017_no_US)\n", + "\n", + "sq_miles_no_germany = sq_miles.drop(\"Germany\")\n", + "print(\"This is sq_miles_no_germany\")\n", + "display(sq_miles_no_germany)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is how=\"left\"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.883870NaN
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 NaN\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is how=\"inner\"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "req1 = pd.merge(wdi2017_no_US, sq_miles_no_germany, on=\"country\", how=\"left\")\n", + "print(\"This is how=\\\"left\\\"\")\n", + "display(req1)\n", + "\n", + "req2 = pd.merge(wdi2017_no_US, sq_miles_no_germany, on=\"country\", how=\"inner\")\n", + "print(\"This is how=\\\"inner\\\"\")\n", + "display(req2)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# the how=\"left\" version includes the countries that are common to both DataFrames (Canada & UK),\n", + " # plus the countries that are included in the left DataFrame (wdi2017_no_US) such as Germany, \n", + " # but not in the right DataFrame (sq_miles_no_germany)\n", + " \n", + "# the how=\"inner\" version includes only the countries that are common to both DataFrames (Canada & UK) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Will this happen for all pairs of DataFrames, or are `wdi2017_no_US` and `sq_miles_no_germany` special in some way?**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As explained earlier: \n", + "- with the 'left' method, the result contains all the keys from the left (\"base\", here wdi2017_no_US) dataframe. This implies that, for the keys (here, countries) that exist on the left dataframe but not on the right dataframe, the \"new column\" (here sq_miles) while be NaN.\n", + " - Here, the result has all the country because the wdi2017_no_US has all the country. But, since sq_miles does not have an entry for germany, the result returns NaN in the corresponding cell. \n", + "- With the 'inner' method, the result containts only the keys that exist in both the left (\"base\", here wdi2017_no_US) and right (here sq_miles) dataframe. \n", + " - Here, the wdi_2017_no_US does have the US and the sq_miles dataframe doesn't have germany so the resulting dataframe have neither" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Also compare `how=\"right\"` and `how=\"outer\"` and answer the same questions.**" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is how=\"right\"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United StatesNaNNaNNaNNaNNaN3.8000
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936\n", + "United States NaN NaN NaN NaN NaN 3.8000\n", + "Russia NaN NaN NaN NaN NaN 6.6000" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is how=\"outer\"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.883870NaN
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United StatesNaNNaNNaNNaNNaN3.8000
RussiaNaNNaNNaNNaNNaN6.6000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP sq_miles\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 3.8000\n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 NaN\n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 0.0936\n", + "United States NaN NaN NaN NaN NaN 3.8000\n", + "Russia NaN NaN NaN NaN NaN 6.6000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "req1bis = pd.merge(wdi2017_no_US, sq_miles_no_germany, on=\"country\", how=\"right\")\n", + "print(\"This is how=\\\"right\\\"\")\n", + "display(req1bis)\n", + "\n", + "req2bis = pd.merge(wdi2017_no_US, sq_miles_no_germany, on=\"country\", how=\"outer\")\n", + "print(\"This is how=\\\"outer\\\"\")\n", + "display(req2bis)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# the how=\"right\" version includes the countries that are common to both DataFrames (Canada & UK),\n", + " # plus the countries that are included in the right DataFrame (sq_miles_no_germany) such as US & Russia, \n", + " # but not in the left DataFrame (wdi2017_no_us)\n", + " \n", + "# the how=\"outer\" version includes all the countries from both DataFrames (Canada, Germany, UK, US, Russia) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Can you pick the correct argument for `how` such that `pd.merge(wdi2017, sq_miles, how=\"left\")` is equal to `pd.merge(sq_miles, wdi2017, how=XXX)`?**" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GovExpendConsumptionExportsImportsGDPsq_miles
country
Canada0.3726651.0954750.5828310.6000311.8681643.8000
Germany0.7455792.1120091.9305631.6663483.8838700.1370
United Kingdom0.5495381.8091540.8626290.9331452.8187040.0936
United States2.40574312.0192662.2870713.06995417.3486273.8000
\n", + "
" + ], + "text/plain": [ + " GovExpend Consumption Exports Imports GDP \\\n", + "country \n", + "Canada 0.372665 1.095475 0.582831 0.600031 1.868164 \n", + "Germany 0.745579 2.112009 1.930563 1.666348 3.883870 \n", + "United Kingdom 0.549538 1.809154 0.862629 0.933145 2.818704 \n", + "United States 2.405743 12.019266 2.287071 3.069954 17.348627 \n", + "\n", + " sq_miles \n", + "country \n", + "Canada 3.8000 \n", + "Germany 0.1370 \n", + "United Kingdom 0.0936 \n", + "United States 3.8000 " + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#first, we check how pd.merge(wdi2017, sq_miles, on=\"country\", how=\"left\") looks like\n", + "pd.merge(wdi2017, sq_miles, on=\"country\", how=\"left\")\n", + "\n", + "#the version below is more elegant as it sorts values and orders columns\n", + " #merged1 = pd.DataFrame(pd.merge(wdi2017, sq_miles, on=\"country\", how=\"left\").sort_values(\"country\", axis = 0) , \n", + " # columns=[\"GovExpend\", \"Consumption\", \"Exports\", \"Imports\", \"GDP\", \"sq_miles\"])\n", + " #merged1" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sq_milesGovExpendConsumptionExportsImportsGDP
country
United States3.80002.40574312.0192662.2870713.06995417.348627
Canada3.80000.3726651.0954750.5828310.6000311.868164
Germany0.13700.7455792.1120091.9305631.6663483.883870
United Kingdom0.09360.5495381.8091540.8626290.9331452.818704
\n", + "
" + ], + "text/plain": [ + " sq_miles GovExpend Consumption Exports Imports \\\n", + "country \n", + "United States 3.8000 2.405743 12.019266 2.287071 3.069954 \n", + "Canada 3.8000 0.372665 1.095475 0.582831 0.600031 \n", + "Germany 0.1370 0.745579 2.112009 1.930563 1.666348 \n", + "United Kingdom 0.0936 0.549538 1.809154 0.862629 0.933145 \n", + "\n", + " GDP \n", + "country \n", + "United States 17.348627 \n", + "Canada 1.868164 \n", + "Germany 3.883870 \n", + "United Kingdom 2.818704 " + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#second, we find the right \"how\"\n", + "pd.merge(sq_miles, wdi2017, on=\"country\" , how=\"right\")\n", + "\n", + "#the version below is more elegant as it sorts values and orders columns and gives the exact same DataFrame in visual terms\n", + " #merged2 = pd.DataFrame(pd.merge(wdi2017, sq_miles, on=\"country\", how=\"right\").sort_values(\"country\", axis = 0) , \n", + " # columns=[\"GovExpend\", \"Consumption\", \"Exports\", \"Imports\", \"GDP\", \"sq_miles\"])\n", + " #merged2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is a matter of keys ! \n", + "- In the first proposed code, we want all the keys from the left dataframe, ie all the countries (keys) in wdi2017.\n", + "- To have the same dataframe in the second case, we want to have the same keys as in the previous dataframe, i.e. the countries in wdi2017. In this case, wdi2017 is the \"on the right\" (it is the dataframe that we put onto the base one) so we chose, thanks to how='right', to have the keys from the right." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For this exercise we use a dataset containing ratings of books from the website [Goodreads](https://www.goodreads.com/) and a dataset containing information on the actual books, that we are going to merge.**" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "#dataset containing ratings of books from the website [Goodreads]\n", + "ratings = qeds.data.load(\"goodreads_ratings\")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "#dataset containing information on the actual books\n", + "books = qeds.data.load(\"goodreads_books\")\n", + "# we only need a few of the columns\n", + "books = books[[\"book_id\", \"authors\", \"title\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "#merged dataset\n", + "rated_books = pd.merge(ratings, books)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Determine the average rating for the books with the *most*/*least* number ratings.**" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',\n", + " \"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)\",\n", + " 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',\n", + " 'The Great Gatsby',\n", + " 'To Kill a Mockingbird',\n", + " 'The Hobbit',\n", + " 'Twilight (Twilight, #1)',\n", + " 'The Hunger Games (The Hunger Games, #1)',\n", + " 'Catching Fire (The Hunger Games, #2)',\n", + " 'Mockingjay (The Hunger Games, #3)']" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#most rated\n", + "most_rated_books_id = rated_books[\"book_id\"].value_counts().nlargest(10).index\n", + "most_rated_books = rated_books.loc[rated_books[\"book_id\"].isin(most_rated_books_id), :]\n", + "list(most_rated_books[\"title\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['The Batman Chronicles, Vol. 1',\n", + " \"The King's Agent\",\n", + " 'The Magic (The Secret, #3)',\n", + " 'Fifty Shades Duo: Fifty Shades Darker / Fifty Shades Freed (Fifty Shades, #2-3)',\n", + " 'The Lake (The Lake Trilogy #1)',\n", + " 'Diary ng Panget',\n", + " \"Kindle Paperwhite User's Guide\",\n", + " \"Kindle User's Guide\",\n", + " 'Attack on Titan: No Regrets, Volume 01',\n", + " 'Manga Classics: Les Misérables']" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#least rated\n", + "least_rated_books_id = rated_books[\"book_id\"].value_counts().nsmallest(10).index\n", + "least_rated_books = rated_books.loc[rated_books[\"book_id\"].isin(least_rated_books_id), :]\n", + "list(least_rated_books[\"title\"].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Is there a distinguishable difference in the average rating compared to the most rated books?**" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "least=least_rated_books.pivot_table(values=\"rating\", index=\"title\")\n", + "most=most_rated_books.pivot_table(values=\"rating\", index=\"title\")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rating
title
Attack on Titan: No Regrets, Volume 014.292683
Diary ng Panget3.545455
Fifty Shades Duo: Fifty Shades Darker / Fifty Shades Freed (Fifty Shades, #2-3)4.041667
Kindle Paperwhite User's Guide3.757576
Kindle User's Guide3.375000
Manga Classics: Les Misérables4.187500
The Batman Chronicles, Vol. 13.822222
The King's Agent3.722222
The Lake (The Lake Trilogy #1)2.960000
The Magic (The Secret, #3)4.041667
\n", + "
" + ], + "text/plain": [ + " rating\n", + "title \n", + "Attack on Titan: No Regrets, Volume 01 4.292683\n", + "Diary ng Panget 3.545455\n", + "Fifty Shades Duo: Fifty Shades Darker / Fifty S... 4.041667\n", + "Kindle Paperwhite User's Guide 3.757576\n", + "Kindle User's Guide 3.375000\n", + "Manga Classics: Les Misérables 4.187500\n", + "The Batman Chronicles, Vol. 1 3.822222\n", + "The King's Agent 3.722222\n", + "The Lake (The Lake Trilogy #1) 2.960000\n", + "The Magic (The Secret, #3) 4.041667" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rating
title
Catching Fire (The Hunger Games, #2)4.133422
Harry Potter and the Chamber of Secrets (Harry Potter, #2)4.229418
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)4.418732
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)4.351350
Mockingjay (The Hunger Games, #3)3.853131
The Great Gatsby3.772224
The Hobbit4.148477
The Hunger Games (The Hunger Games, #1)4.279707
To Kill a Mockingbird4.329369
Twilight (Twilight, #1)3.214341
\n", + "
" + ], + "text/plain": [ + " rating\n", + "title \n", + "Catching Fire (The Hunger Games, #2) 4.133422\n", + "Harry Potter and the Chamber of Secrets (Harry ... 4.229418\n", + "Harry Potter and the Prisoner of Azkaban (Harry... 4.418732\n", + "Harry Potter and the Sorcerer's Stone (Harry Po... 4.351350\n", + "Mockingjay (The Hunger Games, #3) 3.853131\n", + "The Great Gatsby 3.772224\n", + "The Hobbit 4.148477\n", + "The Hunger Games (The Hunger Games, #1) 4.279707\n", + "To Kill a Mockingbird 4.329369\n", + "Twilight (Twilight, #1) 3.214341" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(least)\n", + "display(most)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ttest_indResult(statistic=-1.7299828900907714, pvalue=0.1007424590612311)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# To decide whether they are really different, a small ttest, underpowered and with no care for the hypothesis ^^\n", + "\n", + "import scipy.stats as stats\n", + "stats.ttest_ind(least['rating'], most['rating'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises 5-7" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For these exercises we create the DataFrames dfL and dfR, as specified below:**" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfL.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2
L1A110
L2B220
L3A330
L4C440
\n", + "
" + ], + "text/plain": [ + " Key C1 C2\n", + "L1 A 1 10\n", + "L2 B 2 20\n", + "L3 A 3 30\n", + "L4 C 4 40" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfR.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC3
R1A100
R2B200
R3C300
R4D400
\n", + "
" + ], + "text/plain": [ + " Key C3\n", + "R1 A 100\n", + "R2 B 200\n", + "R3 C 300\n", + "R4 D 400" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dfL = pd.DataFrame(\n", + " {\"Key\": [\"A\", \"B\", \"A\", \"C\"], \"C1\":[1, 2, 3, 4], \"C2\": [10, 20, 30, 40]},\n", + " index=[\"L1\", \"L2\", \"L3\", \"L4\"]\n", + ")[[\"Key\", \"C1\", \"C2\"]]\n", + "print(\"This is dfL.\")\n", + "display(dfL)\n", + "\n", + "dfR = pd.DataFrame(\n", + " {\"Key\": [\"A\", \"B\", \"C\", \"D\"], \"C3\": [100, 200, 300, 400]},\n", + " index=[\"R1\", \"R2\", \"R3\", \"R4\"]\n", + ")[[\"Key\", \"C3\"]]\n", + "print(\"This is dfR.\")\n", + "display(dfR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**In writing, describe what the output looks like when you do `pd.concat([dfL, dfR], axis=1)` (see above and/or run the cell below). Be sure to describe things like:**\n", + "\n", + "- What are the columns? What about columns with the same name? \n", + "- What is the index? \n", + "- Do any `NaN`s get introduced? If so, where? Why? " + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\asus\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2KeyC3
L1A1.010.0NaNNaN
L2B2.020.0NaNNaN
L3A3.030.0NaNNaN
L4C4.040.0NaNNaN
R1NaNNaNNaNA100.0
R2NaNNaNNaNB200.0
R3NaNNaNNaNC300.0
R4NaNNaNNaND400.0
\n", + "
" + ], + "text/plain": [ + " Key C1 C2 Key C3\n", + "L1 A 1.0 10.0 NaN NaN\n", + "L2 B 2.0 20.0 NaN NaN\n", + "L3 A 3.0 30.0 NaN NaN\n", + "L4 C 4.0 40.0 NaN NaN\n", + "R1 NaN NaN NaN A 100.0\n", + "R2 NaN NaN NaN B 200.0\n", + "R3 NaN NaN NaN C 300.0\n", + "R4 NaN NaN NaN D 400.0" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([dfL, dfR], axis=1)\n", + "\n", + "#pd.concat([dfL, dfR], axis=1).info()\n", + "\n", + "#the columns are Key, C1, C2, Key, C3 (since Key is defined as a column and not as an index in both DataFrames, we end up with two columns Key)\n", + "#the index goes through 8 entries: L1 - L4, R1 - R4\n", + "#missing values for:\n", + " #L rows corresponding to columns Key and C3 (because the L rows do not exist in dfR)\n", + " #R rows corresponding to columns Key, C1 and C2 (because the R rows do not exist in dfL)\n", + "#while axis = 1 arranges the DataFrames side-by-side, the fact that there are no common values for the index leads to a DataFrame like this" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Determine what happens when you run each of the two cells below. For each cell, answer the list of questions from the previous exercise.**" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2
L1A110
L2B220
L3A330
L4C440
L1A110
L2B220
L3A330
L4C440
\n", + "
" + ], + "text/plain": [ + " Key C1 C2\n", + "L1 A 1 10\n", + "L2 B 2 20\n", + "L3 A 3 30\n", + "L4 C 4 40\n", + "L1 A 1 10\n", + "L2 B 2 20\n", + "L3 A 3 30\n", + "L4 C 4 40" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First code cell for above exercise\n", + "pd.concat([dfL, dfL], axis=0)\n", + "#pd.concat([dfL, dfL], axis=0).info()\n", + "\n", + "#the columns are Key, C1, C2\n", + "#the index goes through 4 entries: L1 to L4\n", + "#we don' have any missing values, because the Dataframes have identical structures\n", + "#since we used axis = 0, the DataFrames are arranged on top of one another" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC3KeyC3
R1A100A100
R2B200B200
R3C300C300
R4D400D400
\n", + "
" + ], + "text/plain": [ + " Key C3 Key C3\n", + "R1 A 100 A 100\n", + "R2 B 200 B 200\n", + "R3 C 300 C 300\n", + "R4 D 400 D 400" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Second code cell for above exercise\n", + "pd.concat([dfR, dfR], axis=1)\n", + "#pd.concat([dfR, dfR], axis=1).info()\n", + "\n", + "#the columns are Key, C3\n", + "#the index goes through 4 entries: R1 to R4\n", + "#we don' have any missing values, because the Dataframes have identical structures\n", + "#since we used axis = 1, the DataFrames are arranged side-by-side" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 7" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Describe in words why the output of `pd.merge(dfL, dfR, how=\"right\")` has more rows than either `dfL` or `dfR`. Run the cell below to see the output of that operation.**" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is merged:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2C3
0A1.010.0100
1A3.030.0100
2B2.020.0200
3C4.040.0300
4DNaNNaN400
\n", + "
" + ], + "text/plain": [ + " Key C1 C2 C3\n", + "0 A 1.0 10.0 100\n", + "1 A 3.0 30.0 100\n", + "2 B 2.0 20.0 200\n", + "3 C 4.0 40.0 300\n", + "4 D NaN NaN 400" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfL:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC1C2
L1A110
L2B220
L3A330
L4C440
\n", + "
" + ], + "text/plain": [ + " Key C1 C2\n", + "L1 A 1 10\n", + "L2 B 2 20\n", + "L3 A 3 30\n", + "L4 C 4 40" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is dfR:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KeyC3
R1A100
R2B200
R3C300
R4D400
\n", + "
" + ], + "text/plain": [ + " Key C3\n", + "R1 A 100\n", + "R2 B 200\n", + "R3 C 300\n", + "R4 D 400" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# the output of pd.merge(dfL, dfR, how=\"right\") has more rows than either dfL and dfR because, \n", + " # besides the information common to both DataFrames (A, B, C), \n", + " # it includes information that is in dfR and not in dfL (D)\n", + " # Note: all elements from dfL are in dfR, but A appears twice in dfL, which means that the merged DataFrame will include A twice, plus B, plus C, plus D\n", + " \n", + "merged=pd.merge(dfL, dfR, how=\"right\")\n", + "print(\"This is merged:\")\n", + "display(merged)\n", + "\n", + "print(\"This is dfL:\")\n", + "display(dfL)\n", + "\n", + "print(\"This is dfR:\")\n", + "display(dfR)" + ] + } + ], + "metadata": { + "date": 1584040762.8915727, + "filename": "merge.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "Merge" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/8_groupby.ipynb b/Session_7/8_groupby.ipynb new file mode 100644 index 0000000..94e12fe --- /dev/null +++ b/Session_7/8_groupby.ipynb @@ -0,0 +1,3015 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GroupBy\n", + "\n", + "**Prerequisites**\n", + "\n", + "- [Functions](https://datascience.quantecon.org/../python_fundamentals/functions.html) \n", + "- pandas introduction [1](https://datascience.quantecon.org/intro.html) and [2](https://datascience.quantecon.org/basics.html) \n", + "- [Reshape](https://datascience.quantecon.org/reshape.html) \n", + "\n", + "\n", + "**Outcomes**\n", + "\n", + "- Understand the split-apply-combine strategy for aggregate\n", + " computations on groups of data \n", + "- Be able use basic aggregation methods on `df.groupby` to compute\n", + " within group statistics \n", + "- Understand how to group by multiple keys at once \n", + "\n", + "\n", + "**Data**\n", + "\n", + "- Details for all delayed US domestic flights in December 2016,\n", + " obtained from the [Bureau of Transportation\n", + " Statistics](https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time) " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "# Uncomment following line to install on colab\n", + "#! pip install qeds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "import pandas as pd\n", + "import qeds\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "# activate plot theme\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "- [GroupBy](#GroupBy) \n", + " - [Split-Apply-Combine](#Split-Apply-Combine) \n", + " - [Case Study: Airline Delays](#Case-Study:-Airline-Delays) \n", + " - [Exercise: Cohort Analysis using Shopify Data](#Exercise:-Cohort-Analysis-using-Shopify-Data) \n", + " - [Exercises](#Exercises) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split-Apply-Combine\n", + "\n", + "One powerful paradigm for analyzing data is the “Split-Apply-Combine”\n", + "strategy.\n", + "\n", + "This strategy has three steps:\n", + "\n", + "1. `Split`: split the data into groups based on values in one or more columns. \n", + "1. `Apply`: apply a function or routine to each group separately. \n", + "1. `Combine`: combine the output of the apply step into a DataFrame,\n", + " using the group identifiers as the index. \n", + "\n", + "\n", + "We will cover the main components in this lecture, but we encourage you\n", + "to also study the [official\n", + "documentation](https://pandas.pydata.org/pandas-docs/stable/groupby.html)\n", + "to learn more about what is possible.\n", + "\n", + "To describe the concepts, we will need some data.\n", + "\n", + "We will begin with a simple made-up dataset to discuss the concepts and\n", + "then work through extended example and exercises with real data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
2123.0
322NaN
4215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0\n", + "2 1 2 3.0\n", + "3 2 2 NaN\n", + "4 2 1 5.0\n", + "5 2 1 NaN" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C = np.arange(1, 7, dtype=float)\n", + "C[[3, 5]] = np.nan\n", + "df = pd.DataFrame({\n", + " \"A\" : [1, 1, 1, 2, 2, 2],\n", + " \"B\" : [1, 1, 2, 2, 1, 1],\n", + " \"C\": C,\n", + "})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple Example\n", + "\n", + "To perform the *Split* step, we call the `groupby` method on our\n", + "DataFrame.\n", + "\n", + "The first argument to `groupby` is a description of how we want to\n", + "construct groups.\n", + "\n", + "In the most basic version, we will pass a string identifying the column\n", + "name." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "gbA = df.groupby(\"A\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `type` of variable we get back is a `DataFrameGroupBy`, which we\n", + "will sometimes refer to as GroupBy for short." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.groupby.generic.DataFrameGroupBy" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(gbA)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the “groups” inside of the GroupBy object can help us\n", + "understand what the GroupBy represents.\n", + "\n", + "We can do this with the `gb.get_group(group_name)` method." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
2123.0
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0\n", + "2 1 2 3.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbA.get_group(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
322NaN
4215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "3 2 2 NaN\n", + "4 2 1 5.0\n", + "5 2 1 NaN" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbA.get_group(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can *apply* some of our favorite aggregation functions directly on the `GroupBy` object.\n", + "\n", + "If we pass a list of strings to `groupby`, it will group based on unique combinations of values from all columns in the list.\n", + "\n", + "Let’s see an example." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.groupby.generic.DataFrameGroupBy" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbAB = df.groupby([\"A\", \"B\"])\n", + "type(gbAB)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbAB.get_group((1, 1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we still have a GroupBy object, so we can apply our favorite\n", + "aggregations." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
C
AB
112
21
211
20
\n", + "
" + ], + "text/plain": [ + " C\n", + "A B \n", + "1 1 2\n", + " 2 1\n", + "2 1 1\n", + " 2 0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbAB.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the output is a DataFrame with two levels on the index\n", + "and a single column `C`. (Quiz: how do we know it is a DataFrame with\n", + "one column and not a Series?)\n", + "\n", + "This highlights a principle of how pandas handles the *Combine* part of\n", + "the strategy:\n", + "\n", + "> The index of the combined DataFrame will be the group identifiers,\n", + "with one index level per group key." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Aggregate Functions\n", + "\n", + "So far, we have been applying built-in aggregations to our GroupBy object.\n", + "\n", + "We can also apply custom aggregations to each group of a GroupBy in two\n", + "steps:\n", + "\n", + "1. Write our custom aggregation as a Python function. \n", + "1. Passing our function as an argument to the `.agg` method of a GroupBy. \n", + "\n", + "\n", + "Let’s see an example." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "def num_missing(df):\n", + " \"Return the number of missing items in each column of df\"\n", + " return df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can call this function on our original DataFrame to get the number of\n", + "missing items in each column." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "A 0\n", + "B 0\n", + "C 2\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_missing(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also apply it to a GroupBy object to get the number of missing\n", + "items in each column *for each group*." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
100.0
202.0
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 0 0.0\n", + "2 0 2.0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbA.agg(num_missing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The key to keep in mind is that the function we pass to `agg` should\n", + "take in a DataFrame (or Series) and return a Series (or single value)\n", + "with one item per column in the original DataFrame.\n", + "\n", + "When the function is called, the data for each group will be passed to\n", + "our function as a DataFrame (or Series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transforms: The `apply` Method\n", + "\n", + "As we saw in the [basics lecture](https://datascience.quantecon.org/basics.html), we can apply transforms to DataFrames.\n", + "\n", + "We can do the same with GroupBy objects using the `.apply` method.\n", + "\n", + "Let’s see an example." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
2123.0
322NaN
4215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0\n", + "2 1 2 3.0\n", + "3 2 2 NaN\n", + "4 2 1 5.0\n", + "5 2 1 NaN" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "def smallest_by_b(df):\n", + " return df.nsmallest(2, \"B\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
A
10111.0
1112.0
24215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "A \n", + "1 0 1 1 1.0\n", + " 1 1 1 2.0\n", + "2 4 2 1 5.0\n", + " 5 2 1 NaN" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbA.apply(smallest_by_b)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the return value from applying our series transform to `gbA`\n", + "was the group key on the outer level (the `A` column) and the original\n", + "index from `df` on the inner level.\n", + "\n", + "The original index came along because that was the index of the\n", + "DataFrame returned by `smallest_by_b`.\n", + "\n", + "Had our function returned something other than the index from `df`,\n", + "that would appear in the result of the call to `.apply`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `pd.Grouper`\n", + "\n", + "Sometimes, in order to construct the groups you want, you need to give\n", + "pandas more information than just a column name.\n", + "\n", + "Some examples are:\n", + "\n", + "- Grouping by a column and a level of the index. \n", + "- Grouping time series data at a particular frequency. \n", + "\n", + "\n", + "pandas lets you do this through the `pd.Grouper` type.\n", + "\n", + "To see it in action, let’s make a copy of `df` with `A` moved to the\n", + "index and a `Date` column added." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BCDate
A
111.02020-03-31
112.02020-06-30
123.02020-09-30
22NaN2020-12-31
215.02021-03-31
21NaN2021-06-30
\n", + "
" + ], + "text/plain": [ + " B C Date\n", + "A \n", + "1 1 1.0 2020-03-31\n", + "1 1 2.0 2020-06-30\n", + "1 2 3.0 2020-09-30\n", + "2 2 NaN 2020-12-31\n", + "2 1 5.0 2021-03-31\n", + "2 1 NaN 2021-06-30" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df.copy()\n", + "df2[\"Date\"] = pd.date_range(\n", + " start=pd.datetime.today().strftime(\"%m/%d/%Y\"),\n", + " freq=\"BQ\",\n", + " periods=df.shape[0]\n", + ")\n", + "df2 = df2.set_index(\"A\")\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can group by year." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
Date
2020-12-3143
2021-12-3121
\n", + "
" + ], + "text/plain": [ + " B C\n", + "Date \n", + "2020-12-31 4 3\n", + "2021-12-31 2 1" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.groupby(pd.Grouper(key=\"Date\", freq=\"A\")).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can group by the `A` level of the index." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BCDate
A
1333
2313
\n", + "
" + ], + "text/plain": [ + " B C Date\n", + "A \n", + "1 3 3 3\n", + "2 3 1 3" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.groupby(pd.Grouper(level=\"A\")).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can combine these to group by both." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
DateA
2020-12-31133
210
2021-12-31221
\n", + "
" + ], + "text/plain": [ + " B C\n", + "Date A \n", + "2020-12-31 1 3 3\n", + " 2 1 0\n", + "2021-12-31 2 2 1" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.groupby([pd.Grouper(key=\"Date\", freq=\"A\"), pd.Grouper(level=\"A\")]).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can combine `pd.Grouper` with a string, where the string\n", + "denotes a column name" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
C
DateB
2020-12-3112
21
2021-12-3111
\n", + "
" + ], + "text/plain": [ + " C\n", + "Date B \n", + "2020-12-31 1 2\n", + " 2 1\n", + "2021-12-31 1 1" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.groupby([pd.Grouper(key=\"Date\", freq=\"A\"), \"B\"]).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Study: Airline Delays\n", + "\n", + "Let’s apply our new split-apply-combine skills to the airline dataset we\n", + "saw in the [merge](https://datascience.quantecon.org/merge.html) lecture." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "air_dec = qeds.load(\"airline_performance_dec16\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we compute the average delay in arrival time for all carriers\n", + "each week." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierAAASB6DLEVF9HANKOOUAVXWN
Date
2016-12-04-1.7148872.724273-2.894269-5.0883518.655332-2.894212-0.5582825.4689092.7495735.564496-2.121821-1.663695
2016-12-111.14883312.0520315.7950622.50774513.2206734.5788612.0543028.71375515.4296604.09417612.0809381.865933
2016-12-1816.3575617.64376734.60835618.00000023.87662245.0148889.38888922.85789930.90163922.39813033.65112818.373400
2016-12-256.3645132.7196995.586836-0.9161136.85714354.0849595.07574710.44336915.0047805.33247417.28691710.197685
2017-01-012.3218361.22666210.6615772.0481166.8008988.2802986.9700168.3611238.9710830.0617861.3495805.213019
\n", + "
" + ], + "text/plain": [ + "Carrier AA AS B6 DL EV F9 \\\n", + "Date \n", + "2016-12-04 -1.714887 2.724273 -2.894269 -5.088351 8.655332 -2.894212 \n", + "2016-12-11 1.148833 12.052031 5.795062 2.507745 13.220673 4.578861 \n", + "2016-12-18 16.357561 7.643767 34.608356 18.000000 23.876622 45.014888 \n", + "2016-12-25 6.364513 2.719699 5.586836 -0.916113 6.857143 54.084959 \n", + "2017-01-01 2.321836 1.226662 10.661577 2.048116 6.800898 8.280298 \n", + "\n", + "Carrier HA NK OO UA VX WN \n", + "Date \n", + "2016-12-04 -0.558282 5.468909 2.749573 5.564496 -2.121821 -1.663695 \n", + "2016-12-11 2.054302 8.713755 15.429660 4.094176 12.080938 1.865933 \n", + "2016-12-18 9.388889 22.857899 30.901639 22.398130 33.651128 18.373400 \n", + "2016-12-25 5.075747 10.443369 15.004780 5.332474 17.286917 10.197685 \n", + "2017-01-01 6.970016 8.361123 8.971083 0.061786 1.349580 5.213019 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekly_delays = (\n", + " air_dec\n", + " .groupby([pd.Grouper(key=\"Date\", freq=\"W\"), \"Carrier\"])\n", + " [\"ArrDelay\"] # extract one column\n", + " .mean() # take average\n", + " .unstack(level=\"Carrier\") # Flip carrier up as column names\n", + ")\n", + "weekly_delays" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s also plot this data." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot\n", + "axs = weekly_delays.plot.bar(\n", + " figsize=(10, 8), subplots=True, legend=False, sharex=True,\n", + " sharey=True, layout=(4, 3), grid=False\n", + ")\n", + "\n", + "# tweak spacing between subplots and xaxis labels\n", + "axs[0,0].get_figure().tight_layout()\n", + "for ax in axs[-1, :]:\n", + " ax.set_xticklabels(weekly_delays.index.strftime(\"%a, %b. %d'\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like more delays occurred during the week ending Sunday\n", + "December 18th than any other week (except for Frontier, who did *worse*\n", + "on Christmas week).\n", + "\n", + "Let’s see why.\n", + "\n", + "The `air_dec` DataFrame has information on the minutes of delay\n", + "attributed to 5 different categories:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "delay_cols = [\n", + " 'CarrierDelay',\n", + " 'WeatherDelay',\n", + " 'NASDelay',\n", + " 'SecurityDelay',\n", + " 'LateAircraftDelay'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s take a quick look at each of those delay categories for the week ending December 18, 2016." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDelayWeatherDelayNASDelaySecurityDelayLateAircraftDelay
summeanpositivesummeanpositivesummeanpositivesummeanpositivesummeanpositive
Carrier
AA105732.06.2585532922.021820.01.291583456.077279.04.5743463159.0721.00.04267835.0141249.08.3608972574.0
AS8762.02.691032250.03219.00.98863661.016344.05.019656614.0163.00.05006110.013599.04.176597271.0
B649421.09.0316151575.09894.01.808114112.038741.07.0798611326.0672.00.12280730.0100811.018.4230631433.0
DL151188.08.8642122878.039145.02.295087783.075110.04.4037292605.0107.00.0062732.0122896.07.2054412289.0
EV87408.09.9395041375.03824.00.43484276.049703.05.6519221580.00.00.0000000.089773.010.2084381568.0
F919568.010.430704361.06198.03.30383857.022459.011.971748493.00.00.0000000.032236.017.183369316.0
HA7199.05.034266218.03650.02.552448145.086.00.0601404.035.00.0244763.04024.02.813986189.0
NK14735.05.294646452.02240.00.80488756.030361.010.909450840.050.00.0179665.022247.07.993891372.0
OO120307.010.4396911378.026349.02.286446308.054141.04.6981082289.0171.00.01483912.0166102.014.4135722459.0
UA66693.06.3126361851.031602.02.991197521.074992.07.0981542065.00.00.0000000.0118728.011.2378611696.0
VX8048.05.608362246.03807.02.652962126.012619.08.793728224.073.00.0508714.025242.017.590244331.0
WN123882.04.8737905393.023516.00.925171328.078645.03.0940674247.0252.00.00991418.0285073.011.2153996472.0
\n", + "
" + ], + "text/plain": [ + " CarrierDelay WeatherDelay \\\n", + " sum mean positive sum mean positive \n", + "Carrier \n", + "AA 105732.0 6.258553 2922.0 21820.0 1.291583 456.0 \n", + "AS 8762.0 2.691032 250.0 3219.0 0.988636 61.0 \n", + "B6 49421.0 9.031615 1575.0 9894.0 1.808114 112.0 \n", + "DL 151188.0 8.864212 2878.0 39145.0 2.295087 783.0 \n", + "EV 87408.0 9.939504 1375.0 3824.0 0.434842 76.0 \n", + "F9 19568.0 10.430704 361.0 6198.0 3.303838 57.0 \n", + "HA 7199.0 5.034266 218.0 3650.0 2.552448 145.0 \n", + "NK 14735.0 5.294646 452.0 2240.0 0.804887 56.0 \n", + "OO 120307.0 10.439691 1378.0 26349.0 2.286446 308.0 \n", + "UA 66693.0 6.312636 1851.0 31602.0 2.991197 521.0 \n", + "VX 8048.0 5.608362 246.0 3807.0 2.652962 126.0 \n", + "WN 123882.0 4.873790 5393.0 23516.0 0.925171 328.0 \n", + "\n", + " NASDelay SecurityDelay \\\n", + " sum mean positive sum mean positive \n", + "Carrier \n", + "AA 77279.0 4.574346 3159.0 721.0 0.042678 35.0 \n", + "AS 16344.0 5.019656 614.0 163.0 0.050061 10.0 \n", + "B6 38741.0 7.079861 1326.0 672.0 0.122807 30.0 \n", + "DL 75110.0 4.403729 2605.0 107.0 0.006273 2.0 \n", + "EV 49703.0 5.651922 1580.0 0.0 0.000000 0.0 \n", + "F9 22459.0 11.971748 493.0 0.0 0.000000 0.0 \n", + "HA 86.0 0.060140 4.0 35.0 0.024476 3.0 \n", + "NK 30361.0 10.909450 840.0 50.0 0.017966 5.0 \n", + "OO 54141.0 4.698108 2289.0 171.0 0.014839 12.0 \n", + "UA 74992.0 7.098154 2065.0 0.0 0.000000 0.0 \n", + "VX 12619.0 8.793728 224.0 73.0 0.050871 4.0 \n", + "WN 78645.0 3.094067 4247.0 252.0 0.009914 18.0 \n", + "\n", + " LateAircraftDelay \n", + " sum mean positive \n", + "Carrier \n", + "AA 141249.0 8.360897 2574.0 \n", + "AS 13599.0 4.176597 271.0 \n", + "B6 100811.0 18.423063 1433.0 \n", + "DL 122896.0 7.205441 2289.0 \n", + "EV 89773.0 10.208438 1568.0 \n", + "F9 32236.0 17.183369 316.0 \n", + "HA 4024.0 2.813986 189.0 \n", + "NK 22247.0 7.993891 372.0 \n", + "OO 166102.0 14.413572 2459.0 \n", + "UA 118728.0 11.237861 1696.0 \n", + "VX 25242.0 17.590244 331.0 \n", + "WN 285073.0 11.215399 6472.0 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pre_christmas = air_dec.loc[\n", + " (air_dec[\"Date\"] >= \"2016-12-12\") & (air_dec[\"Date\"] <= \"2016-12-18\")\n", + "]\n", + "\n", + "# custom agg function\n", + "def positive(df):\n", + " return (df > 0).sum()\n", + "\n", + "delay_totals = pre_christmas.groupby(\"Carrier\")[delay_cols].agg([\"sum\", \"mean\", positive])\n", + "delay_totals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Want**: plot total, average, and number of each type of delay by\n", + "carrier\n", + "\n", + "To do this, we need to have a DataFrame with:\n", + "\n", + "- Delay type in index (so it is on horizontal-axis) \n", + "- Aggregation method on *outer* most level of columns (so we can do\n", + " `data[\"mean\"]` to get averages) \n", + "- Carrier name on inner level of columns \n", + "\n", + "\n", + "Many sequences of the reshaping commands can accomplish this.\n", + "\n", + "We show one example below." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean...sum
CarrierAAASB6DLEVF9HANKOOUA...B6DLEVF9HANKOOUAVXWN
CarrierDelay6.2585532.6910329.0316158.8642129.93950410.4307045.0342665.29464610.4396916.312636...49421.0151188.087408.019568.07199.014735.0120307.066693.08048.0123882.0
WeatherDelay1.2915830.9886361.8081142.2950870.4348423.3038382.5524480.8048872.2864462.991197...9894.039145.03824.06198.03650.02240.026349.031602.03807.023516.0
NASDelay4.5743465.0196567.0798614.4037295.65192211.9717480.06014010.9094504.6981087.098154...38741.075110.049703.022459.086.030361.054141.074992.012619.078645.0
SecurityDelay0.0426780.0500610.1228070.0062730.0000000.0000000.0244760.0179660.0148390.000000...672.0107.00.00.035.050.0171.00.073.0252.0
LateAircraftDelay8.3608974.17659718.4230637.20544110.20843817.1833692.8139867.99389114.41357211.237861...100811.0122896.089773.032236.04024.022247.0166102.0118728.025242.0285073.0
\n", + "

5 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " mean \\\n", + "Carrier AA AS B6 DL EV \n", + "CarrierDelay 6.258553 2.691032 9.031615 8.864212 9.939504 \n", + "WeatherDelay 1.291583 0.988636 1.808114 2.295087 0.434842 \n", + "NASDelay 4.574346 5.019656 7.079861 4.403729 5.651922 \n", + "SecurityDelay 0.042678 0.050061 0.122807 0.006273 0.000000 \n", + "LateAircraftDelay 8.360897 4.176597 18.423063 7.205441 10.208438 \n", + "\n", + " ... \\\n", + "Carrier F9 HA NK OO UA ... \n", + "CarrierDelay 10.430704 5.034266 5.294646 10.439691 6.312636 ... \n", + "WeatherDelay 3.303838 2.552448 0.804887 2.286446 2.991197 ... \n", + "NASDelay 11.971748 0.060140 10.909450 4.698108 7.098154 ... \n", + "SecurityDelay 0.000000 0.024476 0.017966 0.014839 0.000000 ... \n", + "LateAircraftDelay 17.183369 2.813986 7.993891 14.413572 11.237861 ... \n", + "\n", + " sum \\\n", + "Carrier B6 DL EV F9 HA NK \n", + "CarrierDelay 49421.0 151188.0 87408.0 19568.0 7199.0 14735.0 \n", + "WeatherDelay 9894.0 39145.0 3824.0 6198.0 3650.0 2240.0 \n", + "NASDelay 38741.0 75110.0 49703.0 22459.0 86.0 30361.0 \n", + "SecurityDelay 672.0 107.0 0.0 0.0 35.0 50.0 \n", + "LateAircraftDelay 100811.0 122896.0 89773.0 32236.0 4024.0 22247.0 \n", + "\n", + " \n", + "Carrier OO UA VX WN \n", + "CarrierDelay 120307.0 66693.0 8048.0 123882.0 \n", + "WeatherDelay 26349.0 31602.0 3807.0 23516.0 \n", + "NASDelay 54141.0 74992.0 12619.0 78645.0 \n", + "SecurityDelay 171.0 0.0 73.0 252.0 \n", + "LateAircraftDelay 166102.0 118728.0 25242.0 285073.0 \n", + "\n", + "[5 rows x 36 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reshaped_delays = (\n", + " delay_totals\n", + " .stack() # move aggregation method into index (with Carrier)\n", + " .T # put delay type in index and Carrier+agg in column\n", + " .swaplevel(axis=1) # make agg method outer level of column label\n", + " .sort_index(axis=1) # sort column labels so it prints nicely\n", + ")\n", + "reshaped_delays" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAm0AAAJfCAYAAAAtueEYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3XucXWV96P/PCEirDJlAy8UExEtE0F/xQgFra7nky61IaH+CKGJAJLYntVY4VbDUeAEvv56KqRdqYAjhFAV+qIXaFPo1QK09BUSKWs3xFDFMAgGsSXAQAYF9/lhrcGec2XuH2bP3Xnt/3q/Xfs1ez3rWer5rDQ/5zrPWetZQrVZDkiRJve1Z3Q5AkiRJzZm0SZIkVYBJmyRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbpL4WEX8TEX/RYP37IuKSTsYkSc/EkPO0SRoUEXEo8LeZOb/bsUjStnKkTZIkqQIcaZPUUyJiHfA54FRgT+DvgD/KzEcj4kzgvcAuwNeBP8zM+yJiCPgEcAqwI3AP8ObM/I+IuAzYAHwU+K9y/SNlcy8BlgAvzsy3RMT1wFcy89N18XwL+GBmfikiXgp8Cng18CPgLzLz6lk7GZJUx5E2Sb3oFOAo4EUUidV5EXE4ReJ1EkUydw9wZVn/SOB1Zd0R4I3Aj+t3mJk/BY4B7svMncrPfZPa/TzwpomFiNgfeD7wDxHxXCDLOruV9T4bES9r10FLUiMmbZJ60aczc31mbgIuoEiQTgEuzcw7MvMx4FzgNRGxD/BzYBh4KTCUmWszc+MzaPfLwCsi4vnl8inAl8r2jgPWZebKzHwiM+8Avgi8YQbHKUktM2mT1IvW132/B3he+blnojAzH6YYTZuXmTcCnwY+AzwQESsiYudtbTQzx4F/AE4ui04Grii/Px84OCK2THwokro9trUdSXomtu92AJI0hb3qvu8N3Fd+JkbAKC9X7grcC5CZfw38dUTsBlwN/BkweaqPVm7i/QKwLCK+BvwqcFNZvh7458yMbT4aSWoDkzZJvWhpRHyF4oGB9wFXAWuAKyPi88Ba4CPArZm5LiJ+k+LKwR3AT4FHgSen2O8DwK4RMSczH5qm7dXApcCHgKsy86my/CvAxyLiVH5xL90rgIczc+3MDleSmvPyqKRe9Hngn4C7y8/5mbmGYuTsi8BGiocUJi5j7gxcDGymuIT6Y+B/TN5pZv5vipG0u8tLnM+bos5jwJeAhWUcE+XjFA88nEwx6nc/8HGKp1EladY55YeknlJO+fH2zPxqt2ORpF7iSJskSVIFmLRJkiRVgJdHJUmSKsCRNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqgO27HYBmX0TsAawDtgB7ZebPp6jzauA24FuZ+arORih1VqM+ERG7AsuA1wPPA34C/G/gs5n5hc5HK82uiLgMWFxX9BPge8CHM3N1Xb3nAx8BjgJ2AsaA92Tm33Us2AHnSNtgeBvwD8CPgUXT1HkHcBGwT0Qc2KnApC5p1Ce+CLyOok+8BDga+AKwaycDlDrsX4A9y88hwB3A30XEiwAiYh5wCzAEHAe8FDgTWN+VaAeUI219LiKeRdGx3kXRyZYA10yqMwy8CXgtRSK/BLi9s5FKndGoT0TECPC7wOsz85/KTe4BvtmFUKVOejwz7y+/3x8R5wD/DfgN4AcUI2zrMvPNddus62yIcqSt/x0JPBdYDfxP4NCIeOGkOqcA/5mZ3wYuA94UETt1NEqpcxr1iYeBcWBRRDy3S/FJXRURz6b4w+Yx4I7yD50TgH+LiC9ExIMR8Z2IODciHPzpIJO2/vcO4IrMfCIzNwJfBd4+qc4SYBVAZt5GcZ/Cm5H607R9IjOfoLi35/eBzRFxe0Qsj4jDuxeu1BGHRsTDEfEw8CjwIeCtmXkP8OvAzhQjb+sp7mn7GPBnwAe7FO9AMmnrYxGxJ8W9B6vqii8DTp/46ygiDgL+H+DzdXVWUSRyUl9ppU9k5peBeRT3sn0R2B9YExGf6Wy0UkfdCryi/LyKImlbFRFHAduVdb6dme/JzH/PzCsoLpn+cVeiHVAOa/a3Myh+x7dHRH35dsDxwJcokrPtgY11dYaAZ0XEqzLzjs6FK826VvoEmfkYcGP5+WhEnAd8OCL+MjPXdTRiqTN+lpl31S3fGRFHAH8OHAH8nOKJ0nrfBXaOiLmZublDcQ40R9r6VHkPwtsp/hJ6xaTP3wJLImJn4GRg6aT1BwA34Wib+kgrfaLB5mvLn78+mzFKPeYJ4DnllDi3AvtOWr8v8JAJW+c40ta/jgb2Bj6XmWP1KyJiJZDAW4AasDIzfzapzt8Cn4yIszPzpx2KWZpNTftEOV/hXwErgW9RzOP2cuCjwA+BOzsasdQ5zy7nL4TiQZ2jys+ysuyjwFci4v0Ut9PsTzEKt7zTgQ4yR9r61zuAWyf/41T6Z+BHFDeVfmVywlb6MvArFFOBSP2glT7xBuB/UYw+30gxwvbX5fffnWpiaqlP/A6wsfx8h6IPnEORrFFOsvsm4CTgPyj+uPkr4MPdCHZQDdVqtW7HIEmSpCYcaZMkSaoAkzZJkqQKMGmTJEmqgEombWNjY7WxsTFvxpNK9glpa/YJ9aNKT/mxZcuWZ9whx8fHGR4ebmc4xjDAMYyMjAy1MZxnzD5hDL3Svn2iPYyhN2LolT5RyZE2SZKkQWPSJkmSVAEmbZIkSRVg0iZJklQBJm2SJEkVYNImSZJUASZtkiRJFdDSPG0RMQJcArwcqAFvA74PXAXsA6wDTsrMzRExBCwHjgUeAU7LzDvK/SwGzit3e35mrmrbkUiSJPWxVkfalgPXZ+ZLgQOAtcA5wJrMXACsKZcBjgEWlJ8lwEUAEbELsAw4GDgIWBYRc9t0HJIkSX2tadIWETsDrwNGATLz8czcAiwCJkbKVgEnlN8XAZdnZi0zbwFGImJP4Khi89yUmZuBBI5u69FIkiT1qVYuj74Q+BGwMiIOAL4JvAvYPTM3AmTmxojYraw/D1hft/2Gsmy68q1ExBKKEbrpzBkdHQWK10o8U7VabUbbt4Mx9E8MIyMjbYxma/YJY6hi+/aJ9jCG3oihV/pEK0nb9sCrgHdm5q0RsZxfXAqdylTv1qo1KN9KZq4AVjQKaOIlwDN5D1i332NmDMbQKvuEMQxa+83YJ4xh0Nqf0Mo9bRuADZl5a7l8DUUS90B52ZPy54N19feq234+cF+DckmSJDXRNGnLzPuB9RGxb1l0BPA94DpgcVm2GLi2/H4d8NaIGIqIQ4CHysuoNwBHRsTc8gGEI8sySZIkNdHSlB/AO4ErIuLZwN3A6RQJ39URcQYwBpxY1l1NMd3HXRRTfpwOkJmbIuLDwDfKeh/KzE1tOQpJkqQ+11LSlpl3AgdOseqIKerWgKXT7OdS4NJtCVCSJEm+EUGSJKkSTNokSZIqwKRNkiSpAkzaJEmSKsCkTZIkqQJM2iRJkirApE2SJKkCTNokSZIqwKRNkiSpAkzaJEmSKsCkTZIkqQJM2iRJkirApE2SJKkCTNokSZIqwKRNkiSpAkzaJEmSKsCkTZIkqQJM2iRJkirApE2SJKkCtm+1YkRsB9wO3JuZx0XEC4ArgV2AO4BTM/PxiNgRuBx4NfBj4I2Zua7cx7nAGcCTwJ9k5g3tPBhJkqR+tS0jbe8C1tYtfxy4MDMXAJspkjHKn5sz88XAhWU9ImJ/4GTgZcDRwGfLRFCSJElNtJS0RcR84PeAS8rlIeBw4JqyyirghPL7onKZcv0RZf1FwJWZ+Vhm/hC4CzioHQchSZLU71q9PPpJ4D3AcLm8K7AlM58olzcA88rv84D1AJn5REQ8VNafB9xSt8/6bZ4WEUuAJQ1imTM6OgrA+Ph4i+H/slqtNqPt28EY+ieGkZGRNkazNfuEMVSxfftEexhDb8TQK32iadIWEccBD2bmNyPi0LJ4aIqqtSbrGm3ztMxcAaxoFNPY2FgNYHh4uFG1hsbHx2e0fTsYgzG0wj5hDIPWfjP2CWMYtPYntHJ59LXA8RGxjuLBg8MpRt5GImIi6ZsP3Fd+3wDsBVCunwNsqi+fYhtJkiQ10DRpy8xzM3N+Zu5D8SDBjZl5CnAT8Iay2mLg2vL7deUy5fobM7NWlp8cETuWT54uAG5r25FIkiT1sZnM0/Ze4KyIuIvinrXRsnwU2LUsPws4ByAzvwtcDXwPuB5YmplPzqB9SZKkgdHyPG0AmXkzcHP5/W6mePozMx8FTpxm+wuAC7Y1SEmSpEHnGxEkSZIqwKRNkiSpAkzaJEmSKsCkTZIkqQJM2iRJkirApE2SJKkCtmnKD1XPvkuvb7h+9bkHcuA5/zrt+u9/5uh2hyRJkp4BR9okSZIqwKRNkiSpAvr28qiXBSVJUj9xpE2SJKkCTNokSZIqwKRNkiSpAkzaJEmSKqBvH0SQJjR7KAV8MEWS1PscaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCmj6IEJE7AVcDuwBPAWsyMzlEbELcBWwD7AOOCkzN0fEELAcOBZ4BDgtM+8o97UYOK/c9fmZuaq9hyNJktSfWhlpewI4OzP3Aw4BlkbE/sA5wJrMXACsKZcBjgEWlJ8lwEUAZZK3DDgYOAhYFhFz23gskiRJfatp0paZGydGyjJzHFgLzAMWARMjZauAE8rvi4DLM7OWmbcAIxGxJ3BUsYvclJmbgQScR0GSJKkF2zRPW0TsA7wSuBXYPTM3QpHYRcRuZbV5wPq6zTaUZdOVT25jCcUI3XTmjI6OAjA+Pj5tpdXnHtjwWHbZaYeGdRrtu11qtdqst+N5aH4OYObnYWRkZJvjalW7+kQznfjv0RiqEUM72rdPtIcx9EYMvdInWk7aImIn4IvAn2bmTyJiuqpDU5TVGpRvJTNXACsaxTI2NlYDGB4enrZOo4lSofiH/NiP3j7t+k5Mpjo+Pt7wGNrB89D8HEBvnIfptKtPNNOJ/x6NoRoxdLv9ZuwTxjBo7U9o6enRiNiBImG7IjO/VBY/UF72pPz5YFm+AdirbvP5wH0NyiVJktRE06StfBp0FFibmZ+oW3UdsLj8vhi4tq78rRExFBGHAA+Vl1FvAI6MiLnlAwhHlmWSJElqopXLo68FTgW+ExF3lmXvAz4GXB0RZwBjwInlutUU033cRTHlx+kAmbkpIj4MfKOs96HM3NSWo5AkSepzTZO2zPw6U9+PBnDEFPVrwNJp9nUpcOm2BChJkqRtfHpUkqR+MWftYQ3XP7zHSuasPX7a9Q/td1O7Q5Ia8jVWkiRJFeBImyRJGmgzHXWFzoy8mrRJkjTAvExcHV4elSRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoA34ggaaDc9eZ3Nly/68fey13veN+061/8+U+1OyRJaokjbZIkSRXgSNss2nfp9Q3Xrz73QA4851+nXf/9zxzd7pAkSVJFOdImSZJUASZtkiRJFdDxy6MRcTSwHNgOuCQzP9bpGKRB5A34mjDT/xbA/x6kbujoSFtEbAd8BjgG2B94U0Ts38kYJEmSqmioVqt1rLGIeA3wgcw8qlw+FyAzP7ot+xkbG+tc0FKL9t5776FutW2fUC+yT0hbm2mf6PTl0XnA+rrlDcDB9RUiYgmwpME+5oyOjs5CaFJvsk9IW7NPaGDVarWOfRYuXHjiwoULL6lbPnXhwoWf6mQMdW3/qBvtGoMx9OqnF86BMfRGDN1uv1c+vXAejKE3Yuh2+xOfTj89ugHYq255PnBfh2OYsKVL7dYzhoIx9IZeOAfGUOh2DN1uv1f0wnkwhkK3Y+h2+0DnL49+A1gQES8A7gVOBt7c4RgmPNSldusZQ8EYekMvnANjKHQ7hm633yt64TwYQ6HbMXS7faDDSVtmPhERfwzcQDHlx6WZ+d1OxjBoIuIyYHG5+CTwE+D7wPXAp+rqfQB4S2a+uLMRSp01qU/U+ymwBthvmu2eTXFl4G8y87zZik/qhgb94k3l+vcAZwDPp7g3/ZOZ+ZlOxadCx+dpy8zVwOpOtzvg/gU4iWKKl7kUD3+8B/hD4L+6GJfULRN9ot5TwIHA8RFxQGZ+a9L6P6DoP5d0ID6pG6bqF1uAC4E/o3j441vAa4AVEfF4Zl7c2RAHm+8eHQyPZ+b95ff7gO9GxDXAv1P81SQNmvo+8bSIuB54HDgT+ONJq88EMjPXzX54UldM1y92Bd6fmV8ui+6OiIOAPwdM2jrI11gNqMz8CXARMBwRv97teKRekJlPUYw+vyUifnWiPCJeBBwGrOhWbFIXDQGPTir7GfD8iPAP/w5ypG2w/Uf58wVdjULqvEMj4uFJZTdl5uspkrbdgBOBy8t1bwceAK7rXIhSx03uFw9k5oso7oX+k4hYQ/HvxkHA28o6zwPu6WyYg8ukbbBNzMzszOEaNLfyyzddP1L+/DnwFYrLoZdHxPbAacDKzHyiYxFKnTe5X0z89z4G/CdwJ8W/F/cBo8A5FA+4qUMGOWnrhcsc3Y7h5RQd8O4ux9Ht8wC9EUO39cI56FQMP8vMuxrEMAb8Y0TsR/E06e509gGEbv8uut1+r+iF89DJGKbrFxdl5oryCerdKJK2PyzX/bBDsXX7d9Ht9oEOv3tUnVc+xj0/MxdOKt+Z4q+mH2RmOOWHBsV0fWJSnWcBPwC+TJG0bZeZR3YkQKkLWukXk+r/C/BUZv7ubMalrQ3ySNsgeXZE7EFxOXQucAjFlB87An80qd4rJm37VGZ+uzNhSh0z0ScmeyAza5n5VERcTNFPhoE3djY8qXdExG8C+wB3UIy0nQ28AvjtLoY1kEzaBsPvABsp7j0Yp5hc9/PApzJzc129vSimAan3GPArnQhS6qCJPjHZr/OLuQsvBT5YLl/bobikXrQjsAx4EcWUOF8Dfiszv9PVqAaQl0clSZIqwHnaJEmSKsCkTZIkqQJM2iRJkiqgkknb2NhYbWxszJvxpJJ9QtqafUL9qNJPj27ZsuUZd8jx8XGGh4fbGY4xDHAMIyMjQ81rzT77hDH0Svv2ifYwht6IoVf6RCVH2iRJkgaNSZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVYBJmyRJUgWYtEmSJFVAS/O0RcQIcAnwcqAGvA34PnAVsA+wDjgpMzdHxBCwHDgWeAQ4LTPvKPezGDiv3O35mbmqbUciSZLUx1odaVsOXJ+ZLwUOANYC5wBrMnMBsKZcBjgGWFB+lgAXAUTELsAy4GDgIGBZRMxt03FIkiT1taZJW0TsDLwOGAXIzMczcwuwCJgYKVsFnFB+XwRcnpm1zLwFGImIPYGjis1zU2ZuBhI4uq1HI0mS1KdauTz6QuBHwMqIOAD4JvAuYPfM3AiQmRsjYrey/jxgfd32G8qy6cq3EhFLKEbopjNndHQUKF4r8UzVarUZbd8OxtA/MYyMjLQxmq3ZJ4yhiu3bJ9rDGHojhl7pE60kbdsDrwLemZm3RsRyfnEpdCpTvVur1qB8K5m5AljRKKCJlwDP5D1g3X6PmTEYQ6vsE8YwaO03Y58whkFrf0Ir97RtADZk5q3l8jUUSdwD5WVPyp8P1tXfq277+cB9DcolSZLURNOkLTPvB9ZHxL5l0RHA94DrgMVl2WLg2vL7dcBbI2IoIg4BHiovo94AHBkRc8sHEI4syyRJktRES1N+AO8EroiIZwN3A6dTJHxXR8QZwBhwYll3NcV0H3dRTPlxOkBmboqIDwPfKOt9KDM3teUoJEmS+lxLSVtm3gkcOMWqI6aoWwOWTrOfS4FLtyVASZIk+UYESZKkSjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCti+1YoRsR1wO3BvZh4XES8ArgR2Ae4ATs3MxyNiR+By4NXAj4E3Zua6ch/nAmcATwJ/kpk3tPNgJEmS+tW2jLS9C1hbt/xx4MLMXABspkjGKH9uzswXAxeW9YiI/YGTgZcBRwOfLRNBSZIkNdFS0hYR84HfAy4pl4eAw4FryiqrgBPK74vKZcr1R5T1FwFXZuZjmflD4C7goHYchCRJUr9r9fLoJ4H3AMPl8q7Alsx8olzeAMwrv88D1gNk5hMR8VBZfx5wS90+67d5WkQsAZY0iGXO6OgoAOPj4y2G/8tqtdqMtm8HY+ifGEZGRtoYzdbsE8ZQxfbtE+1hDL0RQ6/0iaZJW0QcBzyYmd+MiEPL4qEpqtaarGu0zdMycwWwolFMY2NjNYDh4eFG1RoaHx+f0fbtYAzG0Ar7hDEMWvvN2CeMYdDan9DK5dHXAsdHxDqKBw8Opxh5G4mIiaRvPnBf+X0DsBdAuX4OsKm+fIptJEmS1EDTpC0zz83M+Zm5D8WDBDdm5inATcAbymqLgWvL79eVy5Trb8zMWll+ckTsWD55ugC4rW1HIkmS1MdmMk/be4GzIuIuinvWRsvyUWDXsvws4ByAzPwucDXwPeB6YGlmPjmD9iVJkgZGy/O0AWTmzcDN5fe7meLpz8x8FDhxmu0vAC7Y1iAlSZIGnW9EkCRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQK2KbXWFXJ7e++r+H6F579HG5///R1Drzwee0OSZIk6RlzpE2SJKkCTNokSZIqwKRNkiSpAvr2njYVvLdPkqT+4EibJElSBZi0SZIkVYBJmyRJUgV4T5v6XrP7+sB7+yRJva9p0hYRewGXA3sATwErMnN5ROwCXAXsA6wDTsrMzRExBCwHjgUeAU7LzDvKfS0Gzit3fX5mrmrv4UiSJPWnVi6PPgGcnZn7AYcASyNif+AcYE1mLgDWlMsAxwALys8S4CKAMslbBhwMHAQsi4i5bTwWSZKkvtU0acvMjRMjZZk5DqwF5gGLgImRslXACeX3RcDlmVnLzFuAkYjYEziq2EVuyszNQAJHt/VoJEmS+tQ23dMWEfsArwRuBXbPzI1QJHYRsVtZbR6wvm6zDWXZdOWT21hCMUI3nTmjo6MAjI+PT1vphWc/p+GxbLfTUMM6jfbdLrVabdbb8Tw0Pwcw8/MwMjKyzXG1ql19oplO/PdoDNWIoR3t2yfawxh6I4Ze6RMtJ20RsRPwReBPM/MnETFd1aEpymoNyreSmSuAFY1iGRsbqwEMDw9PW6fRTeVQ/EN+9189Mu36Ttx4Pj4+3vAY2sHz0PwcQG+ch+m0q08004n/Ho2hGjF0u/1m7BPGMGjtT2hpyo+I2IEiYbsiM79UFj9QXvak/PlgWb4B2Ktu8/nAfQ3KJUmS1ETTpK18GnQUWJuZn6hbdR2wuPy+GLi2rvytETEUEYcAD5WXUW8AjoyIueUDCEeWZZIkSWqilcujrwVOBb4TEXeWZe8DPgZcHRFnAGPAieW61RTTfdxFMeXH6QCZuSkiPgx8o6z3oczc1JajkCRJ6nNNk7bM/DpT348GcMQU9WvA0mn2dSlw6bYEKEmSJN+IIGnA7PD/vaRxhVP/kR0uevW0q3/+nv/T5og6b6bnAPrjPEhV47tHJUmSKsCkTZIkqQK8PCoNCC8LSlK1OdImSZJUASZtkiRJFWDSJkmSVAEmbZIkSRVg0iZJklQBJm2SJEkVYNImSZJUAc7TNotuf/d9Dde/8OzncPv7p69z4IXPa3dIkiSpohxpkyRJqgCTNkmSpAowaZMkSaoA72mTJGmA7f7VMxuu//or3s+Lbz1r2vUPLLy43SFpGo60SZIkVYAjbZKkgeQIk6rGpE2SJA20mSbw0JkkvuNJW0QcDSwHtgMuycyPdToGSZKkqunoPW0RsR3wGeAYYH/gTRGxfydjkCRJqqKhWq3WscYi4jXABzLzqHL5XIDM/Oi27GdsbKxzQUst2nvvvYe61bZ9Qr3IPiFtbaZ9otOXR+cB6+uWNwAH11eIiCXAkgb7mDM6OjoLoUm9yT4hbc0+oYFVq9U69lm4cOGJCxcuvKRu+dSFCxd+qpMx1LX9o260awzG0KufXjgHxtAbMXS7/V759MJ5MIbeiKHb7U98Oj1P2wZgr7rl+UDjt6rPni1dareeMRSMoTf0wjkwhkK3Y+h2+72iF86DMRS6HUO32wc6f3n0G8CCiHgBcC9wMvDmDscw4aEutVvPGArG0Bt64RwYQ6HbMXS7/V7RC+fBGArdjqHb7QMdTtoy84mI+GPgBoopPy7NzO92MoZBEhGXAfMzc+EU62rAqZPK3gBcBVybmX/QiRilTir7xGLgk5n57knrnu4TEbGOYkqi8+vWLwYupniY6iMdClmaVRGxO3Ae8HrgeRTJyT8D52fmnXX1dgDeDbwFWAD8HPh3YHlmfqnTcQ+qjr/GKjNXZ+ZLMvNFmXlBp9tXQ0uAjwPHRMQe3Q5GmiU/A5ZGxEta3aB80v1i4O0mbOoXEbEXcDvwW8AfAS8Gfo8iIbulnFcVYAj4R+Bs4JMUU3YdAtwIXBURH+hs5IPLNyIIgIh4IfA6isvVrwDeBviPk/rR/wKeC/wlsKhRxYh4FvDXFKNzr8/MG2Y/PKljPgPsAByWmT8py8Yo5lBdDVxW3s60G/Aq4JDMvLVu+w9GxKPARyPi7zPzm50MfhD5wnhNWAKszsz/Ai4D3h4RXZtjSZpl7wZeHxGHNajzK8A1wBuAQ03Y1E8iYi7FqNqn6xK2eh8FdgcC2BVYMylhm7CcYvT6lNmKVb/gSFv/OzQiHm5UobxX4XR+Me/RtcDJM14xAAAgAElEQVTfUHTWf5rd8KTOy8xbIuIq4BMR8erMfGqKan8GPAX8Rmb+Z2cjlGbdAoqBm+nuK58o3xfYcbp6mfloRPygrKdZ5khb/7uV4nLn5E+936f4b2E1QGY+BlxJ48krpao7B3gpcNo0679KcW/PR8o/bKR+0uxKim+U6EGDPNK2otsB0JkYfpaZd00ujIj6GJYAvwb8rK58CHgyInbPzAdmOcZB+V30ul44Bx2LITPviYgLgfMj4upJMbwP+DfgAxRPu385It6QmY92KLxu/y663X6v6IXzMFsx/CfFSPLLgS9Psf7l5c/vU8yn+vIp6hARvwK8iOKPnNnU7d9Ft9sHOvzuUXVWi1N+3AL8H+APgMnJ3TXAZZn5sdmNVOqMyX0iIoYp/vG6mGLag1Mz82/rp/yIiN8AEvgOsCgzf9qN2KV2i4ivAAcCL5l8X1tE/CPwSuAFwFKKB3cmP4hARLwX+BhwoA8izL5BHmlTYQlwd2b+3eQV5ejD2yPi45lpdq++k5njEfEXFDdTT1fn2xHxOoqRhBsi4thpbtyWqmYpxdPUN0bEeRT3re1B8aDOYcAJmfmziFhO8dDCdRFxDnAzxYM6J1H8sfMhE7bO8J62wfYsivt5/v9p1l9FMex9RKcCkrpglGK0bVqZ+X3gd4A9gTURsUsnApNmU2beQzHSdivwOeAHFPOx7Qi8JjOvL+v9HDgKuJBirra1wG0U/zacnJnLOh/9YPLyqCRJUgU40iZJklQBJm2SJEkVYNImSZJUAZVM2sbGxmpjY2PejCeV7BPS1uwT6keVnvJjy5Ytz7hDjo+PMzw83M5wjGGAYxgZGemJ97TaJ4yhV9q3T7SHMfRGDL3SJyo50iZJkjRoTNokSZIqwKRNkiSpAkzaJEmSKsCkTZIkqQJM2iRJkirApE2SJKkCWpqnLSJGgEuAlwM14G3A94GrgH2AdcBJmbk5IoaA5cCxwCPAaZl5R7mfxcB55W7Pz8xVbTsSSZKkPtbqSNty4PrMfClwALAWOAdYk5kLgDXlMsAxwILyswS4CCAidgGWAQcDBwHLImJum45DkiSprzVN2iJiZ+B1wChAZj6emVuARcDESNkq4ITy+yLg8sysZeYtwEhE7AkcVWyemzJzM5DA0W09GkmSpD7VyuXRFwI/AlZGxAHAN4F3Abtn5kaAzNwYEbuV9ecB6+u231CWTVe+lYhYQjFCN505o6OjQPFaiWeqVqvNaPt2MIb+iWFkZKSN0WzNPmEMVWzfPtEextAbMfRKn2gladseeBXwzsy8NSKW84tLoVOZ6t1atQblW8nMFcCKRgFNvAR4Ju8B6/Z7zIzBGFplnzCGQWu/GfuEMQxa+xNauadtA7AhM28tl6+hSOIeKC97Uv58sK7+XnXbzwfua1AuSZKkJpombZl5P7A+IvYti44AvgdcBywuyxYD15bfrwPeGhFDEXEI8FB5GfUG4MiImFs+gHBkWSZJkqQmWpryA3gncEVEPBu4GzidIuG7OiLOAMaAE8u6qymm+7iLYsqP0wEyc1NEfBj4RlnvQ5m5qS1HIUmS1OdaStoy807gwClWHTFF3RqwdJr9XApcui0BSpIkyTciSJIkVYJJmyRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVYBJmyRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVYBJmyRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVcD2rVaMiO2A24F7M/O4iHgBcCWwC3AHcGpmPh4ROwKXA68Gfgy8MTPXlfs4FzgDeBL4k8y8oZ0HI0mS1K+2ZaTtXcDauuWPAxdm5gJgM0UyRvlzc2a+GLiwrEdE7A+cDLwMOBr4bJkISpIkqYmWkraImA/8HnBJuTwEHA5cU1ZZBZxQfl9ULlOuP6Ksvwi4MjMfy8wfAncBB7XjICRJkvpdq5dHPwm8Bxgul3cFtmTmE+XyBmBe+X0esB4gM5+IiIfK+vOAW+r2Wb/N0yJiCbCkQSxzRkdHARgfH28x/F9Wq9VmtH07GEP/xDAyMtLGaLZmnzCGKrZvn2gPY+iNGHqlTzRN2iLiOODBzPxmRBxaFg9NUbXWZF2jbZ6WmSuAFY1iGhsbqwEMDw83qtbQ+Pj4jLZvB2MwhlbYJ4xh0Npvxj5hDIPW/oRWLo++Fjg+ItZRPHhwOMXI20hETCR984H7yu8bgL0AyvVzgE315VNsI0mSpAaaJm2ZeW5mzs/MfSgeJLgxM08BbgLeUFZbDFxbfr+uXKZcf2Nm1srykyNix/LJ0wXAbW07EkmSpD42k3na3gucFRF3UdyzNlqWjwK7luVnAecAZOZ3gauB7wHXA0sz88kZtC9JkjQwWp6nDSAzbwZuLr/fzRRPf2bmo8CJ02x/AXDBtgYpSZI06HwjgiRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVYBJmyRJUgWYtEmSJFWASZskSVIFmLRJkiRVgEmbJElSBZi0SZIkVYBJmyRJUgWYtEmSJFWASZskSVIFbN/tACRJkrpp7gcPa7j+tjNXsvcnjm9YZ/Oym9oZ0pQcaZMkSaoAkzZJkqQKaHp5NCL2Ai4H9gCeAlZk5vKI2AW4CtgHWAeclJmbI2IIWA4cCzwCnJaZd5T7WgycV+76/Mxc1d7DkSRJ22KmlwY7cVlQhVZG2p4Azs7M/YBDgKURsT9wDrAmMxcAa8plgGOABeVnCXARQJnkLQMOBg4ClkXE3DYeiyRJUt9qmrRl5saJkbLMHAfWAvOARcDESNkq4ITy+yLg8sysZeYtwEhE7AkcVewiN2XmZiCBo9t6NJIkSX1qm+5pi4h9gFcCtwK7Z+ZGKBI7YLey2jxgfd1mG8qy6colSZLURMtTfkTETsAXgT/NzJ9ExHRVh6YoqzUon9zOEorLqtOZMzo6CsD4+HijkBuq1Woz2r4djKF/YhgZGWljNFuzTxhDFdu3T7RHJ2K47cyVDdf/2nNGGtbpxDma7fMw03MAzc9DO/pES0lbROxAkbBdkZlfKosfiIg9M3NjefnzwbJ8A7BX3ebzgfvK8kMnld88ua3MXAGsaBTP2NhYDWB4eLiV8Kc0Pj4+o+3bwRiMoRX2CWMYtPabsU+0V7P5x247cyUHXXz6tOs78SDCbJ+HmZ4D6JF52sqnQUeBtZn5ibpV1wGLy++LgWvryt8aEUMRcQjwUHn59AbgyIiYWz6AcGRZJkmSpCZaGWl7LXAq8J2IuLMsex/wMeDqiDgDGANOLNetppju4y6KKT9OB8jMTRHxYeAbZb0PZeamthyFJElSn2uatGXm15n6fjSAI6aoXwOWTrOvS4FLtyVASZIk+UYESZKkSjBpkyRJqgCTNkmSpApoeZ42SZL6ie/cVNU40iZJklQBJm2SJEkVYNImSZJUASZtkiRJFWDSJkmSVAEmbZIkSRVg0iZJklQBztMmaaD8yyde1HD9/m/6J/7l4ldMu/53zvpBu0OSpJY40iZJklQBJm2SJEkVYNImSZJUASZtkiRJFeCDCNKA8AZ8Sao2kzZJGjAzTeDBJF7qBpM2SZK6ZO4HD2u4/rYzV7L3J46fdv3mZTe1OyT1sI4nbRFxNLAc2A64JDM/NhvtzH3lBxuuv+2LZ7L36z4x7frN/76s3SFJkiQ9Yx1N2iJiO+AzQAAbgG9ExHWZ+b1OxtEpJo69odnvAfxdSJJ631CtVutYYxHxGuADmXlUuXwuQGZ+dFv2MzY21rmgpRbtvffeQ91q2z6hXmSfkLY20z7R6cuj84D1dcsbgIPrK0TEEmBJg33MGR0dnYXQpN5kn5C2Zp/QwKrVah37LFy48MSFCxdeUrd86sKFCz/VyRjq2v5RN9o1BmPo1U8vnANj6I0Yut1+r3x64TwYQ2/E0O32Jz6dnlx3A7BX3fJ84L4OxzBhS5farWcMBWPoDb1wDoyh0O0Yut1+r+iF82AMhW7H0O32gc5fHv0GsCAiXgDcC5wMvLnDMUx4qEvt1jOGgjH0hl44B8ZQ6HYM3W6/V/TCeTCGQrdj6Hb7QIeTtsx8IiL+GLiBYsqPSzPzu52MYZBExM3AXZn59knl8ynuLTxsUvmngT8C3p2Zf92pOKVOiIhrgT0z86Ap1j2bYtT/MuBlEXFVZr6xbv0OwL8B92fmcR0KWZoVEXEG8DfALpk5Xlf+bWC/yeXA/hGxCrgJWAncAfxmZj5Vt+3NTPHvjdqr4+8ezczVmfmSzHxRZl7Q6fY1tYh4DvAW4CM0vsFXqqrPAb8ZEQdMse4PgLnAp4G7gRMi4tS69R8E9gbOmPUopdn3VYpBm9+dKIiIXwNeBmycovxXgTVlUa2s99ZOBatf8IXxmvBGin+szgeeFxG/1eV4pHa7HrgHOHOKdWcCmZnrgJ8B7wc+HRHPj4jfBt4DnJGZD3QqWGm2ZOY9wA+AI+qKDwf+A7h2inL4RdL2FPBJ4IKIeO4sh6pJTNo04R3AZZn5GHAljrapz5SXci4B3hIRvzpRHhEvorhVYEVd9b8E7gSuAC4HRjPz7zsYrjTb1vDLydmN5Wdy+aOZeW9d2UcoRureM9tBamsmbaK8XPQq4Atl0WXASREx0rWgpNkxCjwXOLGu7O3AA8B1EwVlgncm8FqK+2/P6mCMUiesAV4eEbuVy4dT3LP2zxT3sNWX/6R+w8z8CcVo9H8v75FWh5i0CYpRttWZ+SOAzLwN+CHFPW5S38jMjcBXKC+RRsT2wGnAysx8YlL1JcBPgT2Bl3QwTKkTbix/HlEmXi8E/jkzNwHfritfAIxPsf0lFLfUfKQTwaowyEnbiuZVZt1sx/AYMGeK8okRtEcpRtVOAY6PiCcmPhRPEHXqEukg/C6qoBfOQSdi+Bzw2xGxH3A8sDvFP0BPxxARhwHvBk6iuMfnf0bEjh2I7ekYOthWL7bfK3rhPMxKDJn5X8C3KC6FHgHckZkT01rcVFf+JPCpKbZ/Ejib4naDA2cjxkm6/bvodvtAh989qs6KiOXA64EFZQebKH8bcDGwG3ACxf07v0vxVNCEOcDXgNdm5i0dC1qaZRHxLIqbsL9M8cfJdpl5ZN36EYqRhq9k5n+LiF0obtD+Qmae3Y2YpdkQEf8D+H8p/l+/MTPPKcuPo0jUvgbsm5mHlOWnAZdk5vZ1+1gN7ETxgIJTfsyyTk+uq876G4rLQCvLBG4L8JsUw9mXZ+aPI+IdwJcz8zuTN46If6UYbTNpU9/IzKci4mKKm6iHKZ6crvdZiidI/3tZf1NEnA6sjoi/z8ybOxmvNIvWUIyW7Qq8oa78axRvL/p9phhlm+Rsij9yfg7cNQsxqs4gXx7te5m5FjiE4nLo31N0rD8HPgG8IyJeQZHEXT3NLq4C3hgRU11ilarsUooHEv6L4vInABHxZoqHFN6SmY9MlGfmDRR/BF0WETt3OFZptnyNItn6FeDrE4XlgwbfpPij5quNdlD+O/M5irncNMu8PCpJklQBjrRJkiRVgEmbJElSBZi0SZIkVUAlk7axsbHa2NiYN+NJJfuEtDX7hPpRpaf82LJlyzPukOPj4wwPD7czHGMY4BhGRkaG2hjOM2afMIZead8+0R7G0Bsx9EqfqORImyRJ0qAxaZMkSaoAkzZJkqQKMGmTJEmqgEo/iCBJkjRTc9Ye1nD9w3usZM7a4xvWeWi/m9oZ0pQcaZMkSaoAkzZJkqQKaOnyaESMAJcALwdqwNuA7wNXAfsA64CTMnNzRAwBy4FjgUeA0zLzjnI/i4Hzyt2en5mr2nYkkiRJfazVkbblwPWZ+VLgAGAtcA6wJjMXAGvKZYBjgAXlZwlwEUBE7AIsAw4GDgKWRcTcNh2HJElSX2uatEXEzsDrgFGAzHw8M7cAi4CJkbJVwAnl90XA5ZlZy8xbgJGI2BM4qtg8N2XmZiCBo9t6NJIkSX2qlcujLwR+BKyMiAOAbwLvAnbPzI0AmbkxInYr688D1tdtv6Esm658KxGxhGKEbjpzRkdHgeK1Es9UrVab0fbtYAz9E8PIyEgbo9mafcIYqti+faI9jKEzMTy8x8qG65961gj3NqlTaxJfO/pEK0nb9sCrgHdm5q0RsZxfXAqdylTv1qo1KN9KZq4AVjQKaOIlwDN5D1i332NmDMbQKvuEMQxa+83YJ4yh3ZpN53HvHiuZd//pDev0ypQfG4ANmXlruXwNRRL3QHnZk/Lng3X196rbfj5wX4NySZIkNdE0acvM+4H1EbFvWXQE8D3gOmBxWbYYuLb8fh3w1ogYiohDgIfKy6g3AEdGxNzyAYQjyzJJkiQ10eobEd4JXBERzwbuBk6nSPiujogzgDHgxLLuaorpPu6imPLjdIDM3BQRHwa+Udb7UGZuastRSJIk9bmWkrbMvBM4cIpVR0xRtwYsnWY/lwKXbkuAkiRJ8o0IkiRJlWDSJkmSVAEmbZIkSRXQ6oMIkiT1lTlrD2u4/uE9Vjacv6sT83JJ9RxpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaqA7VutGBHbAbcD92bmcRHxAuBKYBfgDuDUzHw8InYELgdeDfwYeGNmriv3cS5wBvAk8CeZeUM7D0aSJG2bOWsPa7j+4T1WMmft8dOuf2i/m9odkqaxLSNt7wLW1i1/HLgwMxcAmymSMcqfmzPzxcCFZT0iYn/gZOBlwNHAZ8tEUJIkSU20lLRFxHzg94BLyuUh4HDgmrLKKuCE8vuicply/RFl/UXAlZn5WGb+ELgLOKgdByFJktTvWr08+kngPcBwubwrsCUznyiXNwDzyu/zgPUAmflERDxU1p8H3FK3z/ptnhYRS4AlDWKZMzo6CsD4+HiL4f+yWq02o+3bwRj6J4aRkZE2RrM1+4QxVLH9KvSJh/dY2bCdp541wr0N6tQ68DvqxH8LnoeZnwNofh7a0SeaJm0RcRzwYGZ+MyIOLYuHpqhaa7Ku0TZPy8wVwIpGMY2NjdUAhoeHG1VraHx8fEbbt4MxGEMr7BPGMGjtN9OuPtHoPi2Ae/dYybz7T592fSfu5erE78LzMPNzAJ05D61cHn0tcHxErKN48OBwipG3kYiYSPrmA/eV3zcAewGU6+cAm+rLp9hGkiRJDTRN2jLz3Mycn5n7UDxIcGNmngLcBLyhrLYYuLb8fl25TLn+xsysleUnR8SO5ZOnC4Db2nYkkiRJfWwm87S9FzgrIu6iuGdttCwfBXYty88CzgHIzO8CVwPfA64HlmbmkzNoX5IkaWC0PE8bQGbeDNxcfr+bKZ7+zMxHgROn2f4C4IJtDVKSJGnQ+UYESZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCjBpkyRJqgCTNkmSpAowaZMkSaoAkzZJkqQKMGmTJEmqAJM2SZKkCti+WYWI2Au4HNgDeApYkZnLI2IX4CpgH2AdcFJmbo6IIWA5cCzwCHBaZt5R7msxcF656/Mzc1V7D0eSJKk/tTLS9gRwdmbuBxwCLI2I/YFzgDWZuQBYUy4DHAMsKD9LgIsAyiRvGXAwcBCwLCLmtvFYJEmS+lbTkbbM3AhsLL+PR8RaYB6wCDi0rLYKuBl4b1l+eWbWgFsiYiQi9izrZmZuAoiIBI4GvtDG49Ek+y69vuH61eceyIHn/Ou067//maPbHZIkSXoGmiZt9SJiH+CVwK3A7mVCR2ZujIjdymrzgPV1m20oy6Yrn9zGEooRuunMGR0dBWB8fHxbwt9KrVab0fbt0IkYVp97YMP1u+y0Q8M6nThH/fC7GBkZaWM0W7NPGEMV269Cn3h4j5UN23nqWSPc26BOrU/+/+h5mPk5gObnoR19ouWkLSJ2Ar4I/Glm/iQipqs6NEVZrUH5VjJzBbCiUSxjY2M1gOHh4UbVGhofH5/R9u3QiRgajaJBkdQd+9Hbp13fiZG2QfldPFP2CWMYtPabaVefmLP2+Ibt3LvHSubdf/q06x/a76aG27dDJ34XnoeZnwPozHloKWmLiB0oErYrMvNLZfH/be/ew+2q6nOPfzdBJFwSroJKQjQCEblYCjYEjxLdbxUvaGt8ykVEoAKPyEW0p4KtqHiKR7xHDgoEsMhBwVpL25TysyAeEBRELtZQoSAQQQFF7hKC+/wx5krW2llZoZY1xliZ7+d59pO95srm98tK3sVYc445xi8lPb85y/Z84L7m+FJgRtePbwPc0xzfe9Lx7/z+rQ/my4JmZma2NlnjjQjN3aCLgCUR8Zmupy4GDm6+Pxj4h67j75Q0Jmku8FBzGfVfgT+WtGlzA8IfN8fMzMzMbA2eyZm2vYCDgJsl3dAcOxH4BHChpMOAu4C3N88tJi33cRtpyY9DACLi15JOBq5tft/HOjclmA3Tms66gs+8mplZ/Z7J3aNX0n8+GsBr+/z+CeCo1fy3zgbO/q80aGZmZmbeEcHMzMxsJHjQZmZmZjYCPGgzMzMzGwEetJmZmZmNAA/azMzMzEaAB21mZmZmI8CDNjMzM7MR4EGbmZmZ2QjwoM3MzMxsBHjQZmZmZjYCPGgzMzMzGwEetJmZmZmNAA/azMzMzEaAB21mZmZmI8CDNjMzM7MRsG7pBtZmOxx1ycDnF5+wO7t/8KrVPv8fp73+2W7JzMzMRpTPtJmZmZmNAJ9pMzNrmdsOOHrg85t/4i+57YgTB/6el/zfhc9mS601fcn8gc8/uvU5TF+y72qff+illz/bLVnFPGgza4n/7v+o/T9pM7Oysg/aJL0e+DwwBTgrIj6Ruwczay8PXs1sVGUdtEmaApwGCFgKXCvp4oj4Sc4+zKwMD5jMzH5/YxMTE9mKSdoT+EhEvK55fAJARJzyX/nv3HXXXfmaNnuGZs6cOVaqtjNhNXImzHr9dzOR+/LoC4G7ux4vBf6o+zdIOhw4fMB/Y/qiRYuG0JpZnZwJs17OhLXWxMREtq/x8fG3j4+Pn9X1+KDx8fGFOXvoqn1/ibruwT3U+lXDa+Ae6uihdP1avmp4HdxDHT2Urt/5yr1O21JgRtfjbYB7MvfQ8ZtCdbu5h8Q91KGG18A9JKV7KF2/FjW8Du4hKd1D6fpA/suj1wLbSXoR8HNgP+CAzD10PFSobjf3kLiHOtTwGriHpHQPpevXoobXwT0kpXsoXR/IvCNCRCwH3gv8K7AEuDAi/j1nD2ZmZmajKPs6bRGxGFicu66ZmZnZKPPeo2ZmZmYjwIM2MzMzsxHgQZuZmZnZCPCgzczMzGwEtHnQdkbpBnAPHe6hDjW8Bu4hKd1D6fq1qOF1cA9J6R5K1wcy7z1qZmZmZr+fNp9pMzMzMxsZHrSZmZmZjQAP2szMzMxGgAdtZmZmZiOgVYM2SX8n6Y2Sivy5JU0pUXey0n1IelOpv4NJfWxWuofSnImkdB81ZMJ5SJyJpHQfzkR/xf/HmdnpwAHArZI+IWlO5vq3STpV0o6Z69bWx36kv4NPSnppoR4Avi/pIklvkDRWsI+SnIk6+qghE85D4kzU0Ycz0Ucrl/yQNB3YH/gQcDdwJvDViHhqyHU3Jv1DPIQ0YD4b+FpEPDzMujX2IWka6e/gEGACOAe4ICIeydjDGDAOHAq8Avg6cG5E/DRXD7VwJsr3UToTzkMvZ6J8H87Eqlo3aJO0OfAO4CDgHuB84JXAzhGxd8Y+XgVcAGwCfAM4OSJuy1W/hj4kbUH6uzgOWAK8BPhCRCzMUX9SL/OBrwIbAjcCH4yIq3P3UYIzUU8ftWSizXkAZ6KmPpyJXuvmLliSpG8Cc4DzgDdHxL3NU1+XdF2G+lOAN5I+NcwCPk16M/gfwGJg+2H3UEMfkt5M+uQym/R38YqIuE/SBqRQZgnjpDfmXwJHAxcDLwcuAl6Uo4+SnIk6+qghE85D4kzU0Ycz0V+rBm3AFyPisn5PRMTuGerfClwOnBoR3+s6/o3mk0wupft4O/DZiPhu98GIeFzSoRnqd1xNejN4a0Qs7Tp+naQvZeyjJGeijj5qyITzkDgTdfThTPTRxsujOwE7Aut3jkXE32aqvVFEPJqj1ij0UZqksYhoVwD6cCbq6aMk52ElZ6KePkqqMROtOtMm6SRgb1IYFwP7AFcCWcIILJd0FPAyet8Mcp5dKt6HpLmkU9svBdYDpgCPRcS0HPW7bCHpf7Lq6/CazH0U40zU0UclmWh9HsCZqKUPZ6K/ti35sQB4LfCLiDgE2BV4bsb65wFbA68DrgC2AbLdLVlRH18k3RF0KzAV+HMyzWOb5HzgFtK8hI8CPwOuLdBHSc5EHX3UkAnnIXEm6ujDmeijbYO2JyLid6RPENOA+4AXZ6z/koj4a9Knha+QJnnunLF+NX00dx5NiYinI+IcYH7O+o3NI2IR8FREXNF8gpxboI+SnIlK+qggE85D4kxU0oczsapWXR4lTR7chLTezg+BR4EfZKzfWd/nN82ciV+Q7srJrXQfj0taD7hB0ieBe0m3UefWeR3ulfRG0q392xTooyRnoo4+asiE85A4E3X04Uz00apBW0S8p/n2S5IuAaZFxE0ZWzhD0qbAX5NuG94I+HDG+rX0cRBpfsJ7gfcBM4C3Zazf8fFmAc33k067T2v6aQ1nopo+ashE6/MAzkRFfTgTfbTi7lFJuw16PiKuz9WLWQ2cCbNezoSNgracafv0gOcmgKHeCSLp+EHPR8Rnhlm/lj4k3Ux6vVdXf5dh1u/qY+Ea+jgmRx+FORMV9FFDJpyHFZyJCvpwJgZrxaAtIkpMcu+2cSSoHkgAABK9SURBVOH6HaX7eFPh+h1DX9W8ds7ECqX7qCETrc8DOBNdSvfhTAzQisujHc32F8cDMyPicEnbATtExD8Vbq11JG0LbBcR35Y0FVg3Mm4WP6mXDSPisRK1S3Mm6lFLJtqcB3AmauJMrKoVZ9q6nEO6G2he83gpaf+wLGGUtD1wOrBVROwkaRdg34j4eI76tfQh6d3A4cBmpH3ltgG+RFobKRtJewKLSBNsZ0raFTiiayJyGzgTFfRRQyachxWciQr6cCb6a9s6bbMj4pM0t/FGxBPAWMb6ZwIndNW/CdgvY/1a+jgK2At4uKl/K/C8jPU7PkdaOPJXTR83Ajn39quBM1FHHzVkwnlInIk6+nAm+mjboG1Zc4p1AkDSbODJjPU3iIjJ6/0sz1i/lj6ejIhlnQeS1mXApM9hioi7Jx16ukQfBTkTdfRRRSacB8CZqKUPZ6KPtl0ePQm4BJgh6XzSKP5dGes/0LwBdN4MFpAWDMytdB9XSDoRmCpJwHuAf8xYv+NuSfOAiWYRx2OAJQX6KMmZqKOPGjLhPCTORB19OBN9tGrQFhEh6XrSNhRjwLER8UDGFo4CzgDmSPo5cAdwYMb6tfTxQeAw4GbgCNKmzGdlrN9xJPB54IWkeSuXkl6b1nAmqumjhky0Pg/gTFTUhzPRR2vuHm1Ore4DzGkOLQEuiYjsp50lbQisU+puyRr6kLQlQETcn7u2Jc5EXX04E+U5E3X14UysqhWDNkkvAC4nndr9EenT0x8AWwPzI+KeDD3sQLoTpvvN4IyI+Omwa9fSh6Qx0qWH95L+DsZI8wMWRsTHhl1/Ui/zgaOBHZpDS4AvRsR3cvZRijNRRx+1ZKLteQBnopY+nInB2nIjwt8Ap0fE3hHxvog4LiJeDZwGnDLs4s1tw98BHiGdbj4TeAz4jqS5w65fUR/HkeaH7BERm0fEZsAfAXtJyrafm9LGv2eT5kccQDrlvxg4W9IbcvVRmDNRRx/FM+E8rOBM1NGHMzFAW+a0zY2Id00+GBFfkPQfGep/GNh/0gj9W5IuI32i2CdDDzX08U5A3fNDIuJ2Se8gzRX47JDrd/wF8Nbm9u2OGyRdR9oUeHGmPkpyJuroo4ZMOA+JM1FHH87EAG050/bEgOcez1B/dr9TqhFxBfDiDPVr6eM5/Sb0NvMVnpOhfsfWk8LY6eMmYKuMfZTkTNTRRw2ZcB4SZ6KOPpyJAdpypm26pD/tc3wMmJah/qAJnDm3xijdx7Lf87ln26A/axVblWTgTCSl+6ghE85D4kwkpftwJgZoy6DtCuDNq3nuuxnqz5D0hT7Hx0i3EudSuo9dJT3c1Ou+A2YMWD9D/Y7Zki7uc3yMvJ9oS3Im6uijhkw4D4kzUUcfzsQArRi0RcQhktYBFkTEhQVa+IsBz12XrYvCfUTElGHXeIbeMuC5T2XroiBnYgVnwnkAnIkuzkTNmZiYmGjN1/j4+HcL1p4yPj5+aunXoE9fm46Pj49lrvmp8fHxHSv4sx/7TI6tzV/ORN++WpkJ52HFn9mZWLUvZ2LAsZxfrTjT1iUkfQD4Ol3XpSPi10MvHPG0pD8cdp1BJH0YuDAibpH0XOBfgJcDyyUdEBHfztTKLcCZzUKW5wAXRMRDmWp3O5i02nW3d/U5tjZzJpyJDuchcSaciY7qMtG2Qduhza/d21BMkO8a9Y+a6+QX0ftm8M1M9f8MOLn5/mDS9fktge2BrwBZwhgRZwFnNQs4HgLcJOkq4MyIuHzY9SXtT1p758WT5i1sDPxq2PUr40y0PBPOwyqcCWei2ky0atAWES8q3MJmpL/w13QdmwByhXFZRHQmdr4O+FpEPA0saT7NZCNpCmm17TnAA8CNwPGSjoiI/YZc/nrSqudbAJ/uOv4IcNOQa1fFmXAmcB56OBPOBBVnolWDNkkbAMcDMyPicEnbATtExD/lqB8Rh+SoM8CTknYCfgnMBz7Q9dwGuZqQ9BnSXVqXAX8TET9onvrfmRaxvCAidpP0n83aQ63lTDgTOA89nAlngooz0apBG+m6+A+Bec3jpaRT0FnCKGl74HRgq4jYSdIuwL4R8fEc9YFjgW+QTnV/NiLuaPp6A2mvvVx+DPxVRPRbsPIVGeqvJ+lgYM9+6zJlvAxRA2fCmXAeejkTzkS1mWjLjggdsyPik8BTABHxBOl6fS5nAid01b8JGPalwBUi4vsRMSfSfm4ndx1fTNrvLZcDJwdR0r81veSYaHokMBfYhPRJrvvrTRnq18SZcCach17OhDNRbSbadqZtmaSpNAv2SZoNPJmx/gYR8QNJ3ceWZ6zfQ9J04G2kCZcvZcgLJ0pan3R6fQtJm7LyjXAa8IJh1u4WEVcCV0q6LiIW5apbKWeiSxsz4Tyswpno4kzUlYm2DdpOAi4hrfh8PrAX6fbdXB5o3gA6bwYLSJMds2nejPYlBXA30t0wbyXPit9HkD6pvYA00bPjYeC0DPV7RMSiZu7GjnSttB0Rf5u7l4KcCWcCcB66OBPOBFBnJsYmJibW/LvWIpI2J532HAOuiT4b0w6x9ouBM0hzJR4E7gDeERE/y1T/fOBVwKXA10gTPG/LfbeUpKMjYmHOmqvp4yRgb1IgFwP7AFdGxIKSfeXmTDgTTQ/OQ8OZcCaaHqrLRCvOtEma0ywUuFtzqPOpZaakmRFx/ep+9tkUEbcD45I2BNaJiEEb8w7DTqQ3gSXALc1CjtlG7ZJeExGXAT+vZHLnAmBX4EeRtrDZCjgrcw9FOBMrOBMrtTYP4Ex0cSZWqi4TrRi0kW7fPpze9VY6JuhdD2dolFaXfhswC1i3M2chIj6Wo35E7CppDumU97cl3QdsLGnriPhFhhZeTfrU1m9T5pzrEHU8ERG/k7Rc0jTgPtqzQbYzgTMxSZvzAM5Ep44zsVJ1mWjN5VGljYD3jIirCvZwCfAQ6XbypzvHI6Lfm0SOfnYnBXMBsDQi5q3hR56NmiU3ZJ7cy/8BTiTdmfV+4FHghgrWScrCmejbT2sz0fY8gDOxmn6ciZoyUXoj2sybv15duP6PS78Gq+lrbHx8/NUZ6xXbkHnSn3lG1+NZ4+Pju5Tuq8Dr4Ez076tVmXAeel4LZ6J/X85EBZloy+XRjkslvQ34ZqzcpiOn70naOSJuLlC7sxHwILlWfi62IXNXrQlJ3wL+sHn8s1y1K+NMDNaKTDgPPZyJwZyJgto2aDse2BBYLum3pDuDJiJi2jCLSrqZdC1+XeAQSbeT1v3p1N9lmPW7PNbn2IbAYcDmQJY5E5TfkLnjGkl7RMS1mevWxJlYVVsz4TwkzsSqnIlKtGlO2xgwIyLuKlB720HPR8SduXrpkLQxabuSw4ALgU9HxH25+yhJ0k+A7YE7SW9Uud8ci3ImerU9E23PAzgTkzkT9WWiNWfamlOdf09zqjNz7TsBJJ0XEQd1PyfpPOCgvj84BJI2I32SPBD4CrBbRDyYq37Twzv7HS+wYOE+metVxZlYUc+ZSFqdB3Amuuo5E0l1mWjNoK1R+lTny7ofSJpCxjcHSacCf0pauHHniHg0V+1J9uj6fn3gtaSVr3MP2p4P/HtnHaTmU+WOpE9VbeFMOBMdzkPiTDgTHdVlom2DtvnAEZKynuqUdALptuGpkh5uDo8By0jByOX9pDkSfwV8SCv3tssyZ6MjIo7ufqy0t915OWpPcjppi5aOx/ocW9s5E85Eh/OQOBPOREd1mWjboK3Iqc6IOAU4RdIpEXFCiR6aPtYpVXsNHge2K1B3rPvusGYRRWciA2dijUpkwnlInIk6ORO0bNDWNWfgeXRt/pqx/gmSNiX9w+vefDbHJrzVkPSPNJshA+uQTjeXWETxdknHkD45AbwHuL1AH8U4E3WoJBOtzwM4E7VwJvpr1aBN0r6kLUpeQNqOYlvS/movG/Rzz2L9PyfdibMNcANpQ+KrybQ9SkU+1fX9cuDOiFhaoI8jgS+QLgNMAP9G2samNZyJatSQidbnAZyJijgTfbRq0AacTArAtyPiDyTNB/bPWP9Y0uTKayJifrO/20cz1q/FXcC9EfFbAElTJc3KuXhhM7n3wIjYL1fNSjkTdSiaCeehhzNRB2eij1qvXQ/LUxHxK2AdSetExOXAyzPW/23XP8DnRsQtwA4Z69fiIuB3XY+fbo5lExFPA2/JWbNSzkQdimbCeejhTNTBmeijbWfafiNpI+C7wPmS7iOdds1lqaRNgG+Rtuh4ELgnY/1arBsRyzoPImKZpPUK9HGVpC+y6jYp1xfopRRnog41ZMJ5SJyJOjgTfbRi0CbpJcBWpFHzE8D7SIsGbgscPeBHn1UR8SfNtx+RdDkwHbgkV/2K3C9p34i4GEDSW4AHCvQxr/m1e1uWCVowd8SZqE4NmWhtHsCZqJAz0UcrBm3A54ATI6IzUv4d8BVJuwMfAd6cqxFJrwS2i4hzJG0JvBC4I1f9ShxJ+gR7GikAS4G+q18PU0TMz12zIs5EXYpnouV5AGeiNs5EH20ZtM2KiJsmH4yI6yTNytWEpJOA3UnzE84BngN8FdgrVw81iIj/BOY2lyDGOqtN5yLpHRHxVUnHr6a/z+TspxBnoiIlM+E8rOBMVMSZ6K8tNyIMWmtnarYu4E+AfWmujUfEPcDGGetXQdJWkhYBF0XEI5J2lHRYxhY2bH7duM/XRhn7KMmZqEjhTDgPiTNREWeiv7acabtW0rsj4szug80/gB9m7GNZpA2JJ5r6G67pB9ZS55I+QX6oefxT0kTPRTmKR8SXm19XuY1e0h6r/sRayZmoy7kUyoTzsIIzUZdzcSZW0ZZB23HA30s6kJXh2x1Yj/SpJpcLJX0Z2ETSu4FDgTPX8DNroy0i4kKlvfaIiOWSni7VjKQdgf1IazE9RPq3sbZzJupSTSZamgdwJmrjTPTRikFbRPwSmNcskrhTc/ifI+KyHPUlHQdcRZroOh94mDRf4cMRETl6qMxjkjan2aJE0lxSELKRtC0pgPuTbuffFtg95wK/JTkT1SmaibbnAZyJCjkTfbRi0NbRLJJ4eYHS2wCfB+YANwHfI4Uz5yn3mhwPXAzMlnQVsCWwIFdxSd8j3Ub/NWBBRNwq6Y7SYSzBmahGsUw4D72ciWo4E3205UaEoiLiAxExD9gaOBH4NemU948l/aRocxlJ2kPS1s3ChK8mvRZPApeSbufO5X7ShNKtSG8EsHJjYsvAmUgqyYTzUAFnInEmBvOgLa+pwDTSCH46aZXr7xftKK8vA50VrueRJpieBjwInJGriYh4C7AzcD3wUUl3AJtKekWuHmwFZ6JwJpyH6jgTzsRqjU1MVDF4XKtJOgN4GfAIKXzXkDYDfrBoY5lJujEidm2+Pw24PyI+0jy+ISJy7u/X3dfzgD8jzV2YEREzSvTRJs5EUmMmnIcynInEmRjMZ9rymAk8F/gF8HPSKd7fFO2ojCmSOvMoXwt0T/AtOb/ysYhY2FyaeGXBPtrEmUhqzITzUIYzkTgTA3jQlkFEvB7YA/hUc+j9pDWBLpW0yjowa7ELgCsk/QNpb7//Byv2/Mt692hTd14zV2RJ83hX4C9z99FGzsQK1WTCeSjLmVjBmRjAg7ZMImIiIn4MLAb+hXRX0Gzg2KKNZRQR/4v0RnQu8MqI6FybX4eMGzJ3+SzwOuBXTX83Aq8q0EcrORPVZcJ5KMyZcCbWpFVLfpQi6RjShMq9gKdIQbwaOBu4uWBr2UXENX2O/bREL03tuyV1Hyq2yG+bOBMr1ZQJ56EcZ2IlZ2L1PGjLYxbwDeB9EXFv4V5spbslzQMmJK0HHENzGtyGbhbORG2ch7Jm4UzUprpMeNCWQUQcX7oH6+tI0mKWLyRN+r0UeE/RjlrCmaiS81CQM1Gl6jLhQZu12Q4RcWD3AUl7kS5LmLWN82DWq7pM+EYEa7OFz/CYWRs4D2a9qsuEz7RZ60jakzThd0tJ3ZckpgFTynRlVobzYNar5kx40GZttB6wEenf/8Zdxx8m48b1ZpVwHsx6VZsJb2NlrSVp24i4s3QfZjVwHsx61ZgJn2mzNntc0qmk/f7W7xyMiNeUa8msGOfBrFd1mfCNCNZm5wO3AC8CPgr8DLi2ZENmBTkPZr2qy4QHbdZmm0fEIuCpiLgiIg4F5pZuyqwQ58GsV3WZ8OVRa7Onml/vlfRG4B5gm4L9mJXkPJj1qi4THrRZm31c0nTS5sQLSbdzH1e2JbNinAezXtVlwnePmnWRdFxEfK50H2Y1cB7MepXOhOe0mfXy/n9mKzkPZr2KZsKDNrNeY6UbMKuI82DWq2gmPGgz6+X5AmYrOQ9mvYpmwjciWOtIeoT+wRsDpmZux6wo58GsV82Z8I0IZmZmZiPAl0fNzMzMRoAHbWZmZmYjwIM2MzMzsxHgQZuZmZnZCPj/cQ1Ac+MMPDsAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "for agg in [\"mean\", \"sum\", \"positive\"]:\n", + " axs = reshaped_delays[agg].plot(\n", + " kind=\"bar\", subplots=True, layout=(4, 3), figsize=(10, 8), legend=False,\n", + " sharex=True, sharey=True\n", + " )\n", + " fig = axs[0, 0].get_figure()\n", + " fig.suptitle(agg)\n", + "# fig.tight_layout();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s summarize what we did:\n", + "\n", + "- Computed average flight delay for each airline for each week. \n", + "- Noticed that one week had more delays for all airlines. \n", + "- Studied the flights in that week to determine the *cause* of the\n", + " delays in that week. \n", + "\n", + "\n", + "Suppose now that we want to repeat that analysis, but at a daily\n", + "frequency instead of weekly.\n", + "\n", + "We could copy/paste the code from above and change the `W` to a `D`,\n", + "but there’s a better way…\n", + "\n", + "Let’s convert the steps above into two functions:\n", + "\n", + "1. Produce the set of bar charts for average delays at each frequency. \n", + "1. Produce the second set of charts for the total, average, and number\n", + " of occurrences of each type of delay. " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "hide-output": false + }, + "outputs": [], + "source": [ + "def mean_delay_plot(df, freq, figsize=(10, 8)):\n", + " \"\"\"\n", + " Make a bar chart of average flight delays for each carrier at\n", + " a given frequency.\n", + " \"\"\"\n", + " mean_delays = (\n", + " df\n", + " .groupby([pd.Grouper(key=\"Date\", freq=freq), \"Carrier\"])\n", + " [\"ArrDelay\"] # extract one column\n", + " .mean() # take average\n", + " .unstack(level=\"Carrier\") # Flip carrier up as column names\n", + " )\n", + "\n", + " # plot\n", + " axs = mean_delays.plot.bar(\n", + " figsize=figsize, subplots=True, legend=False, sharex=True,\n", + " sharey=True, layout=(4, 3), grid=False\n", + " )\n", + "\n", + " # tweak spacing between subplots and x-axis labels\n", + " axs[0, 0].get_figure().tight_layout()\n", + " for ax in axs[-1, :]:\n", + " ax.set_xticklabels(mean_delays.index.strftime(\"%a, %b. %d'\"))\n", + "\n", + " # return the axes in case we want to further tweak the plot outside the function\n", + " return axs\n", + "\n", + "\n", + "def delay_type_plot(df, start, end):\n", + " \"\"\"\n", + " Make bar charts for total minutes, average minutes, and number of\n", + " occurrences for each delay type, for all flights that were scheduled\n", + " between `start` date and `end` date\n", + " \"\"\"\n", + " sub_df = df.loc[\n", + " (df[\"Date\"] >= start) & (df[\"Date\"] <= end)\n", + " ]\n", + "\n", + " def positive(df):\n", + " return (df > 0).sum()\n", + "\n", + " aggs = sub_df.groupby(\"Carrier\")[delay_cols].agg([\"sum\", \"mean\", positive])\n", + "\n", + " reshaped = aggs.stack().T.swaplevel(axis=1).sort_index(axis=1)\n", + "\n", + " for agg in [\"mean\", \"sum\", \"positive\"]:\n", + " axs = reshaped[agg].plot(\n", + " kind=\"bar\", subplots=True, layout=(4, 3), figsize=(10, 8), legend=False,\n", + " sharex=True, sharey=True\n", + " )\n", + " fig = axs[0, 0].get_figure()\n", + " fig.suptitle(agg)\n", + "# fig.tight_layout();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let’s look at that plot at a daily frequency. (Note that we need the\n", + "figure to be a bit wider in order to see the dates.)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "mean_delay_plot(air_dec, \"D\", figsize=(16, 8));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we expected given our analysis above, the longest average delays\n", + "seemed to happen in the third week.\n", + "\n", + "In particular, it looks like December 17th and 18th had — on average —\n", + "higher delays than other days in December.\n", + "\n", + "Let’s use the `delay_type_plot` function to determine the cause of the\n", + "delays on those two days.\n", + "\n", + "Because our analysis is captured in a single function, we can look at\n", + "the days together and separately without much effort." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAm0AAAJfCAYAAAAtueEYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3X+cXVV96P3PCBStxkxA+WEIYiVa0KdipRBrWwXyheCjQvsIgoqBorG96e1T4VbB0kb5ofj0VhoV0cAEwq0KPCiF0hT6NWCpvQXRVLGa623AMIREsOYHgwgKnPvH3qMnw8w5k8yZfc6e83m/Xuc156y99l7fc3JW5jtr77X2QKPRQJIkSb3tWd0OQJIkSe2ZtEmSJNWASZskSVINmLRJkiTVgEmbJElSDZi0SZIk1YBJm6QZLSI+ExF/3mL7ByPiiipjkqRdMeA6bZL6RUS8AfibzDyg27FI0s5ypE2SJKkGHGmT1FMiYgPwWeA0YH/gb4E/zMzHI+I9wAeAvYCvAn+QmZsiYgD4OPAOYE/gfuDtmfnvEXEVsBH4KPCf5fbHyuZeBiwBDs7Md0bELcDNmfmppni+BXw4M78UEb8KfBJ4DfBD4M8z87pp+zAkqYkjbZJ60TuA44CXUiRW50XE0RSJ18kUydz9wDVl/WOB3ynrDgJvA37UfMDM/DFwPLApM59XPjaNaffzwKmjLyLiUODFwN9HxHOBLOvsU9b7dES8olNvWpJaMWmT1Is+lZkPZOYW4CKKBOkdwMrMXJuZTwDnAq+NiIOAnwGzgF8FBjJzXWZu3oV2bwAOi4gXl6/fAXypbO9NwIbMvDIzn8zMtcAXgbdO4X1K0qSZtEnqRQ80Pb8feFH5uH+0MDMfpRhNm5uZtwGfAi4FHoqIFRHx/J1tNDNHgL8HTimLTgE+Vz5/MXBkRGwbfVAkdfvtbDuStCt273YAkjSOeU3PDwQ2lY/RETDK05V7Aw8CZOYngE9ExD7AdcCfAmOX+pjMRbxfAJZFxB3Ac4Dby/IHgH/KzNjpdyNJHWDSJqkXLY2ImykmDHwQuBZYA1wTEZ8H1gEfAe7KzA0R8RsUZw7WAj8GHgeeGue4DwF7R8TszNw+QdurgZXA+cC1mfl0WX4zcHFEnMYvrqU7DHg0M9dN7e1KUnueHpXUiz4P/CNwX/m4MDPXUIycfRHYTDFJYfQ05vOBy4GtFKdQfwT897EHzcz/RTGSdl95ivNF49R5AvgSsLCMY7R8hGLCwykUo34/AD5GMRtVkqadS35I6inlkh/vzswvdzsWSeoljrRJkiTVgEmbJElSDXh6VJIkqQYcaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGti92wFo+kXEfsAGYBswLzN/Nk6d1wBfA76Vmb9ebYRStVr1iYjYG1gGvBl4EfAI8L+AT2fmF6qPVppeEXEVsLip6BHgu8AFmbm6qd6LgY8AxwHPA4aB92fm31YWbJ9zpK0//D7w98CPgBMmqPNe4DLgoIg4vKrApC5p1Se+CPwORZ94GbAI+AKwd5UBShX7Z2D/8rEAWAv8bUS8FCAi5gJ3AgPAm4BfBd4DPNCVaPuUI20zXEQ8i6Jj/b8UnWwJcP2YOrOAU4HXUSTyS4CvVxupVI1WfSIiBoHXA2/OzH8sd7kf+EYXQpWq9NPM/EH5/AcRcQ7wX4BfA+6lGGHbkJlvb9pnQ7UhypG2me9Y4LnAauB/AG+IiF8ZU+cdwH9k5j3AVcCpEfG8SqOUqtOqTzwKjAAnRMRzuxSf1FUR8UsUf9g8Aawt/9A5EfjXiPhCRDwcEd+OiHMjwsGfCpm0zXzvBT6XmU9m5mbgy8C7x9RZAqwCyMyvUVyn8HakmWnCPpGZT1Jc2/O7wNaI+HpELI+Io7sXrlSJN0TEoxHxKPA4cD7wrsy8H3gh8HyKkbcHKK5puxj4U+DDXYq3L5m0zWARsT/FtQermoqvAs4Y/esoIo4A/i/g8011VlEkctKMMpk+kZk3AHMprmX7InAosCYiLq02WqlSdwGHlY9fp0jaVkXEccBuZZ17MvP9mflvmfk5ilOmf9SVaPuUw5oz25kU/8Zfj4jm8t2AtwBfokjOdgc2N9UZAJ4VEb+emWurC1eadpPpE2TmE8Bt5eOjEXEecEFE/GVmbqg0YqkaP8nM9U2vvxkRxwB/BhwD/IxiRmmz7wDPj4g5mbm1ojj7miNtM1R5DcK7Kf4SOmzM42+AJRHxfOAUYOmY7a8CbsfRNs0gk+kTLXZfV/584XTGKPWYJ4FfLpfEuQt4+ZjtLwe2m7BVx5G2mWsRcCDw2cwcbt4QEVcCCbwTaABXZuZPxtT5G+CvI+LszPxxRTFL06ltnyjXK/wr4ErgWxTruL0S+CjwfeCblUYsVeeXyvULoZioc1z5WFaWfRS4OSL+guJymkMpRuGWVx1oP3OkbeZ6L3DX2F9OpX8CfkhxUenNYxO20g3AsymWApFmgsn0ibcC/5Ni9Pk2ihG2T5TPXz/ewtTSDPHbwOby8W2KPnAORbJGucjuqcDJwL9T/HHzV8AF3Qi2Xw00Go1uxyBJkqQ2HGmTJEmqAZM2SZKkGjBpkyRJqoFaJm3Dw8ON4eFhL8aTSvYJaUf2Cc1EtV7yY9u2bbvcIUdGRpg1a1YnwzGGPo5hcHBwoIPh7DL7hDH0Svv2ic4wht6IoVf6RC1H2iRJkvqNSZskSVINmLRJkiTVgEmbJElSDZi0SZIk1YBJmyRJUg20XfIjIp4N3AHsWda/PjOXRcRLgGuAvYC1wGmZ+dOI2BO4GngN8CPgbZm5oTzWucCZwFPAH2fmrZ1/S5IkSTPPZEbangCOzsxXAYcBiyJiAfAx4JLMnA9spUjGKH9uzcyDgUvKekTEocApwCuARcCnI2K3Tr4ZSZKkmapt0paZjcx8tHy5R/loAEcD15flq4ATy+cnlK8ptx8TEQNl+TWZ+URmfh9YDxzRkXchSZI0w03qjgjliNg3gIOBS4F7gW2Z+WRZZSMwt3w+F3gAIDOfjIjtwN5l+Z1Nh23ep7mtJcCSFuHMHhoaAooVindVo9GY0v6dYAwzJ4bBwcEORrMj+4Qx1LF9+0RnGENvxNArfWJSSVtmPgUcFhGDwA3AIeNUG71VyHi3aWi0KB/b1gpgRat4Ru8nN5VbSnT7lhjGYAyTZZ8whn5rvx37hDH0W/ujdmr2aGZuA74CLAAGI2I06TsA2FQ+3wjMAyi3zwa2NJePs48kSZJaaJu0RcQLyxE2IuI5wEJgHXA78Nay2mLgxvL5TeVryu23ZWajLD8lIvYsZ57OB77WqTciSZI0k01mpG1/4PaIuAe4G8jMvBn4AHBWRKynuGZtqKw/BOxdlp8FnEOx03eA64DvArcAS8vTrpIkSWqj7TVtmXkP8Opxyu9jnNmfmfk4cNIEx7oIuGjnw5QkSepv3hFBkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBnZvVyEi5gFXA/sBTwMrMnN5ROwFXAscBGwATs7MrRExACwH3gg8BpyemWvLYy0GzisPfWFmrurs25EkSZqZJjPS9iRwdmYeAiwAlkbEocA5wJrMnA+sKV8DHA/MLx9LgMsAyiRvGXAkcASwLCLmdPC9SJIkzVhtk7bM3Dw6UpaZI8A6YC5wAjA6UrYKOLF8fgJwdWY2MvNOYDAi9geOKw6RWzJzK5DAoo6+G0mSpBmq7enRZhFxEPBq4C5g38zcDEViFxH7lNXmAg807baxLJuofGwbSyhG6CYye2hoCICRkZGdCX8HjUZjSvt3gjHMnBgGBwc7GM2O7BPGUMf27ROdYQy9EUOv9IlJJ20R8Tzgi8CfZOYjETFR1YFxyhotyneQmSuAFa1iGR4ebgDMmjWrVbWWRkZGprR/JxiDMUyGfcIY+q39duwTxtBv7Y+a1OzRiNiDImH7XGZ+qSx+qDztSfnz4bJ8IzCvafcDgE0tyiVJktRG26StnA06BKzLzI83bboJWFw+Xwzc2FT+rogYiIgFwPbyNOqtwLERMaecgHBsWSZJkqQ2JnN69HXAacC3I+KbZdkHgYuB6yLiTGAYOKnctppiuY/1FEt+nAGQmVsi4gLg7rLe+Zm5pSPvQpIkaYZrm7Rl5lcZ/3o0gGPGqd8Alk5wrJXAyp0JUJIkSd4RQZIkqRZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGJnMbq1p6+dJbWm5ffe7hHH7Ov0y4/XuXLup0SJIkSbtsxiZt0qh2CTyYxEuSep+nRyVJkmrApE2SJKkGTNokSZJqwKRNkiSpBpyIMMM5i1aSpJnBkTZJkqQaMGmTJEmqAZM2SZKkGmh7TVtErATeBDycma8sy/YCrgUOAjYAJ2fm1ogYAJYDbwQeA07PzLXlPouB88rDXpiZqzr7ViRJkmauyYy0XQWMvRr9HGBNZs4H1pSvAY4H5pePJcBl8PMkbxlwJHAEsCwi5kw1eEmSpH7RNmnLzDuALWOKTwBGR8pWASc2lV+dmY3MvBMYjIj9geOKQ+WWzNwKJM9MBCVJkjSBXV3yY9/M3AyQmZsjYp+yfC7wQFO9jWXZROXPEBFLKEbpJjJ7aGgIgJGRkQkrrT738JZvYK/n7dGyTqtjd0qj0Zj2dvwc2n8GMPXPYXBwcKfjmqxO9Yl2qvg+GkM9YuhE+/aJzjCG3oihV/pEp9dpGxinrNGi/BkycwWwolUjw8PDDYBZs2ZNWKfV2mNQ/CJ/40e/PuH2KtYnGxkZafkeOsHPof1nAL3xOUykU32inSq+j8ZQjxi63X479glj6Lf2R+3q7NGHytOelD8fLss3AvOa6h0AbGpRLkmSpEnY1aTtJmBx+XwxcGNT+bsiYiAiFgDby9OotwLHRsSccgLCsWWZJEmSJmEyS358AXgD8IKI2EgxC/Ri4LqIOBMYBk4qq6+mWO5jPcWSH2cAZOaWiLgAuLusd35mjp3cIEmSpAm0Tdoy89QJNh0zTt0GsHSC46wEVu5UdJIkSQK8I4IkSVItmLRJkiTVQKeX/JAkqRZmrzuq5fZH97uS2eveMuH27Yfc3umQpJZM2iRJUl+bagIP1STxnh6VJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBpwIoIkSX3MWbT14UibJElSDZi0SZIk1YBJmyRJUg2YtEmSJNWASZskSVINOHt0Gr186S0tt68+93AOP+dfJtz+vUsXdTokSZJUU460SZIk1YBJmyRJUg2YtEmSJNWA17RJ6ivr3/5fW27f++IPsP69H5xw+8Gf/2SnQ5KkSak8aYuIRcByYDfgisy8uOoYJKmfTTVxBZNXqRsqTdoiYjfgUiCAjcDdEXFTZn63yjikfuQIk9R7vO+ndsZAo9GorLGIeC3wocw8rnx9LkBmfnRnjjM8PFxd0NIkHXjggQPdats+oV5kn5B2NNU+UfXp0bnAA02vNwJHNleIiCXAkhbHmD00NDQNoUm9yT4h7cg+ob7VaDQqeyxcuPCkhQsXXtH0+rSFCxd+ssoYmtr+YTfaNQZj6NVHL3wGxtAbMXS7/V559MLnYAy9EUO32x99VL3kx0ZgXtPrA4BNFccwaluX2m1mDAVj6A298BkYQ6HbMXS7/V7RC5+DMRS6HUO32weqPz16NzA/Il4CPAicAry94hhGbe9Su82MoWAMvaEXPgNjKHQ7hm633yt64XMwhkK3Y+h2+0DFSVtmPhkRfwTcSrHkx8rM/E6VMfSbiLgKWFy+fAp4BPgecAvwyaZ6HwLemZkHVxuhVK0xfaLZj4E1wCET7PdLFGcGPpOZ501XfFI3tOgXp5bb3w+cCbyY4tr0v87MS6uKT4XK12nLzNXA6qrb7XP/DJxMcQeMORSTP94P/AHwn12MS+qW0T7R7GngcOAtEfGqzPzWmO2/R9F/rqggPqkbxusX24BLgD+lmPzxLeC1wIqI+GlmXl5tiP3NOyL0h59m5g/K55uA70TE9cC/UfzVJPWb5j7xcxFxC/BT4D3AH43Z/B4gM3PD9IcndcVE/WJv4C8y84ay6L6IOAL4M8CkrULee7RPZeYjwGXArIh4YbfjkXpBZj5NMfr8zoh4zmh5RLwUOApY0a3YpC4aAB4fU/YT4MUR4R/+FXKkrb/9e/nzJV2NQqreGyLi0TFlt2fmmymStn2Ak4Cry23vBh4CbqouRKlyY/vFQ5n5Uoprof84ItZQ/N44Avj9ss6LgPurDbN/mbT1t9GVmV05XP3mLp550fVj5c+fATdTnA69OiJ2B04HrszMJyuLUKre2H4x+n0fBv4D+CbF74tNwBBwDsUEN1Wkn5O2XjjN0e0YXknRAe/rchzd/hygN2Lotl74DKqK4SeZub5FDMPAP0TEIRSzSfel2gkI3f636Hb7vaIXPocqY5ioX1yWmSvKGdT7UCRtf1Bu+35FsXX736Lb7QMV33tU1SuncR+QmQvHlD+f4q+mezMzXPJD/WKiPjGmzrOAe4EbKJK23TLz2EoClLpgMv1iTP1/Bp7OzNdPZ1zaUT+PtPWTX4qI/ShOh84BFlAs+bEn8Idj6h02Zt+nM/OeasKUKjPaJ8Z6KDMbmfl0RFxO0U9mAW+rNjypd0TEbwAHAWspRtrOBg4DfquLYfUlk7b+8NvAZoprD0YoFtf9PPDJzNzaVG8exTIgzZ4Anl1FkFKFRvvEWC/kF2sXrgQ+XL6+saK4pF60J7AMeCnFkjh3AL+Zmd/ualR9yNOjkiRJNeA6bZIkSTVg0iZJklQDJm2SJEk1UMukbXh4uDE8POzFeFLJPiHtyD6hmajWs0e3bdu2yx1yZGSEWbNmdTIcY+jjGAYHBwfa15p+9glj6JX27ROdYQy9EUOv9IlajrRJkiT1G5M2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaaLvkR0Q8m+LmsHuW9a/PzGUR8RLgGmAvYC1wWmb+NCL2BK4GXgP8CHhbZm4oj3UucCbFjcv/ODNv7fxbkiRJmnkmM9L2BHB0Zr4KOAxYFBELgI8Bl2TmfGArRTJG+XNrZh4MXFLWIyIOBU4BXgEsAj4dEbt18s1IkiTNVG2TtsxsZOaj5cs9ykcDOBq4vixfBZxYPj+hfE25/ZiIGCjLr8nMJzLz+8B64IiOvAtJkqQZblJ3RChHxL4BHAxcCtwLbMvMJ8sqG4G55fO5wAMAmflkRGwH9i7L72w6bPM+zW0tAZa0CGf20NAQUKxQvKsajcaU9u8EY5g5MQwODnYwmh3ZJ4yhju3bJzrDGHojhl7pE5NK2jLzKeCwiBgEbgAOGafa6K1CxrtNQ6NF+di2VgArWsUzej+5qdxSotu3xDAGY5gs+4Qx9Fv77dgnjKHf2h+1U7NHM3Mb8BVgATAYEaNJ3wHApvL5RmAeQLl9NrCluXycfSRJktRC26QtIl5YjrAREc8BFgLrgNuBt5bVFgM3ls9vKl9Tbr8tMxtl+SkRsWc583Q+8LVOvRFJkqSZbDIjbfsDt0fEPcDdQGbmzcAHgLMiYj3FNWtDZf0hYO+y/CzgHIqdvgNcB3wXuAVYWp52lSRJUhttr2nLzHuAV49Tfh/jzP7MzMeBkyY41kXARTsfpiRJUn/zjgiSJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1sHu7ChExD7ga2A94GliRmcsjYi/gWuAgYANwcmZujYgBYDnwRuAx4PTMXFseazFwXnnoCzNzVWffjiRJ0sw0mZG2J4GzM/MQYAGwNCIOBc4B1mTmfGBN+RrgeGB++VgCXAZQJnnLgCOBI4BlETGng+9FkiRpxmqbtGXm5tGRsswcAdYBc4ETgNGRslXAieXzE4CrM7ORmXcCgxGxP3BccYjckplbgQQWdfTdSJIkzVBtT482i4iDgFcDdwH7ZuZmKBK7iNinrDYXeKBpt41l2UTlY9tYQjFCN5HZQ0NDAIyMjOxM+DtoNBpT2r8TjGHmxDA4ONjBaHZknzCGOrZvn+gMY+iNGHqlT0w6aYuI5wFfBP4kMx+JiImqDoxT1mhRvoPMXAGsaBXL8PBwA2DWrFmtqrU0MjIypf07wRiMYTLsE8bQb+23Y58whn5rf9SkZo9GxB4UCdvnMvNLZfFD5WlPyp8Pl+UbgXlNux8AbGpRLkmSpDbaJm3lbNAhYF1mfrxp003A4vL5YuDGpvJ3RcRARCwAtpenUW8Fjo2IOeUEhGPLMkmSJLUxmdOjrwNOA74dEd8syz4IXAxcFxFnAsPASeW21RTLfaynWPLjDIDM3BIRFwB3l/XOz8wtHXkXkiRJM1zbpC0zv8r416MBHDNO/QawdIJjrQRW7kyAkiRJ8o4IkiRJtWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAO7dzuA6fL1921quf1Xzv5lvv4XE9c5/JIXdTokSZKkXeZImyRJUg2YtEmSJNWASZskSVINmLRJkiTVgEmbJElSDbSdPRoRK4E3AQ9n5ivLsr2Aa4GDgA3AyZm5NSIGgOXAG4HHgNMzc225z2LgvPKwF2bmqs6+FWl87WYSg7OJJUm9bzIjbVcBi8aUnQOsycz5wJryNcDxwPzysQS4DH6e5C0DjgSOAJZFxJypBi9JktQv2iZtmXkHsGVM8QnA6EjZKuDEpvKrM7ORmXcCgxGxP3BccajckplbgeSZiaAkSZImsKuL6+6bmZsBMnNzROxTls8FHmiqt7Esm6j8GSJiCcUo3URmDw0NATAyMjJhpV85+5dbvoHdnjfQsk6rY3dKo9GopJ1+j6HddwGm/n0YHBzc6bgmq1N9op1++C4YQ3Xt2yc6wxh6I4Ze6ROdviPCwDhljRblz5CZK4AVrRoZHh5uAMyaNWvCOq2uT4LiF/l9f/XYhNuruIZpZGSk5XuoQj/E0O67AL3xfZhIp/pEO/3wXTCGerTfjn3CGPqt/VG7Onv0ofK0J+XPh8vyjcC8pnoHAJtalEuSJGkSdjVpuwlYXD5fDNzYVP6uiBiIiAXA9vI06q3AsRExp5yAcGxZJkmSpEmYzJIfXwDeALwgIjZSzAK9GLguIs4EhoGTyuqrKZb7WE+x5McZAJm5JSIuAO4u652fmWMnN0iSJGkCbZO2zDx1gk3HjFO3ASyd4DgrgZU7FZ2mrN0aZa5PJklSPXhHBEmSpBowaZMkSaoBkzZJkqQaMGmTJEmqgU4vritJ6nF7/H8va13htH9gj8te07LKz97/vzsYkaTJcKRNkiSpBhxpm0YutyH1nqmOMjnCJKlbTNqkPmGyIkn15ulRSZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBkzZJkqQaMGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBrwjgiSpL6075ff03L7Vw/7Cw6+66wJtz+08PJOh9QVfg71YdImSZL62lQTV6gmea08aYuIRcByYDfgisy8uOoYJEmS6qbSa9oiYjfgUuB44FDg1Ig4tMoYJEmS6mig0WhU1lhEvBb4UGYeV74+FyAzP7ozxxkeHq4uaGmSDjzwwIFutW2fUC+yT0g7mmqfqPr06FzggabXG4EjmytExBJgSYtjzB4aGpqG0KTeZJ+QdmSfUN9qNBqVPRYuXHjSwoULr2h6fdrChQs/WWUMTW3/sBvtGoMx9OqjFz4DY+iNGLrdfq88euFzMIbeiKHb7Y8+ql6nbSMwr+n1AcCmimMYta1L7TYzhoIx9IZe+AyModDtGLrdfq/ohc/BGArdjqHb7QPVnx69G5gfES8BHgROAd5ecQyjtnep3WbGUDCG3tALn4ExFLodQ7fb7xW98DkYQ6HbMXS7faDipC0zn4yIPwJupVjyY2VmfqfKGPpJRFwFHJCZC8fZ1gBOG1P2VuBa4MbM/L0qYpSqVPaJxcBfZ+b7xmz7eZ+IiA0USxJd2LR9MXA5xWSqj1QUsjStImJf4DzgzcCLKJKTfwIuzMxvNtXbA3gf8E5gPvAz4N+A5Zn5parj7leV38YqM1dn5ssy86WZeVHV7aulJcDHgOMjYr9uByNNk58ASyPiZZPdoZzpfjnwbhM2zRQRMQ/4OvCbwB8CBwP/N0VCdme5rirAAPAPwNnAX1Ms2bUAuA24NiI+VG3k/cs7IgiAiPgV4HcoTlcfBvw+4C8nzUT/E3gu8JfACa0qRsSzgE9QjM69OTNvnf7wpMpcCuwBHJWZj5RlwxRrqK4GriovZ9oH+HVgQWbe1bT/hyPiceCjEfF3mfmNKoPvR94wXqOWAKsz8z+Bq4B3R0TX1liSptn7gDdHxFEt6jwbuB54K/AGEzbNJBExh2JU7VNNCVuzjwL7AgHsDawZk7CNWk4xev2O6YpVv+BI28z3hoh4tFWF8lqFM/jFukc3Ap+h6Kz/OL3hSdXLzDsj4lrg4xHxmsx8epxqfwo8DfxaZv5HtRFK024+xcDNRNeVj5a/HNhzonqZ+XhE3FvW0zRzpG3mu4vidOfYR7PfpfgurAbIzCeAa2i9eKVUd+cAvwqcPsH2L1Nc2/OR8g8baSZpdybFO0r0oH7prz9EAAAgAElEQVQeaVvR7QCoJoafZOb6sYUR0RzDEuAFwE+aygeApyJi38x8aJpj7Jd/i17XC59BZTFk5v0RcQlwYURcNyaGDwL/CnyIYrb7DRHx1sx8vKLwuv1v0e32e0UvfA7TFcN/UIwkvxK4YZztryx/fo9iPdVXjlOHiHg28FKKP3KmU7f/LbrdPlDxvUdVrUku+XEn8L+B3wPGJnfXA1dl5sXTG6lUjbF9IiJmUfzyupxi2YPTMvNvmpf8iIhfAxL4NnBCZv64G7FLnRYRNwOHAy8be11bRPwD8GrgJcBSiok7YyciEBEfAC4GDnciwvTr55E2FZYA92Xm347dUI4+vDsiPpaZZveacTJzJCL+nOJi6onq3BMRv0MxknBrRLxxggu3pbpZSjGb+raIOI/iurX9KCbqHAWcmJk/iYjlFJMWboqIc4CvUEzUOZnij53zTdiq4TVt/e1ZFNfz/P8TbL+WYtj7mKoCkrpgiGK0bUKZ+T3gt4H9gTURsVcVgUnTKTPvpxhpuwv4LHAvxXpsewKvzcxbyno/A44DLqFYq20d8DWK3w2nZOay6qPvT54elSRJqgFH2iRJkmrApE2SJKkGTNokSZJqoJZJ2/DwcGN4eNiL8aSSfULakX1CM1Gtl/zYtm3bLnfIkZERZs2a1clwjKGPYxgcHOyJ+7TaJ4yhV9q3T3SGMfRGDL3SJ2o50iZJktRvTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQbazh6NiGcDd1Dci2x34PrMXBYRLwGuAfYC1gKnZeZPI2JP4GrgNcCPgLdl5obyWOcCZwJPAX+cmbd2/i1JkiTNPJMZaXsCODozXwUcBiyKiAXAx4BLMnM+sJUiGaP8uTUzD6a4uezHACLiUOAU4BXAIuDTEbFbJ9+MJEnSTNU2acvMRmY+Wr7co3w0gKOB68vyVcCJ5fMTyteU24+JiIGy/JrMfCIzvw+sB47oyLuQJEma4SZ1TVtE7BYR3wQeBhK4F9iWmU+WVTYCc8vnc4EHAMrt24G9m8vH2UeSJEktTOqOCJn5FHBYRAwCNwCHjFNtdNXp8Vb8bbQo30FELAGWtAhn9tDQEFCsULyrGo3GlPbvBGOYOTEMDg52MJod2SeMoY7t2yc6wxh6I4Ze6RM7dRurzNwWEV8BFgCDEbF7OZp2ALCprLYRmAdsjIjdgdnAlqbyUc37NLexAljRKo7R+8lN5ZYS3b4lhjEYw2TZJ4yh39pvxz5hDP3W/qi2p0cj4oXlCBsR8RxgIbAOuB14a1ltMXBj+fym8jXl9tsys1GWnxIRe5YzT+cDX+vUG5EkSZrJJnNN2/7A7RFxD3A3kJl5M/AB4KyIWE9xzdpQWX8I2LssPws4h2Kn7wDXAd8FbgGWlqddJUmS1Ebb06OZeQ/w6nHK72Oc2Z+Z+Thw0gTHugi4aOfDlCRJ6m/eEUGSJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqYPd2FSJiHnA1sB/wNLAiM5dHxF7AtcBBwAbg5MzcGhEDwHLgjcBjwOmZubY81mLgvPLQF2bmqs6+HUmSpJlpMiNtTwJnZ+YhwAJgaUQcCpwDrMnM+cCa8jXA8cD88rEEuAygTPKWAUcCRwDLImJOB9+LJEnSjNU2acvMzaMjZZk5AqwD5gInAKMjZauAE8vnJwBXZ2YjM+8EBiNif+C44hC5JTO3Agks6ui7kSRJmqHanh5tFhEHAa8G7gL2zczNUCR2EbFPWW0u8EDTbhvLsonKx7axhGKEbiKzh4aGABgZGdmZ8HfQaDSmtH8nGMPMiWFwcLCD0ezIPmEMdWzfPtEZxtAbMfRKn5h00hYRzwO+CPxJZj4SERNVHRinrNGifAeZuQJY0SqW4eHhBsCsWbNaVWtpZGRkSvt3gjEYw2TYJ4yh39pvxz5hDP3W/qhJzR6NiD0oErbPZeaXyuKHytOelD8fLss3AvOadj8A2NSiXJIkSW20TdrK2aBDwLrM/HjTppuAxeXzxcCNTeXvioiBiFgAbC9Po94KHBsRc8oJCMeWZZIkSWpjMqdHXwecBnw7Ir5Zln0QuBi4LiLOBIaBk8ptqymW+1hPseTHGQCZuSUiLgDuLuudn5lbOvIuJEmSZri2SVtmfpXxr0cDOGac+g1g6QTHWgms3JkAJUmS5B0RJEmSasGkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSasCkTZIkqQZM2iRJkmrApE2SJKkGTNokSZJqwKRNkiSpBkzaJEmSamD3dhUiYiXwJuDhzHxlWbYXcC1wELABODkzt0bEALAceCPwGHB6Zq4t91kMnFce9sLMXNXZtyJJkjRzTWak7Spg0Ziyc4A1mTkfWFO+BjgemF8+lgCXwc+TvGXAkcARwLKImDPV4CVJkvpF26QtM+8AtowpPgEYHSlbBZzYVH51ZjYy805gMCL2B44rDpVbMnMrkDwzEZQkSdIE2p4encC+mbkZIDM3R8Q+Zflc4IGmehvLsonKnyEillCM0k1k9tDQEAAjIyO7FDxAo9GY0v6dYAwzJ4bBwcEORrMj+4Qx1LF9+0RnGENvxNArfWJXk7aJDIxT1mhR/gyZuQJY0aqR4eHhBsCsWbN2Nr6fGxkZmdL+nWAMxjAZ9glj6Lf227FPGEOnzfnwUS23f+09V3LE5We0rLN12e2dDGlcuzp79KHytCflz4fL8o3AvKZ6BwCbWpRLkiRpEnY1absJWFw+Xwzc2FT+rogYiIgFwPbyNOqtwLERMaecgHBsWSZJkqRJmMySH18A3gC8ICI2UswCvRi4LiLOBIaBk8rqqymW+1hPseTHGQCZuSUiLgDuLuudn5ljJzdIkiRpAm2Ttsw8dYJNx4xTtwEsneA4K4GVOxWdJEmSAO+IIEmSVAudnj0qSZJqZDIzJw/8+Fsm3F7FrEkVHGmTJEmqAZM2SZKkGjBpkyRJqgGTNkmSpBowaZMkSaoBZ49KkvqSsyZVN460SZIk1YBJmyRJUg2YtEmSJNWASZskSVINOBFBUl/554+/tOX2Q0/9R/758sMm3P7bZ93b6ZAkaVIcaZMkSaoBkzZJkqQaMGmTJEmqAa9pk/qE13JJUr2ZtElSn5lqAg8m8VI3mLRJktQl3kpLO6PypC0iFgHLgd2AKzLz4qpjkCRJqptKk7aI2A24FAhgI3B3RNyUmd/tdFtzXv3hltu/9sX3cODvfHzC7Vv/bVmnQ1KXtPsugN8HSVLvG2g0GpU1FhGvBT6UmceVr88FyMyP7sxxhoeHqwtamqQDDzxwoFtt2yfUi+wT0o6m2ieqPj06F3ig6fVG4MjmChGxBFjS4hizh4aGpiE0qTfZJ6Qd2SfUtxqNRmWPhQsXnrRw4cIrml6ftnDhwk9WGUNT2z/sRrvGYAy9+uiFz8AYeiOGbrffK49e+ByMoTdi6Hb7o4+qF9fdCMxren0AsKniGEZt61K7zYyhYAy9oRc+A2ModDuGbrffK3rhczCGQrdj6Hb7QPWnR+8G5kfES4AHgVOAt1ccw6jtXWq3mTEUjKE39MJnYAyFbsfQ7fZ7RS98DsZQ6HYM3W4fqDhpy8wnI+KPgFsplvxYmZnfqTKGfhIRXwHWZ+a7x5QfQHFt4VFjyj8F/CHwvsz8RFVxSlWIiBuB/TPziHG2/RLFqP9VwCsi4trMfFvT9j2AfwV+kJlvqihkaVpExJnAZ4C9MnOkqfwe4JCx5cChEbEKuB24ElgL/EZmPt2071cY5/eNOqvye49m5urMfFlmvjQzL6q6fY0vIn4ZeCfwEVpf4CvV1WeB34iIV42z7feAOcCngPuAEyPitKbtHwYOBM6c9iil6fdlikGb148WRMQLgFcAm8cpfw6wpixqlPXeVVWw+gVvGK9Rb6P4ZXUh8KKI+M0uxyN12i3A/cB7xtn2HiAzcwPwE+AvgE9FxIsj4reA9wNnZuZDVQUrTZfMvB+4Fzimqfho4N+BG8cph18kbU8Dfw1cFBHPneZQNYZJm0a9F7gqM58ArsHRNs0w5amcK4B3RsRzRssj4qUUlwqsaKr+l8A3gc8BVwNDmfl3FYYrTbc1PDM5u618jC1/PDMfbCr7CMVI3funO0jtyKRNlKeLfh34Qll0FXByRAx2LShpegwBzwVOaip7N/AQcNNoQZngvQd4HcX1t2dVGKNUhTXAKyNin/L10RTXrP0TxTVszeWPNO+YmY9QjEb/t/IaaVXEpE1QjLKtzswfAmTm14DvU1zjJs0YmbkZuJnyFGlE7A6cDlyZmU+Oqb4E+DGwP/CyCsOUqnBb+fOYMvH6FeCfMnMLcE9T+XxgZJz9r6C4pOYjVQSrQj8nbSvaV5l20x3DE8DsccpHR9AepxhVewfwloh4cvRBMYOoqlOk/fBvUQe98BlUEcNngd+KiEOAtwD7UvwC+nkMEXEU8D7gZIprfP5HROxZQWw/j6HCtnqx/V7RC5/DtMSQmf8JfIviVOgxwNrMHF3W4vam8qeAT46z/1PA2RSXGxw+HTGO0e1/i263D1R871FVKyKWA28G5pcdbLT894HLgX2AEymu33k9xaygUbOBO4DXZeadlQUtTbOIeBbFRdg3UPxxsltmHtu0fZBipOHmzPwvEbEXxQXaX8jMs7sRszQdIuK/A/8Pxf/1mzPznLL8TRSJ2h3AyzNzQVl+OnBFZu7edIzVwPMoJii45Mc0q3pxXVXrMxSnga4sE7htwG9QDGdfnZk/ioj3Ajdk5rfH7hwR/0Ix2mbSphkjM5+OiMspLqKeRTFzutmnKWaQ/rey/paIOANYHRF/l5lfqTJeaRqtoRgt2xt4a1P5HRR3L/pdxhllG+Nsij9yfgasn4YY1aSfT4/OeJm5DlhAcTr07yg61p8BHwfeGxGHUSRx101wiGuBt0XEeKdYpTpbSTEh4T8pTn8CEBFvp5ik8M7MfGy0PDNvpfgj6KqIeH7FsUrT5Q6KZOvZwFdHC8uJBt+g+KPmy60OUP6e+SzFWm6aZp4elSRJqgFH2iRJkmrApE2SJKkGTNokSZJqoJZJ2/DwcGN4eNiL8aSSfULakX1CM1Gtl/zYtm3bLnfIkZERZs2a1clwjKGPYxgcHBzoYDi7zD5hDL3Svn2iM4yhN2LolT5Ry5E2SZKkfmPSJkmSVAMmbZIkSTVg0iZJklQDtZ6IIEmSNFWz1x3Vcvuj+13J7HVvaVln+yG3dzKkcTnSJkmSVANtR9oi4tkUN5Xds6x/fWYui4iXANcAewFrgdMy86cRsSdwNfAa4EfA2zJzQ3msc4EzgaeAPy5vwixJkqQ2JjPS9gRwdGa+CjgMWBQRC4CPAZdk5nxgK0UyRvlza2YeDFxS1iMiDgVOAV4BLAI+HRG7dfLNSJIkzVRtk7bMbGTmo+XLPcpHAzgauL4sXwWcWD4/oXxNuf2YiBgoy6/JzCcy8/vAeuCIjrwLSZKkGW5SExHKEbFvAAcDlwL3Atsy88myykZgbvl8LvAAQGY+GRHbgb3L8jubDtu8T3NbS4AlLcKZPTQ0BBQrFO+qRqMxpf07wRhmTgyDg4MdjGZH9gljqGP79onOMIZqYnh0vytbbn/6WYM82KZOo018negTk0raMvMp4LCIGARuAA4Zp9rorULGu01Do0X52LZWACtaxTN6P7mp3FKi27fEMAZjmCz7hDH0W/vt2CeModPazQx9cL8rmfuDM1rW6bnZo5m5DfgKsAAYjIjRpO8AYFP5fCMwD6DcPhvY0lw+zj6SJElqoW3SFhEvLEfYiIjnAAuBdcDtwFvLaouBG8vnN5WvKbfflpmNsvyUiNiznHk6H/hap96IJEnSTDaZkbb9gdsj4h7gbiAz82bgA8BZEbGe4pq1obL+ELB3WX4WcA7FTt8BrgO+C9wCLC1Pu0qSJKmNtte0ZeY9wKvHKb+PcWZ/ZubjwEkTHOsi4KKdD1OSpM6a6ir4VVzDJDXzjgiSJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUAyZtkiRJNWDSJkmSVAMmbZIkSTVg0iZJklQDJm2SJEk1sHu7ChExD7ga2A94GliRmcsjYi/gWuAgYANwcmZujYgBYDnwRuAx4PTMXFseazFwXnnoCzNzVWffjiRJ0sw0mZG2J4GzM/MQYAGwNCIOBc4B1mTmfGBN+RrgeGB++VgCXAZQJnnLgCOBI4BlETGng+9FkiRpxmqbtGXm5tGRsswcAdYBc4ETgNGRslXAieXzE4CrM7ORmXcCgxGxP3BccYjckplbgQQWdfTdSJIkzVBtT482i4iDgFcDdwH7ZuZmKBK7iNinrDYXeKBpt41l2UTlY9tYQjFCN5HZQ0NDAIyMjOxM+DtoNBpT2r8TjGHmxDA4ONjBaHZknzCGOrZfhz7x6H5Xtmzn6WcN8mCLOo0K/o26/V3olxim+l2A9t+HTvSJSSdtEfE84IvAn2TmIxExUdWBccoaLcp3kJkrgBWtYhkeHm4AzJo1q1W1lkZGRqa0fycYgzFMhn3CGPqt/XY61Sdmr3tLy3Ye3O9K5v7gjAm3bz/k9pb7d0Iv/Fv0QwxT/S5ANd+HSc0ejYg9KBK2z2Xml8rih8rTnpQ/Hy7LNwLzmnY/ANjUolySJElttE3aytmgQ8C6zPx406abgMXl88XAjU3l74qIgYhYAGwvT6PeChwbEXPKCQjHlmWSJElqYzKnR18HnAZ8OyK+WZZ9ELgYuC4izgSGgZPKbasplvtYT7HkxxkAmbklIi4A7i7rnZ+ZWzryLiRJkma4tklbZn6V8a9HAzhmnPoNYOkEx1oJrNyZACVJkuQdESRJkmrBpE2SJKkGTNokSZJqwKRNkiSpBnbqjgiSJGlmmb3uqJbbH93vypaLz1axqKwKjrRJkiTVgEmbJElSDZi0SZIk1YBJmyRJUg2YtEmSJNWASZskSVINmLRJkiTVgEmbJElSDZi0SZIk1YBJmyRJUg2YtEmSJNWASZskSVINmLRJkiTVgEmbJElSDezerkJErATeBDycma8sy/YCrgUOAjYAJ2fm1ogYAJYDbwQeA07PzLXlPouB88rDXpiZqzr7ViRJkmauyYy0XQUsGlN2DrAmM+cDa8rXAMcD88vHEuAy+HmStww4EjgCWBYRc6YavCRJUr9om7Rl5h3AljHFJwCjI2WrgBObyq/OzEZm3gkMRsT+wHHFoXJLZm4FkmcmgpIkSZrArl7Ttm9mbgYof+5Tls8FHmiqt7Esm6hckiRJk9D2mradNDBOWaNF+TNExBKKU6sTmT00NATAyMjIzsb3i8YbjSnt3wnGMHNiGBwc7GA0O7JPGEMd269Dn3h0vytbtvP0swZ5sEWdRgX/RlV8F/wcpv4ZQPvPoRN9YleTtociYv/M3Fye/ny4LN8IzGuqdwCwqSx/w5jyr4x34MxcAaxo1fjw8HADYNasWbsSO1B05Kns3wnGYAyTYZ8whn5rv51O9YnZ697Ssp0H97uSuT84Y8Lt2w+5veX+nVDFv4Wfw9Q/A6jmc9jV06M3AYvL54uBG5vK3xURAxGxANhenj69FTg2IuaUExCOLcskSZI0CZNZ8uMLFKNkL4iIjRSzQC8GrouIM4Fh4KSy+mqK5T7WUyz5cQZAZm6JiAuAu8t652fm2MkNkiRJmkDbpC0zT51g0zHj1G0ASyc4zkpg5U5FJ0mSJKDzExF6xsuX3tJy++pzD+fwc/5lwu3fu9QVSWaKdt8F8PsgSep93sZKkiSpBkzaJEmSamDGnh5VwdPEkiTNDI60SZIk1YBJmyRJUg2YtEmSJNWASZskSVINmLRJkiTVgEmbJElSDZi0SZIk1YBJmyRJUg2YtEmSJNWAd0SYRt6NQJIkdYojbZIkSTVg0iZJklQDJm2SJEk1YNImSZJUA05EkCSpS2avO6rl9kf3u5LZ694y4fbth9ze6ZDUw0zapD6x/u3/teX2vS/+AOvf+8EJtx/8+U92OiR1yVS/C+D3QeqGypO2iFgELAd2A67IzIurjkGSJKluKk3aImI34FIggI3A3RFxU2Z+t8o4JP2f9s48Ss6qTsNPE0TCEnZhEEIkAhFZFAEhOEK03+PgAi7xCCIqqMBB2XVG3ABhZEZAVGRQIAIiBwVHHXUiw09BHBEURBYVBpQ1gLLIDhKCPX/cr5KqTqVgHOre2/ne55w63fVVit/b1f183O9+d2kv7nE0xkxURsbGxrIVk7Q9cGREvK55fjhARBz7f/nv3H777flCG/MsmTp16kip2nbC1IidMKaX/68TuW+PvhC4o+v5POCV3f9A0j7APgP+G6vMmTNnCNGMqRM7YUwvdsK0lrGxsWyP0dHRt4+Ojp7e9XzP0dHRk3Jm6Kp9b4m6zuAMtT5q+AycoY4MpevX8qjhc3CGOjKUrt955F6nbR6wftfz9YC7Mmfo8GChut04Q8IZ6qCGz8AZEqUzlK5fCzV8Ds6QKJ2hdH0g/+3RK4CNJL0IuBPYDXhn5gwdHipUtxtnSDhDHdTwGThDonSG0vVroYbPwRkSpTOUrg9k3hEhIhYAHwL+C7geOC8ifpszgzHGGGPMRCT7Om0RMReYm7uuMcYYY8xExnuPGmOMMcZMANxoM8YYY4yZALjRZowxxhgzAXCjzRhjjDFmAtDmRtuppQPgDB2coQ5q+AycIVE6Q+n6tVDD5+AMidIZStcHMu89aowxxhhj/jba3NNmjDHGGDNhcKPNGGOMMWYC4EabMcYYY8wEwI02Y4wxxpgJQKsabZL+XdIbJBX5uSVNKlF3PKVzSHpjqd/BuByrl85QGjuRKJ2jBifsQ8JOJErnsBP9Kf4/zsycArwTuEnSv0iakbn+7yUdJ2nTzHVry7Eb6XfwWUkvKZQB4BeSzpf0ekkjBXOUxE7UkaMGJ+xDwk7UkcNO9KGVS35IWgXYHfg4cAdwGvD1iHhqyHVXJv0h7kVqMH8V+EZEPDzMujXmkDSF9DvYCxgDzgDOjYhHMmYYAUaBvYFtgW8CZ0bEjbky1IKdKJ+jtBP2oRc7UT6HnVic1jXaJK0BvAvYE7gLOAd4FbB5ROyUMcergXOBVYFvAUdHxO9z1a8hh6Q1Sb+Lg4HrgRcDX4yIk3LUH5dlFvB1YEXgGuCjEXFZ7hwlsBP15KjFiTb7AHaiphx2opdlcxcsiaRvAzOAs4E3RcTdzUvflHRlhvqTgDeQrhqmASeQTgZ/D8wFNh52hhpySHoT6cplOul3sW1E3CNpBZKUWWQcd2L+E3AA8D3gZcD5wIty5CiJnagjRw1O2IeEnagjh53oT6sabcCXIuKifi9ExNYZ6t8EXAwcFxE/7zr+reZKJhelc7wdODEiftp9MCIel7R3hvodLiOdDN4cEfO6jl8p6csZc5TETtSRowYn7EPCTtSRw070oY23RzcDNgWW7xyLiK9lqr1SRDyao9ZEyFEaSSMR0S4B+mAn6slREvuwCDtRT46S1OhEq3raJB0B7ESScS6wM/AzIIuMwAJJHwReSu/JIGfvUvEckrYjdW2/BFgOmAQ8FhFTctTvYk1J/8jin8NrMucohp2oI0clTrTeB7ATteSwE/1p25Ifs4HXAn+MiL2ALYHnZ6x/NrAO8DrgEmA9INtsyYpyfIk0I+gmYDLwfjKNYxvHOcANpHEJRwG3AlcUyFESO1FHjhqcsA8JO1FHDjvRh7Y12p6IiL+SriCmAPcAG2as/+KI+CTpauEs0iDPzTPWryZHM/NoUkQ8HRFnALNy1m9YIyLmAE9FxCXNFeR2BXKUxE5UkqMCJ+xDwk5UksNOLE6rbo+SBg+uSlpv51fAo8AvM9bvrO/zYDNm4o+kWTm5KZ3jcUnLAVdL+ixwN2kadW46n8Pdkt5Amtq/XoEcJbETdeSowQn7kLATdeSwE31oVaMtIvZvvv2ypAuAKRFxbcYIp0paDfgkadrwSsCnMtavJceepPEJHwIOAdYH3paxfodjmgU0DyN1u09p8rQGO1FNjhqcaL0PYCcqymEn+tCK2aOSthr0ekRclSuLMTVgJ4zpxU6YiUBbetpOGPDaGDDUmSCSDh30ekR8bpj1a8kh6TrS572k+lsMs35XjpOeIceBOXIUxk5UkKMGJ+zDQuxEBTnsxGBa0WiLiBKD3LtZuXD9DqVzvLFw/Q5DX9W8duzEQkrnqMGJ1vsAdqKL0jnsxABacXu0Q7P9xaHA1IjYR9JGwCYR8YPC0VqHpA2AjSLiR5ImA8tGxs3ix2VZMSIeK1G7NHaiHmpxos0+gJ2oCTuxOK3oaeviDNJsoJnN83mk/cOyyChpY+AUYO2I2EzSFsAuEXFMjvq15JD0AWAfYHXSvnLrAV8mrY2UDUnbA3NIA2ynStoS2LdrIHIbsBMV5KjBCfuwEDtRQQ470Z+2rdM2PSI+SzONNyKeAEYy1j8NOLyr/rXAbhnr15Ljg8AOwMNN/ZuAF2Ss3+HzpIUj729yXAPk3NuvBuxEHTlqcMI+JOxEHTnsRB/a1mib33SxjgFImg48mbH+ChExfr2fBRnr15LjyYiY33kiaVkGDPocJhFxx7hDT5fIURA7UUeOKpywD4CdqCWHnehD226PHgFcAKwv6RxSK/69Gevf15wAOieD2aQFA3NTOsclkj4GTJYkYH/g+xnrd7hD0kxgrFnE8UDg+gI5SmIn6popiFQAABEcSURBVMhRgxP2IWEn6shhJ/rQqkZbRISkq0jbUIwAB0XEfRkjfBA4FZgh6U7gFmCPjPVryfFR4H3AdcC+pE2ZT89Yv8N+wBeAF5LGrVxI+mxag52oJkcNTrTeB7ATFeWwE31ozezRpmt1Z2BGc+h64IKIyN7tLGlFYJlSsyVryCFpLYCIuDd3bZOwE3XlsBPlsRN15bATi9OKRpukdYGLSV27vyZdPb0cWAeYFRF3ZciwCWkmTPfJ4NSIuHHYtWvJIWmEdOvhQ6TfwQhpfMBJEfHpYdcfl2UWcACwSXPoeuBLEfGTnDlKYSfqyFGLE233AexELTnsxGDaMhHhM8ApEbFTRBwSEQdHxI7AycCxwy7eTBv+CfAIqbv5NOAx4CeStht2/YpyHEwaH7JNRKwREasDrwR2kJRtPzeljX+/Shof8U5Sl/9c4KuSXp8rR2HsRB05ijthHxZiJ+rIYScG0JYxbdtFxHvHH4yIL0r6nwz1PwXsPq6F/l1JF5GuKHbOkKGGHO8G1D0+JCJulvQu0liBE4dcv8NHgDc307c7XC3pStKmwHMz5SiJnagjRw1O2IeEnagjh50YQFt62p4Y8NrjGepP79elGhGXABtmqF9Ljuf1G9DbjFd4Xob6HdYZJ2Mnx7XA2hlzlMRO1JGjBifsQ8JO1JHDTgygLT1tq0h6a5/jI8CUDPUHDeDMuTVG6Rzz/8bXnmsG/axVbFWSATuRKJ2jBifsQ8JOJErnsBMDaEuj7RLgTUt47acZ6q8v6Yt9jo+QphLnonSOLSU93NTrngEzAiyfoX6H6ZK+1+f4CHmvaEtiJ+rIUYMT9iFhJ+rIYScG0IpGW0TsJWkZYHZEnFcgwkcGvHZlthSFc0TEpGHXeJbsOuC147OlKIidWIidsA+AnejCTtTsxNjYWGseo6OjPy1Ye9Lo6OhxpT+DPrlWGx0dHclc8/jR0dFNK/jZD3o2x5bmh53om6uVTtiHhT+znVg8l50YcCznoxU9bV2EpA8D36TrvnRE/HnohSOelvSKYdcZhKRPAedFxA2Sng/8EHgZsEDSOyPiR5mi3ACc1ixkeQZwbkQ8lKl2N+8hrXbdzXv7HFuasRN2ooN9SNgJO9GhOifa1mjbu/navQ3FGPnuUf+6uU9+Pr0ng29nqv8O4Ojm+/eQ7s+vBWwMnAVkkTEiTgdObxZw3Au4VtKlwGkRcfGw60vanbT2zobjxi2sDNw/7PqVYSda7oR9WAw7YSeqdaJVjbaIeFHhCKuTfuGv6To2BuSScX5EdAZ2vg74RkQ8DVzfXM1kQ9Ik0mrbM4D7gGuAQyXtGxG7Dbn8VaRVz9cETug6/ghw7ZBrV4WdsBPYhx7shJ2gYida1WiTtAJwKDA1IvaRtBGwSUT8IEf9iNgrR50BPClpM+BPwCzgw12vrZArhKTPkWZpXQR8JiJ+2bz0r5kWsTw3IraS9Idm7aHWYifsBPahBzthJ6jYiVY12kj3xX8FzGyezyN1QWeRUdLGwCnA2hGxmaQtgF0i4pgc9YGDgG+RurpPjIhbmlyvJ+21l4vfAJ+IiH4LVm6bof5ykt4DbN9vXaaMtyFqwE7YCfvQi52wE9U60ZYdETpMj4jPAk8BRMQTpPv1uTgNOLyr/rXAsG8FLiQifhERMyLt53Z01/G5pP3ecrHHeBEl/bjJkmOg6X7AdsCqpCu57scbM9SvCTthJ+xDL3bCTlTrRNt62uZLmkyzYJ+k6cCTGeuvEBG/lNR9bEHG+j1IWgV4G2nA5UsY8sKJkpYnda+vKWk1Fp0IpwDrDrN2NxHxM+Bnkq6MiDm56laKneiijU7Yh8WwE13YibqcaFuj7QjgAtKKz+cAO5Cm7+bivuYE0DkZzCYNdsxGczLahSTgVqTZMG8mz4rf+5Ku1NYlDfTs8DBwcob6PUTEnGbsxqZ0rbQdEV/LnaUgdsJOAPahCzthJ4A6nRgZGxt75n+1FCFpDVK35whwefTZmHaItTcETiWNlXgAuAV4V0Tcmqn+OcCrgQuBb5AGeP4+92wpSQdExEk5ay4hxxHATiQh5wI7Az+LiNklc+XGTtiJJoN9aLATdqLJUJ0TrehpkzSjWShwq+ZQ56plqqSpEXHVkt77XBIRNwOjklYElomIQRvzDoPNSCeB64EbmoUcs7XaJb0mIi4C7qxkcOdsYEvg15G2sFkbOD1zhiLYiYXYiUW01gewE13YiUVU50QrGm2k6dv70LveSocxetfDGRpKq0u/DZgGLNsZsxARn85RPyK2lDSD1OX9I0n3ACtLWici/pghwo6kq7Z+mzLnXIeowxMR8VdJCyRNAe6hPRtk2wnsxDja7APYiU4dO7GI6pxoze1RpY2At4+ISwtmuAB4iDSd/OnO8Yjod5LIkWdrkpizgXkRMfMZ3vJc1Cy5IfP4LP8GfIw0M+sw4FHg6grWScqCneibp7VOtN0HsBNLyGMnanKi9Ea0mTd/vaxw/d+U/gyWkGtkdHR0x4z1im3IPO5nXr/r+bTR0dEtSucq8DnYif65WuWEfej5LOxE/1x2ogIn2nJ7tMOFkt4GfDsWbdORk59L2jwiritQu7MR8CByrfxcbEPmrlpjkr4LvKJ5fmuu2pVhJwbTCifsQw92YjB2oiBta7QdCqwILJD0F9LMoLGImDLMopKuI92LXxbYS9LNpHV/OvW3GGb9Lh7rc2xF4H3AGkCWMROU35C5w+WStomIKzLXrQk7sThtdcI+JOzE4tiJSmjTmLYRYP2IuL1A7Q0GvR4Rt+XK0kHSyqTtSt4HnAecEBH35M5REkm/AzYGbiOdqHKfHItiJ3ppuxNt9wHsxHjsRH1OtKanrenq/A5NV2fm2rcBSDo7Ivbsfk3S2cCefd84BCStTrqS3AM4C9gqIh7IVb/J8O5+xwssWLhz5npVYScW1rMTiVb7AHaiq56dSFTnRGsabQ2luzpf2v1E0iQynhwkHQe8lbRw4+YR8Wiu2uPYpuv75YHXkla+zt1o+zvgt511kJqryk1JV1VtwU7YiQ72IWEn7ESH6pxoW6NtFrCvpKxdnZIOJ00bnizp4ebwCDCfJEYuDiONkfgE8HEt2tsuy5iNDhFxQPdzpb3tzs5RexynkLZo6fBYn2NLO3bCTnSwDwk7YSc6VOdE2xptRbo6I+JY4FhJx0bE4SUyNDmWKVX7GXgc2KhA3ZHu2WHNIop2IgN24hkp4YR9SNiJOrETtKzR1jVm4AV0bf6asf7hklYj/eF1bz6bYxPeapD0fZrNkIFlSN3NJRZRvFnSgaQrJ4D9gZsL5CiGnaiDSpxovQ9gJ2rBTvSnVY02SbuQtihZl7QdxQak/dVeOuh9z2H995Nm4qwHXE3akPgyMm2PUhHHd32/ALgtIuYVyLEf8EXSbYAx4MekbWxag52ohhqcaL0PYCcqwk70oVWNNuBokgA/ioiXS5oF7J6x/kGkwZWXR8SsZn+3ozLWr4Xbgbsj4i8AkiZLmpZz8cJmcO8eEbFbrpqVYifqoKgT9qEHO1EHdqIPtd67HhZPRcT9wDKSlomIi4GXZaz/l64/wOdHxA3AJhnr18L5wF+7nj/dHMtGRDwN7JqzZqXYiToo6oR96MFO1IGd6EPbetoelLQS8FPgHEn3kLpdczFP0qrAd0lbdDwA3JWxfi0sGxHzO08iYr6k5QrkuFTSl1h8m5SrCmQphZ2ogxqcsA8JO1EHdqIPrWi0SXoxsDap1fwEcAhp0cANgAMGvPU5JSLe0nx7pKSLgVWAC3LVr4h7Je0SEd8DkLQrcF+BHDObr93bsozRgrEjdqI6anCitT6AnagQO9GHVjTagM8DH4uITkv5r8BZkrYGjgTelCuIpFcBG0XEGZLWAl4I3JKrfiXsR7qCPZkkwDyg7+rXwyQiZuWuWRF2oi6KO9FyH8BO1Iad6ENbGm3TIuLa8Qcj4kpJ03KFkHQEsDVpfMIZwPOArwM75MpQAxHxB2C75hbESGe16VxIeldEfF3SoUvI97mceQphJyqipBP2YSF2oiLsRH/aMhFh0Fo7k7OlgLcAu9DcG4+Iu4CVM9avAklrS5oDnB8Rj0jaVNL7MkZYsfm6cp/HShlzlMROVERhJ+xDwk5UhJ3oT1t62q6Q9IGIOK37YPMH8KuMOeZH2pB4rKm/4jO9YSnlTNIV5Meb5zeSBnrOyVE8Ir7SfF1sGr2kbRZ/x1KJnaiLMynkhH1YiJ2oizOxE4vRlkbbwcB3JO3BIvm2BpYjXdXk4jxJXwFWlfQBYG/gtGd4z9LImhFxntJee0TEAklPlwojaVNgN9JaTA+R/jaWduxEXVTjREt9ADtRG3aiD61otEXEn4CZzSKJmzWH/zMiLspRX9LBwKWkga6zgIdJ4xU+FRGRI0NlPCZpDZotSiRtRxIhG5I2IAm4O2k6/wbA1jkX+C2JnaiOok603QewExViJ/rQikZbh2aRxIsLlF4P+AIwA7gW+DlJzpxd7jVxKPA9YLqkS4G1gNm5ikv6OWka/TeA2RFxk6RbSstYAjtRDcWcsA+92IlqsBN9aMtEhKJExIcjYiawDvAx4M+kLu/fSPpd0XAZkbSNpHWahQl3JH0WTwIXkqZz5+Je0oDStUknAli0MbHJgJ1IVOKEfagAO5GwE4Nxoy0vk4EppBb8KqRVrn9RNFFevgJ0VrieSRpgejLwAHBqrhARsSuwOXAVcJSkW4DVJG2bK4NZiJ0o7IR9qA47YSeWyMjYWBWNx6UaSacCLwUeIcl3OWkz4AeKBsuMpGsiYsvm+5OBeyPiyOb51RGRc3+/7lwvAN5BGruwfkSsXyJHm7ATiRqdsA9lsBMJOzEY97TlYSrwfOCPwJ2kLt4HiyYqwyRJnXGUrwW6B/iWHF/5WESc1NyaeFXBHG3CTiRqdMI+lMFOJOzEANxoy0BE/AOwDXB8c+gw0ppAF0pabB2YpZhzgUsk/Qdpb7//hoV7/mWdPdrUndmMFbm+eb4l8E+5c7QRO7GQapywD2WxEwuxEwNwoy0TETEWEb8B5gI/JM0Kmg4cVDRYRiLin0knojOBV0VE5978MmTckLmLE4HXAfc3+a4BXl0gRyuxE9U5YR8KYyfsxDPRqiU/SiHpQNKAyh2Ap0giXgZ8FbiuYLTsRMTlfY7dWCJLU/sOSd2Hii3y2ybsxCJqcsI+lMNOLMJOLBk32vIwDfgWcEhE3F04i1nEHZJmAmOSlgMOpOkGN0NnGnaiNuxDWaZhJ2qjOifcaMtARBxaOoPpy36kxSxfSBr0eyGwf9FELcFOVIl9KIidqJLqnHCjzbSZTSJij+4DknYg3ZYwpm3YB2N6qc4JT0QwbeakZ3nMmDZgH4zppTon3NNmWoek7UkDfteS1H1LYgowqUwqY8pgH4zppWYn3GgzbWQ5YCXS3//KXccfJuPG9cZUgn0wppdqnfA2Vqa1SNogIm4rncOYGrAPxvRSoxPuaTNt5nFJx5H2+1u+czAiXlMukjHFsA/G9FKdE56IYNrMOcANwIuAo4BbgStKBjKmIPbBmF6qc8KNNtNm1oiIOcBTEXFJROwNbFc6lDGFsA/G9FKdE749atrMU83XuyW9AbgLWK9gHmNKYh+M6aU6J9xoM23mGEmrkDYnPok0nfvgspGMKYZ9MKaX6pzw7FFjupB0cER8vnQOY2rAPhjTS2knPKbNmF68/58xi7APxvRS1Ak32ozpZaR0AGMqwj4Y00tRJ9xoM6YXjxcwZhH2wZheijrhiQimdUh6hP7ijQCTM8cxpij2wZheanbCExGMMcYYYyYAvj1qjDHGGDMBcKPNGGOMMWYC4EabMcYYY8wEwI02Y4wxxpgJwP8CbVDuhPZZGooAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# both days\n", + "delay_type_plot(air_dec, \"12-17-16\", \"12-18-16\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# only the 17th\n", + "delay_type_plot(air_dec, \"12-17-16\", \"12-17-16\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# only the 18th\n", + "delay_type_plot(air_dec, \"12-18-16\", \"12-18-16\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this exercise was to drive home the ability to *automate*\n", + "tasks.\n", + "\n", + "We were able to write a pair of `functions` that allows us to easily\n", + "repeat the exact same analysis on different subsets of the data, or\n", + "different datasets entirely (e.g. we could do the same analysis on\n", + "November 2016 data, with two lines of code).\n", + "\n", + "These principles can be applied in many settings.\n", + "\n", + "Keep that in mind as we work through the rest of the materials." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise: Cohort Analysis using Shopify Data\n", + "\n", + "The `qeds` library includes routines to simulate data sets in the\n", + "format of common sources\n", + "\n", + "One of these sources is [Shopify](https://www.shopify.com/) — an\n", + "e-commerce platform used by many retail companies for online sales\n", + "\n", + "The code below will simulate a fairly large data set that has the\n", + "properties of a order-detail report from Shopify\n", + "\n", + "We’ll first look at the data, and then describe the exercise" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 502079 entries, 0 to 502078\n", + "Data columns (total 14 columns):\n", + "Day 502079 non-null object\n", + "customer_type 502079 non-null object\n", + "Customer ID 502079 non-null int64\n", + "orders 502079 non-null int64\n", + "total_sales 502079 non-null float64\n", + "Returns 502079 non-null float64\n", + "Ordered quantity 502079 non-null int64\n", + "Gross sales 502079 non-null float64\n", + "Net sales 502079 non-null float64\n", + "Shipping 502079 non-null float64\n", + "Tax 502079 non-null float64\n", + "Net quantity 502079 non-null int64\n", + "Returned quantity 502079 non-null int64\n", + "Discounts 502079 non-null float64\n", + "dtypes: float64(7), int64(5), object(2)\n", + "memory usage: 53.6+ MB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Daycustomer_typeCustomer IDorderstotal_salesReturnsOrdered quantityGross salesNet salesShippingTaxNet quantityReturned quantityDiscounts
02017-01-03Returning9111616584181.140.0281.1481.140.00.0200.0
12016-07-19First-time9139556302117.740.0217.7417.740.00.0200.0
22015-08-26Returning89914591281230.030.05230.03230.030.00.0500.0
32017-10-20Returning3437913651115.280.07115.28115.280.00.0700.0
42017-12-28Returning95592596581234.380.07234.38234.380.00.0700.0
\n", + "
" + ], + "text/plain": [ + " Day customer_type Customer ID orders total_sales Returns \\\n", + "0 2017-01-03 Returning 9111616584 1 81.14 0.0 \n", + "1 2016-07-19 First-time 9139556302 1 17.74 0.0 \n", + "2 2015-08-26 Returning 8991459128 1 230.03 0.0 \n", + "3 2017-10-20 Returning 343791365 1 115.28 0.0 \n", + "4 2017-12-28 Returning 9559259658 1 234.38 0.0 \n", + "\n", + " Ordered quantity Gross sales Net sales Shipping Tax Net quantity \\\n", + "0 2 81.14 81.14 0.0 0.0 2 \n", + "1 2 17.74 17.74 0.0 0.0 2 \n", + "2 5 230.03 230.03 0.0 0.0 5 \n", + "3 7 115.28 115.28 0.0 0.0 7 \n", + "4 7 234.38 234.38 0.0 0.0 7 \n", + "\n", + " Returned quantity Discounts \n", + "0 0 0.0 \n", + "1 0 0.0 \n", + "2 0 0.0 \n", + "3 0 0.0 \n", + "4 0 0.0 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set the \"randomness\" seeds\n", + "random.seed(42)\n", + "np.random.seed(42)\n", + "\n", + "orders = qeds.data.shopify.simulate_orders(500000)\n", + "orders.info()\n", + "\n", + "orders.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a customer’s cohort as the month in which a customer placed\n", + "their first order and the customer type as an indicator of whether this\n", + "was their first order or a returning order.\n", + "\n", + "We now describe the *want* for the exercise, which we ask you to\n", + "complete.\n", + "\n", + "**Want**: Compute the monthly total number of orders, total sales, and\n", + "total quantity separated by customer cohort and customer type.\n", + "\n", + "Read that carefully one more time…" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extended Exercise\n", + "\n", + "Using the reshape and `groupby` tools you have learned, apply the want\n", + "operator described above.\n", + "\n", + "See below for advice on how to proceed.\n", + "\n", + "When you are finished, you should have something that looks like this:\n", + "\n", + "![https://datascience.quantecon.org/assets/_static/groupby_files/groupby_cohort_analysis_exercise_output.png](https://datascience.quantecon.org/assets/_static/groupby_files/groupby_cohort_analysis_exercise_output.png) \n", + "Two notes on the table above:\n", + "\n", + "1. \n", + "
\n", + "
Your actual output will be much bigger. This is just to give you an
\n", + "
\n", + " idea of what it might look like. \n", + "
\n", + " \n", + "
\n", + " \n", + "1. \n", + "
\n", + "
The numbers you produce should actually be the same as what are
\n", + "
\n", + " included in this table… Index into your answer and compare what you\n", + " have with this table to verify your progress. \n", + "
\n", + " \n", + "
\n", + " \n", + "\n", + "\n", + "Now, how to do it?\n", + "\n", + "There is more than one way to code this, but here are some suggested\n", + "steps.\n", + "\n", + "1. Convert the `Day` column to have a `datetime` `dtype` instead\n", + " of object (Hint: use the `pd.to_datetime` function). \n", + "1. Add a new column that specifies the date associated with each\n", + " customer’s `\"First-time\"` order. \n", + " - Hint 1: You can do this with a combination of `groupby` and\n", + " `join`. \n", + " - Hint 2: `customer_type` is always one of `Returning` and\n", + " `First-time`. \n", + " - Hint 3: Some customers don’t have a\n", + " `customer_type == \"First-time\"` entry. You will need to set the\n", + " value for these users to some date that precedes the dates in the\n", + " sample. After adding valid data back into `orders` DataFrame,\n", + " you can identify which customers don’t have a `\"First-Time\"`\n", + " entry by checking for missing data in the new column. \n", + "1. You’ll need to group by 3 things. \n", + "1. You can apply one of the built-in aggregation functions to the GroupBy. \n", + "1. After doing the aggregation, you’ll need to use your reshaping skills to\n", + " move things to the right place in rows and columns. \n", + "\n", + "\n", + "Good luck!" + ] + } + ], + "metadata": { + "date": 1584040760.646586, + "filename": "groupby.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "title": "GroupBy" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/8_groupby_exercises.ipynb b/Session_7/8_groupby_exercises.ipynb new file mode 100644 index 0000000..e6db260 --- /dev/null +++ b/Session_7/8_groupby_exercises.ipynb @@ -0,0 +1,1547 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 233, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: qeds in c:\\users\\asus\\anaconda3\\lib\\site-packages (0.6.2)\n", + "Requirement already satisfied: pandas in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.25.1)\n", + "Requirement already satisfied: numpy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.16.5)\n", + "Requirement already satisfied: quantecon in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.4.6)\n", + "Requirement already satisfied: matplotlib in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.1.1)\n", + "Requirement already satisfied: scipy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (1.3.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.21.3)\n", + "Requirement already satisfied: seaborn in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.9.0)\n", + "Requirement already satisfied: statsmodels in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.10.1)\n", + "Requirement already satisfied: pandas-datareader in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.8.1)\n", + "Requirement already satisfied: plotly in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (4.5.4)\n", + "Requirement already satisfied: pyarrow in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (0.16.0)\n", + "Requirement already satisfied: quandl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.5.0)\n", + "Requirement already satisfied: openpyxl in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (3.0.0)\n", + "Requirement already satisfied: requests in c:\\users\\asus\\anaconda3\\lib\\site-packages (from qeds) (2.22.0)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2019.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas->qeds) (2.8.0)\n", + "Requirement already satisfied: numba>=0.38 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (0.45.1)\n", + "Requirement already satisfied: sympy in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quantecon->qeds) (1.4)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (1.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from matplotlib->qeds) (2.4.2)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from scikit-learn->qeds) (0.13.2)\n", + "Requirement already satisfied: patsy>=0.4.0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from statsmodels->qeds) (0.5.1)\n", + "Requirement already satisfied: lxml in c:\\users\\asus\\anaconda3\\lib\\site-packages (from pandas-datareader->qeds) (4.4.1)\n", + "Requirement already satisfied: six in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.12.0)\n", + "Requirement already satisfied: retrying>=1.3.3 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from plotly->qeds) (1.3.3)\n", + "Requirement already satisfied: more-itertools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (7.2.0)\n", + "Requirement already satisfied: inflection>=0.3.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from quandl->qeds) (0.3.1)\n", + "Requirement already satisfied: et-xmlfile in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.0.1)\n", + "Requirement already satisfied: jdcal in c:\\users\\asus\\anaconda3\\lib\\site-packages (from openpyxl->qeds) (1.4.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2019.9.11)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (1.24.2)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from requests->qeds) (2.8)\n", + "Requirement already satisfied: llvmlite>=0.29.0dev0 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from numba>=0.38->quantecon->qeds) (0.29.0)\n", + "Requirement already satisfied: mpmath>=0.19 in c:\\users\\asus\\anaconda3\\lib\\site-packages (from sympy->quantecon->qeds) (1.1.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\asus\\anaconda3\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib->qeds) (41.4.0)\n" + ] + } + ], + "source": [ + "! pip install qeds\n", + "import random\n", + "import numpy as np\n", + "import pandas as pd\n", + "import qeds\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import qeds\n", + "qeds.themes.mpl_style();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GroupBy - Exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We will begin with a simple made-up dataset to discuss the concepts and then work through extended example and exercises with real data.**" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
2123.0
322NaN
4215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0\n", + "2 1 2 3.0\n", + "3 2 2 NaN\n", + "4 2 1 5.0\n", + "5 2 1 NaN" + ] + }, + "execution_count": 234, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C = np.arange(1, 7, dtype=float)\n", + "C[[3, 5]] = np.nan\n", + "df = pd.DataFrame({\n", + " \"A\" : [1, 1, 1, 2, 2, 2],\n", + " \"B\" : [1, 1, 2, 2, 1, 1],\n", + " \"C\": C,\n", + "})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We create a DataFrameGroupBy to use in what follows.**" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0111.0
1112.0
2123.0
322NaN
4215.0
521NaN
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 1 1.0\n", + "1 1 1 2.0\n", + "2 1 2 3.0\n", + "3 2 2 NaN\n", + "4 2 1 5.0\n", + "5 2 1 NaN" + ] + }, + "execution_count": 235, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbA = df.groupby(\"A\")\n", + "gbA.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Try the following operations: gbA.sum(), gbA.mean(), gbA.count(). How did pandas compute the sum of `gbA`? What happened to the `NaN` entries in column `C`? Write your thoughts.** " + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.sum()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
146.0
245.0
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 4 6.0\n", + "2 4 5.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.mean()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
11.3333332.0
21.3333335.0
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 1.333333 2.0\n", + "2 1.333333 5.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.count()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
133
231
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 3 3\n", + "2 3 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# the NaN entries in column C are disregarded in the analysis, regardless of the operation we use\n", + " # for A = 2: the sum is 5, the mean is also 5 (computed with one element), the count gives us 1 element\n", + "\n", + "print(\"This is gbA.sum()\")\n", + "display(gbA.sum())\n", + "print(\"This is gbA.mean()\")\n", + "display(gbA.mean())\n", + "print(\"This is gbA.count()\")\n", + "display(gbA.count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Use introspection (tab completion) to see what other aggregations are defined for GroupBy objects. Pick three and evaluate them in the cells below.**" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.max()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
123.0
225.0
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 2 3.0\n", + "2 2 5.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.median()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
A
112.0
215.0
\n", + "
" + ], + "text/plain": [ + " B C\n", + "A \n", + "1 1 2.0\n", + "2 1 5.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is gbA.diff()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
0NaNNaN
10.01.0
21.01.0
3NaNNaN
4-1.0NaN
50.0NaN
\n", + "
" + ], + "text/plain": [ + " B C\n", + "0 NaN NaN\n", + "1 0.0 1.0\n", + "2 1.0 1.0\n", + "3 NaN NaN\n", + "4 -1.0 NaN\n", + "5 0.0 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"This is gbA.max()\")\n", + "display(gbA.max())\n", + "print(\"This is gbA.median()\")\n", + "display(gbA.median())\n", + "print(\"This is gbA.diff()\")\n", + "display(gbA.diff())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Does the output of each of these commands have the same features as the output of `gbA.sum()` from above? If not, what is different?**" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "metadata": {}, + "outputs": [], + "source": [ + "#the three commands chosen seem to have the same features, excet for the last one where the key disappears" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Write a function that, given a DataFrame, computes each entry's deviation from the mean of its column and apply the function to `gbA`.**" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BC
0-0.333333-1.0
1-0.3333330.0
20.6666671.0
30.666667NaN
4-0.3333330.0
5-0.333333NaN
\n", + "
" + ], + "text/plain": [ + " B C\n", + "0 -0.333333 -1.0\n", + "1 -0.333333 0.0\n", + "2 0.666667 1.0\n", + "3 0.666667 NaN\n", + "4 -0.333333 0.0\n", + "5 -0.333333 NaN" + ] + }, + "execution_count": 239, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# write function here\n", + "def mean_deviation(x):\n", + " avg = x.mean()\n", + " \n", + " return x - avg\n", + "\n", + "# apply function here\n", + "mean_deviation = gbA.apply(mean_deviation)\n", + "mean_deviation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Describe what the index and and columns are? Where are the group keys (the `A` column)?**" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 6 entries, 0 to 5\n", + "Data columns (total 2 columns):\n", + "B 6 non-null float64\n", + "C 4 non-null float64\n", + "dtypes: float64(2)\n", + "memory usage: 224.0 bytes\n" + ] + } + ], + "source": [ + "mean_deviation.info()\n", + "#the index contains 6 entries, from 0 to 5\n", + "#the columns are B and C\n", + "#the group keys no longer appear\n", + " #gbA.groups.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Determine the correct way to add these results back into `df` as new columns.** \n", + " - Hint: remember the merge lecture" + ] + }, + { + "cell_type": "code", + "execution_count": 241, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCB_devC_dev
0111.0-0.333333-1.0
1112.0-0.3333330.0
2123.00.6666671.0
322NaN0.666667NaN
4215.0-0.3333330.0
521NaN-0.333333NaN
\n", + "
" + ], + "text/plain": [ + " A B C B_dev C_dev\n", + "0 1 1 1.0 -0.333333 -1.0\n", + "1 1 1 2.0 -0.333333 0.0\n", + "2 1 2 3.0 0.666667 1.0\n", + "3 2 2 NaN 0.666667 NaN\n", + "4 2 1 5.0 -0.333333 0.0\n", + "5 2 1 NaN -0.333333 NaN" + ] + }, + "execution_count": 241, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add output of function as new columns to df here...\n", + "mean_deviation_tu = mean_deviation.rename(columns={\"B\":\"B_dev\", \"C\":\"C_dev\"},inplace=True)\n", + "\n", + "merged=pd.concat([df, mean_deviation], axis=1)\n", + "merged\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "
Note that if the group keys
\n", + "
\n", + "remained in the index as the `.apply`'s output, the merge/join step would have been complicated." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We use an airline DataFrame:**" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierTailNumFlightNumOriginCityMarketIDOriginOriginCityNameOriginStateNameDestCityMarketIDDestDestCityName...AirTimeDistanceCarrierDelayWeatherDelayNASDelaySecurityDelayLateAircraftDelayFirstDepTimeTotalAddGTimeDate
0AAN3JHAA4631650MSPMinneapolis, MNMinnesota30977ORDChicago, IL...58.0334.00.00.020.00.00.0NaNNaN2016-12-18
1AAN3DPAA4631650MSPMinneapolis, MNMinnesota30977ORDChicago, IL...57.0334.00.00.020.00.00.0NaNNaN2016-12-19
2AAN3KUAA4631650MSPMinneapolis, MNMinnesota30977ORDChicago, IL...49.0334.00.00.00.00.00.0NaNNaN2016-12-20
3AAN3FBAA4631650MSPMinneapolis, MNMinnesota30977ORDChicago, IL...51.0334.00.00.00.00.00.0NaNNaN2016-12-21
4AAN3BLAA4631650MSPMinneapolis, MNMinnesota30977ORDChicago, IL...51.0334.00.00.00.00.00.0NaNNaN2016-12-22
\n", + "

5 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " Carrier TailNum FlightNum OriginCityMarketID Origin OriginCityName \\\n", + "0 AA N3JHAA 46 31650 MSP Minneapolis, MN \n", + "1 AA N3DPAA 46 31650 MSP Minneapolis, MN \n", + "2 AA N3KUAA 46 31650 MSP Minneapolis, MN \n", + "3 AA N3FBAA 46 31650 MSP Minneapolis, MN \n", + "4 AA N3BLAA 46 31650 MSP Minneapolis, MN \n", + "\n", + " OriginStateName DestCityMarketID Dest DestCityName ... AirTime Distance \\\n", + "0 Minnesota 30977 ORD Chicago, IL ... 58.0 334.0 \n", + "1 Minnesota 30977 ORD Chicago, IL ... 57.0 334.0 \n", + "2 Minnesota 30977 ORD Chicago, IL ... 49.0 334.0 \n", + "3 Minnesota 30977 ORD Chicago, IL ... 51.0 334.0 \n", + "4 Minnesota 30977 ORD Chicago, IL ... 51.0 334.0 \n", + "\n", + " CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay \\\n", + "0 0.0 0.0 20.0 0.0 0.0 \n", + "1 0.0 0.0 20.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " FirstDepTime TotalAddGTime Date \n", + "0 NaN NaN 2016-12-18 \n", + "1 NaN NaN 2016-12-19 \n", + "2 NaN NaN 2016-12-20 \n", + "3 NaN NaN 2016-12-21 \n", + "4 NaN NaN 2016-12-22 \n", + "\n", + "[5 rows x 36 columns]" + ] + }, + "execution_count": 242, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_dec = qeds.load(\"airline_performance_dec16\")\n", + "air_dec.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Which type of delay was the most common?**" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SecurityDelay 356\n", + "WeatherDelay 5946\n", + "NASDelay 55765\n", + "LateAircraftDelay 56377\n", + "CarrierDelay 57189\n", + "dtype: int64" + ] + }, + "execution_count": 243, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#The air_dec DataFrame has information on the minutes of delay attributed to 5 different categories:\n", + "delay_cols = [\n", + " 'CarrierDelay',\n", + " 'WeatherDelay',\n", + " 'NASDelay',\n", + " 'SecurityDelay',\n", + " 'LateAircraftDelay'\n", + "]\n", + "\n", + "air_dec[delay_cols]=(air_dec[delay_cols] > 0) \n", + "air_dec[delay_cols][air_dec[delay_cols]==True].count().sort_values()\n", + "#CarrierDelay was the most frequent one" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. **Which one caused the largest average delay?** " + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SecurityDelay 0.000772\n", + "WeatherDelay 0.012899\n", + "NASDelay 0.120979\n", + "LateAircraftDelay 0.122306\n", + "CarrierDelay 0.124068\n", + "dtype: float64" + ] + }, + "execution_count": 244, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_dec[delay_cols].mean().sort_values()\n", + "#CarrierDelay caused the largest average delay" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. **Does that vary by airline?** " + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "metadata": { + "hide-output": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDelayWeatherDelayNASDelaySecurityDelayLateAircraftDelay
Carrier
AA0.1126120.0123170.1181270.0012330.089361
AS0.0671720.0161210.1544610.0020670.078333
B60.1977970.0058780.1495400.0029180.158089
DL0.1041860.0204990.0991870.0001680.076663
EV0.1134540.0045580.1240100.0000000.122304
F90.1800270.0105250.1810060.0000000.154693
HA0.1154880.0277300.0023630.0009450.077517
NK0.1178980.0074840.2032830.0014490.089731
OO0.0902510.0163820.1498950.0004840.150500
UA0.1293140.0186580.1196680.0001750.100901
VX0.1131020.0499750.1205000.0031230.142693
WN0.1500460.0062810.1060320.0006950.170568
\n", + "
" + ], + "text/plain": [ + " CarrierDelay WeatherDelay NASDelay SecurityDelay \\\n", + "Carrier \n", + "AA 0.112612 0.012317 0.118127 0.001233 \n", + "AS 0.067172 0.016121 0.154461 0.002067 \n", + "B6 0.197797 0.005878 0.149540 0.002918 \n", + "DL 0.104186 0.020499 0.099187 0.000168 \n", + "EV 0.113454 0.004558 0.124010 0.000000 \n", + "F9 0.180027 0.010525 0.181006 0.000000 \n", + "HA 0.115488 0.027730 0.002363 0.000945 \n", + "NK 0.117898 0.007484 0.203283 0.001449 \n", + "OO 0.090251 0.016382 0.149895 0.000484 \n", + "UA 0.129314 0.018658 0.119668 0.000175 \n", + "VX 0.113102 0.049975 0.120500 0.003123 \n", + "WN 0.150046 0.006281 0.106032 0.000695 \n", + "\n", + " LateAircraftDelay \n", + "Carrier \n", + "AA 0.089361 \n", + "AS 0.078333 \n", + "B6 0.158089 \n", + "DL 0.076663 \n", + "EV 0.122304 \n", + "F9 0.154693 \n", + "HA 0.077517 \n", + "NK 0.089731 \n", + "OO 0.150500 \n", + "UA 0.100901 \n", + "VX 0.142693 \n", + "WN 0.170568 " + ] + }, + "execution_count": 245, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "air_dec.groupby(\"Carrier\")[delay_cols].mean()\n", + "#it does: \n", + " #AA - LateAircraftDelay\n", + " #EV - CarrierDelay" + ] + } + ], + "metadata": { + "date": 1584040760.646586, + "filename": "groupby.rst", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "title": "GroupBy" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Session_7/README.md b/Session_7/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/Session_7/Readme.md.txt b/Session_7/Readme.md.txt new file mode 100644 index 0000000..304360c --- /dev/null +++ b/Session_7/Readme.md.txt @@ -0,0 +1 @@ +Readme diff --git a/Session_8/Readme.md.txt b/Session_8/Readme.md.txt new file mode 100644 index 0000000..304360c --- /dev/null +++ b/Session_8/Readme.md.txt @@ -0,0 +1 @@ +Readme