From e6fd5a29fef68870e13d2545da718215a4a0cdda Mon Sep 17 00:00:00 2001 From: John Evans Date: Mon, 28 Oct 2019 10:47:11 -0400 Subject: [PATCH 1/4] Update python smoothing script for versions > 0.17 'EWMA' become an instance method in 0.18 and changed name to 'ewm'. --- Ch02/smooth.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Ch02/smooth.py b/Ch02/smooth.py index b65b70e..19fb67b 100644 --- a/Ch02/smooth.py +++ b/Ch02/smooth.py @@ -3,6 +3,6 @@ air = pd.read_csv('data/AirPassengers.csv', parse_dates = True, header = None) air.columns = ['Date', 'Passengers'] -air['Smooth.5'] = pd.ewma(air, alpha = .5).Passengers -air['Smooth.1'] = pd.ewma(air, alpha = .1).Passengers -air['Smooth.9'] = pd.ewma(air, alpha = .9).Passengers +air['Smooth.5'] = air['Passengers'].ewm(alpha=.5).mean() +air['Smooth.1'] = air['Passengers'].ewm(alpha=.1).mean() +air['Smooth.9'] = air['Passengers'].ewm(alpha=.9).mean() From aef1c3c510359b308e9135c46b7bab3070208212 Mon Sep 17 00:00:00 2001 From: John Evans Date: Fri, 24 Jan 2020 07:12:15 -0800 Subject: [PATCH 2/4] Add myemails.py --- Ch02/myemails.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 Ch02/myemails.py diff --git a/Ch02/myemails.py b/Ch02/myemails.py new file mode 100644 index 0000000..6873dbf --- /dev/null +++ b/Ch02/myemails.py @@ -0,0 +1,41 @@ +import pandas as pd + +emails = pd.read_csv('data/emails.csv', + header=0, + names=['emailsOpened', 'member', 'week'], + parse_dates=['week']) + +print(emails.head()) + +donations = pd.read_csv('data/donations.csv', parse_dates=['timestamp']) +donations.columns = ['amount', 'timestamp', 'member'] + +YearJoined = pd.read_csv('data/year_joined.csv') +YearJoined.columns = ['memberId', 'memberStats', 'yearJoined'] + +complete_idx = pd.MultiIndex.from_product((set(emails.week), set(emails.member))) + +all_email = (emails.set_index(['week', 'member']) + .reindex(complete_idx, fill_value=0) + .reset_index()) +all_email.columns = ['week', 'member', 'emailsOpened'] + +print(all_email[all_email.member==998].sort_values(by='week')) + +cutoff_dates = emails.groupby('member').week.agg(['min', 'max']).reset_index() + +for _, row in cutoff_dates.iterrows(): + member = row['member'] + start_date = row['min'] + end_date = row['max'] + query = 'member == @member and (week < @start_date or week > @end_date)' + all_email.drop(all_email.query(query).index, axis='index', inplace=True) + +# Constructing a found time series +donations.set_index('timestamp', inplace=True) +agg_don = (donations.groupby('member') + .apply(lambda df: df.amount.resample('W-MON').sum())) + +agg_don = agg_don[agg_don != 0] + + From c99d828d81fbe34a574bb6e673d48dcc41cd65f3 Mon Sep 17 00:00:00 2001 From: John Evans Date: Fri, 24 Jan 2020 21:07:07 -0800 Subject: [PATCH 3/4] Finish with emails --- Ch02/emails.ipynb | 1483 +++++++++++++++++++++++++++++++++++++++++++++ Ch02/myemails.py | 22 +- 2 files changed, 1504 insertions(+), 1 deletion(-) create mode 100644 Ch02/emails.ipynb diff --git a/Ch02/emails.ipynb b/Ch02/emails.ipynb new file mode 100644 index 0000000..fc89588 --- /dev/null +++ b/Ch02/emails.ipynb @@ -0,0 +1,1483 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emailsOpenedmemberweek
03.01.02015-06-29
12.01.02015-07-13
22.01.02015-07-20
33.01.02015-07-27
41.01.02015-08-03
\n", + "
" + ], + "text/plain": [ + " emailsOpened member week\n", + "0 3.0 1.0 2015-06-29\n", + "1 2.0 1.0 2015-07-13\n", + "2 2.0 1.0 2015-07-20\n", + "3 3.0 1.0 2015-07-27\n", + "4 1.0 1.0 2015-08-03" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "emails = pd.read_csv('data/emails.csv',\n", + " header=0,\n", + " names=['emailsOpened', 'member', 'week'],\n", + " parse_dates=['week'])\n", + "\n", + "emails.head()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amounttimestampmember
025.02017-11-12 11:13:440.0
150.02015-08-25 19:01:450.0
225.02015-03-26 12:03:470.0
350.02016-07-06 12:24:550.0
450.02016-05-11 18:13:041.0
............
267125.02016-09-02 11:20:00992.0
267250.02017-11-02 12:17:06993.0
26731000.02016-09-13 21:09:47995.0
26741000.02017-09-29 20:03:01995.0
267550.02018-01-03 19:24:24998.0
\n", + "

2676 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " amount timestamp member\n", + "0 25.0 2017-11-12 11:13:44 0.0\n", + "1 50.0 2015-08-25 19:01:45 0.0\n", + "2 25.0 2015-03-26 12:03:47 0.0\n", + "3 50.0 2016-07-06 12:24:55 0.0\n", + "4 50.0 2016-05-11 18:13:04 1.0\n", + "... ... ... ...\n", + "2671 25.0 2016-09-02 11:20:00 992.0\n", + "2672 50.0 2017-11-02 12:17:06 993.0\n", + "2673 1000.0 2016-09-13 21:09:47 995.0\n", + "2674 1000.0 2017-09-29 20:03:01 995.0\n", + "2675 50.0 2018-01-03 19:24:24 998.0\n", + "\n", + "[2676 rows x 3 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "donations = pd.read_csv('data/donations.csv', parse_dates=['timestamp'])\n", + "donations.columns = ['amount', 'timestamp', 'member']\n", + "donations\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
memberIdmemberStatsyearJoined
00silver2014
11silver2015
22silver2016
33bronze2018
44silver2018
............
995995bronze2016
996996bronze2018
997997bronze2018
998998bronze2017
999999silver2014
\n", + "

1000 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " memberId memberStats yearJoined\n", + "0 0 silver 2014\n", + "1 1 silver 2015\n", + "2 2 silver 2016\n", + "3 3 bronze 2018\n", + "4 4 silver 2018\n", + ".. ... ... ...\n", + "995 995 bronze 2016\n", + "996 996 bronze 2018\n", + "997 997 bronze 2018\n", + "998 998 bronze 2017\n", + "999 999 silver 2014\n", + "\n", + "[1000 rows x 3 columns]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "YearJoined = pd.read_csv('data/year_joined.csv')\n", + "YearJoined.columns = ['memberId', 'memberStats', 'yearJoined']\n", + "YearJoined\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
weekmemberemailsOpened
813882015-02-09998.00.0
582112015-02-16998.00.0
587502015-02-23998.00.0
657572015-03-02998.00.0
722252015-03-09998.00.0
............
150912018-04-30998.03.0
452752018-05-07998.03.0
231762018-05-14998.03.0
145522018-05-21998.03.0
26942018-05-28998.03.0
\n", + "

173 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " week member emailsOpened\n", + "81388 2015-02-09 998.0 0.0\n", + "58211 2015-02-16 998.0 0.0\n", + "58750 2015-02-23 998.0 0.0\n", + "65757 2015-03-02 998.0 0.0\n", + "72225 2015-03-09 998.0 0.0\n", + "... ... ... ...\n", + "15091 2018-04-30 998.0 3.0\n", + "45275 2018-05-07 998.0 3.0\n", + "23176 2018-05-14 998.0 3.0\n", + "14552 2018-05-21 998.0 3.0\n", + "2694 2018-05-28 998.0 3.0\n", + "\n", + "[173 rows x 3 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complete_idx = pd.MultiIndex.from_product((set(emails.week), set(emails.member)))\n", + "\n", + "all_email = (emails.set_index(['week', 'member'])\n", + " .reindex(complete_idx, fill_value=0)\n", + " .reset_index())\n", + "all_email.columns = ['week', 'member', 'emailsOpened']\n", + "\n", + "all_email[all_email.member==998].sort_values(by='week')" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
weekmemberemailsOpened
02016-12-051.03.0
32016-12-056.01.0
42016-12-059.03.0
62016-12-0514.00.0
82016-12-0520.01.0
............
932312017-05-08959.02.0
932362017-05-08970.01.0
932372017-05-08973.03.0
932412017-05-08987.03.0
932452017-05-08995.03.0
\n", + "

31836 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " week member emailsOpened\n", + "0 2016-12-05 1.0 3.0\n", + "3 2016-12-05 6.0 1.0\n", + "4 2016-12-05 9.0 3.0\n", + "6 2016-12-05 14.0 0.0\n", + "8 2016-12-05 20.0 1.0\n", + "... ... ... ...\n", + "93231 2017-05-08 959.0 2.0\n", + "93236 2017-05-08 970.0 1.0\n", + "93237 2017-05-08 973.0 3.0\n", + "93241 2017-05-08 987.0 3.0\n", + "93245 2017-05-08 995.0 3.0\n", + "\n", + "[31836 rows x 3 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cutoff_dates = emails.groupby('member').week.agg(['min', 'max']).reset_index()\n", + "\n", + "for _, row in cutoff_dates.iterrows():\n", + " member = row['member']\n", + " start_date = row['min']\n", + " end_date = row['max']\n", + " query = 'member == @member and (week < @start_date or week > @end_date)'\n", + " all_email.drop(all_email.query(query).index, axis='index', inplace=True)\n", + "\n", + "all_email" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Constructing a found time series" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
memberamount
timestamp
2015-03-300.025.0
2015-08-310.050.0
2016-07-110.050.0
2017-11-130.025.0
2016-05-091.050.0
.........
2018-05-21992.050.0
2017-11-06993.050.0
2016-09-19995.01000.0
2017-10-02995.01000.0
2018-01-08998.050.0
\n", + "

2612 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " member amount\n", + "timestamp \n", + "2015-03-30 0.0 25.0\n", + "2015-08-31 0.0 50.0\n", + "2016-07-11 0.0 50.0\n", + "2017-11-13 0.0 25.0\n", + "2016-05-09 1.0 50.0\n", + "... ... ...\n", + "2018-05-21 992.0 50.0\n", + "2017-11-06 993.0 50.0\n", + "2016-09-19 995.0 1000.0\n", + "2017-10-02 995.0 1000.0\n", + "2018-01-08 998.0 50.0\n", + "\n", + "[2612 rows x 2 columns]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Constructing a found time series\n", + "donations.set_index('timestamp', inplace=True)\n", + "agg_don = (donations.groupby('member')\n", + " .apply(lambda df: df.amount.resample('W-MON').sum()))\n", + "\n", + "agg_don = agg_don[agg_don != 0]\n", + "agg_don = agg_don.reset_index().set_index('timestamp') \n", + "agg_don" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
memberemailsOpenedamount
week
2015-06-291.03.00.0
2015-07-061.00.00.0
2015-07-131.02.00.0
2015-07-201.02.00.0
2015-07-271.03.00.0
............
2018-04-30998.03.00.0
2018-05-07998.03.00.0
2018-05-14998.03.00.0
2018-05-21998.03.00.0
2018-05-28998.03.00.0
\n", + "

31836 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " member emailsOpened amount\n", + "week \n", + "2015-06-29 1.0 3.0 0.0\n", + "2015-07-06 1.0 0.0 0.0\n", + "2015-07-13 1.0 2.0 0.0\n", + "2015-07-20 1.0 2.0 0.0\n", + "2015-07-27 1.0 3.0 0.0\n", + "... ... ... ...\n", + "2018-04-30 998.0 3.0 0.0\n", + "2018-05-07 998.0 3.0 0.0\n", + "2018-05-14 998.0 3.0 0.0\n", + "2018-05-21 998.0 3.0 0.0\n", + "2018-05-28 998.0 3.0 0.0\n", + "\n", + "[31836 rows x 3 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lst = [] \n", + "for member, member_email in all_email.groupby('member'): \n", + " member_donations = agg_don.query('member == @member') \n", + " \n", + " member_email.set_index('week', inplace=True) \n", + " member_email.sort_index(inplace=True) \n", + " \n", + " df = pd.merge(member_email, member_donations, \n", + " how='left', \n", + " left_index=True, right_index=True) \n", + " df.fillna(0, inplace=True) \n", + " df['member'] = df.member_x \n", + " lst.append(df.reset_index()[['member', 'week', 'emailsOpened', 'amount']]) \n", + " \n", + "merged_df = pd.concat(lst).set_index('week') \n", + "merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
memberemailsOpenedamount
week
2017-12-04998.01.00.0
2017-12-11998.03.00.0
2017-12-18998.03.00.0
2017-12-25998.00.00.0
2018-01-01998.03.00.0
2018-01-08998.03.050.0
2018-01-15998.02.00.0
2018-01-22998.03.00.0
2018-01-29998.02.00.0
2018-02-05998.03.00.0
2018-02-12998.03.00.0
2018-02-19998.03.00.0
2018-02-26998.02.00.0
2018-03-05998.02.00.0
2018-03-12998.03.00.0
2018-03-19998.02.00.0
2018-03-26998.02.00.0
2018-04-02998.03.00.0
2018-04-09998.03.00.0
2018-04-16998.03.00.0
2018-04-23998.00.00.0
2018-04-30998.03.00.0
2018-05-07998.03.00.0
2018-05-14998.03.00.0
2018-05-21998.03.00.0
2018-05-28998.03.00.0
\n", + "
" + ], + "text/plain": [ + " member emailsOpened amount\n", + "week \n", + "2017-12-04 998.0 1.0 0.0\n", + "2017-12-11 998.0 3.0 0.0\n", + "2017-12-18 998.0 3.0 0.0\n", + "2017-12-25 998.0 0.0 0.0\n", + "2018-01-01 998.0 3.0 0.0\n", + "2018-01-08 998.0 3.0 50.0\n", + "2018-01-15 998.0 2.0 0.0\n", + "2018-01-22 998.0 3.0 0.0\n", + "2018-01-29 998.0 2.0 0.0\n", + "2018-02-05 998.0 3.0 0.0\n", + "2018-02-12 998.0 3.0 0.0\n", + "2018-02-19 998.0 3.0 0.0\n", + "2018-02-26 998.0 2.0 0.0\n", + "2018-03-05 998.0 2.0 0.0\n", + "2018-03-12 998.0 3.0 0.0\n", + "2018-03-19 998.0 2.0 0.0\n", + "2018-03-26 998.0 2.0 0.0\n", + "2018-04-02 998.0 3.0 0.0\n", + "2018-04-09 998.0 3.0 0.0\n", + "2018-04-16 998.0 3.0 0.0\n", + "2018-04-23 998.0 0.0 0.0\n", + "2018-04-30 998.0 3.0 0.0\n", + "2018-05-07 998.0 3.0 0.0\n", + "2018-05-14 998.0 3.0 0.0\n", + "2018-05-21 998.0 3.0 0.0\n", + "2018-05-28 998.0 3.0 0.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = merged_df.query('member == 998')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
memberemailsOpenedamounttarget
week
2017-12-04998.01.00.0NaN
2017-12-11998.03.00.00.0
2017-12-18998.03.00.00.0
2017-12-25998.00.00.00.0
2018-01-01998.03.00.00.0
2018-01-08998.03.050.00.0
2018-01-15998.02.00.050.0
2018-01-22998.03.00.00.0
2018-01-29998.02.00.00.0
2018-02-05998.03.00.00.0
2018-02-12998.03.00.00.0
2018-02-19998.03.00.00.0
2018-02-26998.02.00.00.0
2018-03-05998.02.00.00.0
2018-03-12998.03.00.00.0
2018-03-19998.02.00.00.0
2018-03-26998.02.00.00.0
2018-04-02998.03.00.00.0
2018-04-09998.03.00.00.0
2018-04-16998.03.00.00.0
2018-04-23998.00.00.00.0
2018-04-30998.03.00.00.0
2018-05-07998.03.00.00.0
2018-05-14998.03.00.00.0
2018-05-21998.03.00.00.0
2018-05-28998.03.00.00.0
\n", + "
" + ], + "text/plain": [ + " member emailsOpened amount target\n", + "week \n", + "2017-12-04 998.0 1.0 0.0 NaN\n", + "2017-12-11 998.0 3.0 0.0 0.0\n", + "2017-12-18 998.0 3.0 0.0 0.0\n", + "2017-12-25 998.0 0.0 0.0 0.0\n", + "2018-01-01 998.0 3.0 0.0 0.0\n", + "2018-01-08 998.0 3.0 50.0 0.0\n", + "2018-01-15 998.0 2.0 0.0 50.0\n", + "2018-01-22 998.0 3.0 0.0 0.0\n", + "2018-01-29 998.0 2.0 0.0 0.0\n", + "2018-02-05 998.0 3.0 0.0 0.0\n", + "2018-02-12 998.0 3.0 0.0 0.0\n", + "2018-02-19 998.0 3.0 0.0 0.0\n", + "2018-02-26 998.0 2.0 0.0 0.0\n", + "2018-03-05 998.0 2.0 0.0 0.0\n", + "2018-03-12 998.0 3.0 0.0 0.0\n", + "2018-03-19 998.0 2.0 0.0 0.0\n", + "2018-03-26 998.0 2.0 0.0 0.0\n", + "2018-04-02 998.0 3.0 0.0 0.0\n", + "2018-04-09 998.0 3.0 0.0 0.0\n", + "2018-04-16 998.0 3.0 0.0 0.0\n", + "2018-04-23 998.0 0.0 0.0 0.0\n", + "2018-04-30 998.0 3.0 0.0 0.0\n", + "2018-05-07 998.0 3.0 0.0 0.0\n", + "2018-05-14 998.0 3.0 0.0 0.0\n", + "2018-05-21 998.0 3.0 0.0 0.0\n", + "2018-05-28 998.0 3.0 0.0 0.0" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['target'] = df.amount.shift(1) \n", + "df " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch02/myemails.py b/Ch02/myemails.py index 6873dbf..7d2d135 100644 --- a/Ch02/myemails.py +++ b/Ch02/myemails.py @@ -37,5 +37,25 @@ .apply(lambda df: df.amount.resample('W-MON').sum())) agg_don = agg_don[agg_don != 0] +agg_don = agg_don.reset_index().set_index('timestamp') - + +lst = [] +for member, member_email in all_email.groupby('member'): + member_donations = agg_don.query('member == @member') + + member_email.set_index('week', inplace=True) + member_email.sort_index(inplace=True) + + df = pd.merge(member_email, member_donations, + how='left', + left_index=True, right_index=True) + df.fillna(0, inplace=True) + df['member'] = df.member_x + lst.append(df.reset_index()[['member', 'week', 'emailsOpened', 'amount']]) + +merged_df = pd.concat(lst).set_index('week') +df = merged_df.query('member == 998') + +df['target'] = df.amount.shift(1) +df From e5d609d5ec2e8a92dd80614f69657b9983445f55 Mon Sep 17 00:00:00 2001 From: John Evans Date: Wed, 29 Jan 2020 18:28:25 -0800 Subject: [PATCH 4/4] mine --- Ch02/unemp.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 Ch02/unemp.py diff --git a/Ch02/unemp.py b/Ch02/unemp.py new file mode 100644 index 0000000..eed6a14 --- /dev/null +++ b/Ch02/unemp.py @@ -0,0 +1,54 @@ +import matplotlib.pyplot as plt +import pandas as pd + +unemp = pd.read_csv('data/UNRATE.csv', parse_dates=['DATE'], index_col='DATE') + +# generate a data set where data is randomly missing +rand_unemp = unemp.sample(frac=0.9).sort_index() + +# generate a dataset where data is more likely to be missing when unemployment +# is high +high_unemp = unemp.query('UNRATE > 8').sample(frac=0.2) +bias_unemp = unemp.drop(high_unemp.index, axis='index') + +# +rand_unemp = rand_unemp.resample('M').mean() +bias_unemp = bias_unemp.resample('M').mean() + +# Seems unnecessary. +rand_unemp['rpt'] = rand_unemp['UNRATE'].isnull() + +data = { + 'amt': [99, 100, 5, 15, 11, 1200], + 'dt': ['2019-02-27', '2019-03-02', '2019-06-13', '2019-08-01', '2019-08-31', '2019-09-15',] +} + +donations = pd.DataFrame(data, index=list(range(1,7))) +donations['dt'] = pd.to_datetime(donations['dt']) + +identifier = ['q4q42', '4299hj', 'bbg2'] +dt = [pd.Timestamp(x) for x in ['2019-1-1', '2019-4-1', '2019-7-1']] +publicity = pd.DataFrame({'identifier': identifier, 'dt': dt}, index=list(range(1,4))) + +donations.set_index('dt', inplace=True) +publicity.set_index('dt', inplace=True) + +# we wish to label each donation according to +# what publicity campaign most recently preceded it. +new_index = pd.date_range('2019-01-01', periods=270) +df = donations.join(publicity.reindex(new_index, method='ffill')) +print(df) + +# identify the missing data +# Hey, it's already done by resample. + +idx = slice(350, 400) +unemp_idx = unemp.iloc[idx] +rand_unemp_ff = rand_unemp.iloc[idx].fillna(method='ffill') + +# plot a sample graph showing the flat portions. +fig, ax = plt.subplots() +unemp_idx.plot(ax=ax, linestyle='-', marker='o', markerfacecolor='none') +rand_unemp_ff.plot(ax=ax) +rand_unemp_ff.loc[rand_unemp_ff.rpt, 'UNRATE'].plot(linestyle='none', marker='o', color='r') +ax.legend(['UNRATE', 'Missing', 'FFILL'])