-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataframe_basics.py
More file actions
executable file
·56 lines (30 loc) · 1.37 KB
/
dataframe_basics.py
File metadata and controls
executable file
·56 lines (30 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
To Be run in ipython
API for dask data frames - implement a smaller subset than pandas
https://docs.dask.org/en/latest/dataframe-api.html
Dataframe IO
https://docs.dask.org/en/latest/dataframe-create.html
'''
import makedata
data = makedata.data() #get dask dataframe
data.dtypes #inspect datatypes
data.head(5) #inspect data
# Common Dask Dataframe operations are identical to pandas
#filter operation
data2 = data[data.age > 60]
#group by operation
data.groupby('occupation').income.mean().compute()
#sort operation
data.occupation.value_counts().nlargest(5).compute()
#write to csv
data[data.city == 'Madison Heights'].compute().to_csv('Madison.csv')
#Exercise1: What occupation is Meggan Mayo
data[data.name == 'Meggan Mayo'].occupation.compute()
#Excersie2: How many people are there in the city 'Sun Prairie' that age over the age of 35
data[(data.city == 'Sun Prairie') & (data.age > 35)].compute()
# Exercise3: Find the 10th most populous cities
data.groupby('city').name.count().nlargest(10).compute()
#Exercise4: write a function that adds a new column that gives everyone a one dollar raise
data.assign(income_increase = data.income + 1).compute()
#Exercise5: what is the standard deviation of income grouped by age. This should reveal a data secret.
data.groupby('age').income.std().compute()