link to source blog post
import pandas as pd
import numpy as np
import janitor
url = "https://raw.githubusercontent.com/samukweku/data-wrangling-blog/master/_notebooks/Data_files/iris.csv"
iris = pd .read_csv (url )
iris .head ()
sepal_length
sepal_width
petal_length
petal_width
species
0
5.1
3.5
1.4
0.2
setosa
1
4.9
3.0
1.4
0.2
setosa
2
4.7
3.2
1.3
0.2
setosa
3
4.6
3.1
1.5
0.2
setosa
4
5.0
3.6
1.4
0.2
setosa
iris .groupby ('species' ).sepal_length .mean ()
species
setosa 5.006
versicolor 5.936
virginica 6.588
Name: sepal_length, dtype: float64
Aggregation with Multiple Input variables
temp = iris .assign (sepal_length = iris .sepal_length * iris .sepal_width )
temp = temp .groupby ('species' )
temp .sepal_length .sum () / temp .sepal_width .sum ()
species
setosa 5.034364
versicolor 5.966137
virginica 6.618897
dtype: float64
# method chaining
(iris
.assign (sepal_length = iris .sepal_length * iris .sepal_width )
.groupby ('species' )
# pipe allows reuse of the groupby object
.pipe (lambda df : df .sepal_length .sum () / df .sepal_width .sum ())
)
species
setosa 5.034364
versicolor 5.966137
virginica 6.618897
dtype: float64
# abstract into a function for reuse
# pandas pipe comes in handy here,
# allows usage of function within Pandas chains
def weighted_average (df , data , weights , by ):
"""
Custom function to calculate weighted average.
Args:
data - array to be averaged.
weights - weights associated with each value in data.
by - grouping column(s)
Returns a Series, with the unique values in by as the index
"""
temp = df .assign (** {data : df [data ] * df [weights ]})
temp = temp .groupby (by )
return temp [data ].sum () / temp [weights ].sum ()
weighted_mean = iris .pipe (weighted_average , 'sepal_length' , 'sepal_width' , 'species' )
weighted_mean
species
setosa 5.034364
versicolor 5.966137
virginica 6.618897
dtype: float64
# reuse the weighted mean value
iris .assign (mean_sepal = iris .species .map (weighted_mean ))
sepal_length
sepal_width
petal_length
petal_width
species
mean_sepal
0
5.1
3.5
1.4
0.2
setosa
5.034364
1
4.9
3.0
1.4
0.2
setosa
5.034364
2
4.7
3.2
1.3
0.2
setosa
5.034364
3
4.6
3.1
1.5
0.2
setosa
5.034364
4
5.0
3.6
1.4
0.2
setosa
5.034364
...
...
...
...
...
...
...
145
6.7
3.0
5.2
2.3
virginica
6.618897
146
6.3
2.5
5.0
1.9
virginica
6.618897
147
6.5
3.0
5.2
2.0
virginica
6.618897
148
6.2
3.4
5.4
2.3
virginica
6.618897
149
5.9
3.0
5.1
1.8
virginica
6.618897
150 rows × 6 columns
# without chains
sorted_iris = iris .sort_values (['species' , 'sepal_width' ])
sorted_grouper = sorted_iris .groupby ('species' )
expand_sepal_length = (sorted_grouper
.expanding ()
.sepal_length
.sum ()
.droplevel (0 )
)
sorted_iris .assign (expanding_sepal_sum = expand_sepal_length )
sepal_length
sepal_width
petal_length
petal_width
species
expanding_sepal_sum
41
4.5
2.3
1.3
0.3
setosa
4.5
8
4.4
2.9
1.4
0.2
setosa
8.9
1
4.9
3.0
1.4
0.2
setosa
13.8
12
4.8
3.0
1.4
0.1
setosa
18.6
13
4.3
3.0
1.1
0.1
setosa
22.9
...
...
...
...
...
...
...
136
6.3
3.4
5.6
2.4
virginica
300.4
148
6.2
3.4
5.4
2.3
virginica
306.6
109
7.2
3.6
6.1
2.5
virginica
313.8
117
7.7
3.8
6.7
2.2
virginica
321.5
131
7.9
3.8
6.4
2.0
virginica
329.4
150 rows × 6 columns
# chaining
(iris
.sort_values (['species' , 'sepal_width' ])
.assign (expanding_sepal_sum = lambda df : df .groupby ('species' )
.sepal_length
.expanding ()
.sum ()
.array )
)
sepal_length
sepal_width
petal_length
petal_width
species
expanding_sepal_sum
41
4.5
2.3
1.3
0.3
setosa
4.5
8
4.4
2.9
1.4
0.2
setosa
8.9
1
4.9
3.0
1.4
0.2
setosa
13.8
12
4.8
3.0
1.4
0.1
setosa
18.6
13
4.3
3.0
1.1
0.1
setosa
22.9
...
...
...
...
...
...
...
136
6.3
3.4
5.6
2.4
virginica
300.4
148
6.2
3.4
5.4
2.3
virginica
306.6
109
7.2
3.6
6.1
2.5
virginica
313.8
117
7.7
3.8
6.7
2.2
virginica
321.5
131
7.9
3.8
6.4
2.0
virginica
329.4
150 rows × 6 columns
rolling_sepal = (sorted_grouper
.rolling (window = 5 , center = True ,min_periods = 1 )
.sepal_length .mean ()
.droplevel (0 )
)
sorted_iris .assign (moving_sepal_length = rolling_sepal )
sepal_length
sepal_width
petal_length
petal_width
species
moving_sepal_length
41
4.5
2.3
1.3
0.3
setosa
4.60
8
4.4
2.9
1.4
0.2
setosa
4.65
1
4.9
3.0
1.4
0.2
setosa
4.58
12
4.8
3.0
1.4
0.1
setosa
4.68
13
4.3
3.0
1.1
0.1
setosa
4.68
...
...
...
...
...
...
...
136
6.3
3.4
5.6
2.4
virginica
6.62
148
6.2
3.4
5.4
2.3
virginica
6.82
109
7.2
3.6
6.1
2.5
virginica
7.06
117
7.7
3.8
6.7
2.2
virginica
7.25
131
7.9
3.8
6.4
2.0
virginica
7.60
150 rows × 6 columns
#chaining
(iris
.sort_values (['species' , 'sepal_width' ])
.assign (moving_sepal_length = lambda df : df .groupby ('species' )
.sepal_length
.rolling (window = 5 , center = True , min_periods = 1 )
.mean ()
.array )
)
sepal_length
sepal_width
petal_length
petal_width
species
moving_sepal_length
41
4.5
2.3
1.3
0.3
setosa
4.60
8
4.4
2.9
1.4
0.2
setosa
4.65
1
4.9
3.0
1.4
0.2
setosa
4.58
12
4.8
3.0
1.4
0.1
setosa
4.68
13
4.3
3.0
1.1
0.1
setosa
4.68
...
...
...
...
...
...
...
136
6.3
3.4
5.6
2.4
virginica
6.62
148
6.2
3.4
5.4
2.3
virginica
6.82
109
7.2
3.6
6.1
2.5
virginica
7.06
117
7.7
3.8
6.7
2.2
virginica
7.25
131
7.9
3.8
6.4
2.0
virginica
7.60
150 rows × 6 columns
df = pd .DataFrame ({'s1' :[np .nan ,np .nan ,6 ,9 ,9 ],'s2' :[np .nan ,8 ,7 ,9 ,9 ]})
df
s1
s2
0
NaN
NaN
1
NaN
8.0
2
6.0
7.0
3
9.0
9.0
4
9.0
9.0
# combination of bfill and ffill on axis = 1 suffices
# usually faster than combine_first
df .assign (s3 = lambda df : df .bfill (axis = 1 ).ffill (axis = 1 ).fillna (0 ).s1 )
s1
s2
s3
0
NaN
NaN
0.0
1
NaN
8.0
8.0
2
6.0
7.0
6.0
3
9.0
9.0
9.0
4
9.0
9.0
9.0
# pyjanitor has a coalesce function
df .coalesce ('s1' , 's2' ,
target_column_name = 's3' ,
default_value = 0 )
s1
s2
s3
0
NaN
NaN
0.0
1
NaN
8.0
8.0
2
6.0
7.0
6.0
3
9.0
9.0
9.0
4
9.0
9.0
9.0
# pyjanitor has a case_when function
iris .case_when (
iris .species .eq ('setosa' ) & iris .sepal_length .lt (5 ), 'value C' ,
iris .species .eq ('versicolor' ) & iris .sepal_length .lt (5 ), 'value A' ,
iris .species .eq ('viriginica' ) & iris .sepal_length .lt (5 ), 'value G' ,
'high' ,
column_name = 'wow'
)
sepal_length
sepal_width
petal_length
petal_width
species
wow
0
5.1
3.5
1.4
0.2
setosa
high
1
4.9
3.0
1.4
0.2
setosa
value C
2
4.7
3.2
1.3
0.2
setosa
value C
3
4.6
3.1
1.5
0.2
setosa
value C
4
5.0
3.6
1.4
0.2
setosa
high
...
...
...
...
...
...
...
145
6.7
3.0
5.2
2.3
virginica
high
146
6.3
2.5
5.0
1.9
virginica
high
147
6.5
3.0
5.2
2.0
virginica
high
148
6.2
3.4
5.4
2.3
virginica
high
149
5.9
3.0
5.1
1.8
virginica
high
150 rows × 6 columns
# string like queries are supported for the conditions
iris .case_when (
"species == 'setosa' and sepal_length < 5 " , 'value C' ,
"species == 'versicolor' and sepal_length < 5 " , 'value A' ,
"species == 'virginica' and sepal_length < 5 " , 'value G' ,
'high' ,
column_name = 'wow'
)
sepal_length
sepal_width
petal_length
petal_width
species
wow
0
5.1
3.5
1.4
0.2
setosa
high
1
4.9
3.0
1.4
0.2
setosa
value C
2
4.7
3.2
1.3
0.2
setosa
value C
3
4.6
3.1
1.5
0.2
setosa
value C
4
5.0
3.6
1.4
0.2
setosa
high
...
...
...
...
...
...
...
145
6.7
3.0
5.2
2.3
virginica
high
146
6.3
2.5
5.0
1.9
virginica
high
147
6.5
3.0
5.2
2.0
virginica
high
148
6.2
3.4
5.4
2.3
virginica
high
149
5.9
3.0
5.1
1.8
virginica
high
150 rows × 6 columns
Thank you so much for this gist! I've taken many examples from it and added to the repo. Also made sure to give you credit (: