Skip to content

Instantly share code, notes, and snippets.

@samukweku
Last active November 15, 2021 06:15
Show Gist options
  • Save samukweku/88272c539743e9507dd275e1d2d71018 to your computer and use it in GitHub Desktop.
Save samukweku/88272c539743e9507dd275e1d2d71018 to your computer and use it in GitHub Desktop.
Solutions to some challenges on `Dont believe the python hype...`

link to source blog post

import pandas as pd
import numpy as np
import janitor
url = "https://raw.githubusercontent.com/samukweku/data-wrangling-blog/master/_notebooks/Data_files/iris.csv"
iris = pd.read_csv(url)
iris.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

Aggregation

iris.groupby('species').sepal_length.mean()
species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

Aggregation with Multiple Input variables

temp = iris.assign(sepal_length = iris.sepal_length * iris.sepal_width)
temp = temp.groupby('species')
temp.sepal_length.sum() / temp.sepal_width.sum()
species
setosa        5.034364
versicolor    5.966137
virginica     6.618897
dtype: float64
# method chaining

(iris
.assign(sepal_length = iris.sepal_length * iris.sepal_width)
.groupby('species')
# pipe allows reuse of the groupby object
.pipe(lambda df: df.sepal_length.sum() / df.sepal_width.sum())
)
species
setosa        5.034364
versicolor    5.966137
virginica     6.618897
dtype: float64
# abstract into a function for reuse
# pandas pipe comes in handy here, 
# allows usage of function within Pandas chains
def weighted_average(df, data, weights, by):
    """
    Custom function to calculate weighted average.

    Args:
    data - array to be averaged.
    weights - weights associated with each value in data.
    by - grouping column(s)

    Returns a Series, with the unique values in by as the index
    """
    temp = df.assign(**{data : df[data] * df[weights]})
    temp = temp.groupby(by)
    return temp[data].sum() / temp[weights].sum()


weighted_mean = iris.pipe(weighted_average, 'sepal_length', 'sepal_width', 'species')

weighted_mean
species
setosa        5.034364
versicolor    5.966137
virginica     6.618897
dtype: float64

Windows Function

# reuse the weighted mean value
iris.assign(mean_sepal = iris.species.map(weighted_mean))
sepal_length sepal_width petal_length petal_width species mean_sepal
0 5.1 3.5 1.4 0.2 setosa 5.034364
1 4.9 3.0 1.4 0.2 setosa 5.034364
2 4.7 3.2 1.3 0.2 setosa 5.034364
3 4.6 3.1 1.5 0.2 setosa 5.034364
4 5.0 3.6 1.4 0.2 setosa 5.034364
... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica 6.618897
146 6.3 2.5 5.0 1.9 virginica 6.618897
147 6.5 3.0 5.2 2.0 virginica 6.618897
148 6.2 3.4 5.4 2.3 virginica 6.618897
149 5.9 3.0 5.1 1.8 virginica 6.618897

150 rows × 6 columns

Expanding Windows

# without chains
sorted_iris = iris.sort_values(['species', 'sepal_width'])
sorted_grouper = sorted_iris.groupby('species')
expand_sepal_length = (sorted_grouper
                        .expanding()
                        .sepal_length
                        .sum()
                        .droplevel(0)
                        )
                        
sorted_iris.assign(expanding_sepal_sum = expand_sepal_length)
sepal_length sepal_width petal_length petal_width species expanding_sepal_sum
41 4.5 2.3 1.3 0.3 setosa 4.5
8 4.4 2.9 1.4 0.2 setosa 8.9
1 4.9 3.0 1.4 0.2 setosa 13.8
12 4.8 3.0 1.4 0.1 setosa 18.6
13 4.3 3.0 1.1 0.1 setosa 22.9
... ... ... ... ... ... ...
136 6.3 3.4 5.6 2.4 virginica 300.4
148 6.2 3.4 5.4 2.3 virginica 306.6
109 7.2 3.6 6.1 2.5 virginica 313.8
117 7.7 3.8 6.7 2.2 virginica 321.5
131 7.9 3.8 6.4 2.0 virginica 329.4

150 rows × 6 columns

# chaining
(iris
.sort_values(['species', 'sepal_width'])
.assign(expanding_sepal_sum = lambda df: df.groupby('species')
                                           .sepal_length
                                           .expanding()
                                           .sum()
                                           .array)
)
sepal_length sepal_width petal_length petal_width species expanding_sepal_sum
41 4.5 2.3 1.3 0.3 setosa 4.5
8 4.4 2.9 1.4 0.2 setosa 8.9
1 4.9 3.0 1.4 0.2 setosa 13.8
12 4.8 3.0 1.4 0.1 setosa 18.6
13 4.3 3.0 1.1 0.1 setosa 22.9
... ... ... ... ... ... ...
136 6.3 3.4 5.6 2.4 virginica 300.4
148 6.2 3.4 5.4 2.3 virginica 306.6
109 7.2 3.6 6.1 2.5 virginica 313.8
117 7.7 3.8 6.7 2.2 virginica 321.5
131 7.9 3.8 6.4 2.0 virginica 329.4

150 rows × 6 columns

Moving Windows

rolling_sepal = (sorted_grouper
                .rolling(window=5, center=True,min_periods=1)
                .sepal_length.mean()
                .droplevel(0)
                )
                
sorted_iris.assign(moving_sepal_length = rolling_sepal)
sepal_length sepal_width petal_length petal_width species moving_sepal_length
41 4.5 2.3 1.3 0.3 setosa 4.60
8 4.4 2.9 1.4 0.2 setosa 4.65
1 4.9 3.0 1.4 0.2 setosa 4.58
12 4.8 3.0 1.4 0.1 setosa 4.68
13 4.3 3.0 1.1 0.1 setosa 4.68
... ... ... ... ... ... ...
136 6.3 3.4 5.6 2.4 virginica 6.62
148 6.2 3.4 5.4 2.3 virginica 6.82
109 7.2 3.6 6.1 2.5 virginica 7.06
117 7.7 3.8 6.7 2.2 virginica 7.25
131 7.9 3.8 6.4 2.0 virginica 7.60

150 rows × 6 columns

#chaining
(iris
 .sort_values(['species', 'sepal_width'])
 .assign(moving_sepal_length = lambda df: df.groupby('species')
                                            .sepal_length
                                            .rolling(window=5, center=True, min_periods=1)
                                            .mean()
                                            .array)
)
sepal_length sepal_width petal_length petal_width species moving_sepal_length
41 4.5 2.3 1.3 0.3 setosa 4.60
8 4.4 2.9 1.4 0.2 setosa 4.65
1 4.9 3.0 1.4 0.2 setosa 4.58
12 4.8 3.0 1.4 0.1 setosa 4.68
13 4.3 3.0 1.1 0.1 setosa 4.68
... ... ... ... ... ... ...
136 6.3 3.4 5.6 2.4 virginica 6.62
148 6.2 3.4 5.4 2.3 virginica 6.82
109 7.2 3.6 6.1 2.5 virginica 7.06
117 7.7 3.8 6.7 2.2 virginica 7.25
131 7.9 3.8 6.4 2.0 virginica 7.60

150 rows × 6 columns

Coalesce

df=pd.DataFrame({'s1':[np.nan,np.nan,6,9,9],'s2':[np.nan,8,7,9,9]})
df
s1 s2
0 NaN NaN
1 NaN 8.0
2 6.0 7.0
3 9.0 9.0
4 9.0 9.0
# combination of bfill and ffill on axis = 1 suffices
# usually faster than combine_first
df.assign(s3 = lambda df: df.bfill(axis=1).ffill(axis=1).fillna(0).s1)
s1 s2 s3
0 NaN NaN 0.0
1 NaN 8.0 8.0
2 6.0 7.0 6.0
3 9.0 9.0 9.0
4 9.0 9.0 9.0
# pyjanitor has a coalesce function
df.coalesce('s1', 's2',
            target_column_name = 's3',
            default_value = 0)
s1 s2 s3
0 NaN NaN 0.0
1 NaN 8.0 8.0
2 6.0 7.0 6.0
3 9.0 9.0 9.0
4 9.0 9.0 9.0

Case When

# pyjanitor has a case_when function
iris.case_when(
    iris.species.eq('setosa') & iris.sepal_length.lt(5), 'value C',
    iris.species.eq('versicolor') & iris.sepal_length.lt(5), 'value A',
    iris.species.eq('viriginica') & iris.sepal_length.lt(5), 'value G',
    'high', 
    column_name = 'wow'
)
sepal_length sepal_width petal_length petal_width species wow
0 5.1 3.5 1.4 0.2 setosa high
1 4.9 3.0 1.4 0.2 setosa value C
2 4.7 3.2 1.3 0.2 setosa value C
3 4.6 3.1 1.5 0.2 setosa value C
4 5.0 3.6 1.4 0.2 setosa high
... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica high
146 6.3 2.5 5.0 1.9 virginica high
147 6.5 3.0 5.2 2.0 virginica high
148 6.2 3.4 5.4 2.3 virginica high
149 5.9 3.0 5.1 1.8 virginica high

150 rows × 6 columns

# string like queries are supported for the conditions
iris.case_when(
    "species == 'setosa' and sepal_length < 5 ", 'value C',
    "species == 'versicolor' and sepal_length < 5 ", 'value A',
    "species == 'virginica' and sepal_length < 5 ", 'value G',
    'high', 
    column_name = 'wow'
)
sepal_length sepal_width petal_length petal_width species wow
0 5.1 3.5 1.4 0.2 setosa high
1 4.9 3.0 1.4 0.2 setosa value C
2 4.7 3.2 1.3 0.2 setosa value C
3 4.6 3.1 1.5 0.2 setosa value C
4 5.0 3.6 1.4 0.2 setosa high
... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica high
146 6.3 2.5 5.0 1.9 virginica high
147 6.5 3.0 5.2 2.0 virginica high
148 6.2 3.4 5.4 2.3 virginica high
149 5.9 3.0 5.1 1.8 virginica high

150 rows × 6 columns

@IyarLin
Copy link

IyarLin commented May 9, 2021

Thank you so much for this gist! I've taken many examples from it and added to the repo. Also made sure to give you credit (:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment