Skip to content

Instantly share code, notes, and snippets.

@shaypal5
Last active August 5, 2022 15:08
Show Gist options
  • Save shaypal5/8a4d03696744b7f7339ba3e362b355c2 to your computer and use it in GitHub Desktop.
Save shaypal5/8a4d03696744b7f7339ba3e362b355c2 to your computer and use it in GitHub Desktop.
An example for an advanced initialization of a complex pdpipe pipeline for processing pandas dataframes. ๐Ÿผ๐Ÿšฟ
from typing import Optional
import pdpipe as pdp
from pdpipe import df
from sklearn.linear_model import LogisticRegression
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator
class MyPipelineAndModel(PdPipelineAndSklearnEstimator):
def __init__(
self,
savings_max_val: Optional[int] = 100,
drop_gender: Optional[bool] = False,
standardize: Optional[bool] = False,
ohencode_country: Optional[bool] = True,
savings_bin_val: Optional[int] = None,
pca_threshold: Optional[int] = 20,
fit_intercept: Optional[bool] = True,
):
# save pipeline parameters
self.savings_max_val = savings_max_val
self.drop_gender = drop_gender
self.standardize = standardize
self.ohencode_country = ohencode_country
self.savings_bin_val = savings_bin_val
self.pca_threshold = pca_threshold
self.fit_intercept = fit_intercept
# init helper lists
cols_to_drop = ['Bearded']
cols_to_encode = []
# start with a prefix of non-optional stages
stages = [
# standard pipeline stages
pdp.ColDrop(columns=pdp.cq.WithAtLeastMissingValueRate(0.2)),
pdp.DropLabelsByValues(not_in_set=['Smoking', 'Non-Smoking']),
pdp.EncodeLabel(),
pdp.ColDrop(['Name'], errors='ignore'),
# using pdpipe fly-handles ๐Ÿš€
df.set_index(keys='id'),
pdp.drop_rows_where['Savings'] > savings_max_val,
df['Viking'] << (df['Country'].isin(['Denmark', 'Finland']) & ~df['Bearded']),
df['YearlyGrands'] << (df['Savings'] * 1000) / df['Age']
]
# a few parameter-dependent pipeline stages
if savings_bin_val:
stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
cols_to_encode.append('Savings_bin')
if drop_gender:
cols_to_drop.append('Gender')
else:
cols_to_encode.append('Gender')
if ohencode_country:
stages.append(pdp.OneHotEncode('Country'))
else:
cols_to_drop.append('Country')
# processing the text column:
# 1. we do this before standardization so tf-idf
# representation is also standardized
# 2. we do this after everything else, so all tf-idf
# columns are last in column order (for ease of presentation)
stages.extend([
pdp.TokenizeText('Quote'),
pdp.SnowballStem('EnglishStemmer', columns=['Quote']),
pdp.RemoveStopwords('English', 'Quote'),
pdp.TfidfVectorizeTokenLists('Quote', hierarchical_labels=True),
])
# PCA all tf-idf columns if there are too many of them
stages.append(
pdp.Decompose(
transformer='PCA',
columns=pdp.cq.StartsWith('Quote'),
prec=pdp.cond.HasAtLeastNQualifyingColumns(
n=pca_threshold,
qualifier=pdp.cq.StartsWith('Quote'),
),
exraise=False,
)
)
# more parameter-dependent pipeline stages
if len(cols_to_encode) > 0:
stages.append(pdp.Encode(cols_to_encode))
if standardize:
stages.append(pdp.Scale('StandardScaler'))
# the suffix of non-optional pipeline stages
stages.extend([
pdp.ColDrop(cols_to_drop, errors='ignore'),
pdp.Schematize(),
pdp.ConditionValidator([
pdp.cond.HasAtMostNQualifyingColumns(
n=150,
qualifier=pdp.cq.AllColumns(fittable=False),
),
pdp.cond.HasNoMissingValues(),
]),
])
pipeline = pdp.PdPipeline(stages)
model = LogisticRegression(fit_intercept=fit_intercept)
super().__init__(pipeline=pipeline, estimator=model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment