Last active
August 5, 2022 15:08
-
-
Save shaypal5/8a4d03696744b7f7339ba3e362b355c2 to your computer and use it in GitHub Desktop.
An example for an advanced initialization of a complex pdpipe pipeline for processing pandas dataframes. ๐ผ๐ฟ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Optional | |
import pdpipe as pdp | |
from pdpipe import df | |
from sklearn.linear_model import LogisticRegression | |
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator | |
class MyPipelineAndModel(PdPipelineAndSklearnEstimator): | |
def __init__( | |
self, | |
savings_max_val: Optional[int] = 100, | |
drop_gender: Optional[bool] = False, | |
standardize: Optional[bool] = False, | |
ohencode_country: Optional[bool] = True, | |
savings_bin_val: Optional[int] = None, | |
pca_threshold: Optional[int] = 20, | |
fit_intercept: Optional[bool] = True, | |
): | |
# save pipeline parameters | |
self.savings_max_val = savings_max_val | |
self.drop_gender = drop_gender | |
self.standardize = standardize | |
self.ohencode_country = ohencode_country | |
self.savings_bin_val = savings_bin_val | |
self.pca_threshold = pca_threshold | |
self.fit_intercept = fit_intercept | |
# init helper lists | |
cols_to_drop = ['Bearded'] | |
cols_to_encode = [] | |
# start with a prefix of non-optional stages | |
stages = [ | |
# standard pipeline stages | |
pdp.ColDrop(columns=pdp.cq.WithAtLeastMissingValueRate(0.2)), | |
pdp.DropLabelsByValues(not_in_set=['Smoking', 'Non-Smoking']), | |
pdp.EncodeLabel(), | |
pdp.ColDrop(['Name'], errors='ignore'), | |
# using pdpipe fly-handles ๐ | |
df.set_index(keys='id'), | |
pdp.drop_rows_where['Savings'] > savings_max_val, | |
df['Viking'] << (df['Country'].isin(['Denmark', 'Finland']) & ~df['Bearded']), | |
df['YearlyGrands'] << (df['Savings'] * 1000) / df['Age'] | |
] | |
# a few parameter-dependent pipeline stages | |
if savings_bin_val: | |
stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False)) | |
cols_to_encode.append('Savings_bin') | |
if drop_gender: | |
cols_to_drop.append('Gender') | |
else: | |
cols_to_encode.append('Gender') | |
if ohencode_country: | |
stages.append(pdp.OneHotEncode('Country')) | |
else: | |
cols_to_drop.append('Country') | |
# processing the text column: | |
# 1. we do this before standardization so tf-idf | |
# representation is also standardized | |
# 2. we do this after everything else, so all tf-idf | |
# columns are last in column order (for ease of presentation) | |
stages.extend([ | |
pdp.TokenizeText('Quote'), | |
pdp.SnowballStem('EnglishStemmer', columns=['Quote']), | |
pdp.RemoveStopwords('English', 'Quote'), | |
pdp.TfidfVectorizeTokenLists('Quote', hierarchical_labels=True), | |
]) | |
# PCA all tf-idf columns if there are too many of them | |
stages.append( | |
pdp.Decompose( | |
transformer='PCA', | |
columns=pdp.cq.StartsWith('Quote'), | |
prec=pdp.cond.HasAtLeastNQualifyingColumns( | |
n=pca_threshold, | |
qualifier=pdp.cq.StartsWith('Quote'), | |
), | |
exraise=False, | |
) | |
) | |
# more parameter-dependent pipeline stages | |
if len(cols_to_encode) > 0: | |
stages.append(pdp.Encode(cols_to_encode)) | |
if standardize: | |
stages.append(pdp.Scale('StandardScaler')) | |
# the suffix of non-optional pipeline stages | |
stages.extend([ | |
pdp.ColDrop(cols_to_drop, errors='ignore'), | |
pdp.Schematize(), | |
pdp.ConditionValidator([ | |
pdp.cond.HasAtMostNQualifyingColumns( | |
n=150, | |
qualifier=pdp.cq.AllColumns(fittable=False), | |
), | |
pdp.cond.HasNoMissingValues(), | |
]), | |
]) | |
pipeline = pdp.PdPipeline(stages) | |
model = LogisticRegression(fit_intercept=fit_intercept) | |
super().__init__(pipeline=pipeline, estimator=model) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment