Skip to content

Instantly share code, notes, and snippets.

@MCRE-BE
Last active November 3, 2023 11:24
Show Gist options
  • Save MCRE-BE/048671bb2a66eb7d16cad6455c613f83 to your computer and use it in GitHub Desktop.
Save MCRE-BE/048671bb2a66eb7d16cad6455c613f83 to your computer and use it in GitHub Desktop.
Column or Feature selection algorithms
"""CramersV detection and suppression algorithm for OHE columns.
See Also
--------
`Multicollinearity impact <www.kaggle.com/code/ffisegydd/sklearn-multicollinearity-class/notebook>`_
"""
# %%
#############
# Libraries #
#############
from itertools import combinations
from typing import Self
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from tqdm.auto import tqdm
############
# CramersV #
############
class CramersV(BaseEstimator, TransformerMixin):
"""CramersV detection and suppression algorithm for OHE columns.
In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc)
is a measure of association between two nominal variables, giving a value between
0 and +1 (inclusive). It is based on Pearson's chi-squared statistic and was published
by Harald Cramér in 1946.
Based on SKLearn alorithm, create a fit and transform class to calculate and drop
the needed columns.
See Also
--------
`Wikipedia Cramer's V <https://www.wikiwand.com/en/Cram%C3%A9r's_V>`_
`Assocation Metrics on Github <https://github.com/HeberTU/association_metrics>`_
Parameters
----------
thresh : float, by default 0.5
Attributes
----------
thresh : float
The treshold above which the featues will be assumed to be related
and one will be dropped during fitting. The following thresholds are
often assumed :
* Unrelated : 0 to 0.33
* Medium relation : 0.33 to 0.5
* Strongly related : 0.5 to 1
matrix: Dict[Tuple[str, str], int]
Dict with the calculated Cramer's V associations. The
dictionnary is filled in both ways i:j and j:i with tupples
as keys
data : pd.DataFrame
The data used during fitting
features_to_drop_ : List[str]
List of all features to drop
_fitted : bool
Whether the .fit function has been called
"""
def __init__(
self: Self,
thresh: float = 0.5,
):
self.thresh = thresh
self.features_to_drop_ = []
self._fitted = False
self.data = None
self.matrix = None
def fit(
self: Self,
X: pd.DataFrame,
y: pd.Series = None,
) -> object:
"""Fits the object based on Cramer's V selection method.
Fits the object by calculating the Cramer's V association value
for each pair of features passed in X. The results are saved in
a squared matrix array. _Fitted attribute is set to True.
The calculation is only applied for categorical columns in the
pandas DataFrame.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
y : pd.Series, by default None
The target variable. Not used, only added for SKLearn comaptibility.
Attributes
----------
matrix: dict
data : pd.DataFrame
features_to_drop_ : list
_fitted : bool
Raises
------
KeyError
In case no columns with dtype "category" are present in the passed
dataframe.
"""
# --- Import ---
from scipy.stats.contingency import association
# --- Variables ---
data = X.copy()
thresh = self.thresh
# --- Script ---
# Select only categorical variables
col = data.select_dtypes(include=['category']).columns
if len(col) == 0: raise KeyError("No categorical variables found")
col = list(combinations(col, r=2))
# Fill the matrix
dropped = []
matrix = {}
progress = tqdm(col, desc="Combo", leave=False)
for i, j in progress:
progress.set_postfix_str(f"{i} : {j}")
# if already dropped, don't calculate
if i in dropped or j in dropped:
next
else:
input_tab = pd.crosstab(data[i], data[j])
res_cramer = association(input_tab, method='cramer')
matrix[(i, j)], matrix[(j, i)] = res_cramer, res_cramer
dropped.append(i) if res_cramer > thresh else None
# Save
self._fitted = True
self.matrix = matrix
self.data = X.copy()
self.features_to_drop_ = list(set(dropped))
return self
def transform(
self: Self,
X: pd.DataFrame,
) -> pd.DataFrame:
"""Transform the dataframe based on Cramer's V selection.
Based on the fitted object, drops all the corresponding columns
from the dataframe. The object assumes the same columns are in the
fitted as in the transformed DataFrame, even if this is not checked.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
Returns
-------
pd.DataFrame
Transformed dataframe
Raises
------
NotFittedError
If the object has not been fitted
"""
# --- Check ---
if not self._fitted: raise NotFittedError("Object has not been fitted")
return X.drop(columns=self.features_to_drop_, errors="ignore").copy()
"""Different kinds of transformers used in the FeatureTransformer.
Module containing several kinds of SKLearn compatible transformers to select
and filter out specific types of features that could potentially reduce the
accuracy of our forecasts.
See Also
--------
`Multicollinearity impact <www.kaggle.com/code/ffisegydd/sklearn-multicollinearity-class/notebook>`_
"""
# %%
#############
# Libraries #
#############
from typing import List, Self, Tuple
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from sklearn.impute import SimpleImputer
#############
# ReduceVIF #
#############
class ReduceVIF(BaseEstimator, TransformerMixin):
"""Variance Inflation Factors detection and suppression algorithm.
Algorithm to itteratively calculate the Variance Inflation Factor for each
of the features in the provided dataframe and delete them from the model.
The use of this transformer guarantees the absence of multicolinearity in the
model and should imporve the accuracy or training speed.
See Also
--------
statsmodels.stats.outliers_influence.variance_inflation_factor
sklearn.preprocessing.Imputer
Parameters
----------
thresh : float, by default 5.0
impute : bool, by default False
impute_strategy : str, {'median', 'mean', 'most_frequent'}
Attributes
----------
tresh : float
The treshold under wich we want to drop the column. From documentation
5 to 10 : is ok / Above 10 : to drop.
impute : bool
Whether to impute the NaN values.
impute_strategy : str
The imputer to use by sklearn.preprocessing to fill any available NaN.
imputer : SimpleImputer
SimpleImputer initialized with the chosen impute_strategy
data : pd.DataFrame
The data used during fitting
features_to_drop_ : List[str]
List of all features to drop
_fitted : bool
Whether the .fit function has been called
"""
def __init__(
self: Self,
thresh: float = 5.0,
impute: bool = False,
impute_strategy: str = 'median',
):
self.thresh = thresh
self.impute_strategy = impute_strategy
self.features_to_drop_ = []
self._fitted = False
self.impute = impute
if impute:
self.imputer = SimpleImputer(strategy=impute_strategy)
@staticmethod
def calculate_vif(
X: pd.DataFrame,
thresh=5.0,
) -> Tuple[pd.DataFrame, List[str]]:
"""Calculate the VIF for each column and drop it.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
thresh : float, by default 5.0
The treshold over which we decide to drop the chosen column.
See Also
--------
`Inspiration of function <https://stats.stackexchange.com/a/253620/53565>`_
Returns
-------
pd.DataFrame
The transformed DataFrame with all high VIF columns dropped.
List[str]
The list of all columns to drop
"""
# --- Import ---
from statsmodels.stats.outliers_influence import variance_inflation_factor
# --- Setting Variables ---
dropped = True
LOOP = 1
to_drop = {}
# --- Script ---
while dropped and LOOP <= len(X.columns):
# --- Run variables ---
feat = X.columns
dropped = False
LOOP = LOOP + 1
# --- Script ---
# Calculate the VIF for each and every column
vif = [
variance_inflation_factor(X.values, feat.get_loc(var))
for var in feat
]
# Find the highest VIF and drop it. The continue if one is dropped.
max_vif = max(vif)
if max_vif > thresh:
# find column to drop and save it
col = feat[vif.index(max_vif)]
to_drop[col] = max_vif
X = X.drop(columns=col, errors="ignore")
dropped = True
# Logging
out = pd.DataFrame(data=to_drop.values(), index=to_drop.keys())
out = out.astype(np.int64)
return X, list(out.index)
def fit(
self: Self,
X: pd.DataFrame,
y: pd.Series = None,
) -> Self:
"""The fit function.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
y : pd.Series, by default None
The target variable. Not used, only added for SKLearn comaptibility.
Attributes
----------
data : pd.DataFrame
features_to_drop_ : List[str]
_fitted : bool
See Also
--------
ReduceVIF.calculate_vif
"""
# --- Variables ---
self.data = X.copy()
columns = X.columns.tolist()
# --- Script ---
if hasattr(self, 'imputer'):
self.imputer.fit(X)
X = pd.DataFrame(self.imputer.transform(X), columns=columns)
X, to_drop = ReduceVIF.calculate_vif(X, self.thresh)
self.features_to_drop_ = list(set(to_drop))
self._fitted = True
return self
def transform(
self: Self,
X: pd.DataFrame,
) -> pd.DataFrame:
"""The transformer function for the ReduceVIF.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
Returns
-------
pd.DataFrame
The transformed dataframe.
Raises
------
NotFittedError
If the object has not been fitted
"""
# --- Checks ---
if not self._fitted:
raise NotFittedError("Model has not been fitted yet")
return X.drop(columns=self.features_to_drop_, errors="ignore").copy()
"""RegexSelector selection and suppression algorithm."""
# %%
#############
# Libraries #
#############
import re
from typing import List, Self
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
#################
# RegexSelector #
#################
# ? : to remove as unused?
class RegexSelector(BaseEstimator, TransformerMixin):
"""RegexSelector selection and suppression algorithm.
Object that selects the feature columns from a dataframe based on a
regex string and only keeps the relevant columns.
Parameters
----------
regex : List[str]
Attributes
----------
regex : List[str]
A list of lists or tupples with the following pattern:
* Regex string to select all relevant features
* String to match in the set. Matching are kept.
features: List[str]
List of all features seen during fitting
data : pd.DataFrame
The data used during fitting
features_to_drop_ : List[str]
List of all features to drop
_fitted : bool
Whether the .fit function has been called
Example
-------
Dictionnary formatting ::
regex = [
["^(SIX_Lead_time_|SIX_t_btwn_o_).*", "100"],
["^(Holidays_).*(_Cat_).*", "001"],
["^(Holidays_DC_|Holidays_XPO_).*", "DCA"],
]
"""
def __init__(
self: Self,
regex: List[str],
):
self.regex = regex
self._fitted = False
self.data = None
self.features = None
self.features_to_drop_ = []
def fit(
self: Self,
X: pd.DataFrame,
y: pd.Series = None,
) -> Self:
"""Fits the object and sets the needed attributes.
Based on the provided regex list provided during initialization,
the needed columns to drop are identified and saved in the needed
attributes.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
y : pd.Series, by default None
The target variable. Not used, only added for SKLearn comaptibility.
Attributes
----------
features: List[str]
data : pd.DataFrame
features_to_drop_ : List[str]
_fitted : bool
"""
# --- Variables ---
regex = self.regex
features = list(X.columns)
self.features = features
self.data = X.copy()
# --- Script ---
dropped = []
for pattern, what in regex:
test = [x for x in features if re.search(pattern, x)]
if what is None:
to_drop = test
else:
to_drop = [x for x in test if str(what) not in x]
dropped += to_drop
# Saving
self.features_to_drop_ = list(set(dropped))
self._fitted = True
return self
def transform(
self: Self,
X: pd.DataFrame,
) -> pd.DataFrame:
"""Transforms the dataframe by removing columns chosen in fit.
Based on the fitted object, drops all the corresponding columns
from the dataframe. The object assumes the same columns are in the
fitted as in the transformed DataFrame, even if this is not checked.
Parameters
----------
X : pd.DataFrame
Feature dataframe with all columns.
Returns
-------
pd.DataFrame
Transformed dataframe
Raises
------
NotFittedError
If the object has not been fitted
"""
# --- Check ---
if not self._fitted: raise NotFittedError("Object has not been fitted")
return X.drop(columns=self.features_to_drop_, errors="ignore").copy()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment