Last active
November 3, 2023 11:24
-
-
Save MCRE-BE/048671bb2a66eb7d16cad6455c613f83 to your computer and use it in GitHub Desktop.
Column or Feature selection algorithms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""CramersV detection and suppression algorithm for OHE columns. | |
See Also | |
-------- | |
`Multicollinearity impact <www.kaggle.com/code/ffisegydd/sklearn-multicollinearity-class/notebook>`_ | |
""" | |
# %% | |
############# | |
# Libraries # | |
############# | |
from itertools import combinations | |
from typing import Self | |
import pandas as pd | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.exceptions import NotFittedError | |
from tqdm.auto import tqdm | |
############ | |
# CramersV # | |
############ | |
class CramersV(BaseEstimator, TransformerMixin): | |
"""CramersV detection and suppression algorithm for OHE columns. | |
In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) | |
is a measure of association between two nominal variables, giving a value between | |
0 and +1 (inclusive). It is based on Pearson's chi-squared statistic and was published | |
by Harald Cramér in 1946. | |
Based on SKLearn alorithm, create a fit and transform class to calculate and drop | |
the needed columns. | |
See Also | |
-------- | |
`Wikipedia Cramer's V <https://www.wikiwand.com/en/Cram%C3%A9r's_V>`_ | |
`Assocation Metrics on Github <https://github.com/HeberTU/association_metrics>`_ | |
Parameters | |
---------- | |
thresh : float, by default 0.5 | |
Attributes | |
---------- | |
thresh : float | |
The treshold above which the featues will be assumed to be related | |
and one will be dropped during fitting. The following thresholds are | |
often assumed : | |
* Unrelated : 0 to 0.33 | |
* Medium relation : 0.33 to 0.5 | |
* Strongly related : 0.5 to 1 | |
matrix: Dict[Tuple[str, str], int] | |
Dict with the calculated Cramer's V associations. The | |
dictionnary is filled in both ways i:j and j:i with tupples | |
as keys | |
data : pd.DataFrame | |
The data used during fitting | |
features_to_drop_ : List[str] | |
List of all features to drop | |
_fitted : bool | |
Whether the .fit function has been called | |
""" | |
def __init__( | |
self: Self, | |
thresh: float = 0.5, | |
): | |
self.thresh = thresh | |
self.features_to_drop_ = [] | |
self._fitted = False | |
self.data = None | |
self.matrix = None | |
def fit( | |
self: Self, | |
X: pd.DataFrame, | |
y: pd.Series = None, | |
) -> object: | |
"""Fits the object based on Cramer's V selection method. | |
Fits the object by calculating the Cramer's V association value | |
for each pair of features passed in X. The results are saved in | |
a squared matrix array. _Fitted attribute is set to True. | |
The calculation is only applied for categorical columns in the | |
pandas DataFrame. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
y : pd.Series, by default None | |
The target variable. Not used, only added for SKLearn comaptibility. | |
Attributes | |
---------- | |
matrix: dict | |
data : pd.DataFrame | |
features_to_drop_ : list | |
_fitted : bool | |
Raises | |
------ | |
KeyError | |
In case no columns with dtype "category" are present in the passed | |
dataframe. | |
""" | |
# --- Import --- | |
from scipy.stats.contingency import association | |
# --- Variables --- | |
data = X.copy() | |
thresh = self.thresh | |
# --- Script --- | |
# Select only categorical variables | |
col = data.select_dtypes(include=['category']).columns | |
if len(col) == 0: raise KeyError("No categorical variables found") | |
col = list(combinations(col, r=2)) | |
# Fill the matrix | |
dropped = [] | |
matrix = {} | |
progress = tqdm(col, desc="Combo", leave=False) | |
for i, j in progress: | |
progress.set_postfix_str(f"{i} : {j}") | |
# if already dropped, don't calculate | |
if i in dropped or j in dropped: | |
next | |
else: | |
input_tab = pd.crosstab(data[i], data[j]) | |
res_cramer = association(input_tab, method='cramer') | |
matrix[(i, j)], matrix[(j, i)] = res_cramer, res_cramer | |
dropped.append(i) if res_cramer > thresh else None | |
# Save | |
self._fitted = True | |
self.matrix = matrix | |
self.data = X.copy() | |
self.features_to_drop_ = list(set(dropped)) | |
return self | |
def transform( | |
self: Self, | |
X: pd.DataFrame, | |
) -> pd.DataFrame: | |
"""Transform the dataframe based on Cramer's V selection. | |
Based on the fitted object, drops all the corresponding columns | |
from the dataframe. The object assumes the same columns are in the | |
fitted as in the transformed DataFrame, even if this is not checked. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
Returns | |
------- | |
pd.DataFrame | |
Transformed dataframe | |
Raises | |
------ | |
NotFittedError | |
If the object has not been fitted | |
""" | |
# --- Check --- | |
if not self._fitted: raise NotFittedError("Object has not been fitted") | |
return X.drop(columns=self.features_to_drop_, errors="ignore").copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Different kinds of transformers used in the FeatureTransformer. | |
Module containing several kinds of SKLearn compatible transformers to select | |
and filter out specific types of features that could potentially reduce the | |
accuracy of our forecasts. | |
See Also | |
-------- | |
`Multicollinearity impact <www.kaggle.com/code/ffisegydd/sklearn-multicollinearity-class/notebook>`_ | |
""" | |
# %% | |
############# | |
# Libraries # | |
############# | |
from typing import List, Self, Tuple | |
import numpy as np | |
import pandas as pd | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.exceptions import NotFittedError | |
from sklearn.impute import SimpleImputer | |
############# | |
# ReduceVIF # | |
############# | |
class ReduceVIF(BaseEstimator, TransformerMixin): | |
"""Variance Inflation Factors detection and suppression algorithm. | |
Algorithm to itteratively calculate the Variance Inflation Factor for each | |
of the features in the provided dataframe and delete them from the model. | |
The use of this transformer guarantees the absence of multicolinearity in the | |
model and should imporve the accuracy or training speed. | |
See Also | |
-------- | |
statsmodels.stats.outliers_influence.variance_inflation_factor | |
sklearn.preprocessing.Imputer | |
Parameters | |
---------- | |
thresh : float, by default 5.0 | |
impute : bool, by default False | |
impute_strategy : str, {'median', 'mean', 'most_frequent'} | |
Attributes | |
---------- | |
tresh : float | |
The treshold under wich we want to drop the column. From documentation | |
5 to 10 : is ok / Above 10 : to drop. | |
impute : bool | |
Whether to impute the NaN values. | |
impute_strategy : str | |
The imputer to use by sklearn.preprocessing to fill any available NaN. | |
imputer : SimpleImputer | |
SimpleImputer initialized with the chosen impute_strategy | |
data : pd.DataFrame | |
The data used during fitting | |
features_to_drop_ : List[str] | |
List of all features to drop | |
_fitted : bool | |
Whether the .fit function has been called | |
""" | |
def __init__( | |
self: Self, | |
thresh: float = 5.0, | |
impute: bool = False, | |
impute_strategy: str = 'median', | |
): | |
self.thresh = thresh | |
self.impute_strategy = impute_strategy | |
self.features_to_drop_ = [] | |
self._fitted = False | |
self.impute = impute | |
if impute: | |
self.imputer = SimpleImputer(strategy=impute_strategy) | |
@staticmethod | |
def calculate_vif( | |
X: pd.DataFrame, | |
thresh=5.0, | |
) -> Tuple[pd.DataFrame, List[str]]: | |
"""Calculate the VIF for each column and drop it. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
thresh : float, by default 5.0 | |
The treshold over which we decide to drop the chosen column. | |
See Also | |
-------- | |
`Inspiration of function <https://stats.stackexchange.com/a/253620/53565>`_ | |
Returns | |
------- | |
pd.DataFrame | |
The transformed DataFrame with all high VIF columns dropped. | |
List[str] | |
The list of all columns to drop | |
""" | |
# --- Import --- | |
from statsmodels.stats.outliers_influence import variance_inflation_factor | |
# --- Setting Variables --- | |
dropped = True | |
LOOP = 1 | |
to_drop = {} | |
# --- Script --- | |
while dropped and LOOP <= len(X.columns): | |
# --- Run variables --- | |
feat = X.columns | |
dropped = False | |
LOOP = LOOP + 1 | |
# --- Script --- | |
# Calculate the VIF for each and every column | |
vif = [ | |
variance_inflation_factor(X.values, feat.get_loc(var)) | |
for var in feat | |
] | |
# Find the highest VIF and drop it. The continue if one is dropped. | |
max_vif = max(vif) | |
if max_vif > thresh: | |
# find column to drop and save it | |
col = feat[vif.index(max_vif)] | |
to_drop[col] = max_vif | |
X = X.drop(columns=col, errors="ignore") | |
dropped = True | |
# Logging | |
out = pd.DataFrame(data=to_drop.values(), index=to_drop.keys()) | |
out = out.astype(np.int64) | |
return X, list(out.index) | |
def fit( | |
self: Self, | |
X: pd.DataFrame, | |
y: pd.Series = None, | |
) -> Self: | |
"""The fit function. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
y : pd.Series, by default None | |
The target variable. Not used, only added for SKLearn comaptibility. | |
Attributes | |
---------- | |
data : pd.DataFrame | |
features_to_drop_ : List[str] | |
_fitted : bool | |
See Also | |
-------- | |
ReduceVIF.calculate_vif | |
""" | |
# --- Variables --- | |
self.data = X.copy() | |
columns = X.columns.tolist() | |
# --- Script --- | |
if hasattr(self, 'imputer'): | |
self.imputer.fit(X) | |
X = pd.DataFrame(self.imputer.transform(X), columns=columns) | |
X, to_drop = ReduceVIF.calculate_vif(X, self.thresh) | |
self.features_to_drop_ = list(set(to_drop)) | |
self._fitted = True | |
return self | |
def transform( | |
self: Self, | |
X: pd.DataFrame, | |
) -> pd.DataFrame: | |
"""The transformer function for the ReduceVIF. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
Returns | |
------- | |
pd.DataFrame | |
The transformed dataframe. | |
Raises | |
------ | |
NotFittedError | |
If the object has not been fitted | |
""" | |
# --- Checks --- | |
if not self._fitted: | |
raise NotFittedError("Model has not been fitted yet") | |
return X.drop(columns=self.features_to_drop_, errors="ignore").copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""RegexSelector selection and suppression algorithm.""" | |
# %% | |
############# | |
# Libraries # | |
############# | |
import re | |
from typing import List, Self | |
import pandas as pd | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.exceptions import NotFittedError | |
################# | |
# RegexSelector # | |
################# | |
# ? : to remove as unused? | |
class RegexSelector(BaseEstimator, TransformerMixin): | |
"""RegexSelector selection and suppression algorithm. | |
Object that selects the feature columns from a dataframe based on a | |
regex string and only keeps the relevant columns. | |
Parameters | |
---------- | |
regex : List[str] | |
Attributes | |
---------- | |
regex : List[str] | |
A list of lists or tupples with the following pattern: | |
* Regex string to select all relevant features | |
* String to match in the set. Matching are kept. | |
features: List[str] | |
List of all features seen during fitting | |
data : pd.DataFrame | |
The data used during fitting | |
features_to_drop_ : List[str] | |
List of all features to drop | |
_fitted : bool | |
Whether the .fit function has been called | |
Example | |
------- | |
Dictionnary formatting :: | |
regex = [ | |
["^(SIX_Lead_time_|SIX_t_btwn_o_).*", "100"], | |
["^(Holidays_).*(_Cat_).*", "001"], | |
["^(Holidays_DC_|Holidays_XPO_).*", "DCA"], | |
] | |
""" | |
def __init__( | |
self: Self, | |
regex: List[str], | |
): | |
self.regex = regex | |
self._fitted = False | |
self.data = None | |
self.features = None | |
self.features_to_drop_ = [] | |
def fit( | |
self: Self, | |
X: pd.DataFrame, | |
y: pd.Series = None, | |
) -> Self: | |
"""Fits the object and sets the needed attributes. | |
Based on the provided regex list provided during initialization, | |
the needed columns to drop are identified and saved in the needed | |
attributes. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
y : pd.Series, by default None | |
The target variable. Not used, only added for SKLearn comaptibility. | |
Attributes | |
---------- | |
features: List[str] | |
data : pd.DataFrame | |
features_to_drop_ : List[str] | |
_fitted : bool | |
""" | |
# --- Variables --- | |
regex = self.regex | |
features = list(X.columns) | |
self.features = features | |
self.data = X.copy() | |
# --- Script --- | |
dropped = [] | |
for pattern, what in regex: | |
test = [x for x in features if re.search(pattern, x)] | |
if what is None: | |
to_drop = test | |
else: | |
to_drop = [x for x in test if str(what) not in x] | |
dropped += to_drop | |
# Saving | |
self.features_to_drop_ = list(set(dropped)) | |
self._fitted = True | |
return self | |
def transform( | |
self: Self, | |
X: pd.DataFrame, | |
) -> pd.DataFrame: | |
"""Transforms the dataframe by removing columns chosen in fit. | |
Based on the fitted object, drops all the corresponding columns | |
from the dataframe. The object assumes the same columns are in the | |
fitted as in the transformed DataFrame, even if this is not checked. | |
Parameters | |
---------- | |
X : pd.DataFrame | |
Feature dataframe with all columns. | |
Returns | |
------- | |
pd.DataFrame | |
Transformed dataframe | |
Raises | |
------ | |
NotFittedError | |
If the object has not been fitted | |
""" | |
# --- Check --- | |
if not self._fitted: raise NotFittedError("Object has not been fitted") | |
return X.drop(columns=self.features_to_drop_, errors="ignore").copy() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment