Skip to content

Instantly share code, notes, and snippets.

@ElissandroMendes
Last active August 29, 2019 01:44
Show Gist options
  • Save ElissandroMendes/8101499571b3d61a02f95a00045e24b2 to your computer and use it in GitHub Desktop.
Save ElissandroMendes/8101499571b3d61a02f95a00045e24b2 to your computer and use it in GitHub Desktop.
Tutorial code - Intermediate Machine Learning - Micro course from Kaggle
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)
# First aproach: Drop columns in training and validation data with categorical values
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
print("Unique values in Condition2 column in training data:", X_train['Condition2'].unique())
print("\nUnique values in Condition2 column in validation data:", X_valid['Condition2'].unique())
## There are values in validation data that don't appear in training data
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if
set(X_train[col]) == set(X_valid[col])]
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
from sklearn.preprocessing import LabelEncoder
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)
encoder = LabelEncoder()
# Apply label encoder
for col in good_label_cols:
label_X_train[col] = encoder.fit_transform(X_train[col])
label_X_valid[col] = encoder.transform(X_valid[col])
print("MAE from Approach 2 - Label Encoding:")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
## Investigate cardinality
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])
## only apply one-hor-encode in low cardinality columns
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment