Last active
August 29, 2019 01:44
-
-
Save ElissandroMendes/8101499571b3d61a02f95a00045e24b2 to your computer and use it in GitHub Desktop.
Tutorial code - Intermediate Machine Learning - Micro course from Kaggle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
# Read the data | |
X = pd.read_csv('../input/train.csv', index_col='Id') | |
X_test = pd.read_csv('../input/test.csv', index_col='Id') | |
# Remove rows with missing target, separate target from predictors | |
X.dropna(axis=0, subset=['SalePrice'], inplace=True) | |
y = X.SalePrice | |
X.drop(['SalePrice'], axis=1, inplace=True) | |
# To keep things simple, we'll drop columns with missing values | |
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] | |
X.drop(cols_with_missing, axis=1, inplace=True) | |
X_test.drop(cols_with_missing, axis=1, inplace=True) | |
# Break off validation set from training data | |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, | |
train_size=0.8, test_size=0.2, | |
random_state=0) | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_absolute_error | |
# function for comparing different approaches | |
def score_dataset(X_train, X_valid, y_train, y_valid): | |
model = RandomForestRegressor(n_estimators=100, random_state=0) | |
model.fit(X_train, y_train) | |
preds = model.predict(X_valid) | |
return mean_absolute_error(y_valid, preds) | |
# First aproach: Drop columns in training and validation data with categorical values | |
drop_X_train = X_train.select_dtypes(exclude=['object']) | |
drop_X_valid = X_valid.select_dtypes(exclude=['object']) | |
print("MAE from Approach 1 (Drop categorical variables):") | |
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)) | |
print("Unique values in Condition2 column in training data:", X_train['Condition2'].unique()) | |
print("\nUnique values in Condition2 column in validation data:", X_valid['Condition2'].unique()) | |
## There are values in validation data that don't appear in training data | |
# All categorical columns | |
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"] | |
# Columns that can be safely label encoded | |
good_label_cols = [col for col in object_cols if | |
set(X_train[col]) == set(X_valid[col])] | |
# Problematic columns that will be dropped from the dataset | |
bad_label_cols = list(set(object_cols)-set(good_label_cols)) | |
print('Categorical columns that will be label encoded:', good_label_cols) | |
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols) | |
from sklearn.preprocessing import LabelEncoder | |
# Drop categorical columns that will not be encoded | |
label_X_train = X_train.drop(bad_label_cols, axis=1) | |
label_X_valid = X_valid.drop(bad_label_cols, axis=1) | |
encoder = LabelEncoder() | |
# Apply label encoder | |
for col in good_label_cols: | |
label_X_train[col] = encoder.fit_transform(X_train[col]) | |
label_X_valid[col] = encoder.transform(X_valid[col]) | |
print("MAE from Approach 2 - Label Encoding:") | |
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid)) | |
## Investigate cardinality | |
# Get number of unique entries in each column with categorical data | |
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols)) | |
d = dict(zip(object_cols, object_nunique)) | |
# Print number of unique entries by column, in ascending order | |
sorted(d.items(), key=lambda x: x[1]) | |
## only apply one-hor-encode in low cardinality columns | |
# Columns that will be one-hot encoded | |
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] | |
# Columns that will be dropped from the dataset | |
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols)) | |
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols) | |
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment