Notice
Recent Posts
Recent Comments
Link
«   2026/02   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
Tags
more
Archives
Today
Total
관리 메뉴

Silver bullet

Pipeline for StandardScaler & OneHotEncoder 본문

AI/AI

Pipeline for StandardScaler & OneHotEncoder

밀크쌀과자 2024. 7. 12. 04:42

1. Feature engineering & Feature selection (+ 데이터 읽어들이기 & Binary label 만들어주기)

* (중요) train_test_split을 한 다음 pipeline을 적용해야한다.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, model_selection, linear_model
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

df_data = pd.read_excel('boston_house_data.xlsx', index_col=0)
df_data.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

df_target = pd.read_excel('boston_house_target.xlsx', index_col=0)
df_target.columns = ['Price']

mean_price = df_target['Price'].mean()
df_target['Price'] = df_target['Price'].apply(lambda x : 1 if x > mean_price else 0)

x_train, x_test, y_train, y_test = model_selection.train_test_split(df_data, 
                                                                    df_target, 
                                                                    test_size=0.3, 
                                                                    random_state=0)

Make Pipeline for feature-transformer (StandardScaler & OneHotEncoder)

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
np.array(df_data['RAD']).reshape(-1, 1)

ohe = OneHotEncoder(categories='auto', handle_unknown='ignore') 
ohe.fit(np.array(df_data['RAD']).reshape(-1, 1))
pd.DataFrame(ohe.transform(np.array(df_data['RAD']).reshape(-1, 1)).todense())
# numeric_features = list(df_data.columns)
# numeric_features.remove('CHAS')
# numeric_features.remove('RAD')

numeric_features = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
numeric_transformer = StandardScaler() # cf) RobustScaler

categorical_features = ['CHAS', 'RAD']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 
# categories='auto' : just for ignoring warning messages
# handle_unknown='ignore' : if an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros.

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

handle_unknown='ignore' : if an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros. (변환 중 알 수 없는 범주가 발견되면 이 기능에 대한 결과적인 원 핫 인코딩 열은 모두 0이 됩니다. )

 


Pipeline usage - 1) Preprocessing-only (fit & transform)

가장 권장하는 방법

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # preprocessing-only

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)

# 위에서 categorical_features 리스트에 포함시킨 열 중 숫자가 아닌 텍스트(문자열)로 이루어진 열이 있을 경우,
# .transform() 함수 실행 결과로 만들어진 변수의 타입이 np.array가 아닌 csr_matrix일 수 있습니다.
# 그 경우에는 .tranform() 함수 실행 직후 .todense() 함수를 추가로 실행해주시면 됩니다.

# ex) preprocessor_pipe.transform(x_train).todense()
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=200, random_state=0)
model.fit(x_train_transformed, y_train) # <- x_train_transformed (not x_train)

accuracy = model.score(x_test_transformed, y_test)
print("model score:", round(accuracy, 4))
model score: 0.8553

Pipeline usage - 2) Preprocessing + Training (at once)

from sklearn.ensemble import GradientBoostingClassifier

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier(n_estimators=200, random_state=0))])

model.fit(x_train, y_train) # <- x_train (not x_train_transformed)

accuracy = model.score(x_test, y_test)
print("model score:", round(accuracy, 4))
model score: 0.8553

Pipeline usage - 3) Preprocessing + Training + Tuning hyper-params (at once)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier())]) # removed hyper-params 


model.get_params().keys()

param_grid = {
    'classifier__loss': ['deviance', 'exponential'], # you can exclude "deviance" 
    'classifier__learning_rate': [0.01, 0.001], # you can exclude "0.001" 
    'classifier__n_estimators': [200, 400], # 500, 1000, 1500
    'classifier__min_samples_split': [2, 4],
    'classifier__max_depth': [2, 4],
    'classifier__random_state': [0]
}

grid_search = GridSearchCV(model, param_grid, 
                           refit=True, cv=3, n_jobs=1, verbose=1, scoring= 'accuracy')

grid_search.fit(x_train, y_train)
print("Best params:", grid_search.best_params_)

accuracy = grid_search.score(x_test, y_test)
print("\nmodel score:", round(accuracy, 4))
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best params: {'classifier__learning_rate': 0.01, 'classifier__loss': 'deviance', 'classifier__max_depth': 4, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 400, 'classifier__random_state': 0}

model score: 0.8421