Notice
Recent Posts
Recent Comments
Link
«   2026/02   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
Tags
more
Archives
Today
Total
관리 메뉴

Silver bullet

Bayesian-Search HPO with Scikit-Optimize 본문

AI/AI

Bayesian-Search HPO with Scikit-Optimize

밀크쌀과자 2024. 7. 13. 08:14

유방암 데이터 셋 사용

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0) # stratify=cancer.target : 각 class 비율을 train & test split 시 유지


sc = StandardScaler()
sc.fit(X_train) # X_train 의 평균과 표준편차를 구함

X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

# tree 기반 앙상블모델은 feature 스케일링 작업을 하지 않아도 됨
clf = LGBMClassifier(verbose=-1)
clf.fit(X_train, y_train)

print("Accuracy on Training set: {:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy on Test set: {:.3f}".format(clf.score(X_test, y_test)))
Accuracy on Training set: 1.000
Accuracy on Test set: 0.972

Hyper-Parameter Optimization

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

%%time

param_grid = {'n_estimators' : [100, 300, 500, 1000],
              'min_child_samples' : [10, 20],
              'max_depth':[-1, 3, 5], 
              'random_state':[42]}

hpo = GridSearchCV(LGBMClassifier(), param_grid, refit=True, verbose=1, n_jobs=-1)

hpo.fit(X_train, y_train)
print('The best parameters are ', hpo.best_params_, '\n')

print("Accuracy on Training set: {:.6f}".format(hpo.score(X_train, y_train)))
print("Accuracy on Test set: {:.6f}".format(hpo.score(X_test, y_test)))

# pd.DataFrame.from_dict(hpo.cv_results_)
%%time

param_grid = {'n_estimators' : stats.randint(100, 1000),
              'min_child_samples' : stats.randint(10, 20),
              'max_depth': stats.randint(-1, 5), 
              'random_state':[42]}

hpo = RandomizedSearchCV(LGBMClassifier(), param_grid, refit=True, verbose=1, n_jobs=-1, 
                         n_iter=72)

hpo.fit(X_train, y_train)
print('The best parameters are ', hpo.best_params_, '\n')

print("Accuracy on Training set: {:.6f}".format(hpo.score(X_train, y_train)))
print("Accuracy on Test set: {:.6f}".format(hpo.score(X_test, y_test)))

# pd.DataFrame.from_dict(hpo.cv_results_)

3) Bayesian-Search (scikit-optimize 활용)

!pip install scikit-optimize==0.9.0
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer

%%time

param_grid = {'n_estimators' : Integer(100, 1000),
              'min_child_samples' : Integer(10, 20),
              'max_depth': Integer(-1, 5), 
              'random_state':[42]}

hpo = BayesSearchCV(LGBMClassifier(verbose=-1), param_grid, refit=True, n_jobs=-1,
                    n_iter=72, cv=5)

hpo.fit(X_train, y_train)
print('The best parameters are ', hpo.best_params_, '\n')

print("Accuracy on Training set: {:.6f}".format(hpo.score(X_train, y_train)))
print("Accuracy on Test set: {:.6f}".format(hpo.score(X_test, y_test)))

# pd.DataFrame.from_dict(grid.cv_results_)

 

딥러닝 : Bayesian-Search > Randomized-Search > Grid-Search