from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, model_selection, ensemble
from sklearn.metrics import accuracy_score, roc_curve, auc
data = datasets.load_breast_cancer()
x_data = data.data
y_data = data.target
def create_model():
# Tree-based model이므로 Standard-scaling or Min-max scaling 필수 X
model = ensemble.GradientBoostingClassifier(n_estimators=1000,
max_depth=4,
min_samples_split=5,
learning_rate=0.01,
random_state=42)
return model
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data,
y_data,
test_size=0.3,
random_state=0)
model = create_model()
model.fit(x_train, y_train)
pred_test = model.predict_proba(x_test)
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_test[:,1])
roc_auc = auc(fpr, tpr)
print('AUC: ', roc_auc)
print('Accuracy: ', accuracy_score(model.predict(x_test), y_test))
AUC: 0.9961787184009406
Accuracy: 0.9590643274853801
K-Fold CV
from sklearn import model_selection
kf = model_selection.KFold(n_splits=10, shuffle=False) # 5번에서 활용할 cross_val_score의 결과와 동일한 결과를 만들기 위함
kf.split(x_data, y_data) # Generate indices to split data into training and test set
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(x_data, y_data)) :
train_data, train_label = x_data[train_idx, :], y_data[train_idx]
valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))
Stratified K-Fold CV
from sklearn import model_selection
stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)
for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
train_data, train_label = x_data[train_idx, :], y_data[train_idx]
valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))
Stratified K-Fold CV → Model-training
valid_scores = []
stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)
for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
train_data, train_label = x_data[train_idx, :], y_data[train_idx]
valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
model = create_model()
model.fit(train_data, train_label)
train_acc = accuracy_score(model.predict(train_data), train_label) * 100
valid_acc = accuracy_score(model.predict(valid_data), valid_label) * 100
print('[{} Fold] \n Accuracy-Training : {:.2f}% \n Accuracy-Validation : {:.2f}% \n'.format(fold_idx,
train_acc,
valid_acc))
valid_scores.append(valid_acc)
print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores)))
cross_val_score → Model-training (훨씬 간결하고 쉬운 코드)
1) 'cv' parameter <- int
from sklearn import model_selection
model = create_model()
# Classification model이 활용되고 y_data가 이진분류 or 다중분류 형태일 경우 자동으로 Stratified K-Fold가 적용됨
valid_scores = model_selection.cross_val_score(model, x_data, y_data,
cv=10, verbose=1,
n_jobs=-1) # Number of jobs to run in parallel. '-1' == use all processors
valid_scores
print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores * 100)))
array([0.96491228, 0.92982456, 0.92982456, 0.92982456, 1. ,
0.96491228, 0.98245614, 0.98245614, 0.98245614, 0.98214286])
Cross-Validation Score : 96.49%
2) 'cv' parameter <- KFold or StratifiedKFold
# K-Fold
model = create_model()
kf = model_selection.KFold(n_splits=10, shuffle=False)
valid_scores_kf = model_selection.cross_val_score(model, x_data, y_data,
cv=kf, verbose=1, n_jobs=-1)
# Stratified K-Fold
model = create_model()
stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)
valid_scores_s_kf = model_selection.cross_val_score(model, x_data, y_data,
cv=stratified_kf, verbose=1, n_jobs=-1)
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))
Cross-Validation Score (K-Fold): 96.32%
Cross-Validation Score (Stratified K-Fold): 96.49%