Notice
Recent Posts
Recent Comments
Link
«   2026/02   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
Tags
more
Archives
Today
Total
관리 메뉴

Silver bullet

Stratified K-Fold CV & cross_val_score 본문

AI/AI

Stratified K-Fold CV & cross_val_score

밀크쌀과자 2024. 7. 12. 09:55
from collections import Counter 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, model_selection, ensemble
from sklearn.metrics import accuracy_score, roc_curve, auc

data = datasets.load_breast_cancer()

x_data = data.data
y_data = data.target

def create_model():
    
    # Tree-based model이므로 Standard-scaling or Min-max scaling 필수 X
    model = ensemble.GradientBoostingClassifier(n_estimators=1000, 
                                            max_depth=4, 
                                            min_samples_split=5, 
                                            learning_rate=0.01,
                                            random_state=42)
    return model
from sklearn import model_selection

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, 
                                                                    y_data, 
                                                                    test_size=0.3, 
                                                                    random_state=0)
                                                                    
model = create_model()
model.fit(x_train, y_train)

pred_test = model.predict_proba(x_test) 
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_test[:,1]) 
roc_auc = auc(fpr, tpr) 

print('AUC: ', roc_auc)
print('Accuracy: ', accuracy_score(model.predict(x_test), y_test))
AUC:  0.9961787184009406
Accuracy:  0.9590643274853801

 

K-Fold CV

from sklearn import model_selection 

kf = model_selection.KFold(n_splits=10, shuffle=False) # 5번에서 활용할 cross_val_score의 결과와 동일한 결과를 만들기 위함 

kf.split(x_data, y_data) # Generate indices to split data into training and test set

for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))

 

Stratified K-Fold CV

from sklearn import model_selection 

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))

 

Stratified K-Fold CV → Model-training

valid_scores = []

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    model = create_model()
    model.fit(train_data, train_label)
    
    train_acc = accuracy_score(model.predict(train_data), train_label) * 100
    valid_acc = accuracy_score(model.predict(valid_data), valid_label) * 100
    print('[{} Fold] \n Accuracy-Training : {:.2f}% \n Accuracy-Validation : {:.2f}% \n'.format(fold_idx, 
                                                                                                train_acc, 
                                                                                                valid_acc))
    valid_scores.append(valid_acc)

print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores)))

cross_val_score → Model-training (훨씬 간결하고 쉬운 코드)

1) 'cv' parameter <- int

from sklearn import model_selection

model = create_model()

# Classification model이 활용되고 y_data가 이진분류 or 다중분류 형태일 경우 자동으로 Stratified K-Fold가 적용됨
valid_scores = model_selection.cross_val_score(model, x_data, y_data, 
                                               cv=10, verbose=1,
                                               n_jobs=-1) # Number of jobs to run in parallel. '-1' == use all processors
                                               
valid_scores

print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores * 100)))
array([0.96491228, 0.92982456, 0.92982456, 0.92982456, 1.        ,
       0.96491228, 0.98245614, 0.98245614, 0.98245614, 0.98214286])
Cross-Validation Score : 96.49%

2) 'cv' parameter <- KFold or StratifiedKFold

# K-Fold

model = create_model()
kf = model_selection.KFold(n_splits=10, shuffle=False)

valid_scores_kf = model_selection.cross_val_score(model, x_data, y_data,
                                                  cv=kf, verbose=1, n_jobs=-1)

# Stratified K-Fold

model = create_model()
stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

valid_scores_s_kf = model_selection.cross_val_score(model, x_data, y_data,
                                                    cv=stratified_kf, verbose=1, n_jobs=-1)
                                                    
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))
Cross-Validation Score (K-Fold): 96.32%
Cross-Validation Score (Stratified K-Fold): 96.49%