%matplotlib inline
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn


from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 인위적인 데이터셋 만들기
X, y = make_blobs(random_state=0)
# 데이터와 타겟 레이블을 Training set와 Test set로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# 모델 객체를 만들고, Training set로 학습
logreg = LogisticRegression().fit(X_train, y_train)
# 모델을 Test set로 평가
print('테스트 세트 점수: {:.2f}'.format(logreg.score(X_test, y_test)))

테스트 세트 점수: 0.88


mglearn.plots.plot_cross_validation()


from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

iris = load_iris()
logreg = LogisticRegression(max_iter=1000)

scores = cross_val_score(logreg, iris.data, iris.target)
print('CV 점수:', scores)

CV 점수: [0.96666667 1.         0.93333333 0.96666667 1.        ]


scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
print('CV 점수:', scores)

CV 점수: [0.96666667 1.         0.93333333 0.96666667 1.        ]


print('CV 평균 점수: {:.2f}'.format(scores.mean()))

CV 평균 점수: 0.97


# cross_validate 함수 사용
from sklearn.model_selection import cross_validate

res = cross_validate(logreg, iris.data, iris.target, cv=5, return_train_score=True)
display(res)

{'fit_time': array([0.03673053, 0.03090334, 0.02393627, 0.02393579, 0.02293849]),
 'score_time': array([0.        , 0.        , 0.        , 0.0009973 , 0.00099778]),
 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_score': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ])}


res_df = pd.DataFrame(res)
display(res_df)
print('평균 시간과 점수:\n', res_df.mean())

평균 시간과 점수:
 fit_time       0.027689
score_time     0.000399
test_score     0.973333
train_score    0.975000
dtype: float64


from sklearn.model_selection import cross_val_predict

pred = cross_val_predict(logreg, iris.data, iris.target, cv=5)
# CV는 Test set에 각 샘플이 정확하게 한 번씩 들어감
print(pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


mglearn.plots.plot_stratified_cross_validation()


from sklearn.model_selection import KFold

# n_splits: fold 수
kfold = KFold(n_splits=5)

print('CV 점수:\n', cross_val_score(logreg, iris.data, iris.target, cv=kfold))

CV 점수:
 [1.         1.         0.86666667 0.93333333 0.83333333]


print('Iris 레이블:\n', iris.target)

kfold = KFold(n_splits=3)
print('\nCV 점수:\n', cross_val_score(logreg, iris.data, iris.target, cv=kfold))

Iris 레이블:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

CV 점수:
 [0. 0. 0.]


# 이를 해결하기 위해 계층별 k-fold CV 외에 데이터 섞기 -> shuffle=True
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print('CV 점수:\n', cross_val_score(logreg, iris.data, iris.target, cv=kfold))

CV 점수:
 [0.98 0.96 0.96]


from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
print('CV 분할 횟수:', len(scores))
print('평균 정확도: {:.2f}'.format(scores.mean()))

CV 분할 횟수: 150
평균 정확도: 0.97


mglearn.plots.plot_shuffle_split()


from sklearn.model_selection import ShuffleSplit

# Training set, Test set 각각 데이터셋의 50%로 분할
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print('CV 점수:\n', scores)

# StratifiedShuffleSplit

CV 점수:
 [0.96       0.98666667 0.96       0.90666667 0.97333333 0.97333333
 0.96       0.94666667 0.94666667 0.93333333]


from sklearn.model_selection import GroupKFold

# 인위적 데이터셋 생성
X, y = make_blobs(n_samples=12, random_state=0)
# 처음 세 개의 샘플은 같은 그룹에 속하고, 다음 ~
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print('CV 점수:', scores)

CV 점수: [0.75       0.6        0.66666667]

C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\validation.py:67: FutureWarning: Pass groups=[0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] as keyword args. From version 0.25 passing these as positional arguments will result in an error
  warnings.warn("Pass {} as keyword args. From version 0.25 "


mglearn.plots.plot_group_kfold()


from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
logreg = LogisticRegression(max_iter=1000)

from sklearn.model_selection import RepeatedStratifiedKFold

# default n_splits=5, n_repeats=10
rskfold = RepeatedStratifiedKFold(random_state=42)
scores = cross_val_score(logreg, iris.data, iris.target, cv=rskfold)

print('CV 점수:\n', scores)
print('CV 평균 점수: {:.3f}'.format(scores.mean()))

# RepeatedKFold

CV 점수:
 [1.         0.96666667 0.93333333 1.         0.93333333 0.96666667
 0.96666667 0.93333333 1.         0.96666667 0.93333333 1.
 1.         0.96666667 0.96666667 0.9        1.         1.
 0.93333333 0.96666667 0.93333333 0.96666667 0.96666667 1.
 0.96666667 1.         0.96666667 0.96666667 0.9        1.
 0.96666667 0.96666667 0.96666667 0.96666667 0.93333333 0.96666667
 0.96666667 1.         1.         0.9        0.96666667 1.
 0.9        0.96666667 0.96666667 0.9        0.96666667 0.96666667
 1.         0.96666667]
CV 평균 점수: 0.965

티스토리

[Python] 교차검증(Cross Validation, CV) 실습

[Python] 교차검증(Cross Validation, CV) 실습

패키지 불러오기¶

CHAPTER 5 모델 평가와 성능 향상¶

모델 평가 과정¶

5.1 교차 검증(CV, Cross-Validation)¶

5-fold CV에서의 데이터 분할¶

5.1.1 scikit-learn의 교차 검증¶

iris 데이터셋에 적용한 로&##51648;스틱 회귀 모형 평가¶

5.1.2 교차 검증의 장점¶

CV가 Test set에 대해 예측한 결과 구하기¶

5.1.3 계층별 k-겹 교차 검증과 그 외 전략들¶

1) 교차 검증 상세 옵션¶

2) LOOCV(Leave-one-out CV)¶

3) 임의 분할 교차 검증(shuffle-split CV)¶

4) 그룹별 교차 검증¶

5.1.4 반복 교차 검증¶

	fit_time	score_time	test_score	train_score
0	0.036731	0.000000	0.966667	0.966667
1	0.030903	0.000000	1.000000	0.966667
2	0.023936	0.000000	0.933333	0.983333
3	0.023936	0.000997	0.966667	0.983333
4	0.022938	0.000998	1.000000	0.975000