#!pip install autogluon.multimodal
14wk-60: 자전거대여 / 하이퍼파라메터 튜닝
1. 강의영상
2. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
'ignore') warnings.filterwarnings(
3. Data
-
자료 다운로드
!kaggle competitions download -c bike-sharing-demand
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
Downloading bike-sharing-demand.zip to /root/Dropbox
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 830kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 830kB/s]
!unzip bike-sharing-demand.zip -d data
Archive: bike-sharing-demand.zip
inflating: data/sampleSubmission.csv
inflating: data/test.csv
inflating: data/train.csv
= pd.read_csv('data/sampleSubmission.csv')
sampleSubmission = pd.read_csv('data/train.csv')
df_train = pd.read_csv('data/test.csv') df_test
!rm -rf data
!rm bike-sharing-demand.zip
4. 기본전처리 및 분석 프로세스
-
전처리
def preprocessing(df_train,df_test):
= df_train.copy()
df_train_featured = df_test.copy()
df_test_featured #----#
= df_train_featured.drop(['casual','registered'],axis=1)
df_train_featured #--#
'hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
df_train_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
df_test_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
df_train_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
df_test_featured[#--#
= df_train_featured.drop(['datetime'],axis=1)
df_train_featured = df_test_featured.drop(['datetime'],axis=1)
df_test_featured #--#
= df_train_featured.drop(['atemp'],axis=1)
df_train_featured = df_test_featured.drop(['atemp'],axis=1)
df_test_featured return df_train_featured, df_test_featured
-
함수들
def plot(yhat,yyhat):
= pd.concat([
df = yhat, dataset_type = 'train'),
df_train.assign(count_hat = yyhat, dataset_type = 'test')
df_test.assign(count_hat
])'datetime'] = pd.to_datetime(df['datetime'])
df[
sns.lineplot('datetime')[:(24*28)],
df.sort_values(='datetime',y='count',
x='dataset_type',
hue='--',
linestyle=0.8
lw
)
sns.lineplot('datetime')[:(24*28)],
df.sort_values(='datetime',y='count_hat',
x='dataset_type',
hue=0.5,
alpha=3
lw
)= plt.gcf()
fig 8,2)
fig.set_size_inches(=15);
plt.xticks(rotation fig.show()
def submit(yyhat):
'count'] = yyhat
sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
sampleSubmission["submission.csv",index=False)
sampleSubmission.to_csv(!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
!rm submission.csv
def auto(df_train, df_test):
# step1
= preprocessing(df_train, df_test) # preprocessing
df_train_featured, df_test_featured 'count'] = np.log1p(df_train_featured['count']) # transform
df_train_featured[# step2~4
= fit_predict(df_train_featured,df_test_featured)
yhat,yyhat = np.expm1(yhat) # inverse_trans
yhat = np.expm1(yyhat) # inverse_trans
yyhat # 시각화
plot(yhat,yyhat)# 제출
submit(yyhat)
5. 하이퍼파라메터 튜닝
-
기본 HP
{"NN_TORCH": {},
"GBM": [
"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
{
{},"GBMLarge"
],"CAT": {},
"XGB": {},
"FASTAI": {},
"RF": [
"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
{"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
{"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
],"XT": [
"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
{"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
{"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
],"KNN": [
"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
{"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
{
] }
-
fit_predict 함수 수정
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model | score_val | eval_metric | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|---|
0 | RandomForestMSE | -0.401983 | root_mean_squared_error | 0.078551 | 0.73439 | 0.078551 | 0.734390 | 1 | True | 1 |
1 | WeightedEnsemble_L2 | -0.401983 | root_mean_squared_error | 0.078805 | 0.73621 | 0.000255 | 0.001819 | 2 | True | 2 |
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 107kB/s]
Successfully submitted to Bike Sharing Demand
ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html
- LightGBM model: https://lightgbm.readthedocs.io/en/latest/
- CatBoost model: https://catboost.ai/
- XGBoost model: https://xgboost.readthedocs.io/en/latest/
- Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
- Extra Trees model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
- Linear model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
-
방금 돌린것은 아래와 결과가 동일함.
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model | score_val | eval_metric | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|---|
0 | RandomForestMSE | -0.401983 | root_mean_squared_error | 0.075658 | 0.596290 | 0.075658 | 0.596290 | 1 | True | 1 |
1 | WeightedEnsemble_L2 | -0.401983 | root_mean_squared_error | 0.075915 | 0.598079 | 0.000257 | 0.001789 | 2 | True | 2 |
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 117kB/s]
Successfully submitted to Bike Sharing Demand
-
알아낸 방법?
= preprocessing(df_train,df_test) df_train_featured, df_test_featured
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters )
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7efc9570f490>
'model_info']['RandomForestMSE']['hyperparameters'] predictr.info()[
{'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'squared_error'}
-
RF에서 더 다양한 파라메터를 실험해보자.
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model | score_val | eval_metric | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|---|
0 | RandomForest(500,15000) | -0.401733 | root_mean_squared_error | 0.109799 | 0.841268 | 0.109799 | 0.841268 | 1 | True | 6 |
1 | RandomForest(500,10000) | -0.401733 | root_mean_squared_error | 0.110465 | 0.801023 | 0.110465 | 0.801023 | 1 | True | 5 |
2 | WeightedEnsemble_L2 | -0.401733 | root_mean_squared_error | 0.110721 | 0.892063 | 0.000255 | 0.091039 | 2 | True | 7 |
3 | RandomForest(300,15000) | -0.401983 | root_mean_squared_error | 0.067139 | 0.556841 | 0.067139 | 0.556841 | 1 | True | 2 |
4 | RandomForest(300,10000) | -0.401983 | root_mean_squared_error | 0.077653 | 0.563708 | 0.077653 | 0.563708 | 1 | True | 1 |
5 | RandomForest(400,15000) | -0.402192 | root_mean_squared_error | 0.087821 | 0.706382 | 0.087821 | 0.706382 | 1 | True | 4 |
6 | RandomForest(400,10000) | -0.402192 | root_mean_squared_error | 0.088674 | 0.664156 | 0.088674 | 0.664156 | 1 | True | 3 |
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 136kB/s]
Successfully submitted to Bike Sharing Demand