14wk-60: 자전거대여 / 하이퍼파라메터 튜닝

Author

최규빈

Published

December 1, 2023

1. 강의영상

2. Imports

#!pip install autogluon.multimodal 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
warnings.filterwarnings('ignore')

3. Data

- 자료 다운로드

!kaggle competitions download -c bike-sharing-demand
Downloading bike-sharing-demand.zip to /home/cgb2/Dropbox/07_lectures/2023-09-MP2023/posts
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 763kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 762kB/s]
!unzip bike-sharing-demand.zip -d data
Archive:  bike-sharing-demand.zip
  inflating: data/sampleSubmission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          
sampleSubmission = pd.read_csv('data/sampleSubmission.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv') 
!rm -rf data
!rm bike-sharing-demand.zip

4. 기본전처리 및 분석 프로세스

- 전처리

def preprocessing(df_train,df_test):
    df_train_featured = df_train.copy()
    df_test_featured = df_test.copy()
    #----# 
    df_train_featured = df_train_featured.drop(['casual','registered'],axis=1)
    #--#
    df_train_featured['hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_test_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_train_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
    df_test_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
    #--#
    df_train_featured = df_train_featured.drop(['datetime'],axis=1)
    df_test_featured = df_test_featured.drop(['datetime'],axis=1)
    #--#
    df_train_featured = df_train_featured.drop(['atemp'],axis=1)
    df_test_featured = df_test_featured.drop(['atemp'],axis=1)
    return df_train_featured, df_test_featured

- 함수들

def plot(yhat,yyhat):
    df = pd.concat([
        df_train.assign(count_hat = yhat, dataset_type = 'train'),
        df_test.assign(count_hat = yyhat, dataset_type = 'test')
    ]).reset_index(drop=True)
    df['datetime'] = pd.to_datetime(df['datetime'])
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count',
        hue='dataset_type',
        linestyle='--',
        lw=0.8
    )
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count_hat',
        hue='dataset_type',
        alpha=0.5,
        lw=3
    )
    fig = plt.gcf()
    fig.set_size_inches(8,2)
    plt.xticks(rotation=15); 
    fig.show()
def submit(yyhat):
    sampleSubmission['count'] = yyhat 
    sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
    sampleSubmission.to_csv("submission.csv",index=False)
    !kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
    !rm submission.csv
def auto(df_train, df_test):
    # step1 
    df_train_featured, df_test_featured = preprocessing(df_train, df_test) # preprocessing
    df_train_featured['count'] = np.log1p(df_train_featured['count']) # transform 
    # step2~4 
    yhat,yyhat = fit_predict(df_train_featured,df_test_featured)
    yhat = np.expm1(yhat) # inverse_trans
    yyhat = np.expm1(yyhat) # inverse_trans
    # 시각화 
    plot(yhat,yyhat)
    # 제출 
    submit(yyhat)

5. 하이퍼파라메터 튜닝

- 기본 HP

{
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge"
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "XT": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
    ]
}

- fit_predict 함수 수정

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 root_mean_squared_error 0.044126 0.869338 0.044126 0.869338 1 True 1
1 WeightedEnsemble_L2 -0.401983 root_mean_squared_error 0.044359 0.870885 0.000233 0.001547 2 True 2
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 122kB/s]
Successfully submitted to Bike Sharing Demand

ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html

- 방금 돌린것은 아래와 결과가 동일함.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 root_mean_squared_error 0.034022 0.712419 0.034022 0.712419 1 True 1
1 WeightedEnsemble_L2 -0.401983 root_mean_squared_error 0.034240 0.713953 0.000218 0.001534 2 True 2
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 121kB/s]
Successfully submitted to Bike Sharing Demand

- 알아낸 방법?

df_train_featured, df_test_featured = preprocessing(df_train,df_test)
predictr= TabularPredictor(label='count',verbosity=False)
# step3 
hp = {
    "RF": [
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ]
}
predictr.fit(
    df_train_featured,
    hyperparameters = hp
)
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f5f8525b2d0>
predictr.info()['model_info']['RandomForestMSE']['hyperparameters']
{'n_estimators': 300,
 'max_leaf_nodes': 15000,
 'n_jobs': -1,
 'random_state': 0,
 'bootstrap': True,
 'criterion': 'squared_error'}

- RF에서 더 다양한 파라메터를 실험해보자.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForest(500,10000) -0.401733 root_mean_squared_error 0.054719 1.484488 0.054719 1.484488 1 True 5
1 WeightedEnsemble_L2 -0.401733 root_mean_squared_error 0.054930 1.555554 0.000211 0.071066 2 True 7
2 RandomForest(500,15000) -0.401733 root_mean_squared_error 0.064742 1.486831 0.064742 1.486831 1 True 6
3 RandomForest(300,15000) -0.401983 root_mean_squared_error 0.034378 0.778114 0.034378 0.778114 1 True 2
4 RandomForest(300,10000) -0.401983 root_mean_squared_error 0.044908 0.723808 0.044908 0.723808 1 True 1
5 RandomForest(400,10000) -0.402192 root_mean_squared_error 0.054852 1.019239 0.054852 1.019239 1 True 3
6 RandomForest(400,15000) -0.402192 root_mean_squared_error 0.054879 1.250339 0.054879 1.250339 1 True 4
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 139kB/s]
Successfully submitted to Bike Sharing Demand