14wk-60: 자전거대여 / 하이퍼파라메터 튜닝

Author

최규빈

Published

December 1, 2023

1. 강의영상

2. Imports

#!pip install autogluon.multimodal 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
warnings.filterwarnings('ignore')

3. Data

- 자료 다운로드

!kaggle competitions download -c bike-sharing-demand
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
Downloading bike-sharing-demand.zip to /root/Dropbox
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 830kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 830kB/s]
!unzip bike-sharing-demand.zip -d data
Archive:  bike-sharing-demand.zip
  inflating: data/sampleSubmission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          
sampleSubmission = pd.read_csv('data/sampleSubmission.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv') 
!rm -rf data
!rm bike-sharing-demand.zip

4. 기본전처리 및 분석 프로세스

- 전처리

def preprocessing(df_train,df_test):
    df_train_featured = df_train.copy()
    df_test_featured = df_test.copy()
    #----# 
    df_train_featured = df_train_featured.drop(['casual','registered'],axis=1)
    #--#
    df_train_featured['hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_test_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_train_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
    df_test_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
    #--#
    df_train_featured = df_train_featured.drop(['datetime'],axis=1)
    df_test_featured = df_test_featured.drop(['datetime'],axis=1)
    #--#
    df_train_featured = df_train_featured.drop(['atemp'],axis=1)
    df_test_featured = df_test_featured.drop(['atemp'],axis=1)
    return df_train_featured, df_test_featured

- 함수들

def plot(yhat,yyhat):
    df = pd.concat([
        df_train.assign(count_hat = yhat, dataset_type = 'train'),
        df_test.assign(count_hat = yyhat, dataset_type = 'test')
    ])
    df['datetime'] = pd.to_datetime(df['datetime'])
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count',
        hue='dataset_type',
        linestyle='--',
        lw=0.8
    )
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count_hat',
        hue='dataset_type',
        alpha=0.5,
        lw=3
    )
    fig = plt.gcf()
    fig.set_size_inches(8,2)
    plt.xticks(rotation=15); 
    fig.show()
def submit(yyhat):
    sampleSubmission['count'] = yyhat 
    sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
    sampleSubmission.to_csv("submission.csv",index=False)
    !kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
    !rm submission.csv
def auto(df_train, df_test):
    # step1 
    df_train_featured, df_test_featured = preprocessing(df_train, df_test) # preprocessing
    df_train_featured['count'] = np.log1p(df_train_featured['count']) # transform 
    # step2~4 
    yhat,yyhat = fit_predict(df_train_featured,df_test_featured)
    yhat = np.expm1(yhat) # inverse_trans
    yyhat = np.expm1(yyhat) # inverse_trans
    # 시각화 
    plot(yhat,yyhat)
    # 제출 
    submit(yyhat)

5. 하이퍼파라메터 튜닝

- 기본 HP

{
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge"
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "XT": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
    ]
}

- fit_predict 함수 수정

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 root_mean_squared_error 0.078551 0.73439 0.078551 0.734390 1 True 1
1 WeightedEnsemble_L2 -0.401983 root_mean_squared_error 0.078805 0.73621 0.000255 0.001819 2 True 2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 107kB/s]
Successfully submitted to Bike Sharing Demand

ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html

- 방금 돌린것은 아래와 결과가 동일함.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 root_mean_squared_error 0.075658 0.596290 0.075658 0.596290 1 True 1
1 WeightedEnsemble_L2 -0.401983 root_mean_squared_error 0.075915 0.598079 0.000257 0.001789 2 True 2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 117kB/s]
Successfully submitted to Bike Sharing Demand

- 알아낸 방법?

df_train_featured, df_test_featured = preprocessing(df_train,df_test)
predictr= TabularPredictor(label='count',verbosity=False)
# step3 
hp = {
    "RF": [
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ]
}
predictr.fit(
    df_train_featured,
    hyperparameters = hp
)
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7efc9570f490>
predictr.info()['model_info']['RandomForestMSE']['hyperparameters']
{'n_estimators': 300,
 'max_leaf_nodes': 15000,
 'n_jobs': -1,
 'random_state': 0,
 'bootstrap': True,
 'criterion': 'squared_error'}

- RF에서 더 다양한 파라메터를 실험해보자.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
model score_val eval_metric pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForest(500,15000) -0.401733 root_mean_squared_error 0.109799 0.841268 0.109799 0.841268 1 True 6
1 RandomForest(500,10000) -0.401733 root_mean_squared_error 0.110465 0.801023 0.110465 0.801023 1 True 5
2 WeightedEnsemble_L2 -0.401733 root_mean_squared_error 0.110721 0.892063 0.000255 0.091039 2 True 7
3 RandomForest(300,15000) -0.401983 root_mean_squared_error 0.067139 0.556841 0.067139 0.556841 1 True 2
4 RandomForest(300,10000) -0.401983 root_mean_squared_error 0.077653 0.563708 0.077653 0.563708 1 True 1
5 RandomForest(400,15000) -0.402192 root_mean_squared_error 0.087821 0.706382 0.087821 0.706382 1 True 4
6 RandomForest(400,10000) -0.402192 root_mean_squared_error 0.088674 0.664156 0.088674 0.664156 1 True 3
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 136kB/s]
Successfully submitted to Bike Sharing Demand