import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.linear_model
import sklearn.tree
#---#
import warnings
'ignore') warnings.filterwarnings(
07wk-33: 취업(다중공선성) / 의사결정나무
1. 강의영상
2. Imports
3. Data
43052)
np.random.seed(= pd.read_csv("https://raw.githubusercontent.com/guebin/MP2023/main/posts/employment_multicollinearity.csv")
df 'employment_score'] = df.gpa * 1.0 + df.toeic* 1/100 + np.random.randn(500)
df[ df
employment_score | gpa | toeic | toeic0 | toeic1 | toeic2 | toeic3 | toeic4 | toeic5 | toeic6 | ... | toeic490 | toeic491 | toeic492 | toeic493 | toeic494 | toeic495 | toeic496 | toeic497 | toeic498 | toeic499 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.784955 | 0.051535 | 135 | 129.566309 | 133.078481 | 121.678398 | 113.457366 | 133.564200 | 136.026566 | 141.793547 | ... | 132.014696 | 140.013265 | 135.575816 | 143.863346 | 152.162740 | 132.850033 | 115.956496 | 131.842126 | 125.090801 | 143.568527 |
1 | 10.789671 | 0.355496 | 935 | 940.563187 | 935.723570 | 939.190519 | 938.995672 | 945.376482 | 927.469901 | 952.424087 | ... | 942.251184 | 923.241548 | 939.924802 | 921.912261 | 953.250300 | 931.743615 | 940.205853 | 930.575825 | 941.530348 | 934.221055 |
2 | 8.221213 | 2.228435 | 485 | 493.671390 | 493.909118 | 475.500970 | 480.363752 | 478.868942 | 493.321602 | 490.059102 | ... | 484.438233 | 488.101275 | 485.626742 | 475.330715 | 485.147363 | 468.553780 | 486.870976 | 481.640957 | 499.340808 | 488.197332 |
3 | 2.137594 | 1.179701 | 65 | 62.272565 | 55.957257 | 68.521468 | 76.866765 | 51.436321 | 57.166824 | 67.834920 | ... | 67.653225 | 65.710588 | 64.146780 | 76.662194 | 66.837839 | 82.379018 | 69.174745 | 64.475993 | 52.647087 | 59.493275 |
4 | 8.650144 | 3.962356 | 445 | 449.280637 | 438.895582 | 433.598274 | 444.081141 | 437.005100 | 434.761142 | 443.135269 | ... | 455.940348 | 435.952854 | 441.521145 | 443.038886 | 433.118847 | 466.103355 | 430.056944 | 423.632873 | 446.973484 | 442.793633 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | 9.057243 | 4.288465 | 280 | 276.680902 | 274.502675 | 277.868536 | 292.283300 | 277.476630 | 281.671647 | 296.307373 | ... | 269.541846 | 278.220546 | 278.484758 | 284.901284 | 272.451612 | 265.784490 | 275.795948 | 280.465992 | 268.528889 | 283.638470 |
496 | 4.108020 | 2.601212 | 310 | 296.940263 | 301.545000 | 306.725610 | 314.811407 | 311.935810 | 309.695838 | 301.979914 | ... | 304.680578 | 295.476836 | 316.582100 | 319.412132 | 312.984039 | 312.372112 | 312.106944 | 314.101927 | 309.409533 | 297.429968 |
497 | 2.430590 | 0.042323 | 225 | 206.793217 | 228.335345 | 222.115146 | 216.479498 | 227.469560 | 238.710310 | 233.797065 | ... | 233.469238 | 235.160919 | 228.517306 | 228.349646 | 224.153606 | 230.860484 | 218.683195 | 232.949484 | 236.951938 | 227.997629 |
498 | 5.343171 | 1.041416 | 320 | 327.461442 | 323.019899 | 329.589337 | 313.312233 | 315.645050 | 324.448247 | 314.271045 | ... | 326.297700 | 309.893822 | 312.873223 | 322.356584 | 319.332809 | 319.405283 | 324.021917 | 312.363694 | 318.493866 | 310.973930 |
499 | 6.505106 | 3.626883 | 375 | 370.966595 | 364.668477 | 371.853566 | 373.574930 | 376.701708 | 356.905085 | 354.584022 | ... | 382.278782 | 379.460816 | 371.031640 | 370.272639 | 375.618182 | 369.252740 | 376.925543 | 391.863103 | 368.735260 | 368.520844 |
500 rows × 503 columns
= sklearn.model_selection.train_test_split(df,test_size=0.3,random_state=42) df_train, df_test
= df_train.loc[:,'gpa':],df_train['employment_score']
X,y = df_test.loc[:,'gpa':],df_test['employment_score'] XX,yy
- 실제 kaggle에서는 yy를 모르는 상황임
4. 분석
-
분석1: 의사결정나무
## step1 -- pass
## step2
= sklearn.tree.DecisionTreeRegressor(random_state=42)
predictr ## step3
predictr.fit(X,y)## step4
'employment_score_hat'] = predictr.predict(X)
df_train['employment_score_hat'] = predictr.predict(XX)
df_test[#---#
print(f'train score: {predictr.score(X,y):.4f}')
print(f'test score: {predictr.score(XX,yy):.4f}')
train score: 1.0000
test score: 0.8300
- 오버핏이긴한데 나쁘지 않음..
-
분석2: Lasso
## step1 -- pass
## step2
= sklearn.linear_model.LassoCV()
predictr ## step3
predictr.fit(X,y)## step4
'employment_score_hat'] = predictr.predict(X)
df_train['employment_score_hat'] = predictr.predict(XX)
df_test[#---#
print(f'train score: {predictr.score(X,y):.4f}')
print(f'test score: {predictr.score(XX,yy):.4f}')
train score: 0.8994
test score: 0.8587
-
총평: Lasso가 좋긴해요. 그런데 의사결정나무도 나쁘지 않아요.
- 참고로 Lasso는 엄청 발전된 모델
- 의사결정나무는 아주 초기모델임