import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.linear_model
06wk-24: 취업(다중공선성) / RidgeCV
1. 강의영상
2. Imports
3. Data
= pd.read_csv("https://raw.githubusercontent.com/guebin/MP2023/main/posts/employment_multicollinearity.csv")
df 43052)
np.random.seed('employment_score'] = df.gpa * 1.0 + df.toeic* 1/100 + np.random.randn(500) df[
df
employment_score | gpa | toeic | toeic0 | toeic1 | toeic2 | toeic3 | toeic4 | toeic5 | toeic6 | ... | toeic490 | toeic491 | toeic492 | toeic493 | toeic494 | toeic495 | toeic496 | toeic497 | toeic498 | toeic499 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.784955 | 0.051535 | 135 | 129.566309 | 133.078481 | 121.678398 | 113.457366 | 133.564200 | 136.026566 | 141.793547 | ... | 132.014696 | 140.013265 | 135.575816 | 143.863346 | 152.162740 | 132.850033 | 115.956496 | 131.842126 | 125.090801 | 143.568527 |
1 | 10.789671 | 0.355496 | 935 | 940.563187 | 935.723570 | 939.190519 | 938.995672 | 945.376482 | 927.469901 | 952.424087 | ... | 942.251184 | 923.241548 | 939.924802 | 921.912261 | 953.250300 | 931.743615 | 940.205853 | 930.575825 | 941.530348 | 934.221055 |
2 | 8.221213 | 2.228435 | 485 | 493.671390 | 493.909118 | 475.500970 | 480.363752 | 478.868942 | 493.321602 | 490.059102 | ... | 484.438233 | 488.101275 | 485.626742 | 475.330715 | 485.147363 | 468.553780 | 486.870976 | 481.640957 | 499.340808 | 488.197332 |
3 | 2.137594 | 1.179701 | 65 | 62.272565 | 55.957257 | 68.521468 | 76.866765 | 51.436321 | 57.166824 | 67.834920 | ... | 67.653225 | 65.710588 | 64.146780 | 76.662194 | 66.837839 | 82.379018 | 69.174745 | 64.475993 | 52.647087 | 59.493275 |
4 | 8.650144 | 3.962356 | 445 | 449.280637 | 438.895582 | 433.598274 | 444.081141 | 437.005100 | 434.761142 | 443.135269 | ... | 455.940348 | 435.952854 | 441.521145 | 443.038886 | 433.118847 | 466.103355 | 430.056944 | 423.632873 | 446.973484 | 442.793633 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | 9.057243 | 4.288465 | 280 | 276.680902 | 274.502675 | 277.868536 | 292.283300 | 277.476630 | 281.671647 | 296.307373 | ... | 269.541846 | 278.220546 | 278.484758 | 284.901284 | 272.451612 | 265.784490 | 275.795948 | 280.465992 | 268.528889 | 283.638470 |
496 | 4.108020 | 2.601212 | 310 | 296.940263 | 301.545000 | 306.725610 | 314.811407 | 311.935810 | 309.695838 | 301.979914 | ... | 304.680578 | 295.476836 | 316.582100 | 319.412132 | 312.984039 | 312.372112 | 312.106944 | 314.101927 | 309.409533 | 297.429968 |
497 | 2.430590 | 0.042323 | 225 | 206.793217 | 228.335345 | 222.115146 | 216.479498 | 227.469560 | 238.710310 | 233.797065 | ... | 233.469238 | 235.160919 | 228.517306 | 228.349646 | 224.153606 | 230.860484 | 218.683195 | 232.949484 | 236.951938 | 227.997629 |
498 | 5.343171 | 1.041416 | 320 | 327.461442 | 323.019899 | 329.589337 | 313.312233 | 315.645050 | 324.448247 | 314.271045 | ... | 326.297700 | 309.893822 | 312.873223 | 322.356584 | 319.332809 | 319.405283 | 324.021917 | 312.363694 | 318.493866 | 310.973930 |
499 | 6.505106 | 3.626883 | 375 | 370.966595 | 364.668477 | 371.853566 | 373.574930 | 376.701708 | 356.905085 | 354.584022 | ... | 382.278782 | 379.460816 | 371.031640 | 370.272639 | 375.618182 | 369.252740 | 376.925543 | 391.863103 | 368.735260 | 368.520844 |
500 rows × 503 columns
4. RidgeCV
-
RidgeCV 클래스에서 모형을 선택해보자.
## step1
= sklearn.model_selection.train_test_split(df,test_size=0.3,random_state=42)
df_train, df_test = df_train.loc[:,'gpa':'toeic499']
X = df_train.loc[:,'employment_score']
y = df_test.loc[:,'gpa':'toeic499']
XX = df_test.loc[:,'employment_score']
yy ## step2
= sklearn.linear_model.RidgeCV()
predictr ## step3
predictr.fit(X,y)## step4 -- pass
RidgeCV()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RidgeCV()
predictr.score(X,y)
0.9999996840224911
predictr.score(XX,yy)
0.1191494594976158
-
alpha들의 후보를 우리가 직접 선정하자.
## step1
= sklearn.model_selection.train_test_split(df,test_size=0.3,random_state=42)
df_train, df_test = df_train.loc[:,'gpa':'toeic499']
X = df_train.loc[:,'employment_score']
y = df_test.loc[:,'gpa':'toeic499']
XX = df_test.loc[:,'employment_score']
yy ## step2
= sklearn.linear_model.RidgeCV(alphas=[5e2, 5e3, 5e4, 5e5, 5e6, 5e7, 5e8])
predictr ## step3
predictr.fit(X,y)## step4 -- pass
RidgeCV(alphas=[500.0, 5000.0, 50000.0, 500000.0, 5000000.0, 50000000.0, 500000000.0])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RidgeCV(alphas=[500.0, 5000.0, 50000.0, 500000.0, 5000000.0, 50000000.0, 500000000.0])
predictr.score(X,y)
0.7521268560159359
predictr.score(XX,yy)
0.7450309251010895
predictr.alpha_
50000000.0
참고로 이 적합결과는 아래의 코드를 실행한것과 같다
## step1
= sklearn.model_selection.train_test_split(df,test_size=0.3,random_state=42)
df_train, df_test = df_train.loc[:,'gpa':'toeic499']
X = df_train.loc[:,'employment_score']
y = df_test.loc[:,'gpa':'toeic499']
XX = df_test.loc[:,'employment_score']
yy ## step2
= sklearn.linear_model.Ridge(alpha=50000000.0)
predictr ## step3
predictr.fit(X,y)## step4 -- pass
Ridge(alpha=50000000.0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge(alpha=50000000.0)
predictr.score(X,y)
0.752126856015936
predictr.score(XX,yy)
0.7450309251010895