import numpy as np
import pandas as pd
import sklearn.linear_model
03wk-11: Medical Cost / 회귀분석
1. 강의영상
2. Import
3. Data 불러오기
-
캐글에서 Medical Cost Personal Datasets
download
-
Data Load
= pd.read_csv('https://raw.githubusercontent.com/guebin/MP2023/main/posts/insurance.csv')
df_train df_train
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
4. 분석
A. Data 정리
df_train.columns
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
= pd.get_dummies(df_train.drop(['charges'],axis=1))
X = df_train[['charges']] y
X
age | bmi | children | sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19 | 27.900 | 0 | True | False | False | True | False | False | False | True |
1 | 18 | 33.770 | 1 | False | True | True | False | False | False | True | False |
2 | 28 | 33.000 | 3 | False | True | True | False | False | False | True | False |
3 | 33 | 22.705 | 0 | False | True | True | False | False | True | False | False |
4 | 32 | 28.880 | 0 | False | True | True | False | False | True | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | 30.970 | 3 | False | True | True | False | False | True | False | False |
1334 | 18 | 31.920 | 0 | True | False | True | False | True | False | False | False |
1335 | 18 | 36.850 | 0 | True | False | True | False | False | False | True | False |
1336 | 21 | 25.800 | 0 | True | False | True | False | False | False | False | True |
1337 | 61 | 29.070 | 0 | True | False | False | True | False | True | False | False |
1338 rows × 11 columns
y
charges | |
---|---|
0 | 16884.92400 |
1 | 1725.55230 |
2 | 4449.46200 |
3 | 21984.47061 |
4 | 3866.85520 |
... | ... |
1333 | 10600.54830 |
1334 | 2205.98080 |
1335 | 1629.83350 |
1336 | 2007.94500 |
1337 | 29141.36030 |
1338 rows × 1 columns
B. Predictor 생성
= sklearn.linear_model.LinearRegression() predictr
C. 학습
predictr.fit(X,y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
D. 예측
= predictr.predict(X)) df_train.assign(yhat
age | sex | bmi | children | smoker | region | charges | yhat | |
---|---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 25293.713028 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 3448.602834 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 6706.988491 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 3754.830163 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 5592.493386 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 12351.323686 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 3511.930809 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 4149.132486 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 1246.584939 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 37085.623268 |
1338 rows × 8 columns
E. 평가
# R^2 predictr.score(X,y)
0.7509130345985207
0.7 이상이면 망한모형까지는 아님 (대회용으로는 부적절할 수 있으나 대충 쓸 수는 있는 정도)
5. 계수해석
-
상수항 해석
predictr.intercept_
array([-666.93771994])
- 기본적인 보험료는 -666이라는 의미
-
계수해석
'name':list(X.columns), 'coef':predictr.coef_.reshape(-1)}) pd.DataFrame({
name | coef | |
---|---|---|
0 | age | 256.856353 |
1 | bmi | 339.193454 |
2 | children | 475.500545 |
3 | sex_female | 65.657180 |
4 | sex_male | -65.657180 |
5 | smoker_no | -11924.267271 |
6 | smoker_yes | 11924.267271 |
7 | region_northeast | 587.009235 |
8 | region_northwest | 234.045336 |
9 | region_southeast | -448.012814 |
10 | region_southwest | -373.041756 |
- 지역은 잘 모르겠으나 나머지는 꽤 그럴듯해 보임