import numpy as np
import pandas as pd
import sklearn.linear_model03wk-11: Medical Cost / 회귀분석
1. 강의영상
2. Import
3. Data 불러오기
- 캐글에서 Medical Cost Personal Datasets download
- Data Load
df_train = pd.read_csv('https://raw.githubusercontent.com/guebin/MP2023/main/posts/insurance.csv')
df_train| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
4. 분석
A. Data 정리
df_train.columnsIndex(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
X = pd.get_dummies(df_train.drop(['charges'],axis=1))
y = df_train[['charges']]X| age | bmi | children | sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 27.900 | 0 | True | False | False | True | False | False | False | True |
| 1 | 18 | 33.770 | 1 | False | True | True | False | False | False | True | False |
| 2 | 28 | 33.000 | 3 | False | True | True | False | False | False | True | False |
| 3 | 33 | 22.705 | 0 | False | True | True | False | False | True | False | False |
| 4 | 32 | 28.880 | 0 | False | True | True | False | False | True | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | 30.970 | 3 | False | True | True | False | False | True | False | False |
| 1334 | 18 | 31.920 | 0 | True | False | True | False | True | False | False | False |
| 1335 | 18 | 36.850 | 0 | True | False | True | False | False | False | True | False |
| 1336 | 21 | 25.800 | 0 | True | False | True | False | False | False | False | True |
| 1337 | 61 | 29.070 | 0 | True | False | False | True | False | True | False | False |
1338 rows × 11 columns
y| charges | |
|---|---|
| 0 | 16884.92400 |
| 1 | 1725.55230 |
| 2 | 4449.46200 |
| 3 | 21984.47061 |
| 4 | 3866.85520 |
| ... | ... |
| 1333 | 10600.54830 |
| 1334 | 2205.98080 |
| 1335 | 1629.83350 |
| 1336 | 2007.94500 |
| 1337 | 29141.36030 |
1338 rows × 1 columns
B. Predictor 생성
predictr = sklearn.linear_model.LinearRegression()C. 학습
predictr.fit(X,y)LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
D. 예측
df_train.assign(yhat = predictr.predict(X))| age | sex | bmi | children | smoker | region | charges | yhat | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 25293.713028 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 3448.602834 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 6706.988491 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 3754.830163 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 5592.493386 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 12351.323686 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 3511.930809 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 4149.132486 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 1246.584939 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 37085.623268 |
1338 rows × 8 columns
E. 평가
predictr.score(X,y) # R^20.7509130345985207
0.7 이상이면 망한모형까지는 아님 (대회용으로는 부적절할 수 있으나 대충 쓸 수는 있는 정도)
5. 계수해석
- 상수항 해석
predictr.intercept_array([-666.93771994])
- 기본적인 보험료는 -666이라는 의미
- 계수해석
pd.DataFrame({'name':list(X.columns), 'coef':predictr.coef_.reshape(-1)})| name | coef | |
|---|---|---|
| 0 | age | 256.856353 |
| 1 | bmi | 339.193454 |
| 2 | children | 475.500545 |
| 3 | sex_female | 65.657180 |
| 4 | sex_male | -65.657180 |
| 5 | smoker_no | -11924.267271 |
| 6 | smoker_yes | 11924.267271 |
| 7 | region_northeast | 587.009235 |
| 8 | region_northwest | 234.045336 |
| 9 | region_southeast | -448.012814 |
| 10 | region_southwest | -373.041756 |
- 지역은 잘 모르겠으나 나머지는 꽤 그럴듯해 보임