import numpy as np
import pandas as pd
import sklearn.linear_model
03wk-11: Medical Cost / 회귀분석
1. 강의영상
2. Import
3. Data 불러오기
캐글에서 Medical Cost Personal Datasets
Data Load
= pd.read_csv('')
df_train df_train
age | sex | bmi | children | smoker | region | charges | |
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
4. 분석
A. Data 정리
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
= pd.get_dummies(df_train.drop(['charges'],axis=1))
X = df_train[['charges']] y
age | bmi | children | sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | |
0 | 19 | 27.900 | 0 | True | False | False | True | False | False | False | True |
1 | 18 | 33.770 | 1 | False | True | True | False | False | False | True | False |
2 | 28 | 33.000 | 3 | False | True | True | False | False | False | True | False |
3 | 33 | 22.705 | 0 | False | True | True | False | False | True | False | False |
4 | 32 | 28.880 | 0 | False | True | True | False | False | True | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | 30.970 | 3 | False | True | True | False | False | True | False | False |
1334 | 18 | 31.920 | 0 | True | False | True | False | True | False | False | False |
1335 | 18 | 36.850 | 0 | True | False | True | False | False | False | True | False |
1336 | 21 | 25.800 | 0 | True | False | True | False | False | False | False | True |
1337 | 61 | 29.070 | 0 | True | False | False | True | False | True | False | False |
1338 rows × 11 columns
charges | |
0 | 16884.92400 |
1 | 1725.55230 |
2 | 4449.46200 |
3 | 21984.47061 |
4 | 3866.85520 |
... | ... |
1333 | 10600.54830 |
1334 | 2205.98080 |
1335 | 1629.83350 |
1336 | 2007.94500 |
1337 | 29141.36030 |
1338 rows × 1 columns
B. Predictor 생성
= sklearn.linear_model.LinearRegression() predictr
C. 학습,y)
D. 예측
= predictr.predict(X)) df_train.assign(yhat
age | sex | bmi | children | smoker | region | charges | yhat | |
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 25293.713028 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 3448.602834 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 6706.988491 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 3754.830163 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 5592.493386 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 12351.323686 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 3511.930809 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 4149.132486 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 1246.584939 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 37085.623268 |
1338 rows × 8 columns
E. 평가
# R^2 predictr.score(X,y)
0.7 이상이면 망한모형까지는 아님 (대회용으로는 부적절할 수 있으나 대충 쓸 수는 있는 정도)
5. 계수해석
상수항 해석
- 기본적인 보험료는 -666이라는 의미
'name':list(X.columns), 'coef':predictr.coef_.reshape(-1)}) pd.DataFrame({
name | coef | |
0 | age | 256.856353 |
1 | bmi | 339.193454 |
2 | children | 475.500545 |
3 | sex_female | 65.657180 |
4 | sex_male | -65.657180 |
5 | smoker_no | -11924.267271 |
6 | smoker_yes | 11924.267271 |
7 | region_northeast | 587.009235 |
8 | region_northwest | 234.045336 |
9 | region_southeast | -448.012814 |
10 | region_southwest | -373.041756 |
- 지역은 잘 모르겠으나 나머지는 꽤 그럴듯해 보임