import pandas as pd
import numpy as np
from plotnine import *
08wk-1: 에너지사용량 시각화
1. 강의영상
2. Imports
3. 예비학습
A. pd.concat()
# 예시1
– 위아래로
= pd.DataFrame({'A':[1,2,3],'B':[2,3,4]})
df1 = pd.DataFrame({'A':[-1,-2,-3],'B':[-2,-3,-4]}) df2
"df1",df1)
display("df2",df2) display(
'df1'
A | B | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 3 |
2 | 3 | 4 |
'df2'
A | B | |
---|---|---|
0 | -1 | -2 |
1 | -2 | -3 |
2 | -3 | -4 |
#pd.concat([df1,df2],axis=0) # (3,2) concat (3,2) -> (6,4)
pd.concat([df1,df2])
A | B | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 3 |
2 | 3 | 4 |
0 | -1 | -2 |
1 | -2 | -3 |
2 | -3 | -4 |
인덱스를 재정리하고 싶다면?
=True) pd.concat([df1,df2]).reset_index(drop
A | B | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 3 |
2 | 3 | 4 |
3 | -1 | -2 |
4 | -2 | -3 |
5 | -3 | -4 |
#
# 예시2
– 좌우로
= pd.DataFrame({'A':[1,2,3],'B':[2,3,4]})
df1 = pd.DataFrame({'C':[-1,-2,-3],'D':[-2,-3,-4]}) df2
"df1",df1)
display("df2",df2) display(
'df1'
A | B | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 3 |
2 | 3 | 4 |
'df2'
C | D | |
---|---|---|
0 | -1 | -2 |
1 | -2 | -3 |
2 | -3 | -4 |
=1) # (3,2) concat (3,2) -> (3,4) pd.concat([df1,df2],axis
A | B | C | D | |
---|---|---|---|---|
0 | 1 | 2 | -1 | -2 |
1 | 2 | 3 | -2 | -3 |
2 | 3 | 4 | -3 | -4 |
#
B. pd.merge()
= pd.DataFrame({'department':['A','A','B','B'], 'gender':['male','female','male','female'],'count':[1,2,3,1]})
big = pd.DataFrame({'department':['A','B'], 'total':[3,4]}) small
"big",big)
display("small",small) display(
'big'
department | gender | count | |
---|---|---|---|
0 | A | male | 1 |
1 | A | female | 2 |
2 | B | male | 3 |
3 | B | female | 1 |
'small'
department | total | |
---|---|---|
0 | A | 3 |
1 | B | 4 |
big.merge(small)
department | gender | count | total | |
---|---|---|---|---|
0 | A | male | 1 | 3 |
1 | A | female | 2 | 3 |
2 | B | male | 3 | 4 |
3 | B | female | 1 | 4 |
small.merge(big)
department | total | gender | count | |
---|---|---|---|---|
0 | A | 3 | male | 1 |
1 | A | 3 | female | 2 |
2 | B | 4 | male | 3 |
3 | B | 4 | female | 1 |
C. .applymap()
# 예시1
43052)
np.random.seed(= pd.DataFrame({'A':np.random.rand(3), 'B':np.random.rand(3)})
df df
A | B | |
---|---|---|
0 | 0.817682 | 0.619777 |
1 | 0.049532 | 0.122541 |
2 | 0.838686 | 0.117128 |
lambda x: 'Yes' if x>0.5 else 'No') df.applymap(
/tmp/ipykernel_288950/1665022133.py:1: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
A | B | |
---|---|---|
0 | Yes | Yes |
1 | No | No |
2 | Yes | No |
#
D. df.astype()
-
예시1
= pd.DataFrame({'A':[0,1,2],'B':[4,5,6]})
df df
A | B | |
---|---|---|
0 | 0 | 4 |
1 | 1 | 5 |
2 | 2 | 6 |
float) df.astype(
A | B | |
---|---|---|
0 | 0.0 | 4.0 |
1 | 1.0 | 5.0 |
2 | 2.0 | 6.0 |
E. 제 아픈기억
# 예제1
– object형은 일괄적으로 문자형임을 의미하는게 아님
43052)
np.random.seed(= pd.DataFrame({'A':['1','2','0','1',2], 'B':['2','3','0','0',0]})
df df
A | B | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 3 |
2 | 0 | 0 |
3 | 1 | 0 |
4 | 2 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 5 non-null object
1 B 5 non-null object
dtypes: object(2)
memory usage: 208.0+ bytes
0],df.A[4],df.B[0],df.B[4] df.A[
('1', 2, '2', 0)
#
# 예제2
– column이름이 이상하게 들어가 있는 경우가 있음.
= pd.DataFrame({('A',''):[0,0,0], ('B',''):[1,1,1]})
df df
A | B | |
---|---|---|
0 | 0 | 1 |
1 | 0 | 1 |
2 | 0 | 1 |
'A'] df[
0 0
1 0
2 0
Name: A, dtype: int64
'A','')] df[(
0 0
1 0
2 0
Name: (A, ), dtype: int64
#
4. 에너지사용량 시각화
(1)
아래는 2019년 서울의 에너지사용량을 불러오는 예시코드이다.
'https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Seoul2019.csv') pd.read_csv(
지역 | 건물동수 | 연면적 | 에너지사용량(TOE)/전기 | 에너지사용량(TOE)/도시가스 | 에너지사용량(TOE)/지역난방 | |
---|---|---|---|---|---|---|
0 | 종로구 | 17,851 | 9,204,140 | 63,492 | 76,653 | 799 |
1 | 중구 | 10,383 | 10,078,848 | 79,223 | 68,210 | 497 |
2 | 용산구 | 17,138 | 10,756,612 | 51,229 | 79,805 | 11,128 |
3 | 성동구 | 13,980 | 11,804,313 | 59,832 | 99,986 | 0 |
4 | 광진구 | 21,556 | 12,272,738 | 68,756 | 123,447 | 0 |
5 | 동대문구 | 21,794 | 12,664,554 | 65,913 | 111,420 | 0 |
6 | 중랑구 | 23,950 | 15,182,802 | 59,370 | 109,284 | 7,442 |
7 | 성북구 | 27,112 | 15,938,807 | 77,007 | 148,376 | 0 |
8 | 강북구 | 23,334 | 9,458,987 | 47,731 | 100,045 | 0 |
9 | 도봉구 | 13,168 | 10,644,704 | 44,985 | 90,379 | 5,268 |
10 | 노원구 | 9,704 | 17,197,086 | 77,010 | 94,340 | 50,859 |
11 | 은평구 | 25,200 | 14,735,131 | 75,914 | 130,159 | 14,370 |
12 | 서대문구 | 17,651 | 12,559,425 | 65,164 | 111,542 | 6,330 |
13 | 마포구 | 18,844 | 15,024,186 | 92,453 | 114,931 | 20,148 |
14 | 양천구 | 14,690 | 15,428,339 | 70,721 | 82,857 | 49,258 |
15 | 강서구 | 20,446 | 20,641,866 | 86,809 | 128,786 | 35,896 |
16 | 구로구 | 17,204 | 13,509,894 | 59,916 | 120,457 | 2,963 |
17 | 금천구 | 12,135 | 7,420,441 | 34,791 | 69,814 | 732 |
18 | 영등포구 | 18,133 | 14,914,027 | 87,480 | 114,238 | 13,531 |
19 | 동작구 | 20,102 | 13,612,946 | 66,811 | 132,285 | 899 |
20 | 관악구 | 26,460 | 14,997,859 | 85,416 | 158,543 | 0 |
21 | 서초구 | 12,856 | 21,560,285 | 135,491 | 121,437 | 38,866 |
22 | 강남구 | 16,129 | 29,961,585 | 180,121 | 149,045 | 83,459 |
23 | 송파구 | 19,331 | 26,573,343 | 139,117 | 143,601 | 71,954 |
24 | 강동구 | 16,636 | 15,048,315 | 70,341 | 121,931 | 11,921 |
에너지 사용량은 2018년부터 2021년까지의 기간 동안 서울, 부산 등 여러 지역에 대해 정리되어 있으며, 아래 주소 형식으로 저장되어 있다.
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Seoul2018.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Seoul2019.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Seoul2020.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Seoul2021.csv
...
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Busan2018.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Busan2019.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Busan2020.csv
https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/Busan2021.csv
아래의 url
, prov
를 참고하여 모든 자료를 불러온 뒤 pd.concat()
을 이용하여 하나의 df로 합쳐라.
= 'https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/{}.csv'
url
= ['Seoul', 'Busan', 'Daegu', 'Incheon',
prov 'Gwangju', 'Daejeon', 'Ulsan', 'Sejongsi',
'Gyeonggi-do', 'Gangwon-do', 'Chungcheongbuk-do',
'Chungcheongnam-do', 'Jeollabuk-do', 'Jeollanam-do',
'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do']
올바르게 정리된 데이터프레임의 예시는 아래와 같다.
(풀이)
= pd.concat([pd.read_csv(url.format(p+y)).assign(년도=y, 시도=p) for p in prov for y in ['2018', '2019', '2020', '2021']]).reset_index(drop=True)
df df
지역 | 건물동수 | 연면적 | 에너지사용량(TOE)/전기 | 에너지사용량(TOE)/도시가스 | 에너지사용량(TOE)/지역난방 | 년도 | 시도 | |
---|---|---|---|---|---|---|---|---|
0 | 종로구 | 17,929 | 9,141,777 | 64,818 | 82,015 | 111 | 2018 | Seoul |
1 | 중구 | 10,598 | 10,056,233 | 81,672 | 75,260 | 563 | 2018 | Seoul |
2 | 용산구 | 17,201 | 10,639,652 | 52,659 | 85,220 | 12,043 | 2018 | Seoul |
3 | 성동구 | 14,180 | 11,631,770 | 60,559 | 107,416 | 0 | 2018 | Seoul |
4 | 광진구 | 21,520 | 12,054,796 | 70,609 | 130,308 | 0 | 2018 | Seoul |
... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 서귀포시 | 34,729 | 7,233,931 | 34,641 | 1,306 | 0 | 2019 | Jeju-do |
996 | 제주시 | 66,504 | 19,819,923 | 99,212 | 22,179 | 0 | 2020 | Jeju-do |
997 | 서귀포시 | 34,880 | 7,330,040 | 35,510 | 1,639 | 0 | 2020 | Jeju-do |
998 | 제주시 | 67,053 | 20,275,738 | 103,217 | 25,689 | 0 | 2021 | Jeju-do |
999 | 서귀포시 | 35,230 | 7,512,206 | 37,884 | 2,641 | 0 | 2021 | Jeju-do |
1000 rows × 8 columns
(2)
의미상 숫자형이지만 문자형으로 입력이된 자료를 모두 전처리하라.
(풀이)
= df.년도.astype(int))\
df.assign(년도 '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([ .reset_index()
/tmp/ipykernel_322791/847505444.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
년도 | 시도 | 지역 | 건물동수 | 연면적 | 에너지사용량(TOE)/전기 | 에너지사용량(TOE)/도시가스 | 에너지사용량(TOE)/지역난방 | |
---|---|---|---|---|---|---|---|---|
0 | 2018 | Seoul | 종로구 | 17929 | 9141777 | 64818 | 82015 | 111 |
1 | 2018 | Seoul | 중구 | 10598 | 10056233 | 81672 | 75260 | 563 |
2 | 2018 | Seoul | 용산구 | 17201 | 10639652 | 52659 | 85220 | 12043 |
3 | 2018 | Seoul | 성동구 | 14180 | 11631770 | 60559 | 107416 | 0 |
4 | 2018 | Seoul | 광진구 | 21520 | 12054796 | 70609 | 130308 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 2019 | Jeju-do | 서귀포시 | 34729 | 7233931 | 34641 | 1306 | 0 |
996 | 2020 | Jeju-do | 제주시 | 66504 | 19819923 | 99212 | 22179 | 0 |
997 | 2020 | Jeju-do | 서귀포시 | 34880 | 7330040 | 35510 | 1639 | 0 |
998 | 2021 | Jeju-do | 제주시 | 67053 | 20275738 | 103217 | 25689 | 0 |
999 | 2021 | Jeju-do | 서귀포시 | 35230 | 7512206 | 37884 | 2641 | 0 |
1000 rows × 8 columns
(3)
열의 이름을 아래와 같이 바꾸라.
= {
name_dict '년도': 'Year',
'시도': 'Prov',
'지역': 'Reg',
'건물동수': 'BldgCount',
'연면적': 'Area',
'에너지사용량(TOE)/전기': 'Elec',
'에너지사용량(TOE)/도시가스': 'Gas',
'에너지사용량(TOE)/지역난방': 'Heat'
}
(풀이)
= df.년도.astype(int))\
df.assign(년도 '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1) .reset_index().rename(name_dict,axis
/tmp/ipykernel_322791/1695199233.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
Year | Prov | Reg | BldgCount | Area | Elec | Gas | Heat | |
---|---|---|---|---|---|---|---|---|
0 | 2018 | Seoul | 종로구 | 17929 | 9141777 | 64818 | 82015 | 111 |
1 | 2018 | Seoul | 중구 | 10598 | 10056233 | 81672 | 75260 | 563 |
2 | 2018 | Seoul | 용산구 | 17201 | 10639652 | 52659 | 85220 | 12043 |
3 | 2018 | Seoul | 성동구 | 14180 | 11631770 | 60559 | 107416 | 0 |
4 | 2018 | Seoul | 광진구 | 21520 | 12054796 | 70609 | 130308 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 2019 | Jeju-do | 서귀포시 | 34729 | 7233931 | 34641 | 1306 | 0 |
996 | 2020 | Jeju-do | 제주시 | 66504 | 19819923 | 99212 | 22179 | 0 |
997 | 2020 | Jeju-do | 서귀포시 | 34880 | 7330040 | 35510 | 1639 | 0 |
998 | 2021 | Jeju-do | 제주시 | 67053 | 20275738 | 103217 | 25689 | 0 |
999 | 2021 | Jeju-do | 서귀포시 | 35230 | 7512206 | 37884 | 2641 | 0 |
1000 rows × 8 columns
(4)
아래와 같은 그림을 시각화 하라.
# 시각화예시
(풀이)
= df.assign(년도 = df.년도.astype(int))\
tidydata '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1)\
.reset_index().rename(name_dict,axis'BldgCount','Area'],axis=1)\
.drop(['Year','Prov','Reg']).stack().reset_index()\
.set_index(['level_3':'Type',0:'EnergyUse'},axis=1)\
.rename({=['Year','Prov','Type'],values='EnergyUse',aggfunc='sum')\
.pivot_table(index\
.reset_index()eval('logEnergyUse = log(EnergyUse)')
.
tidydata # --- #
= ggplot(tidydata)
fig = geom_line(aes(x='Year',y='logEnergyUse',color='Type',linetype='Type'))
line = (fig + line + facet_wrap('Prov',scales='free')).draw()
fig fig
/tmp/ipykernel_322791/2236388880.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log
10,6)
fig.set_size_inches(150)
fig.set_dpi( fig
(5)
Prov
별로 총 에너지사용량이 많은 상위5개의 Reg
을 찾고 아래와 같이 시각화 하라.
# 시각화예시
(풀이)
= df.년도.astype(int))\
df.assign(년도 '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1)\
.reset_index().rename(name_dict,axiseval('EnergyUse = Elec + Gas + Heat')\
.'BldgCount','Area','Elec','Gas','Heat'],axis=1)\
.drop([=['Prov','Reg'],columns='Year',values='EnergyUse').sum(axis=1)\
.pivot_table(index0:'EnergyUse'},axis=1) .reset_index().rename({
/tmp/ipykernel_322791/1149148944.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
Prov | Reg | EnergyUse | |
---|---|---|---|
0 | Busan | 강서구 | 200386.0 |
1 | Busan | 금정구 | 451212.0 |
2 | Busan | 기장군 | 287926.0 |
3 | Busan | 남구 | 491030.0 |
4 | Busan | 동구 | 156302.0 |
... | ... | ... | ... |
245 | Ulsan | 남구 | 607820.0 |
246 | Ulsan | 동구 | 281094.0 |
247 | Ulsan | 북구 | 334844.0 |
248 | Ulsan | 울주군 | 394217.0 |
249 | Ulsan | 중구 | 395158.0 |
250 rows × 3 columns
= df.assign(년도 = df.년도.astype(int))\
g '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1)\
.reset_index().rename(name_dict,axiseval('EnergyUse = Elec + Gas + Heat')\
.'BldgCount','Area','Elec','Gas','Heat'],axis=1)\
.drop([=['Prov','Reg'],columns='Year',values='EnergyUse').sum(axis=1)\
.pivot_table(index0:'EnergyUse'},axis=1)\
.reset_index().rename({'Prov') .groupby(
/tmp/ipykernel_322791/1748878604.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
= pd.concat([df.sort_values('EnergyUse',ascending=False).reset_index(drop=True).reset_index().rename({'index':'Rank'},axis=1).iloc[:5] for k,df in g])\
tidydata =True)
.reset_index(drop
tidydata #---#
= (ggplot(tidydata) + geom_col(aes(x='Rank',y='EnergyUse',fill='Prov')) + facet_wrap('Prov')).draw()
fig 10,5)
fig.set_size_inches(150)
fig.set_dpi( fig
(5)
(Prov
,Year
)별 전기에너지 사용량 비율을 구하고 아래와 같이 시각화 하라. + 제주를 제외한 지역으로 한정하고 시각화하라.
# 시각화예시
(풀이1) – 수업시간에 한 풀이
= df.년도.astype(int))\
df.assign(년도 '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1)\
.reset_index().rename(name_dict,axis'BldgCount','Area'],axis=1).melt(id_vars=['Year','Prov','Reg'])\
.drop(['variable':'Type','value':'EnergyUse'},axis=1)\
.rename({=['Prov','Year'],columns='Type',values='EnergyUse',aggfunc='sum')\
.pivot_table(indexeval('ElecRate = Elec/(Elec+Gas+Heat)').loc[:,'ElecRate'].reset_index() .
/tmp/ipykernel_322791/3362619229.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
Prov | Year | ElecRate | |
---|---|---|---|
0 | Busan | 2018 | 0.455996 |
1 | Busan | 2019 | 0.463324 |
2 | Busan | 2020 | 0.457401 |
3 | Busan | 2021 | 0.534566 |
4 | Chungcheongbuk-do | 2018 | 0.512447 |
... | ... | ... | ... |
63 | Seoul | 2021 | 0.455675 |
64 | Ulsan | 2018 | 0.384385 |
65 | Ulsan | 2019 | 0.392067 |
66 | Ulsan | 2020 | 0.387118 |
67 | Ulsan | 2021 | 0.386115 |
68 rows × 3 columns
= df.assign(년도 = df.년도.astype(int))\
tidydata '년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.set_index([=1)\
.reset_index().rename(name_dict,axis'BldgCount','Area'],axis=1).melt(id_vars=['Year','Prov','Reg'])\
.drop(['variable':'Type','value':'EnergyUse'},axis=1)\
.rename({=['Prov','Year'],columns='Type',values='EnergyUse',aggfunc='sum')\
.pivot_table(indexeval('ElecRate = Elec/(Elec+Gas+Heat)').loc[:,'ElecRate'].reset_index()
.#--#
= ggplot(tidydata)
fig = geom_point(aes(x='Year',y='ElecRate',color='Prov',shape='Prov'))
point = geom_line(aes(x='Year',y='ElecRate',color='Prov'),linetype='dashed')
line = (fig + point + line).draw()
fig 150)
fig.set_dpi( fig
/tmp/ipykernel_322791/3057913787.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/mizani/palettes.py:706: UserWarning: Palette can return a maximum of 13 values. 17 values requested.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 16 rows containing missing values.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/mizani/palettes.py:706: UserWarning: Palette can return a maximum of 13 values. 17 values requested.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/plotnine/guides/guides.py:259: PlotnineWarning: geom_point legend : Removed 4 rows containing missing values.
= ggplot(tidydata.query("Prov != 'Jeju-do'"))
fig = geom_point(aes(x='Year',y='ElecRate',color='Prov',shape='Prov'))
point = geom_line(aes(x='Year',y='ElecRate',color='Prov'),linetype='dashed')
line = (fig + point + line).draw()
fig 150)
fig.set_dpi( fig
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/mizani/palettes.py:706: UserWarning: Palette can return a maximum of 13 values. 16 values requested.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 12 rows containing missing values.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/mizani/palettes.py:706: UserWarning: Palette can return a maximum of 13 values. 16 values requested.
/home/cgb2/anaconda3/envs/ag/lib/python3.10/site-packages/plotnine/guides/guides.py:259: PlotnineWarning: geom_point legend : Removed 3 rows containing missing values.
(풀이2) – draft에 있던 풀이, 수업시간에 풀이한것과 약간다름
= df.년도.astype(int))\
df.assign(년도 '년도','시도','지역']).applymap(lambda x: str(x).replace(',','')).astype(int).reset_index()\
.set_index([=1)\
.rename(name_dict,axis'BldgCount','Area'],axis=1)\
.drop([=['Year','Prov','Reg']).rename({'variable':'Type','value':'EnergyUse'},axis=1)\
.melt(id_vars=['Year','Prov','Type'],values='EnergyUse',aggfunc='sum').reset_index()\
.pivot_table(index'Year','Prov','Type']).unstack()\
.set_index([= lambda df: df[('EnergyUse','Elec')]/(df[('EnergyUse','Elec')]+df[('EnergyUse','Gas')]+df[('EnergyUse','Heat')]))\
.assign(ElecRate -1].reset_index().set_axis(['Year','Prov','ElecRate'],axis=1) .iloc[:,
/tmp/ipykernel_322791/2924150206.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
Year | Prov | ElecRate | |
---|---|---|---|
0 | 2018 | Busan | 0.455996 |
1 | 2018 | Chungcheongbuk-do | 0.512447 |
2 | 2018 | Chungcheongnam-do | 0.506471 |
3 | 2018 | Daegu | 0.403464 |
4 | 2018 | Daejeon | 0.418136 |
... | ... | ... | ... |
63 | 2021 | Jeollabuk-do | 0.466878 |
64 | 2021 | Jeollanam-do | 0.537464 |
65 | 2021 | Sejongsi | 0.435457 |
66 | 2021 | Seoul | 0.455675 |
67 | 2021 | Ulsan | 0.386115 |
68 rows × 3 columns
이후의 과정은 풀이1와 같음.
5. pd.merge()
의 이용
before
에서 after
로 가는 일반적인 방법이 있을까?
= df.assign(년도 = df.년도.astype(int))\
before '년도','시도','지역']).applymap(lambda x: str(x).replace(',','')).astype(int).reset_index()\
.set_index([=1)\
.rename(name_dict,axis'BldgCount','Area'],axis=1)
.drop([= before\
after =['Year','Prov','Reg']).rename({'variable':'Type','value':'EnergyUse'},axis=1)\
.melt(id_vars=['Year','Prov','Type'],values='EnergyUse',aggfunc='sum').reset_index()\
.pivot_table(index'Year','Prov','Type']).unstack()\
.set_index([= lambda df: df[('EnergyUse','Elec')]/(df[('EnergyUse','Elec')]+df[('EnergyUse','Gas')]+df[('EnergyUse','Heat')]))\
.assign(ElecRate -1].reset_index().set_axis(['Year','Prov','ElecRate'],axis=1)
.iloc[:,'before',before)
display('after',after) display(
/tmp/ipykernel_322791/2643872.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
'before'
Year | Prov | Reg | Elec | Gas | Heat | |
---|---|---|---|---|---|---|
0 | 2018 | Seoul | 종로구 | 64818 | 82015 | 111 |
1 | 2018 | Seoul | 중구 | 81672 | 75260 | 563 |
2 | 2018 | Seoul | 용산구 | 52659 | 85220 | 12043 |
3 | 2018 | Seoul | 성동구 | 60559 | 107416 | 0 |
4 | 2018 | Seoul | 광진구 | 70609 | 130308 | 0 |
... | ... | ... | ... | ... | ... | ... |
995 | 2019 | Jeju-do | 서귀포시 | 34641 | 1306 | 0 |
996 | 2020 | Jeju-do | 제주시 | 99212 | 22179 | 0 |
997 | 2020 | Jeju-do | 서귀포시 | 35510 | 1639 | 0 |
998 | 2021 | Jeju-do | 제주시 | 103217 | 25689 | 0 |
999 | 2021 | Jeju-do | 서귀포시 | 37884 | 2641 | 0 |
1000 rows × 6 columns
'after'
Year | Prov | ElecRate | |
---|---|---|---|
0 | 2018 | Busan | 0.455996 |
1 | 2018 | Chungcheongbuk-do | 0.512447 |
2 | 2018 | Chungcheongnam-do | 0.506471 |
3 | 2018 | Daegu | 0.403464 |
4 | 2018 | Daejeon | 0.418136 |
... | ... | ... | ... |
63 | 2021 | Jeollabuk-do | 0.466878 |
64 | 2021 | Jeollanam-do | 0.537464 |
65 | 2021 | Sejongsi | 0.435457 |
66 | 2021 | Seoul | 0.455675 |
67 | 2021 | Ulsan | 0.386115 |
68 rows × 3 columns
(해법) – 머리가 안돌아가면 일단 tidydata로 만들고 시작..
= before.melt(id_vars=['Year','Prov','Reg'])\
small 'Year','Prov']).agg({'value':'sum'}).reset_index()\
.groupby(['value':'EnergyUseTotal'},axis=1) .rename({
= before.melt(id_vars=['Year','Prov','Reg'])\
big 'Year','Prov','variable']).agg({'value':'sum'}).reset_index()\
.groupby(['variable':'Type','value':'EnergyUse'},axis=1) .rename({
'Type=="Elec"')\
big.merge(small).query(eval('ElecRate = EnergyUse/EnergyUseTotal')\
.'Year','Prov','ElecRate']].reset_index(drop=True) .loc[:,[
Year | Prov | ElecRate | |
---|---|---|---|
0 | 2018 | Busan | 0.455996 |
1 | 2018 | Chungcheongbuk-do | 0.512447 |
2 | 2018 | Chungcheongnam-do | 0.506471 |
3 | 2018 | Daegu | 0.403464 |
4 | 2018 | Daejeon | 0.418136 |
... | ... | ... | ... |
63 | 2021 | Jeollabuk-do | 0.466878 |
64 | 2021 | Jeollanam-do | 0.537464 |
65 | 2021 | Sejongsi | 0.435457 |
66 | 2021 | Seoul | 0.455675 |
67 | 2021 | Ulsan | 0.386115 |
68 rows × 3 columns