강의영상

- (1/5) group by (1)

- (2/5) group by (2)

- (3/5) pd.cut (1)

- (4/5) pd.cut (2)

- (5/5) 과제설명

data

import pandas as pd 
import numpy as np 
df=pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/flights.csv')
df
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
0 1 1 4 WN LAX SLC 1625 58.0 94.0 590 1905 65.0 0 0
1 1 1 4 UA DEN IAD 823 7.0 154.0 1452 1333 -13.0 0 0
2 1 1 4 MQ DFW VPS 1305 36.0 85.0 641 1453 35.0 0 0
3 1 1 4 AA DFW DCA 1555 7.0 126.0 1192 1935 -7.0 0 0
4 1 1 4 WN LAX MCI 1720 48.0 166.0 1363 2225 39.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58487 12 31 4 AA SFO DFW 515 5.0 166.0 1464 1045 -19.0 0 0
58488 12 31 4 F9 LAS SFO 1910 13.0 71.0 414 2050 4.0 0 0
58489 12 31 4 OO SFO SBA 1846 -6.0 46.0 262 1956 -5.0 0 0
58490 12 31 4 WN MSP ATL 525 39.0 124.0 907 855 34.0 0 0
58491 12 31 4 OO SFO BOI 859 5.0 73.0 522 1146 -1.0 0 0

58492 rows × 14 columns

df.columns
Index(['MONTH', 'DAY', 'WEEKDAY', 'AIRLINE', 'ORG_AIR', 'DEST_AIR',
       'SCHED_DEP', 'DEP_DELAY', 'AIR_TIME', 'DIST', 'SCHED_ARR', 'ARR_DELAY',
       'DIVERTED', 'CANCELLED'],
      dtype='object')

groupby

- 데이터프레임을 여러개의 서브데이터프레임으로 나누는 기능

- 단독으로 쓸 이유는 별로 없다. $\to$ 그룹을 나누고 어떠한 "연산"을 하기 위함

df.groupby(by='AIRLINE')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f58ee1d7b20>
  • 데이터프레임을 각 항공사 별로 나눔

- 확인

grouped_df = df.groupby(by='AIRLINE')
grouped_df.groups
{'AA': [3, 6, 8, 15, 26, 32, 33, 36, 37, 41, 46, 47, 48, 55, 62, 66, 73, 75, 76, 87, 89, 97, 98, 107, 120, 122, 128, 131, 133, 146, 160, 171, 174, 178, 179, 188, 200, 201, 206, 221, 267, 270, 279, 305, 309, 311, 321, 330, 332, 342, 343, 347, 348, 380, 381, 382, 390, 393, 398, 402, 404, 405, 419, 427, 430, 431, 441, 449, 451, 455, 457, 466, 476, 491, 493, 497, 509, 514, 521, 527, 528, 537, 550, 551, 556, 560, 568, 572, 587, 589, 592, 612, 613, 618, 624, 626, 628, 634, 640, 652, ...], 'AS': [38, 198, 241, 277, 397, 450, 453, 500, 518, 591, 718, 737, 741, 808, 820, 867, 872, 1005, 1106, 1146, 1292, 1437, 1530, 1559, 1609, 1636, 1747, 1786, 1810, 1826, 1892, 1921, 2010, 2151, 2269, 2446, 2526, 2576, 2586, 2941, 3112, 3125, 3171, 3176, 3413, 3483, 3494, 3596, 3678, 3734, 3891, 3910, 3937, 3989, 4099, 4128, 4326, 4397, 4432, 4587, 4743, 4799, 4841, 4850, 4912, 4987, 5099, 5154, 5176, 5254, 5261, 5297, 5356, 5372, 5496, 5667, 5685, 5688, 5877, 5976, 6016, 6079, 6122, 6245, 6255, 6393, 6529, 6816, 6824, 6861, 6945, 6988, 7065, 7150, 7153, 7196, 7225, 7235, 7389, 7478, ...], 'B6': [123, 127, 239, 333, 548, 549, 696, 757, 945, 1175, 1361, 1453, 1547, 1561, 1564, 1650, 1670, 1691, 1800, 1939, 1998, 2251, 2253, 2256, 2330, 2385, 2434, 2607, 2650, 2692, 2721, 2781, 2800, 2884, 2936, 3060, 3306, 3648, 3793, 3983, 4347, 4552, 4578, 4639, 4779, 4820, 4885, 5036, 5125, 5131, 5205, 5211, 5228, 5260, 5351, 5483, 5577, 5585, 5777, 5956, 6126, 6151, 6418, 6901, 6952, 6977, 7792, 7798, 7853, 7877, 7904, 8030, 8082, 8091, 8118, 8145, 8177, 8364, 8507, 8541, 8543, 8558, 8732, 8828, 8912, 9093, 9173, 9275, 9371, 9389, 9447, 9482, 9574, 9686, 9896, 10066, 10254, 10416, 10600, 10727, ...], 'DL': [53, 57, 77, 79, 85, 90, 91, 101, 116, 117, 121, 129, 130, 137, 142, 152, 154, 155, 158, 159, 162, 165, 169, 172, 181, 182, 184, 185, 189, 190, 204, 207, 211, 215, 216, 222, 235, 238, 244, 245, 248, 249, 250, 253, 254, 258, 286, 289, 304, 307, 308, 310, 313, 323, 326, 328, 329, 336, 353, 354, 364, 373, 385, 395, 407, 411, 412, 413, 420, 433, 435, 439, 445, 456, 468, 481, 485, 490, 492, 508, 517, 526, 529, 531, 535, 543, 546, 559, 570, 579, 584, 588, 593, 602, 606, 607, 616, 617, 631, 642, ...], 'EV': [11, 13, 29, 40, 69, 100, 106, 118, 136, 143, 147, 164, 167, 170, 194, 196, 199, 205, 208, 219, 220, 225, 231, 233, 236, 251, 252, 261, 264, 265, 268, 282, 291, 293, 298, 312, 319, 337, 352, 362, 363, 369, 371, 383, 403, 414, 415, 424, 429, 437, 442, 443, 459, 460, 465, 470, 495, 499, 503, 505, 506, 522, 525, 530, 541, 567, 574, 580, 583, 632, 645, 654, 659, 677, 683, 689, 693, 695, 697, 714, 721, 723, 725, 730, 740, 742, 761, 764, 765, 768, 772, 776, 777, 778, 785, 787, 790, 791, 794, 799, ...], 'F9': [7, 93, 209, 232, 247, 290, 301, 386, 444, 483, 553, 563, 596, 598, 599, 605, 665, 685, 738, 844, 870, 948, 985, 1103, 1167, 1181, 1264, 1374, 1382, 1407, 1421, 1435, 1489, 1590, 1611, 1697, 1746, 1760, 1775, 1776, 1807, 1830, 1925, 1962, 2014, 2036, 2069, 2078, 2111, 2125, 2208, 2335, 2405, 2515, 2557, 2579, 2600, 2683, 2693, 2708, 2731, 2733, 2805, 2853, 2861, 3024, 3241, 3288, 3374, 3382, 3395, 3418, 3555, 3570, 3575, 3628, 3638, 3716, 3735, 3777, 3843, 3923, 3962, 3977, 4016, 4031, 4046, 4069, 4111, 4151, 4172, 4287, 4356, 4371, 4420, 4469, 4542, 4641, 4671, 4720, ...], 'HA': [582, 712, 878, 1053, 1269, 1345, 1544, 1864, 2316, 2644, 2681, 3002, 3165, 3227, 4359, 4414, 4926, 5187, 5281, 5320, 5363, 7267, 7372, 7744, 7808, 9360, 9761, 10163, 11119, 11626, 11743, 11809, 12275, 12752, 12780, 13466, 13755, 13956, 14566, 15064, 15316, 15959, 16501, 17632, 18467, 18720, 20461, 21558, 21761, 22454, 22584, 22592, 22618, 22858, 22892, 23177, 23374, 23773, 25763, 26127, 27745, 28864, 29608, 29762, 29875, 30610, 31164, 31505, 31530, 31675, 32030, 32037, 32987, 33311, 33747, 34884, 35959, 36068, 36319, 36425, 37119, 37128, 37664, 37983, 38440, 39475, 39482, 39625, 41664, 42165, 42485, 42846, 44068, 45330, 46713, 47772, 48279, 48722, 49667, 49782, ...], 'MQ': [2, 10, 18, 24, 50, 60, 71, 78, 81, 103, 105, 115, 125, 149, 180, 187, 210, 214, 228, 285, 296, 314, 320, 334, 335, 339, 341, 357, 458, 472, 484, 486, 488, 502, 516, 524, 534, 547, 581, 601, 622, 638, 639, 646, 661, 707, 711, 722, 727, 751, 769, 774, 802, 805, 830, 851, 854, 856, 865, 876, 915, 939, 955, 968, 977, 980, 1002, 1006, 1036, 1085, 1087, 1104, 1113, 1150, 1158, 1163, 1168, 1169, 1204, 1229, 1231, 1234, 1238, 1246, 1287, 1296, 1303, 1306, 1328, 1329, 1336, 1393, 1398, 1410, 1423, 1428, 1436, 1448, 1449, 1454, ...], 'NK': [17, 74, 95, 109, 166, 345, 358, 401, 434, 436, 446, 452, 482, 507, 510, 519, 533, 544, 610, 615, 620, 729, 735, 845, 858, 887, 898, 916, 970, 1017, 1095, 1097, 1153, 1188, 1208, 1257, 1342, 1392, 1536, 1579, 1580, 1657, 1732, 1761, 1863, 1943, 1948, 1990, 2026, 2068, 2071, 2118, 2124, 2149, 2177, 2196, 2214, 2244, 2299, 2334, 2378, 2424, 2425, 2447, 2523, 2537, 2592, 2613, 2639, 2643, 2740, 2749, 2752, 2764, 2765, 2770, 2786, 2875, 2886, 3018, 3061, 3065, 3072, 3151, 3153, 3174, 3248, 3315, 3381, 3435, 3566, 3598, 3674, 3696, 3701, 3708, 3840, 3877, 3894, 3934, ...], 'OO': [12, 16, 22, 25, 27, 34, 39, 42, 51, 52, 54, 58, 61, 63, 65, 82, 86, 99, 102, 113, 124, 126, 135, 138, 139, 151, 157, 161, 168, 175, 213, 223, 237, 240, 242, 255, 257, 278, 281, 284, 288, 297, 315, 317, 318, 322, 324, 327, 331, 359, 361, 366, 367, 370, 376, 378, 379, 387, 392, 418, 421, 447, 448, 540, 552, 554, 565, 569, 576, 590, 600, 603, 604, 609, 625, 629, 637, 649, 653, 658, 660, 663, 666, 671, 672, 673, 676, 679, 681, 682, 684, 691, 692, 708, 734, 754, 755, 762, 789, 793, ...], 'UA': [1, 5, 9, 14, 21, 44, 45, 67, 70, 72, 94, 108, 110, 134, 140, 141, 144, 145, 163, 177, 183, 192, 197, 246, 260, 262, 263, 266, 269, 273, 275, 283, 295, 325, 338, 340, 349, 368, 374, 375, 377, 389, 394, 396, 408, 423, 425, 438, 440, 462, 463, 469, 471, 474, 479, 501, 511, 512, 513, 520, 523, 536, 539, 542, 555, 557, 566, 573, 577, 586, 594, 595, 608, 621, 623, 630, 635, 636, 647, 650, 651, 667, 686, 687, 698, 699, 700, 701, 713, 717, 719, 745, 756, 773, 784, 801, 829, 836, 842, 843, ...], 'US': [31, 35, 49, 96, 104, 111, 112, 119, 148, 150, 153, 191, 202, 203, 224, 229, 355, 360, 372, 384, 391, 467, 473, 477, 478, 487, 494, 538, 545, 558, 561, 564, 575, 578, 619, 690, 715, 726, 728, 760, 795, 806, 881, 896, 901, 910, 954, 1028, 1033, 1068, 1134, 1135, 1143, 1155, 1164, 1201, 1210, 1245, 1274, 1288, 1290, 1291, 1335, 1355, 1377, 1399, 1406, 1446, 1459, 1462, 1470, 1471, 1476, 1481, 1493, 1499, 1503, 1552, 1554, 1565, 1589, 1593, 1603, 1661, 1680, 1716, 1731, 1733, 1735, 1743, 1749, 1754, 1755, 1758, 1767, 1773, 1790, 1821, 1904, 1905, ...], 'VX': [56, 227, 243, 417, 432, 464, 614, 641, 703, 759, 812, 825, 943, 1058, 1092, 1093, 1192, 1302, 1312, 1343, 1369, 1452, 1586, 1599, 1644, 1696, 1705, 1805, 1930, 1957, 2048, 2219, 2220, 2270, 2302, 2324, 2484, 2572, 2620, 2646, 2735, 2753, 2808, 2880, 2898, 2919, 3110, 3111, 3120, 3210, 3246, 3261, 3302, 3427, 3501, 3510, 3664, 3680, 3694, 3731, 3860, 3938, 3968, 4005, 4029, 4035, 4056, 4088, 4270, 4351, 4531, 4536, 4540, 4692, 4712, 4728, 4746, 4825, 4856, 4860, 4869, 4876, 4890, 4901, 4928, 4994, 5075, 5078, 5115, 5143, 5167, 5206, 5245, 5265, 5379, 5383, 5413, 5502, 5512, 5522, ...], 'WN': [0, 4, 19, 20, 23, 28, 30, 43, 59, 64, 68, 80, 83, 84, 88, 92, 114, 132, 156, 173, 176, 186, 193, 195, 212, 217, 218, 226, 230, 234, 256, 259, 271, 272, 274, 276, 280, 287, 292, 294, 299, 300, 302, 303, 306, 316, 344, 346, 350, 351, 356, 365, 388, 399, 400, 406, 409, 410, 416, 422, 426, 428, 454, 461, 475, 480, 489, 496, 498, 504, 515, 532, 562, 571, 585, 597, 611, 627, 633, 648, 657, 669, 670, 674, 678, 710, 720, 724, 731, 744, 747, 752, 753, 788, 797, 804, 807, 813, 815, 817, ...]}
  • 너무 보기 힘듬

- 보기좋은 형태로 확인

list(grouped_df.groups)
['AA',
 'AS',
 'B6',
 'DL',
 'EV',
 'F9',
 'HA',
 'MQ',
 'NK',
 'OO',
 'UA',
 'US',
 'VX',
 'WN']
grouped_df.get_group('AA')
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
3 1 1 4 AA DFW DCA 1555 7.0 126.0 1192 1935 -7.0 0 0
6 1 1 4 AA DFW MSY 1250 84.0 64.0 447 1410 83.0 0 0
8 1 1 4 AA ORD STL 1845 -5.0 44.0 258 1950 -5.0 0 0
15 1 1 4 AA DEN DFW 1445 -6.0 93.0 641 1745 4.0 0 0
26 1 1 4 AA LAX AUS 1430 33.0 157.0 1242 1925 41.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58470 12 31 4 AA DFW FAT 1020 -3.0 196.0 1313 1156 -2.0 0 0
58475 12 31 4 AA IAH CLT 710 1.0 113.0 912 1037 -12.0 0 0
58476 12 31 4 AA DFW TPA 1020 -3.0 121.0 929 1340 -6.0 0 0
58479 12 31 4 AA DFW ELP 1200 3.0 94.0 551 1250 13.0 0 0
58487 12 31 4 AA SFO DFW 515 5.0 166.0 1464 1045 -19.0 0 0

8900 rows × 14 columns

for g in grouped_df.groups:
    print(g)
    display(grouped_df.get_group(g))
AA
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
3 1 1 4 AA DFW DCA 1555 7.0 126.0 1192 1935 -7.0 0 0
6 1 1 4 AA DFW MSY 1250 84.0 64.0 447 1410 83.0 0 0
8 1 1 4 AA ORD STL 1845 -5.0 44.0 258 1950 -5.0 0 0
15 1 1 4 AA DEN DFW 1445 -6.0 93.0 641 1745 4.0 0 0
26 1 1 4 AA LAX AUS 1430 33.0 157.0 1242 1925 41.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58470 12 31 4 AA DFW FAT 1020 -3.0 196.0 1313 1156 -2.0 0 0
58475 12 31 4 AA IAH CLT 710 1.0 113.0 912 1037 -12.0 0 0
58476 12 31 4 AA DFW TPA 1020 -3.0 121.0 929 1340 -6.0 0 0
58479 12 31 4 AA DFW ELP 1200 3.0 94.0 551 1250 13.0 0 0
58487 12 31 4 AA SFO DFW 515 5.0 166.0 1464 1045 -19.0 0 0

8900 rows × 14 columns

AS
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
38 1 1 4 AS PHX SEA 1505 -2.0 155.0 1107 1702 -3.0 0 0
198 1 2 5 AS LAX SEA 2110 5.0 145.0 954 2352 8.0 0 0
241 1 2 5 AS LAS PDX 650 -5.0 117.0 763 906 -3.0 0 0
277 1 2 5 AS ORD ANC 935 -1.0 402.0 2846 1339 -6.0 0 0
397 1 3 6 AS LAS SEA 1300 48.0 137.0 867 1535 47.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58305 12 30 3 AS LAX SEA 1325 -2.0 134.0 954 1608 -7.0 0 0
58355 12 31 4 AS PHX SEA 1200 -5.0 145.0 1107 1407 -24.0 0 0
58404 12 31 4 AS SFO SLC 2110 -2.0 80.0 599 2358 -4.0 0 0
58407 12 31 4 AS SFO PDX 645 -2.0 81.0 550 832 -3.0 0 0
58428 12 31 4 AS LAX SEA 1420 -8.0 127.0 954 1709 -25.0 0 0

768 rows × 14 columns

B6
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
123 1 1 4 B6 LAS BOS 1230 0.0 246.0 2381 2026 -27.0 0 0
127 1 1 4 B6 LAS BOS 2359 68.0 247.0 2381 749 46.0 0 0
239 1 2 5 B6 ORD BOS 540 -8.0 96.0 867 856 -22.0 0 0
333 1 3 6 B6 LAX FLL 2237 32.0 270.0 2342 619 42.0 0 0
548 1 4 7 B6 SFO FLL 2307 -4.0 298.0 2583 724 -1.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58262 12 30 3 B6 SFO LGB 1921 -6.0 57.0 354 2038 -14.0 0 0
58301 12 30 3 B6 LAX JFK 630 4.0 285.0 2475 1445 -6.0 0 0
58425 12 31 4 B6 ORD SJU 700 239.0 250.0 2072 1335 239.0 0 0
58477 12 31 4 B6 DFW BOS 1145 12.0 161.0 1562 1608 -14.0 0 0
58483 12 31 4 B6 PHX BOS 2236 -12.0 231.0 2300 515 -45.0 0 0

543 rows × 14 columns

DL
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
53 1 1 4 DL LAS MSP 713 -5.0 156.0 1299 1220 -18.0 0 0
57 1 1 4 DL MSP RSW 700 -1.0 169.0 1416 1130 -20.0 0 0
77 1 1 4 DL LAX ATL 1130 24.0 217.0 1947 1840 16.0 0 0
79 1 1 4 DL LAX CMH 2146 -3.0 223.0 1995 459 -13.0 0 0
85 1 1 4 DL ATL OKC 2059 -4.0 116.0 761 2227 -12.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58440 12 31 4 DL ATL CVG 1611 -4.0 61.0 373 1736 -6.0 0 0
58448 12 31 4 DL ATL SRQ 1610 0.0 61.0 444 1740 -13.0 0 0
58464 12 31 4 DL LAX SFO 700 108.0 54.0 337 825 105.0 0 0
58467 12 31 4 DL ATL IND 1235 -3.0 63.0 432 1407 -13.0 0 0
58485 12 31 4 DL ATL CMH 2206 2.0 64.0 447 2338 -8.0 0 0

10601 rows × 14 columns

EV
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
11 1 1 4 EV ORD JAN 1155 6.0 113.0 677 1403 5.0 0 0
13 1 1 4 EV ORD CMH 1010 -2.0 46.0 296 1228 -9.0 0 0
29 1 1 4 EV ORD IND 1025 -6.0 29.0 177 1228 -19.0 0 0
40 1 1 4 EV IAH CLE 1038 -3.0 126.0 1091 1425 -18.0 0 0
69 1 1 4 EV ATL RAP 1930 -5.0 181.0 1230 2104 -15.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58445 12 31 4 EV DFW TXK 850 -5.0 30.0 181 948 -17.0 0 0
58452 12 31 4 EV DFW SHV 1650 -4.0 32.0 190 1746 -12.0 0 0
58459 12 31 4 EV MSP ORD 1435 18.0 61.0 334 1609 3.0 0 0
58463 12 31 4 EV ORD MSN 1220 18.0 32.0 108 1319 27.0 0 0
58486 12 31 4 EV DFW LFT 850 21.0 52.0 351 1012 14.0 0 0

5858 rows × 14 columns

F9
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
7 1 1 4 F9 SFO PHX 1020 -7.0 91.0 651 1315 -6.0 0 0
93 1 1 4 F9 ATL DEN 859 16.0 181.0 1199 1026 10.0 0 0
209 1 2 5 F9 MSP DEN 1025 -6.0 97.0 680 1134 -13.0 0 0
232 1 2 5 F9 DEN PHX 2040 -7.0 83.0 602 2228 -18.0 0 0
247 1 2 5 F9 ORD ATL 730 10.0 86.0 606 1020 23.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58288 12 30 3 F9 DEN ORD 625 -4.0 136.0 888 1000 14.0 0 0
58331 12 30 3 F9 ORD PHX 825 18.0 207.0 1440 1127 14.0 0 0
58447 12 31 4 F9 DEN LAS 1245 13.0 94.0 628 1340 13.0 0 0
58449 12 31 4 F9 DEN MCO 645 11.0 169.0 1546 1224 -11.0 0 0
58488 12 31 4 F9 LAS SFO 1910 13.0 71.0 414 2050 4.0 0 0

1317 rows × 14 columns

HA
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
582 1 4 7 HA LAX OGG 1115 -11.0 310.0 2486 1500 -27.0 0 0
712 1 5 1 HA LAS HNL 900 -5.0 357.0 2762 1315 5.0 0 0
878 1 6 2 HA PHX HNL 800 1.0 374.0 2917 1140 3.0 0 0
1053 1 7 3 HA LAX HNL 1705 0.0 332.0 2556 2055 -2.0 0 0
1269 1 8 4 HA LAX HNL 1000 -1.0 335.0 2556 1350 0.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
55883 12 16 3 HA LAX HNL 835 1.0 314.0 2556 1235 -18.0 0 0
56174 12 18 5 HA LAX HNL 835 -5.0 342.0 2556 1235 -4.0 0 0
56350 12 19 6 HA PHX HNL 800 -5.0 363.0 2917 1155 -34.0 0 0
56816 12 21 1 HA LAX LIH 740 20.0 303.0 2615 1145 -11.0 0 0
58391 12 31 4 HA LAX HNL 1000 0.0 324.0 2556 1350 -9.0 0 0

112 rows × 14 columns

MQ
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
2 1 1 4 MQ DFW VPS 1305 36.0 85.0 641 1453 35.0 0 0
10 1 1 4 MQ DFW DRO 1335 28.0 104.0 674 1438 28.0 0 0
18 1 1 4 MQ ORD DAY 2220 19.0 37.0 240 23 20.0 0 0
24 1 1 4 MQ DFW BTR 730 NaN NaN 383 853 NaN 0 1
50 1 1 4 MQ ORD CID 1135 -7.0 37.0 196 1238 -15.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58415 12 31 4 MQ ORD FWA 845 -2.0 37.0 157 1045 -4.0 0 0
58426 12 31 4 MQ DFW FAR 1154 4.0 124.0 968 1437 -13.0 0 0
58468 12 31 4 MQ DFW OKC 1720 -3.0 31.0 175 1819 -10.0 0 0
58474 12 31 4 MQ ORD FNT 829 4.0 40.0 223 1034 -4.0 0 0
58484 12 31 4 MQ ORD DSM 1333 1.0 57.0 299 1455 -7.0 0 0

3471 rows × 14 columns

NK
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
17 1 1 4 NK DEN DTW 1952 37.0 124.0 1123 31 54.0 0 0
74 1 1 4 NK PHX DFW 159 -1.0 103.0 868 502 1.0 0 0
95 1 1 4 NK LAS OAK 1115 22.0 62.0 407 1246 10.0 0 0
109 1 1 4 NK MSP ORD 616 2.0 49.0 334 745 -19.0 0 0
166 1 2 5 NK LAS PDX 1535 -8.0 123.0 763 1754 -4.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58160 12 29 2 NK MSP MCO 740 0.0 171.0 1310 1158 33.0 0 0
58197 12 30 3 NK IAH ORD 755 -8.0 136.0 925 1030 -2.0 0 0
58437 12 31 4 NK ORD DFW 1952 15.0 135.0 802 2225 23.0 0 0
58461 12 31 4 NK ORD LGA 1801 -5.0 84.0 733 2109 -26.0 0 0
58469 12 31 4 NK LAS MSY 1950 124.0 163.0 1500 112 101.0 0 0

1516 rows × 14 columns

OO
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
12 1 1 4 OO ORD MSP 1510 2.0 65.0 334 1646 4.0 0 0
16 1 1 4 OO DEN SGU 1105 21.0 66.0 517 1249 20.0 0 0
22 1 1 4 OO LAS LAX 1544 -4.0 39.0 236 1655 -12.0 0 0
25 1 1 4 OO ORD SPI 2110 -4.0 31.0 174 2205 5.0 0 0
27 1 1 4 OO IAH JAC 1104 -1.0 161.0 1265 1316 -1.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58451 12 31 4 OO ATL FWA 1905 -3.0 72.0 508 2051 -14.0 0 0
58480 12 31 4 OO MSP BIS 1310 -2.0 65.0 386 1449 -9.0 0 0
58482 12 31 4 OO DEN CPR 1850 -2.0 38.0 230 1956 1.0 0 0
58489 12 31 4 OO SFO SBA 1846 -6.0 46.0 262 1956 -5.0 0 0
58491 12 31 4 OO SFO BOI 859 5.0 73.0 522 1146 -1.0 0 0

6588 rows × 14 columns

UA
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
1 1 1 4 UA DEN IAD 823 7.0 154.0 1452 1333 -13.0 0 0
5 1 1 4 UA IAH SAN 1450 1.0 178.0 1303 1620 -14.0 0 0
9 1 1 4 UA IAH SJC 925 3.0 215.0 1608 1136 -14.0 0 0
14 1 1 4 UA IAH IND 1426 -1.0 102.0 844 1742 -20.0 0 0
21 1 1 4 UA ORD CLE 2102 48.0 47.0 315 2320 41.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58422 12 31 4 UA DEN SAN 1535 0.0 124.0 853 1704 -13.0 0 0
58432 12 31 4 UA ORD SAN 1915 7.0 238.0 1723 2143 -3.0 0 0
58457 12 31 4 UA ORD LAX 659 -1.0 241.0 1744 946 0.0 0 0
58460 12 31 4 UA SFO PHL 2235 -6.0 265.0 2521 700 -42.0 0 0
58481 12 31 4 UA IAH LAX 1433 1.0 197.0 1379 1625 -13.0 0 0

7792 rows × 14 columns

US
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
31 1 1 4 US PHX DEN 1810 29.0 94.0 602 1954 49.0 0 0
35 1 1 4 US ORD PHL 1600 -2.0 80.0 678 1857 -9.0 0 0
49 1 1 4 US IAH PHX 1445 -1.0 147.0 1009 1638 -7.0 0 0
96 1 1 4 US ATL PHL 1445 -4.0 90.0 666 1644 -11.0 0 0
104 1 1 4 US MSP PHX 730 -3.0 174.0 1276 1010 -20.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
31514 6 30 2 US DEN PHL 705 -4.0 188.0 1558 1240 1.0 0 0
31523 6 30 2 US PHX DEN 1451 6.0 85.0 602 1738 7.0 0 0
31535 6 30 2 US PHX AUS 840 -3.0 116.0 872 1304 -11.0 0 0
31561 6 30 2 US ORD PHX 710 -5.0 170.0 1440 901 -50.0 0 0
31582 6 30 2 US PHX OGG 800 -4.0 356.0 2845 1127 -13.0 0 0

1615 rows × 14 columns

VX
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
56 1 1 4 VX LAS SFO 900 23.0 65.0 414 1035 11.0 0 0
227 1 2 5 VX SFO LAS 1220 -5.0 68.0 414 1350 -5.0 0 0
243 1 2 5 VX SFO SEA 700 -4.0 104.0 679 905 -1.0 0 0
417 1 3 6 VX SFO LAS 900 -2.0 62.0 414 1030 -11.0 0 0
432 1 3 6 VX SFO SEA 2035 -2.0 106.0 679 2240 -2.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58332 12 30 3 VX SFO LAS 1950 -3.0 58.0 414 2120 -4.0 0 0
58383 12 31 4 VX SFO PSP 1630 -7.0 65.0 421 1755 -12.0 0 0
58400 12 31 4 VX SFO LAX 1125 -4.0 54.0 337 1245 -10.0 0 0
58471 12 31 4 VX SFO LAX 700 6.0 51.0 337 820 3.0 0 0
58478 12 31 4 VX SFO LAX 1530 29.0 52.0 337 1650 22.0 0 0

993 rows × 14 columns

WN
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
0 1 1 4 WN LAX SLC 1625 58.0 94.0 590 1905 65.0 0 0
4 1 1 4 WN LAX MCI 1720 48.0 166.0 1363 2225 39.0 0 0
19 1 1 4 WN PHX LAX 1640 51.0 58.0 370 1700 59.0 0 0
20 1 1 4 WN ATL BWI 1115 1.0 76.0 577 1305 -15.0 0 0
23 1 1 4 WN ATL HOU 1555 30.0 113.0 696 1720 18.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58455 12 31 4 WN LAX SMF 1420 -2.0 64.0 373 1540 -7.0 0 0
58458 12 31 4 WN LAS SFO 1825 25.0 67.0 414 1955 17.0 0 0
58472 12 31 4 WN PHX HOU 845 5.0 119.0 1020 1210 7.0 0 0
58473 12 31 4 WN DEN PDX 1205 4.0 130.0 991 1400 -13.0 0 0
58490 12 31 4 WN MSP ATL 525 39.0 124.0 907 855 34.0 0 0

8418 rows × 14 columns

AIRLINE을 기준으로 데이터프레임을 나누고 $\to$ ARR_DELAY에 mean함수를 적용: (AIRLINE $\to$ {ARR_DELAY: mean})

- 방법1

df.groupby(by='AIRLINE').agg({'ARR_DELAY':'mean'})
ARR_DELAY
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353

- 방법2 ($\star\star\star$)

df.groupby(by='AIRLINE').agg({'ARR_DELAY':np.mean})
ARR_DELAY
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353

- 방법3

df.groupby(by='AIRLINE')['ARR_DELAY'].agg('mean')
AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64

- 방법4 ($\star$)

df.groupby(by='AIRLINE')['ARR_DELAY'].agg(np.mean)
AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64

- 방법5

df.groupby(by='AIRLINE')['ARR_DELAY'].mean()
AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64

- 방법2와 방법4는 사용자정의 함수를 쓸 수 있다는 장점이 있음

def f(x): return -np.mean(x) 
df.groupby(by='AIRLINE').agg({'ARR_DELAY':f})
ARR_DELAY
AIRLINE
AA -5.542661
AS 0.833333
B6 -8.692593
DL -0.339691
EV -7.034580
F9 -13.630651
HA -4.972973
MQ -6.860591
NK -18.436070
OO -7.593463
UA -7.765755
US -1.681105
VX -5.348884
WN -6.397353
df.groupby(by='AIRLINE').agg({'ARR_DELAY':lambda x: -np.mean(x)})
ARR_DELAY
AIRLINE
AA -5.542661
AS 0.833333
B6 -8.692593
DL -0.339691
EV -7.034580
F9 -13.630651
HA -4.972973
MQ -6.860591
NK -18.436070
OO -7.593463
UA -7.765755
US -1.681105
VX -5.348884
WN -6.397353
df.groupby(by='AIRLINE')['ARR_DELAY'].agg(lambda x: -np.mean(x))
AIRLINE
AA    -5.542661
AS     0.833333
B6    -8.692593
DL    -0.339691
EV    -7.034580
F9   -13.630651
HA    -4.972973
MQ    -6.860591
NK   -18.436070
OO    -7.593463
UA    -7.765755
US    -1.681105
VX    -5.348884
WN    -6.397353
Name: ARR_DELAY, dtype: float64

- 입력이 여러개인 사용자 정의 함수도 사용가능함

def f(x,y): return np.mean(x)**y 
df.groupby(by='AIRLINE')['ARR_DELAY'].agg(f,2)
AIRLINE
AA     30.721086
AS      0.694444
B6     75.561166
DL      0.115390
EV     49.485310
F9    185.794656
HA     24.730460
MQ     47.067715
NK    339.888677
OO     57.660681
UA     60.306954
US      2.826113
VX     28.610564
WN     40.926120
Name: ARR_DELAY, dtype: float64
df.groupby(by='AIRLINE').agg({'ARR_DELAY': lambda x: f(x,2)})
ARR_DELAY
AIRLINE
AA 30.721086
AS 0.694444
B6 75.561166
DL 0.115390
EV 49.485310
F9 185.794656
HA 24.730460
MQ 47.067715
NK 339.888677
OO 57.660681
UA 60.306954
US 2.826113
VX 28.610564
WN 40.926120

AIRLINE,WEEKDAY $\to$ {CANCELLED: sum}

- 방법1~5

df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':'sum'})
CANCELLED
AIRLINE WEEKDAY
AA 1 41
2 9
3 16
4 20
5 18
... ... ...
WN 3 18
4 10
5 7
6 10
7 7

98 rows × 1 columns

df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':np.sum})
CANCELLED
AIRLINE WEEKDAY
AA 1 41
2 9
3 16
4 20
5 18
... ... ...
WN 3 18
4 10
5 7
6 10
7 7

98 rows × 1 columns

df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].agg('sum')
AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64
df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].agg(np.sum)
AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64
df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].sum()
AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64

AIRLINE,WEEKDAY $\to$ {CANCELLED: sum, mean} , {DIVERTED: sum, mean}

- 방법 1~4 (5번은 쓸 수 없다)

df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':['sum','mean'],'DIVERTED':['sum','mean']})
CANCELLED DIVERTED
sum mean sum mean
AIRLINE WEEKDAY
AA 1 41 0.032106 6 0.004699
2 9 0.007341 2 0.001631
3 16 0.011949 2 0.001494
4 20 0.015004 5 0.003751
5 18 0.014151 1 0.000786
... ... ... ... ... ...
WN 3 18 0.014118 2 0.001569
4 10 0.007911 4 0.003165
5 7 0.005828 0 0.000000
6 10 0.010132 3 0.003040
7 7 0.006066 3 0.002600

98 rows × 4 columns

df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':[np.sum,np.mean],'DIVERTED':[np.sum,np.mean]})
CANCELLED DIVERTED
sum mean sum mean
AIRLINE WEEKDAY
AA 1 41 0.032106 6 0.004699
2 9 0.007341 2 0.001631
3 16 0.011949 2 0.001494
4 20 0.015004 5 0.003751
5 18 0.014151 1 0.000786
... ... ... ... ... ...
WN 3 18 0.014118 2 0.001569
4 10 0.007911 4 0.003165
5 7 0.005828 0 0.000000
6 10 0.010132 3 0.003040
7 7 0.006066 3 0.002600

98 rows × 4 columns

df.groupby(by=['AIRLINE','WEEKDAY'])[['CANCELLED','DIVERTED']].agg(['sum','mean'])
CANCELLED DIVERTED
sum mean sum mean
AIRLINE WEEKDAY
AA 1 41 0.032106 6 0.004699
2 9 0.007341 2 0.001631
3 16 0.011949 2 0.001494
4 20 0.015004 5 0.003751
5 18 0.014151 1 0.000786
... ... ... ... ... ...
WN 3 18 0.014118 2 0.001569
4 10 0.007911 4 0.003165
5 7 0.005828 0 0.000000
6 10 0.010132 3 0.003040
7 7 0.006066 3 0.002600

98 rows × 4 columns

df.groupby(by=['AIRLINE','WEEKDAY'])[['CANCELLED','DIVERTED']].agg([np.sum,np.mean])
CANCELLED DIVERTED
sum mean sum mean
AIRLINE WEEKDAY
AA 1 41 0.032106 6 0.004699
2 9 0.007341 2 0.001631
3 16 0.011949 2 0.001494
4 20 0.015004 5 0.003751
5 18 0.014151 1 0.000786
... ... ... ... ... ...
WN 3 18 0.014118 2 0.001569
4 10 0.007911 4 0.003165
5 7 0.005828 0 0.000000
6 10 0.010132 3 0.003040
7 7 0.006066 3 0.002600

98 rows × 4 columns

 

AIRLINE,WEEKDAY $\to$ {CANCELLED: sum, mean, size} , {AIR_TIME: mean,var}

df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':['sum','mean','size'],'AIR_TIME':['mean','var']})
CANCELLED AIR_TIME
sum mean size mean var
AIRLINE WEEKDAY
AA 1 41 0.032106 1277 147.610569 5393.806723
2 9 0.007341 1226 143.851852 5359.890719
3 16 0.011949 1339 144.514005 5378.854539
4 20 0.015004 1333 141.124618 4791.524627
5 18 0.014151 1272 145.430966 5884.592076
... ... ... ... ... ... ...
WN 3 18 0.014118 1275 104.219920 2901.873447
4 10 0.007911 1264 107.200800 2966.568935
5 7 0.005828 1201 107.893635 3268.717093
6 10 0.010132 987 109.247433 3152.753719
7 7 0.006066 1154 107.602273 3183.126889

98 rows × 5 columns

df.groupby(by=['AIRLINE','WEEKDAY'])\
.agg({'CANCELLED':[np.sum,np.mean,len],'AIR_TIME':[np.mean,lambda x: np.std(x,ddof=1)**2]})
CANCELLED AIR_TIME
sum mean len mean <lambda_0>
AIRLINE WEEKDAY
AA 1 41 0.032106 1277 147.610569 5393.806723
2 9 0.007341 1226 143.851852 5359.890719
3 16 0.011949 1339 144.514005 5378.854539
4 20 0.015004 1333 141.124618 4791.524627
5 18 0.014151 1272 145.430966 5884.592076
... ... ... ... ... ... ...
WN 3 18 0.014118 1275 104.219920 2901.873447
4 10 0.007911 1264 107.200800 2966.568935
5 7 0.005828 1201 107.893635 3268.717093
6 10 0.010132 987 109.247433 3152.753719
7 7 0.006066 1154 107.602273 3183.126889

98 rows × 5 columns

grouping by continuous variable

df
MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
0 1 1 4 WN LAX SLC 1625 58.0 94.0 590 1905 65.0 0 0
1 1 1 4 UA DEN IAD 823 7.0 154.0 1452 1333 -13.0 0 0
2 1 1 4 MQ DFW VPS 1305 36.0 85.0 641 1453 35.0 0 0
3 1 1 4 AA DFW DCA 1555 7.0 126.0 1192 1935 -7.0 0 0
4 1 1 4 WN LAX MCI 1720 48.0 166.0 1363 2225 39.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58487 12 31 4 AA SFO DFW 515 5.0 166.0 1464 1045 -19.0 0 0
58488 12 31 4 F9 LAS SFO 1910 13.0 71.0 414 2050 4.0 0 0
58489 12 31 4 OO SFO SBA 1846 -6.0 46.0 262 1956 -5.0 0 0
58490 12 31 4 WN MSP ATL 525 39.0 124.0 907 855 34.0 0 0
58491 12 31 4 OO SFO BOI 859 5.0 73.0 522 1146 -1.0 0 0

58492 rows × 14 columns

- 목표: DIST를 적당한 구간으로 나누어 카테고리화 하고 그것을 바탕으로 groupby를 수행하자.

df.DIST.hist()
<AxesSubplot:>
df.DIST.describe()
count    58492.000000
mean       872.900072
std        624.996805
min         67.000000
25%        391.000000
50%        690.000000
75%       1199.000000
max       4502.000000
Name: DIST, dtype: float64

- 구간을 아래와 같이 설정한다.

bins=[-np.inf, 400, 700, 1200, np.inf] 

- pd.cut()을 이용하여 각 구간의 obs를 카테고리화 하자.

cuts=pd.cut(df.DIST,bins=bins)
cuts
0         (400.0, 700.0]
1          (1200.0, inf]
2         (400.0, 700.0]
3        (700.0, 1200.0]
4          (1200.0, inf]
              ...       
58487      (1200.0, inf]
58488     (400.0, 700.0]
58489      (-inf, 400.0]
58490    (700.0, 1200.0]
58491     (400.0, 700.0]
Name: DIST, Length: 58492, dtype: category
Categories (4, interval[float64, right]): [(-inf, 400.0] < (400.0, 700.0] < (700.0, 1200.0] < (1200.0, inf]]

- cuts, AIRLINE $\to$ {DIVERTED: sum}

df.groupby([cuts,'AIRLINE']).agg({'DIVERTED':sum})
DIVERTED
DIST AIRLINE
(-inf, 400.0] AA 0
AS 0
B6 0
DL 1
EV 3
F9 0
HA 0
MQ 0
NK 0
OO 5
UA 2
US 0
VX 0
WN 1
(400.0, 700.0] AA 3
AS 0
B6 0
DL 12
EV 8
F9 1
HA 0
MQ 4
NK 1
OO 7
UA 1
US 0
VX 0
WN 2
(700.0, 1200.0] AA 10
AS 0
B6 1
DL 6
EV 4
F9 0
HA 0
MQ 1
NK 1
OO 5
UA 4
US 0
VX 0
WN 4
(1200.0, inf] AA 13
AS 0
B6 1
DL 5
EV 0
F9 1
HA 1
MQ 0
NK 3
OO 4
UA 12
US 1
VX 1
WN 8

- 아래와 비교해보자.

df.groupby(['AIRLINE']).agg({'AIRLINE':len})
AIRLINE
AIRLINE
AA 8900
AS 768
B6 543
DL 10601
EV 5858
F9 1317
HA 112
MQ 3471
NK 1516
OO 6588
UA 7792
US 1615
VX 993
WN 8418

- cuts을 이용하여 추가그룹핑을 하면 조금 다른 특징들을 데이터에서 발견할 수 있다.

  • AA항공사와 DL항공사는 모두 비슷한 우회횟수를 가지고 있음.
  • AA항공사는 700회이상의 구간에서 우회를 많이하고 DL항공사는 400~700사이에서 우회를 많이 한다. (패턴이 다름)

- 구간이름에 label을 붙이는 방법

bins
[-inf, 400, 700, 1200, inf]
cuts2=pd.cut(df.DIST,bins=bins,labels=['Q1','Q2','Q3','Q4'])
cuts2
0        Q2
1        Q4
2        Q2
3        Q3
4        Q4
         ..
58487    Q4
58488    Q2
58489    Q1
58490    Q3
58491    Q2
Name: DIST, Length: 58492, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
df.groupby(by=[cuts2,'AIRLINE']).agg({'DIVERTED':sum})
DIVERTED
DIST AIRLINE
Q1 AA 0
AS 0
B6 0
DL 1
EV 3
F9 0
HA 0
MQ 0
NK 0
OO 5
UA 2
US 0
VX 0
WN 1
Q2 AA 3
AS 0
B6 0
DL 12
EV 8
F9 1
HA 0
MQ 4
NK 1
OO 7
UA 1
US 0
VX 0
WN 2
Q3 AA 10
AS 0
B6 1
DL 6
EV 4
F9 0
HA 0
MQ 1
NK 1
OO 5
UA 4
US 0
VX 0
WN 4
Q4 AA 13
AS 0
B6 1
DL 5
EV 0
F9 1
HA 1
MQ 0
NK 3
OO 4
UA 12
US 1
VX 1
WN 8
df.groupby(cuts2).agg({'DIVERTED':len})
DIVERTED
DIST
Q1 15027
Q2 14697
Q3 14417
Q4 14351

숙제

구간을

bins=[-np.inf, 400, 700, 1200, np.inf]

이 아니라

bins=[-np.inf, 400, 600, 800, 1000, 1200, np.inf]

와 같이 나누고 적당한 각구간별로 해당하는 관측치의 수를 구하라.