(8주차) 10월27일
groupby
• 최규빈 • 34 min read
-
(1/5) group by (1)
-
(2/5) group by (2)
-
(3/5) pd.cut (1)
-
(4/5) pd.cut (2)
-
(5/5) 과제설명
import pandas as pd
import numpy as np
df=pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/flights.csv')
df
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | WN | LAX | SLC | 1625 | 58.0 | 94.0 | 590 | 1905 | 65.0 | 0 | 0 |
1 | 1 | 1 | 4 | UA | DEN | IAD | 823 | 7.0 | 154.0 | 1452 | 1333 | -13.0 | 0 | 0 |
2 | 1 | 1 | 4 | MQ | DFW | VPS | 1305 | 36.0 | 85.0 | 641 | 1453 | 35.0 | 0 | 0 |
3 | 1 | 1 | 4 | AA | DFW | DCA | 1555 | 7.0 | 126.0 | 1192 | 1935 | -7.0 | 0 | 0 |
4 | 1 | 1 | 4 | WN | LAX | MCI | 1720 | 48.0 | 166.0 | 1363 | 2225 | 39.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58487 | 12 | 31 | 4 | AA | SFO | DFW | 515 | 5.0 | 166.0 | 1464 | 1045 | -19.0 | 0 | 0 |
58488 | 12 | 31 | 4 | F9 | LAS | SFO | 1910 | 13.0 | 71.0 | 414 | 2050 | 4.0 | 0 | 0 |
58489 | 12 | 31 | 4 | OO | SFO | SBA | 1846 | -6.0 | 46.0 | 262 | 1956 | -5.0 | 0 | 0 |
58490 | 12 | 31 | 4 | WN | MSP | ATL | 525 | 39.0 | 124.0 | 907 | 855 | 34.0 | 0 | 0 |
58491 | 12 | 31 | 4 | OO | SFO | BOI | 859 | 5.0 | 73.0 | 522 | 1146 | -1.0 | 0 | 0 |
58492 rows × 14 columns
df.columns
Index(['MONTH', 'DAY', 'WEEKDAY', 'AIRLINE', 'ORG_AIR', 'DEST_AIR', 'SCHED_DEP', 'DEP_DELAY', 'AIR_TIME', 'DIST', 'SCHED_ARR', 'ARR_DELAY', 'DIVERTED', 'CANCELLED'], dtype='object')
-
데이터프레임을 여러개의 서브데이터프레임으로 나누는 기능
-
단독으로 쓸 이유는 별로 없다. $\to$ 그룹을 나누고 어떠한 "연산"을 하기 위함
df.groupby(by='AIRLINE')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f58ee1d7b20>
- 데이터프레임을 각 항공사 별로 나눔
-
확인
grouped_df = df.groupby(by='AIRLINE')
grouped_df.groups
{'AA': [3, 6, 8, 15, 26, 32, 33, 36, 37, 41, 46, 47, 48, 55, 62, 66, 73, 75, 76, 87, 89, 97, 98, 107, 120, 122, 128, 131, 133, 146, 160, 171, 174, 178, 179, 188, 200, 201, 206, 221, 267, 270, 279, 305, 309, 311, 321, 330, 332, 342, 343, 347, 348, 380, 381, 382, 390, 393, 398, 402, 404, 405, 419, 427, 430, 431, 441, 449, 451, 455, 457, 466, 476, 491, 493, 497, 509, 514, 521, 527, 528, 537, 550, 551, 556, 560, 568, 572, 587, 589, 592, 612, 613, 618, 624, 626, 628, 634, 640, 652, ...], 'AS': [38, 198, 241, 277, 397, 450, 453, 500, 518, 591, 718, 737, 741, 808, 820, 867, 872, 1005, 1106, 1146, 1292, 1437, 1530, 1559, 1609, 1636, 1747, 1786, 1810, 1826, 1892, 1921, 2010, 2151, 2269, 2446, 2526, 2576, 2586, 2941, 3112, 3125, 3171, 3176, 3413, 3483, 3494, 3596, 3678, 3734, 3891, 3910, 3937, 3989, 4099, 4128, 4326, 4397, 4432, 4587, 4743, 4799, 4841, 4850, 4912, 4987, 5099, 5154, 5176, 5254, 5261, 5297, 5356, 5372, 5496, 5667, 5685, 5688, 5877, 5976, 6016, 6079, 6122, 6245, 6255, 6393, 6529, 6816, 6824, 6861, 6945, 6988, 7065, 7150, 7153, 7196, 7225, 7235, 7389, 7478, ...], 'B6': [123, 127, 239, 333, 548, 549, 696, 757, 945, 1175, 1361, 1453, 1547, 1561, 1564, 1650, 1670, 1691, 1800, 1939, 1998, 2251, 2253, 2256, 2330, 2385, 2434, 2607, 2650, 2692, 2721, 2781, 2800, 2884, 2936, 3060, 3306, 3648, 3793, 3983, 4347, 4552, 4578, 4639, 4779, 4820, 4885, 5036, 5125, 5131, 5205, 5211, 5228, 5260, 5351, 5483, 5577, 5585, 5777, 5956, 6126, 6151, 6418, 6901, 6952, 6977, 7792, 7798, 7853, 7877, 7904, 8030, 8082, 8091, 8118, 8145, 8177, 8364, 8507, 8541, 8543, 8558, 8732, 8828, 8912, 9093, 9173, 9275, 9371, 9389, 9447, 9482, 9574, 9686, 9896, 10066, 10254, 10416, 10600, 10727, ...], 'DL': [53, 57, 77, 79, 85, 90, 91, 101, 116, 117, 121, 129, 130, 137, 142, 152, 154, 155, 158, 159, 162, 165, 169, 172, 181, 182, 184, 185, 189, 190, 204, 207, 211, 215, 216, 222, 235, 238, 244, 245, 248, 249, 250, 253, 254, 258, 286, 289, 304, 307, 308, 310, 313, 323, 326, 328, 329, 336, 353, 354, 364, 373, 385, 395, 407, 411, 412, 413, 420, 433, 435, 439, 445, 456, 468, 481, 485, 490, 492, 508, 517, 526, 529, 531, 535, 543, 546, 559, 570, 579, 584, 588, 593, 602, 606, 607, 616, 617, 631, 642, ...], 'EV': [11, 13, 29, 40, 69, 100, 106, 118, 136, 143, 147, 164, 167, 170, 194, 196, 199, 205, 208, 219, 220, 225, 231, 233, 236, 251, 252, 261, 264, 265, 268, 282, 291, 293, 298, 312, 319, 337, 352, 362, 363, 369, 371, 383, 403, 414, 415, 424, 429, 437, 442, 443, 459, 460, 465, 470, 495, 499, 503, 505, 506, 522, 525, 530, 541, 567, 574, 580, 583, 632, 645, 654, 659, 677, 683, 689, 693, 695, 697, 714, 721, 723, 725, 730, 740, 742, 761, 764, 765, 768, 772, 776, 777, 778, 785, 787, 790, 791, 794, 799, ...], 'F9': [7, 93, 209, 232, 247, 290, 301, 386, 444, 483, 553, 563, 596, 598, 599, 605, 665, 685, 738, 844, 870, 948, 985, 1103, 1167, 1181, 1264, 1374, 1382, 1407, 1421, 1435, 1489, 1590, 1611, 1697, 1746, 1760, 1775, 1776, 1807, 1830, 1925, 1962, 2014, 2036, 2069, 2078, 2111, 2125, 2208, 2335, 2405, 2515, 2557, 2579, 2600, 2683, 2693, 2708, 2731, 2733, 2805, 2853, 2861, 3024, 3241, 3288, 3374, 3382, 3395, 3418, 3555, 3570, 3575, 3628, 3638, 3716, 3735, 3777, 3843, 3923, 3962, 3977, 4016, 4031, 4046, 4069, 4111, 4151, 4172, 4287, 4356, 4371, 4420, 4469, 4542, 4641, 4671, 4720, ...], 'HA': [582, 712, 878, 1053, 1269, 1345, 1544, 1864, 2316, 2644, 2681, 3002, 3165, 3227, 4359, 4414, 4926, 5187, 5281, 5320, 5363, 7267, 7372, 7744, 7808, 9360, 9761, 10163, 11119, 11626, 11743, 11809, 12275, 12752, 12780, 13466, 13755, 13956, 14566, 15064, 15316, 15959, 16501, 17632, 18467, 18720, 20461, 21558, 21761, 22454, 22584, 22592, 22618, 22858, 22892, 23177, 23374, 23773, 25763, 26127, 27745, 28864, 29608, 29762, 29875, 30610, 31164, 31505, 31530, 31675, 32030, 32037, 32987, 33311, 33747, 34884, 35959, 36068, 36319, 36425, 37119, 37128, 37664, 37983, 38440, 39475, 39482, 39625, 41664, 42165, 42485, 42846, 44068, 45330, 46713, 47772, 48279, 48722, 49667, 49782, ...], 'MQ': [2, 10, 18, 24, 50, 60, 71, 78, 81, 103, 105, 115, 125, 149, 180, 187, 210, 214, 228, 285, 296, 314, 320, 334, 335, 339, 341, 357, 458, 472, 484, 486, 488, 502, 516, 524, 534, 547, 581, 601, 622, 638, 639, 646, 661, 707, 711, 722, 727, 751, 769, 774, 802, 805, 830, 851, 854, 856, 865, 876, 915, 939, 955, 968, 977, 980, 1002, 1006, 1036, 1085, 1087, 1104, 1113, 1150, 1158, 1163, 1168, 1169, 1204, 1229, 1231, 1234, 1238, 1246, 1287, 1296, 1303, 1306, 1328, 1329, 1336, 1393, 1398, 1410, 1423, 1428, 1436, 1448, 1449, 1454, ...], 'NK': [17, 74, 95, 109, 166, 345, 358, 401, 434, 436, 446, 452, 482, 507, 510, 519, 533, 544, 610, 615, 620, 729, 735, 845, 858, 887, 898, 916, 970, 1017, 1095, 1097, 1153, 1188, 1208, 1257, 1342, 1392, 1536, 1579, 1580, 1657, 1732, 1761, 1863, 1943, 1948, 1990, 2026, 2068, 2071, 2118, 2124, 2149, 2177, 2196, 2214, 2244, 2299, 2334, 2378, 2424, 2425, 2447, 2523, 2537, 2592, 2613, 2639, 2643, 2740, 2749, 2752, 2764, 2765, 2770, 2786, 2875, 2886, 3018, 3061, 3065, 3072, 3151, 3153, 3174, 3248, 3315, 3381, 3435, 3566, 3598, 3674, 3696, 3701, 3708, 3840, 3877, 3894, 3934, ...], 'OO': [12, 16, 22, 25, 27, 34, 39, 42, 51, 52, 54, 58, 61, 63, 65, 82, 86, 99, 102, 113, 124, 126, 135, 138, 139, 151, 157, 161, 168, 175, 213, 223, 237, 240, 242, 255, 257, 278, 281, 284, 288, 297, 315, 317, 318, 322, 324, 327, 331, 359, 361, 366, 367, 370, 376, 378, 379, 387, 392, 418, 421, 447, 448, 540, 552, 554, 565, 569, 576, 590, 600, 603, 604, 609, 625, 629, 637, 649, 653, 658, 660, 663, 666, 671, 672, 673, 676, 679, 681, 682, 684, 691, 692, 708, 734, 754, 755, 762, 789, 793, ...], 'UA': [1, 5, 9, 14, 21, 44, 45, 67, 70, 72, 94, 108, 110, 134, 140, 141, 144, 145, 163, 177, 183, 192, 197, 246, 260, 262, 263, 266, 269, 273, 275, 283, 295, 325, 338, 340, 349, 368, 374, 375, 377, 389, 394, 396, 408, 423, 425, 438, 440, 462, 463, 469, 471, 474, 479, 501, 511, 512, 513, 520, 523, 536, 539, 542, 555, 557, 566, 573, 577, 586, 594, 595, 608, 621, 623, 630, 635, 636, 647, 650, 651, 667, 686, 687, 698, 699, 700, 701, 713, 717, 719, 745, 756, 773, 784, 801, 829, 836, 842, 843, ...], 'US': [31, 35, 49, 96, 104, 111, 112, 119, 148, 150, 153, 191, 202, 203, 224, 229, 355, 360, 372, 384, 391, 467, 473, 477, 478, 487, 494, 538, 545, 558, 561, 564, 575, 578, 619, 690, 715, 726, 728, 760, 795, 806, 881, 896, 901, 910, 954, 1028, 1033, 1068, 1134, 1135, 1143, 1155, 1164, 1201, 1210, 1245, 1274, 1288, 1290, 1291, 1335, 1355, 1377, 1399, 1406, 1446, 1459, 1462, 1470, 1471, 1476, 1481, 1493, 1499, 1503, 1552, 1554, 1565, 1589, 1593, 1603, 1661, 1680, 1716, 1731, 1733, 1735, 1743, 1749, 1754, 1755, 1758, 1767, 1773, 1790, 1821, 1904, 1905, ...], 'VX': [56, 227, 243, 417, 432, 464, 614, 641, 703, 759, 812, 825, 943, 1058, 1092, 1093, 1192, 1302, 1312, 1343, 1369, 1452, 1586, 1599, 1644, 1696, 1705, 1805, 1930, 1957, 2048, 2219, 2220, 2270, 2302, 2324, 2484, 2572, 2620, 2646, 2735, 2753, 2808, 2880, 2898, 2919, 3110, 3111, 3120, 3210, 3246, 3261, 3302, 3427, 3501, 3510, 3664, 3680, 3694, 3731, 3860, 3938, 3968, 4005, 4029, 4035, 4056, 4088, 4270, 4351, 4531, 4536, 4540, 4692, 4712, 4728, 4746, 4825, 4856, 4860, 4869, 4876, 4890, 4901, 4928, 4994, 5075, 5078, 5115, 5143, 5167, 5206, 5245, 5265, 5379, 5383, 5413, 5502, 5512, 5522, ...], 'WN': [0, 4, 19, 20, 23, 28, 30, 43, 59, 64, 68, 80, 83, 84, 88, 92, 114, 132, 156, 173, 176, 186, 193, 195, 212, 217, 218, 226, 230, 234, 256, 259, 271, 272, 274, 276, 280, 287, 292, 294, 299, 300, 302, 303, 306, 316, 344, 346, 350, 351, 356, 365, 388, 399, 400, 406, 409, 410, 416, 422, 426, 428, 454, 461, 475, 480, 489, 496, 498, 504, 515, 532, 562, 571, 585, 597, 611, 627, 633, 648, 657, 669, 670, 674, 678, 710, 720, 724, 731, 744, 747, 752, 753, 788, 797, 804, 807, 813, 815, 817, ...]}
- 너무 보기 힘듬
-
보기좋은 형태로 확인
list(grouped_df.groups)
['AA', 'AS', 'B6', 'DL', 'EV', 'F9', 'HA', 'MQ', 'NK', 'OO', 'UA', 'US', 'VX', 'WN']
grouped_df.get_group('AA')
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 1 | 1 | 4 | AA | DFW | DCA | 1555 | 7.0 | 126.0 | 1192 | 1935 | -7.0 | 0 | 0 |
6 | 1 | 1 | 4 | AA | DFW | MSY | 1250 | 84.0 | 64.0 | 447 | 1410 | 83.0 | 0 | 0 |
8 | 1 | 1 | 4 | AA | ORD | STL | 1845 | -5.0 | 44.0 | 258 | 1950 | -5.0 | 0 | 0 |
15 | 1 | 1 | 4 | AA | DEN | DFW | 1445 | -6.0 | 93.0 | 641 | 1745 | 4.0 | 0 | 0 |
26 | 1 | 1 | 4 | AA | LAX | AUS | 1430 | 33.0 | 157.0 | 1242 | 1925 | 41.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58470 | 12 | 31 | 4 | AA | DFW | FAT | 1020 | -3.0 | 196.0 | 1313 | 1156 | -2.0 | 0 | 0 |
58475 | 12 | 31 | 4 | AA | IAH | CLT | 710 | 1.0 | 113.0 | 912 | 1037 | -12.0 | 0 | 0 |
58476 | 12 | 31 | 4 | AA | DFW | TPA | 1020 | -3.0 | 121.0 | 929 | 1340 | -6.0 | 0 | 0 |
58479 | 12 | 31 | 4 | AA | DFW | ELP | 1200 | 3.0 | 94.0 | 551 | 1250 | 13.0 | 0 | 0 |
58487 | 12 | 31 | 4 | AA | SFO | DFW | 515 | 5.0 | 166.0 | 1464 | 1045 | -19.0 | 0 | 0 |
8900 rows × 14 columns
for g in grouped_df.groups:
print(g)
display(grouped_df.get_group(g))
AA
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 1 | 1 | 4 | AA | DFW | DCA | 1555 | 7.0 | 126.0 | 1192 | 1935 | -7.0 | 0 | 0 |
6 | 1 | 1 | 4 | AA | DFW | MSY | 1250 | 84.0 | 64.0 | 447 | 1410 | 83.0 | 0 | 0 |
8 | 1 | 1 | 4 | AA | ORD | STL | 1845 | -5.0 | 44.0 | 258 | 1950 | -5.0 | 0 | 0 |
15 | 1 | 1 | 4 | AA | DEN | DFW | 1445 | -6.0 | 93.0 | 641 | 1745 | 4.0 | 0 | 0 |
26 | 1 | 1 | 4 | AA | LAX | AUS | 1430 | 33.0 | 157.0 | 1242 | 1925 | 41.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58470 | 12 | 31 | 4 | AA | DFW | FAT | 1020 | -3.0 | 196.0 | 1313 | 1156 | -2.0 | 0 | 0 |
58475 | 12 | 31 | 4 | AA | IAH | CLT | 710 | 1.0 | 113.0 | 912 | 1037 | -12.0 | 0 | 0 |
58476 | 12 | 31 | 4 | AA | DFW | TPA | 1020 | -3.0 | 121.0 | 929 | 1340 | -6.0 | 0 | 0 |
58479 | 12 | 31 | 4 | AA | DFW | ELP | 1200 | 3.0 | 94.0 | 551 | 1250 | 13.0 | 0 | 0 |
58487 | 12 | 31 | 4 | AA | SFO | DFW | 515 | 5.0 | 166.0 | 1464 | 1045 | -19.0 | 0 | 0 |
8900 rows × 14 columns
AS
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
38 | 1 | 1 | 4 | AS | PHX | SEA | 1505 | -2.0 | 155.0 | 1107 | 1702 | -3.0 | 0 | 0 |
198 | 1 | 2 | 5 | AS | LAX | SEA | 2110 | 5.0 | 145.0 | 954 | 2352 | 8.0 | 0 | 0 |
241 | 1 | 2 | 5 | AS | LAS | PDX | 650 | -5.0 | 117.0 | 763 | 906 | -3.0 | 0 | 0 |
277 | 1 | 2 | 5 | AS | ORD | ANC | 935 | -1.0 | 402.0 | 2846 | 1339 | -6.0 | 0 | 0 |
397 | 1 | 3 | 6 | AS | LAS | SEA | 1300 | 48.0 | 137.0 | 867 | 1535 | 47.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58305 | 12 | 30 | 3 | AS | LAX | SEA | 1325 | -2.0 | 134.0 | 954 | 1608 | -7.0 | 0 | 0 |
58355 | 12 | 31 | 4 | AS | PHX | SEA | 1200 | -5.0 | 145.0 | 1107 | 1407 | -24.0 | 0 | 0 |
58404 | 12 | 31 | 4 | AS | SFO | SLC | 2110 | -2.0 | 80.0 | 599 | 2358 | -4.0 | 0 | 0 |
58407 | 12 | 31 | 4 | AS | SFO | PDX | 645 | -2.0 | 81.0 | 550 | 832 | -3.0 | 0 | 0 |
58428 | 12 | 31 | 4 | AS | LAX | SEA | 1420 | -8.0 | 127.0 | 954 | 1709 | -25.0 | 0 | 0 |
768 rows × 14 columns
B6
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
123 | 1 | 1 | 4 | B6 | LAS | BOS | 1230 | 0.0 | 246.0 | 2381 | 2026 | -27.0 | 0 | 0 |
127 | 1 | 1 | 4 | B6 | LAS | BOS | 2359 | 68.0 | 247.0 | 2381 | 749 | 46.0 | 0 | 0 |
239 | 1 | 2 | 5 | B6 | ORD | BOS | 540 | -8.0 | 96.0 | 867 | 856 | -22.0 | 0 | 0 |
333 | 1 | 3 | 6 | B6 | LAX | FLL | 2237 | 32.0 | 270.0 | 2342 | 619 | 42.0 | 0 | 0 |
548 | 1 | 4 | 7 | B6 | SFO | FLL | 2307 | -4.0 | 298.0 | 2583 | 724 | -1.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58262 | 12 | 30 | 3 | B6 | SFO | LGB | 1921 | -6.0 | 57.0 | 354 | 2038 | -14.0 | 0 | 0 |
58301 | 12 | 30 | 3 | B6 | LAX | JFK | 630 | 4.0 | 285.0 | 2475 | 1445 | -6.0 | 0 | 0 |
58425 | 12 | 31 | 4 | B6 | ORD | SJU | 700 | 239.0 | 250.0 | 2072 | 1335 | 239.0 | 0 | 0 |
58477 | 12 | 31 | 4 | B6 | DFW | BOS | 1145 | 12.0 | 161.0 | 1562 | 1608 | -14.0 | 0 | 0 |
58483 | 12 | 31 | 4 | B6 | PHX | BOS | 2236 | -12.0 | 231.0 | 2300 | 515 | -45.0 | 0 | 0 |
543 rows × 14 columns
DL
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
53 | 1 | 1 | 4 | DL | LAS | MSP | 713 | -5.0 | 156.0 | 1299 | 1220 | -18.0 | 0 | 0 |
57 | 1 | 1 | 4 | DL | MSP | RSW | 700 | -1.0 | 169.0 | 1416 | 1130 | -20.0 | 0 | 0 |
77 | 1 | 1 | 4 | DL | LAX | ATL | 1130 | 24.0 | 217.0 | 1947 | 1840 | 16.0 | 0 | 0 |
79 | 1 | 1 | 4 | DL | LAX | CMH | 2146 | -3.0 | 223.0 | 1995 | 459 | -13.0 | 0 | 0 |
85 | 1 | 1 | 4 | DL | ATL | OKC | 2059 | -4.0 | 116.0 | 761 | 2227 | -12.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58440 | 12 | 31 | 4 | DL | ATL | CVG | 1611 | -4.0 | 61.0 | 373 | 1736 | -6.0 | 0 | 0 |
58448 | 12 | 31 | 4 | DL | ATL | SRQ | 1610 | 0.0 | 61.0 | 444 | 1740 | -13.0 | 0 | 0 |
58464 | 12 | 31 | 4 | DL | LAX | SFO | 700 | 108.0 | 54.0 | 337 | 825 | 105.0 | 0 | 0 |
58467 | 12 | 31 | 4 | DL | ATL | IND | 1235 | -3.0 | 63.0 | 432 | 1407 | -13.0 | 0 | 0 |
58485 | 12 | 31 | 4 | DL | ATL | CMH | 2206 | 2.0 | 64.0 | 447 | 2338 | -8.0 | 0 | 0 |
10601 rows × 14 columns
EV
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | 1 | 1 | 4 | EV | ORD | JAN | 1155 | 6.0 | 113.0 | 677 | 1403 | 5.0 | 0 | 0 |
13 | 1 | 1 | 4 | EV | ORD | CMH | 1010 | -2.0 | 46.0 | 296 | 1228 | -9.0 | 0 | 0 |
29 | 1 | 1 | 4 | EV | ORD | IND | 1025 | -6.0 | 29.0 | 177 | 1228 | -19.0 | 0 | 0 |
40 | 1 | 1 | 4 | EV | IAH | CLE | 1038 | -3.0 | 126.0 | 1091 | 1425 | -18.0 | 0 | 0 |
69 | 1 | 1 | 4 | EV | ATL | RAP | 1930 | -5.0 | 181.0 | 1230 | 2104 | -15.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58445 | 12 | 31 | 4 | EV | DFW | TXK | 850 | -5.0 | 30.0 | 181 | 948 | -17.0 | 0 | 0 |
58452 | 12 | 31 | 4 | EV | DFW | SHV | 1650 | -4.0 | 32.0 | 190 | 1746 | -12.0 | 0 | 0 |
58459 | 12 | 31 | 4 | EV | MSP | ORD | 1435 | 18.0 | 61.0 | 334 | 1609 | 3.0 | 0 | 0 |
58463 | 12 | 31 | 4 | EV | ORD | MSN | 1220 | 18.0 | 32.0 | 108 | 1319 | 27.0 | 0 | 0 |
58486 | 12 | 31 | 4 | EV | DFW | LFT | 850 | 21.0 | 52.0 | 351 | 1012 | 14.0 | 0 | 0 |
5858 rows × 14 columns
F9
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 1 | 1 | 4 | F9 | SFO | PHX | 1020 | -7.0 | 91.0 | 651 | 1315 | -6.0 | 0 | 0 |
93 | 1 | 1 | 4 | F9 | ATL | DEN | 859 | 16.0 | 181.0 | 1199 | 1026 | 10.0 | 0 | 0 |
209 | 1 | 2 | 5 | F9 | MSP | DEN | 1025 | -6.0 | 97.0 | 680 | 1134 | -13.0 | 0 | 0 |
232 | 1 | 2 | 5 | F9 | DEN | PHX | 2040 | -7.0 | 83.0 | 602 | 2228 | -18.0 | 0 | 0 |
247 | 1 | 2 | 5 | F9 | ORD | ATL | 730 | 10.0 | 86.0 | 606 | 1020 | 23.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58288 | 12 | 30 | 3 | F9 | DEN | ORD | 625 | -4.0 | 136.0 | 888 | 1000 | 14.0 | 0 | 0 |
58331 | 12 | 30 | 3 | F9 | ORD | PHX | 825 | 18.0 | 207.0 | 1440 | 1127 | 14.0 | 0 | 0 |
58447 | 12 | 31 | 4 | F9 | DEN | LAS | 1245 | 13.0 | 94.0 | 628 | 1340 | 13.0 | 0 | 0 |
58449 | 12 | 31 | 4 | F9 | DEN | MCO | 645 | 11.0 | 169.0 | 1546 | 1224 | -11.0 | 0 | 0 |
58488 | 12 | 31 | 4 | F9 | LAS | SFO | 1910 | 13.0 | 71.0 | 414 | 2050 | 4.0 | 0 | 0 |
1317 rows × 14 columns
HA
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
582 | 1 | 4 | 7 | HA | LAX | OGG | 1115 | -11.0 | 310.0 | 2486 | 1500 | -27.0 | 0 | 0 |
712 | 1 | 5 | 1 | HA | LAS | HNL | 900 | -5.0 | 357.0 | 2762 | 1315 | 5.0 | 0 | 0 |
878 | 1 | 6 | 2 | HA | PHX | HNL | 800 | 1.0 | 374.0 | 2917 | 1140 | 3.0 | 0 | 0 |
1053 | 1 | 7 | 3 | HA | LAX | HNL | 1705 | 0.0 | 332.0 | 2556 | 2055 | -2.0 | 0 | 0 |
1269 | 1 | 8 | 4 | HA | LAX | HNL | 1000 | -1.0 | 335.0 | 2556 | 1350 | 0.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
55883 | 12 | 16 | 3 | HA | LAX | HNL | 835 | 1.0 | 314.0 | 2556 | 1235 | -18.0 | 0 | 0 |
56174 | 12 | 18 | 5 | HA | LAX | HNL | 835 | -5.0 | 342.0 | 2556 | 1235 | -4.0 | 0 | 0 |
56350 | 12 | 19 | 6 | HA | PHX | HNL | 800 | -5.0 | 363.0 | 2917 | 1155 | -34.0 | 0 | 0 |
56816 | 12 | 21 | 1 | HA | LAX | LIH | 740 | 20.0 | 303.0 | 2615 | 1145 | -11.0 | 0 | 0 |
58391 | 12 | 31 | 4 | HA | LAX | HNL | 1000 | 0.0 | 324.0 | 2556 | 1350 | -9.0 | 0 | 0 |
112 rows × 14 columns
MQ
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | 1 | 4 | MQ | DFW | VPS | 1305 | 36.0 | 85.0 | 641 | 1453 | 35.0 | 0 | 0 |
10 | 1 | 1 | 4 | MQ | DFW | DRO | 1335 | 28.0 | 104.0 | 674 | 1438 | 28.0 | 0 | 0 |
18 | 1 | 1 | 4 | MQ | ORD | DAY | 2220 | 19.0 | 37.0 | 240 | 23 | 20.0 | 0 | 0 |
24 | 1 | 1 | 4 | MQ | DFW | BTR | 730 | NaN | NaN | 383 | 853 | NaN | 0 | 1 |
50 | 1 | 1 | 4 | MQ | ORD | CID | 1135 | -7.0 | 37.0 | 196 | 1238 | -15.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58415 | 12 | 31 | 4 | MQ | ORD | FWA | 845 | -2.0 | 37.0 | 157 | 1045 | -4.0 | 0 | 0 |
58426 | 12 | 31 | 4 | MQ | DFW | FAR | 1154 | 4.0 | 124.0 | 968 | 1437 | -13.0 | 0 | 0 |
58468 | 12 | 31 | 4 | MQ | DFW | OKC | 1720 | -3.0 | 31.0 | 175 | 1819 | -10.0 | 0 | 0 |
58474 | 12 | 31 | 4 | MQ | ORD | FNT | 829 | 4.0 | 40.0 | 223 | 1034 | -4.0 | 0 | 0 |
58484 | 12 | 31 | 4 | MQ | ORD | DSM | 1333 | 1.0 | 57.0 | 299 | 1455 | -7.0 | 0 | 0 |
3471 rows × 14 columns
NK
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
17 | 1 | 1 | 4 | NK | DEN | DTW | 1952 | 37.0 | 124.0 | 1123 | 31 | 54.0 | 0 | 0 |
74 | 1 | 1 | 4 | NK | PHX | DFW | 159 | -1.0 | 103.0 | 868 | 502 | 1.0 | 0 | 0 |
95 | 1 | 1 | 4 | NK | LAS | OAK | 1115 | 22.0 | 62.0 | 407 | 1246 | 10.0 | 0 | 0 |
109 | 1 | 1 | 4 | NK | MSP | ORD | 616 | 2.0 | 49.0 | 334 | 745 | -19.0 | 0 | 0 |
166 | 1 | 2 | 5 | NK | LAS | PDX | 1535 | -8.0 | 123.0 | 763 | 1754 | -4.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58160 | 12 | 29 | 2 | NK | MSP | MCO | 740 | 0.0 | 171.0 | 1310 | 1158 | 33.0 | 0 | 0 |
58197 | 12 | 30 | 3 | NK | IAH | ORD | 755 | -8.0 | 136.0 | 925 | 1030 | -2.0 | 0 | 0 |
58437 | 12 | 31 | 4 | NK | ORD | DFW | 1952 | 15.0 | 135.0 | 802 | 2225 | 23.0 | 0 | 0 |
58461 | 12 | 31 | 4 | NK | ORD | LGA | 1801 | -5.0 | 84.0 | 733 | 2109 | -26.0 | 0 | 0 |
58469 | 12 | 31 | 4 | NK | LAS | MSY | 1950 | 124.0 | 163.0 | 1500 | 112 | 101.0 | 0 | 0 |
1516 rows × 14 columns
OO
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | 1 | 1 | 4 | OO | ORD | MSP | 1510 | 2.0 | 65.0 | 334 | 1646 | 4.0 | 0 | 0 |
16 | 1 | 1 | 4 | OO | DEN | SGU | 1105 | 21.0 | 66.0 | 517 | 1249 | 20.0 | 0 | 0 |
22 | 1 | 1 | 4 | OO | LAS | LAX | 1544 | -4.0 | 39.0 | 236 | 1655 | -12.0 | 0 | 0 |
25 | 1 | 1 | 4 | OO | ORD | SPI | 2110 | -4.0 | 31.0 | 174 | 2205 | 5.0 | 0 | 0 |
27 | 1 | 1 | 4 | OO | IAH | JAC | 1104 | -1.0 | 161.0 | 1265 | 1316 | -1.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58451 | 12 | 31 | 4 | OO | ATL | FWA | 1905 | -3.0 | 72.0 | 508 | 2051 | -14.0 | 0 | 0 |
58480 | 12 | 31 | 4 | OO | MSP | BIS | 1310 | -2.0 | 65.0 | 386 | 1449 | -9.0 | 0 | 0 |
58482 | 12 | 31 | 4 | OO | DEN | CPR | 1850 | -2.0 | 38.0 | 230 | 1956 | 1.0 | 0 | 0 |
58489 | 12 | 31 | 4 | OO | SFO | SBA | 1846 | -6.0 | 46.0 | 262 | 1956 | -5.0 | 0 | 0 |
58491 | 12 | 31 | 4 | OO | SFO | BOI | 859 | 5.0 | 73.0 | 522 | 1146 | -1.0 | 0 | 0 |
6588 rows × 14 columns
UA
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | 1 | 4 | UA | DEN | IAD | 823 | 7.0 | 154.0 | 1452 | 1333 | -13.0 | 0 | 0 |
5 | 1 | 1 | 4 | UA | IAH | SAN | 1450 | 1.0 | 178.0 | 1303 | 1620 | -14.0 | 0 | 0 |
9 | 1 | 1 | 4 | UA | IAH | SJC | 925 | 3.0 | 215.0 | 1608 | 1136 | -14.0 | 0 | 0 |
14 | 1 | 1 | 4 | UA | IAH | IND | 1426 | -1.0 | 102.0 | 844 | 1742 | -20.0 | 0 | 0 |
21 | 1 | 1 | 4 | UA | ORD | CLE | 2102 | 48.0 | 47.0 | 315 | 2320 | 41.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58422 | 12 | 31 | 4 | UA | DEN | SAN | 1535 | 0.0 | 124.0 | 853 | 1704 | -13.0 | 0 | 0 |
58432 | 12 | 31 | 4 | UA | ORD | SAN | 1915 | 7.0 | 238.0 | 1723 | 2143 | -3.0 | 0 | 0 |
58457 | 12 | 31 | 4 | UA | ORD | LAX | 659 | -1.0 | 241.0 | 1744 | 946 | 0.0 | 0 | 0 |
58460 | 12 | 31 | 4 | UA | SFO | PHL | 2235 | -6.0 | 265.0 | 2521 | 700 | -42.0 | 0 | 0 |
58481 | 12 | 31 | 4 | UA | IAH | LAX | 1433 | 1.0 | 197.0 | 1379 | 1625 | -13.0 | 0 | 0 |
7792 rows × 14 columns
US
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
31 | 1 | 1 | 4 | US | PHX | DEN | 1810 | 29.0 | 94.0 | 602 | 1954 | 49.0 | 0 | 0 |
35 | 1 | 1 | 4 | US | ORD | PHL | 1600 | -2.0 | 80.0 | 678 | 1857 | -9.0 | 0 | 0 |
49 | 1 | 1 | 4 | US | IAH | PHX | 1445 | -1.0 | 147.0 | 1009 | 1638 | -7.0 | 0 | 0 |
96 | 1 | 1 | 4 | US | ATL | PHL | 1445 | -4.0 | 90.0 | 666 | 1644 | -11.0 | 0 | 0 |
104 | 1 | 1 | 4 | US | MSP | PHX | 730 | -3.0 | 174.0 | 1276 | 1010 | -20.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
31514 | 6 | 30 | 2 | US | DEN | PHL | 705 | -4.0 | 188.0 | 1558 | 1240 | 1.0 | 0 | 0 |
31523 | 6 | 30 | 2 | US | PHX | DEN | 1451 | 6.0 | 85.0 | 602 | 1738 | 7.0 | 0 | 0 |
31535 | 6 | 30 | 2 | US | PHX | AUS | 840 | -3.0 | 116.0 | 872 | 1304 | -11.0 | 0 | 0 |
31561 | 6 | 30 | 2 | US | ORD | PHX | 710 | -5.0 | 170.0 | 1440 | 901 | -50.0 | 0 | 0 |
31582 | 6 | 30 | 2 | US | PHX | OGG | 800 | -4.0 | 356.0 | 2845 | 1127 | -13.0 | 0 | 0 |
1615 rows × 14 columns
VX
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
56 | 1 | 1 | 4 | VX | LAS | SFO | 900 | 23.0 | 65.0 | 414 | 1035 | 11.0 | 0 | 0 |
227 | 1 | 2 | 5 | VX | SFO | LAS | 1220 | -5.0 | 68.0 | 414 | 1350 | -5.0 | 0 | 0 |
243 | 1 | 2 | 5 | VX | SFO | SEA | 700 | -4.0 | 104.0 | 679 | 905 | -1.0 | 0 | 0 |
417 | 1 | 3 | 6 | VX | SFO | LAS | 900 | -2.0 | 62.0 | 414 | 1030 | -11.0 | 0 | 0 |
432 | 1 | 3 | 6 | VX | SFO | SEA | 2035 | -2.0 | 106.0 | 679 | 2240 | -2.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58332 | 12 | 30 | 3 | VX | SFO | LAS | 1950 | -3.0 | 58.0 | 414 | 2120 | -4.0 | 0 | 0 |
58383 | 12 | 31 | 4 | VX | SFO | PSP | 1630 | -7.0 | 65.0 | 421 | 1755 | -12.0 | 0 | 0 |
58400 | 12 | 31 | 4 | VX | SFO | LAX | 1125 | -4.0 | 54.0 | 337 | 1245 | -10.0 | 0 | 0 |
58471 | 12 | 31 | 4 | VX | SFO | LAX | 700 | 6.0 | 51.0 | 337 | 820 | 3.0 | 0 | 0 |
58478 | 12 | 31 | 4 | VX | SFO | LAX | 1530 | 29.0 | 52.0 | 337 | 1650 | 22.0 | 0 | 0 |
993 rows × 14 columns
WN
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | WN | LAX | SLC | 1625 | 58.0 | 94.0 | 590 | 1905 | 65.0 | 0 | 0 |
4 | 1 | 1 | 4 | WN | LAX | MCI | 1720 | 48.0 | 166.0 | 1363 | 2225 | 39.0 | 0 | 0 |
19 | 1 | 1 | 4 | WN | PHX | LAX | 1640 | 51.0 | 58.0 | 370 | 1700 | 59.0 | 0 | 0 |
20 | 1 | 1 | 4 | WN | ATL | BWI | 1115 | 1.0 | 76.0 | 577 | 1305 | -15.0 | 0 | 0 |
23 | 1 | 1 | 4 | WN | ATL | HOU | 1555 | 30.0 | 113.0 | 696 | 1720 | 18.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58455 | 12 | 31 | 4 | WN | LAX | SMF | 1420 | -2.0 | 64.0 | 373 | 1540 | -7.0 | 0 | 0 |
58458 | 12 | 31 | 4 | WN | LAS | SFO | 1825 | 25.0 | 67.0 | 414 | 1955 | 17.0 | 0 | 0 |
58472 | 12 | 31 | 4 | WN | PHX | HOU | 845 | 5.0 | 119.0 | 1020 | 1210 | 7.0 | 0 | 0 |
58473 | 12 | 31 | 4 | WN | DEN | PDX | 1205 | 4.0 | 130.0 | 991 | 1400 | -13.0 | 0 | 0 |
58490 | 12 | 31 | 4 | WN | MSP | ATL | 525 | 39.0 | 124.0 | 907 | 855 | 34.0 | 0 | 0 |
8418 rows × 14 columns
-
방법1
df.groupby(by='AIRLINE').agg({'ARR_DELAY':'mean'})
ARR_DELAY | |
---|---|
AIRLINE | |
AA | 5.542661 |
AS | -0.833333 |
B6 | 8.692593 |
DL | 0.339691 |
EV | 7.034580 |
F9 | 13.630651 |
HA | 4.972973 |
MQ | 6.860591 |
NK | 18.436070 |
OO | 7.593463 |
UA | 7.765755 |
US | 1.681105 |
VX | 5.348884 |
WN | 6.397353 |
-
방법2 ($\star\star\star$)
df.groupby(by='AIRLINE').agg({'ARR_DELAY':np.mean})
ARR_DELAY | |
---|---|
AIRLINE | |
AA | 5.542661 |
AS | -0.833333 |
B6 | 8.692593 |
DL | 0.339691 |
EV | 7.034580 |
F9 | 13.630651 |
HA | 4.972973 |
MQ | 6.860591 |
NK | 18.436070 |
OO | 7.593463 |
UA | 7.765755 |
US | 1.681105 |
VX | 5.348884 |
WN | 6.397353 |
-
방법3
df.groupby(by='AIRLINE')['ARR_DELAY'].agg('mean')
AIRLINE AA 5.542661 AS -0.833333 B6 8.692593 DL 0.339691 EV 7.034580 F9 13.630651 HA 4.972973 MQ 6.860591 NK 18.436070 OO 7.593463 UA 7.765755 US 1.681105 VX 5.348884 WN 6.397353 Name: ARR_DELAY, dtype: float64
-
방법4 ($\star$)
df.groupby(by='AIRLINE')['ARR_DELAY'].agg(np.mean)
AIRLINE AA 5.542661 AS -0.833333 B6 8.692593 DL 0.339691 EV 7.034580 F9 13.630651 HA 4.972973 MQ 6.860591 NK 18.436070 OO 7.593463 UA 7.765755 US 1.681105 VX 5.348884 WN 6.397353 Name: ARR_DELAY, dtype: float64
-
방법5
df.groupby(by='AIRLINE')['ARR_DELAY'].mean()
AIRLINE AA 5.542661 AS -0.833333 B6 8.692593 DL 0.339691 EV 7.034580 F9 13.630651 HA 4.972973 MQ 6.860591 NK 18.436070 OO 7.593463 UA 7.765755 US 1.681105 VX 5.348884 WN 6.397353 Name: ARR_DELAY, dtype: float64
-
방법2와 방법4는 사용자정의 함수를 쓸 수 있다는 장점이 있음
def f(x): return -np.mean(x)
df.groupby(by='AIRLINE').agg({'ARR_DELAY':f})
ARR_DELAY | |
---|---|
AIRLINE | |
AA | -5.542661 |
AS | 0.833333 |
B6 | -8.692593 |
DL | -0.339691 |
EV | -7.034580 |
F9 | -13.630651 |
HA | -4.972973 |
MQ | -6.860591 |
NK | -18.436070 |
OO | -7.593463 |
UA | -7.765755 |
US | -1.681105 |
VX | -5.348884 |
WN | -6.397353 |
df.groupby(by='AIRLINE').agg({'ARR_DELAY':lambda x: -np.mean(x)})
ARR_DELAY | |
---|---|
AIRLINE | |
AA | -5.542661 |
AS | 0.833333 |
B6 | -8.692593 |
DL | -0.339691 |
EV | -7.034580 |
F9 | -13.630651 |
HA | -4.972973 |
MQ | -6.860591 |
NK | -18.436070 |
OO | -7.593463 |
UA | -7.765755 |
US | -1.681105 |
VX | -5.348884 |
WN | -6.397353 |
df.groupby(by='AIRLINE')['ARR_DELAY'].agg(lambda x: -np.mean(x))
AIRLINE AA -5.542661 AS 0.833333 B6 -8.692593 DL -0.339691 EV -7.034580 F9 -13.630651 HA -4.972973 MQ -6.860591 NK -18.436070 OO -7.593463 UA -7.765755 US -1.681105 VX -5.348884 WN -6.397353 Name: ARR_DELAY, dtype: float64
-
입력이 여러개인 사용자 정의 함수도 사용가능함
def f(x,y): return np.mean(x)**y
df.groupby(by='AIRLINE')['ARR_DELAY'].agg(f,2)
AIRLINE AA 30.721086 AS 0.694444 B6 75.561166 DL 0.115390 EV 49.485310 F9 185.794656 HA 24.730460 MQ 47.067715 NK 339.888677 OO 57.660681 UA 60.306954 US 2.826113 VX 28.610564 WN 40.926120 Name: ARR_DELAY, dtype: float64
df.groupby(by='AIRLINE').agg({'ARR_DELAY': lambda x: f(x,2)})
ARR_DELAY | |
---|---|
AIRLINE | |
AA | 30.721086 |
AS | 0.694444 |
B6 | 75.561166 |
DL | 0.115390 |
EV | 49.485310 |
F9 | 185.794656 |
HA | 24.730460 |
MQ | 47.067715 |
NK | 339.888677 |
OO | 57.660681 |
UA | 60.306954 |
US | 2.826113 |
VX | 28.610564 |
WN | 40.926120 |
-
방법1~5
df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':'sum'})
CANCELLED | ||
---|---|---|
AIRLINE | WEEKDAY | |
AA | 1 | 41 |
2 | 9 | |
3 | 16 | |
4 | 20 | |
5 | 18 | |
... | ... | ... |
WN | 3 | 18 |
4 | 10 | |
5 | 7 | |
6 | 10 | |
7 | 7 |
98 rows × 1 columns
df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':np.sum})
CANCELLED | ||
---|---|---|
AIRLINE | WEEKDAY | |
AA | 1 | 41 |
2 | 9 | |
3 | 16 | |
4 | 20 | |
5 | 18 | |
... | ... | ... |
WN | 3 | 18 |
4 | 10 | |
5 | 7 | |
6 | 10 | |
7 | 7 |
98 rows × 1 columns
df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].agg('sum')
AIRLINE WEEKDAY AA 1 41 2 9 3 16 4 20 5 18 .. WN 3 18 4 10 5 7 6 10 7 7 Name: CANCELLED, Length: 98, dtype: int64
df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].agg(np.sum)
AIRLINE WEEKDAY AA 1 41 2 9 3 16 4 20 5 18 .. WN 3 18 4 10 5 7 6 10 7 7 Name: CANCELLED, Length: 98, dtype: int64
df.groupby(by=['AIRLINE','WEEKDAY'])['CANCELLED'].sum()
AIRLINE WEEKDAY AA 1 41 2 9 3 16 4 20 5 18 .. WN 3 18 4 10 5 7 6 10 7 7 Name: CANCELLED, Length: 98, dtype: int64
-
방법 1~4 (5번은 쓸 수 없다)
df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':['sum','mean'],'DIVERTED':['sum','mean']})
CANCELLED | DIVERTED | ||||
---|---|---|---|---|---|
sum | mean | sum | mean | ||
AIRLINE | WEEKDAY | ||||
AA | 1 | 41 | 0.032106 | 6 | 0.004699 |
2 | 9 | 0.007341 | 2 | 0.001631 | |
3 | 16 | 0.011949 | 2 | 0.001494 | |
4 | 20 | 0.015004 | 5 | 0.003751 | |
5 | 18 | 0.014151 | 1 | 0.000786 | |
... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 2 | 0.001569 |
4 | 10 | 0.007911 | 4 | 0.003165 | |
5 | 7 | 0.005828 | 0 | 0.000000 | |
6 | 10 | 0.010132 | 3 | 0.003040 | |
7 | 7 | 0.006066 | 3 | 0.002600 |
98 rows × 4 columns
df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':[np.sum,np.mean],'DIVERTED':[np.sum,np.mean]})
CANCELLED | DIVERTED | ||||
---|---|---|---|---|---|
sum | mean | sum | mean | ||
AIRLINE | WEEKDAY | ||||
AA | 1 | 41 | 0.032106 | 6 | 0.004699 |
2 | 9 | 0.007341 | 2 | 0.001631 | |
3 | 16 | 0.011949 | 2 | 0.001494 | |
4 | 20 | 0.015004 | 5 | 0.003751 | |
5 | 18 | 0.014151 | 1 | 0.000786 | |
... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 2 | 0.001569 |
4 | 10 | 0.007911 | 4 | 0.003165 | |
5 | 7 | 0.005828 | 0 | 0.000000 | |
6 | 10 | 0.010132 | 3 | 0.003040 | |
7 | 7 | 0.006066 | 3 | 0.002600 |
98 rows × 4 columns
df.groupby(by=['AIRLINE','WEEKDAY'])[['CANCELLED','DIVERTED']].agg(['sum','mean'])
CANCELLED | DIVERTED | ||||
---|---|---|---|---|---|
sum | mean | sum | mean | ||
AIRLINE | WEEKDAY | ||||
AA | 1 | 41 | 0.032106 | 6 | 0.004699 |
2 | 9 | 0.007341 | 2 | 0.001631 | |
3 | 16 | 0.011949 | 2 | 0.001494 | |
4 | 20 | 0.015004 | 5 | 0.003751 | |
5 | 18 | 0.014151 | 1 | 0.000786 | |
... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 2 | 0.001569 |
4 | 10 | 0.007911 | 4 | 0.003165 | |
5 | 7 | 0.005828 | 0 | 0.000000 | |
6 | 10 | 0.010132 | 3 | 0.003040 | |
7 | 7 | 0.006066 | 3 | 0.002600 |
98 rows × 4 columns
df.groupby(by=['AIRLINE','WEEKDAY'])[['CANCELLED','DIVERTED']].agg([np.sum,np.mean])
CANCELLED | DIVERTED | ||||
---|---|---|---|---|---|
sum | mean | sum | mean | ||
AIRLINE | WEEKDAY | ||||
AA | 1 | 41 | 0.032106 | 6 | 0.004699 |
2 | 9 | 0.007341 | 2 | 0.001631 | |
3 | 16 | 0.011949 | 2 | 0.001494 | |
4 | 20 | 0.015004 | 5 | 0.003751 | |
5 | 18 | 0.014151 | 1 | 0.000786 | |
... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 2 | 0.001569 |
4 | 10 | 0.007911 | 4 | 0.003165 | |
5 | 7 | 0.005828 | 0 | 0.000000 | |
6 | 10 | 0.010132 | 3 | 0.003040 | |
7 | 7 | 0.006066 | 3 | 0.002600 |
98 rows × 4 columns
df.groupby(by=['AIRLINE','WEEKDAY']).agg({'CANCELLED':['sum','mean','size'],'AIR_TIME':['mean','var']})
CANCELLED | AIR_TIME | |||||
---|---|---|---|---|---|---|
sum | mean | size | mean | var | ||
AIRLINE | WEEKDAY | |||||
AA | 1 | 41 | 0.032106 | 1277 | 147.610569 | 5393.806723 |
2 | 9 | 0.007341 | 1226 | 143.851852 | 5359.890719 | |
3 | 16 | 0.011949 | 1339 | 144.514005 | 5378.854539 | |
4 | 20 | 0.015004 | 1333 | 141.124618 | 4791.524627 | |
5 | 18 | 0.014151 | 1272 | 145.430966 | 5884.592076 | |
... | ... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 1275 | 104.219920 | 2901.873447 |
4 | 10 | 0.007911 | 1264 | 107.200800 | 2966.568935 | |
5 | 7 | 0.005828 | 1201 | 107.893635 | 3268.717093 | |
6 | 10 | 0.010132 | 987 | 109.247433 | 3152.753719 | |
7 | 7 | 0.006066 | 1154 | 107.602273 | 3183.126889 |
98 rows × 5 columns
df.groupby(by=['AIRLINE','WEEKDAY'])\
.agg({'CANCELLED':[np.sum,np.mean,len],'AIR_TIME':[np.mean,lambda x: np.std(x,ddof=1)**2]})
CANCELLED | AIR_TIME | |||||
---|---|---|---|---|---|---|
sum | mean | len | mean | <lambda_0> | ||
AIRLINE | WEEKDAY | |||||
AA | 1 | 41 | 0.032106 | 1277 | 147.610569 | 5393.806723 |
2 | 9 | 0.007341 | 1226 | 143.851852 | 5359.890719 | |
3 | 16 | 0.011949 | 1339 | 144.514005 | 5378.854539 | |
4 | 20 | 0.015004 | 1333 | 141.124618 | 4791.524627 | |
5 | 18 | 0.014151 | 1272 | 145.430966 | 5884.592076 | |
... | ... | ... | ... | ... | ... | ... |
WN | 3 | 18 | 0.014118 | 1275 | 104.219920 | 2901.873447 |
4 | 10 | 0.007911 | 1264 | 107.200800 | 2966.568935 | |
5 | 7 | 0.005828 | 1201 | 107.893635 | 3268.717093 | |
6 | 10 | 0.010132 | 987 | 109.247433 | 3152.753719 | |
7 | 7 | 0.006066 | 1154 | 107.602273 | 3183.126889 |
98 rows × 5 columns
df
MONTH | DAY | WEEKDAY | AIRLINE | ORG_AIR | DEST_AIR | SCHED_DEP | DEP_DELAY | AIR_TIME | DIST | SCHED_ARR | ARR_DELAY | DIVERTED | CANCELLED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | WN | LAX | SLC | 1625 | 58.0 | 94.0 | 590 | 1905 | 65.0 | 0 | 0 |
1 | 1 | 1 | 4 | UA | DEN | IAD | 823 | 7.0 | 154.0 | 1452 | 1333 | -13.0 | 0 | 0 |
2 | 1 | 1 | 4 | MQ | DFW | VPS | 1305 | 36.0 | 85.0 | 641 | 1453 | 35.0 | 0 | 0 |
3 | 1 | 1 | 4 | AA | DFW | DCA | 1555 | 7.0 | 126.0 | 1192 | 1935 | -7.0 | 0 | 0 |
4 | 1 | 1 | 4 | WN | LAX | MCI | 1720 | 48.0 | 166.0 | 1363 | 2225 | 39.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58487 | 12 | 31 | 4 | AA | SFO | DFW | 515 | 5.0 | 166.0 | 1464 | 1045 | -19.0 | 0 | 0 |
58488 | 12 | 31 | 4 | F9 | LAS | SFO | 1910 | 13.0 | 71.0 | 414 | 2050 | 4.0 | 0 | 0 |
58489 | 12 | 31 | 4 | OO | SFO | SBA | 1846 | -6.0 | 46.0 | 262 | 1956 | -5.0 | 0 | 0 |
58490 | 12 | 31 | 4 | WN | MSP | ATL | 525 | 39.0 | 124.0 | 907 | 855 | 34.0 | 0 | 0 |
58491 | 12 | 31 | 4 | OO | SFO | BOI | 859 | 5.0 | 73.0 | 522 | 1146 | -1.0 | 0 | 0 |
58492 rows × 14 columns
-
목표: DIST를 적당한 구간으로 나누어 카테고리화 하고 그것을 바탕으로 groupby를 수행하자.
df.DIST.hist()
<AxesSubplot:>
df.DIST.describe()
count 58492.000000 mean 872.900072 std 624.996805 min 67.000000 25% 391.000000 50% 690.000000 75% 1199.000000 max 4502.000000 Name: DIST, dtype: float64
-
구간을 아래와 같이 설정한다.
bins=[-np.inf, 400, 700, 1200, np.inf]
-
pd.cut()을 이용하여 각 구간의 obs를 카테고리화 하자.
cuts=pd.cut(df.DIST,bins=bins)
cuts
0 (400.0, 700.0] 1 (1200.0, inf] 2 (400.0, 700.0] 3 (700.0, 1200.0] 4 (1200.0, inf] ... 58487 (1200.0, inf] 58488 (400.0, 700.0] 58489 (-inf, 400.0] 58490 (700.0, 1200.0] 58491 (400.0, 700.0] Name: DIST, Length: 58492, dtype: category Categories (4, interval[float64, right]): [(-inf, 400.0] < (400.0, 700.0] < (700.0, 1200.0] < (1200.0, inf]]
-
cuts, AIRLINE $\to$ {DIVERTED: sum}
df.groupby([cuts,'AIRLINE']).agg({'DIVERTED':sum})
DIVERTED | ||
---|---|---|
DIST | AIRLINE | |
(-inf, 400.0] | AA | 0 |
AS | 0 | |
B6 | 0 | |
DL | 1 | |
EV | 3 | |
F9 | 0 | |
HA | 0 | |
MQ | 0 | |
NK | 0 | |
OO | 5 | |
UA | 2 | |
US | 0 | |
VX | 0 | |
WN | 1 | |
(400.0, 700.0] | AA | 3 |
AS | 0 | |
B6 | 0 | |
DL | 12 | |
EV | 8 | |
F9 | 1 | |
HA | 0 | |
MQ | 4 | |
NK | 1 | |
OO | 7 | |
UA | 1 | |
US | 0 | |
VX | 0 | |
WN | 2 | |
(700.0, 1200.0] | AA | 10 |
AS | 0 | |
B6 | 1 | |
DL | 6 | |
EV | 4 | |
F9 | 0 | |
HA | 0 | |
MQ | 1 | |
NK | 1 | |
OO | 5 | |
UA | 4 | |
US | 0 | |
VX | 0 | |
WN | 4 | |
(1200.0, inf] | AA | 13 |
AS | 0 | |
B6 | 1 | |
DL | 5 | |
EV | 0 | |
F9 | 1 | |
HA | 1 | |
MQ | 0 | |
NK | 3 | |
OO | 4 | |
UA | 12 | |
US | 1 | |
VX | 1 | |
WN | 8 |
-
아래와 비교해보자.
df.groupby(['AIRLINE']).agg({'AIRLINE':len})
AIRLINE | |
---|---|
AIRLINE | |
AA | 8900 |
AS | 768 |
B6 | 543 |
DL | 10601 |
EV | 5858 |
F9 | 1317 |
HA | 112 |
MQ | 3471 |
NK | 1516 |
OO | 6588 |
UA | 7792 |
US | 1615 |
VX | 993 |
WN | 8418 |
-
cuts을 이용하여 추가그룹핑을 하면 조금 다른 특징들을 데이터에서 발견할 수 있다.
- AA항공사와 DL항공사는 모두 비슷한 우회횟수를 가지고 있음.
- AA항공사는 700회이상의 구간에서 우회를 많이하고 DL항공사는 400~700사이에서 우회를 많이 한다. (패턴이 다름)
-
구간이름에 label을 붙이는 방법
bins
[-inf, 400, 700, 1200, inf]
cuts2=pd.cut(df.DIST,bins=bins,labels=['Q1','Q2','Q3','Q4'])
cuts2
0 Q2 1 Q4 2 Q2 3 Q3 4 Q4 .. 58487 Q4 58488 Q2 58489 Q1 58490 Q3 58491 Q2 Name: DIST, Length: 58492, dtype: category Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
df.groupby(by=[cuts2,'AIRLINE']).agg({'DIVERTED':sum})
DIVERTED | ||
---|---|---|
DIST | AIRLINE | |
Q1 | AA | 0 |
AS | 0 | |
B6 | 0 | |
DL | 1 | |
EV | 3 | |
F9 | 0 | |
HA | 0 | |
MQ | 0 | |
NK | 0 | |
OO | 5 | |
UA | 2 | |
US | 0 | |
VX | 0 | |
WN | 1 | |
Q2 | AA | 3 |
AS | 0 | |
B6 | 0 | |
DL | 12 | |
EV | 8 | |
F9 | 1 | |
HA | 0 | |
MQ | 4 | |
NK | 1 | |
OO | 7 | |
UA | 1 | |
US | 0 | |
VX | 0 | |
WN | 2 | |
Q3 | AA | 10 |
AS | 0 | |
B6 | 1 | |
DL | 6 | |
EV | 4 | |
F9 | 0 | |
HA | 0 | |
MQ | 1 | |
NK | 1 | |
OO | 5 | |
UA | 4 | |
US | 0 | |
VX | 0 | |
WN | 4 | |
Q4 | AA | 13 |
AS | 0 | |
B6 | 1 | |
DL | 5 | |
EV | 0 | |
F9 | 1 | |
HA | 1 | |
MQ | 0 | |
NK | 3 | |
OO | 4 | |
UA | 12 | |
US | 1 | |
VX | 1 | |
WN | 8 |
df.groupby(cuts2).agg({'DIVERTED':len})
DIVERTED | |
---|---|
DIST | |
Q1 | 15027 |
Q2 | 14697 |
Q3 | 14417 |
Q4 | 14351 |
구간을
bins=[-np.inf, 400, 700, 1200, np.inf]
이 아니라
bins=[-np.inf, 400, 600, 800, 1000, 1200, np.inf]
와 같이 나누고 적당한 각구간별로 해당하는 관측치의 수를 구하라.