假设我有一些航空公司的航班数据。一个字段是始发机场,另一个字段是目的地机场。我想按路线(独特的起点到目的地组合)对观察进行分组。问题是,对于每条独特的路线,还需要包括相应的返回路线。例如,如果我有机场 A 到机场 B 进行一组观察,但在接下来的几次观察中从机场 B 到机场 A,我希望它们都具有相同的路线 ID。
我可以为此使用 SAS、Stata 或 R。即使是 Python,如果它更容易的话。
代表代码如下:
df1 <- structure(list(airl = c("US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US"), ORIGIN = c("ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL"), DESTINATION = c("CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE"), miles = c(480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 54,
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 54, 54, 54, 54, 54, 54, 54, 54, 54,
54, 54, 54, 54, 54, 54, 54, 54), orig_area = c(23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23), dest_area = c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23), month = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10,
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1, 2, 3, 4, 5, 6, 7), freq = c(88, 80, 89, 78, 88, 83, 85, 80,
76, 79, 76, 81, 86, 65, 62, 60, 82, 137, 138, 142, 144, 149,
147, 150, 143, 150, 138, 128, 151, 145, 148, 146, 147, 149, 79,
76, 81, 86, 65, 62, 60, 82, 82, 82, 84, 81, 83, 81, 85, 84, 76,
85, 143, 137, 138, 142, 143, 151, 147, 150, 143, 150, 137, 128,
151, 145, 148, 146, 147), seats = c(8146, 7352, 7599, 6920, 6759,
6060, 6189, 5939, 6137, 6504, 6440, 6804, 6862, 5330, 5242, 5068,
6204, 6460, 6276, 6047, 6095, 6306, 6102, 6265, 7085, 7344, 6809,
6348, 6965, 6626, 6893, 6741, 6765, 6865, 6504, 6440, 6804, 6862,
5330, 5242, 5068, 6204, 6104, 6030, 6278, 6034, 6944, 6816, 6544,
6494, 5872, 6544, 6747, 6460, 6276, 6034, 6058, 6380, 6102, 6278,
7085, 7344, 6759, 6348, 6952, 6613, 6919, 6728, 6765), year = c(2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2009, 2009, 2009, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011,
2011, 2011)), .Names = c("airl", "ORIGIN", "DESTINATION", "miles",
"orig_area", "dest_area", "month", "freq", "seats", "year"), class = "data.frame", row.names = c(NA,
69L))