r - 是否有与 Stata 'order' 命令等效的 R 函数？

Question

R 中的“顺序”看起来像 Stata 中的“排序”。例如，这是一个数据集（仅列出了变量名称）：

v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18

这是我期望的输出：

v1 v2 v3 v4 v5 v7 v8 v9 v10 v11 v12 v17 v18 v13 v14 v15 v6 v16

在 R 中，我有两种方法：

data <- data[,c(1:5,7:12,17:18,13:15,6,16)]

或者

names <- c("v1", "v2", "v3", "v4", "v5", "v7", "v8", "v9", "v10", "v11", "v12",  "v17", "v18", "v13", "v14", "v15", "v6", "v16")
data <- data[names]

为了在 Stata 中获得相同的输出，我可以运行 2 行：

order v17 v18, before(v13)
order v6 v16, last

在上面的理想数据中，我们可以知道我们要处理的变量的位置。但在大多数实际情况下，我们有像“年龄”“性别”这样没有位置指示符的变量，而且我们在一个数据集中可能有超过 50 个变量。那么Stata中“订单”的优势可能会更加明显。我们不需要知道变量的确切位置，只需输入它的名称：

order age, after(gender)

R 中是否有一个基本函数来处理这个问题，或者我可以得到一个包吗？提前致谢。

tweetinfo <- data.frame(uid=1:50, mid=2:51, annotations=3:52, bmiddle_pic=4:53, created_at=5:54, favorited=6:55, geo=7:56, in_reply_to_screen_name=8:57, in_reply_to_status_id=9:58, in_reply_to_user_id=10:59, original_pic=11:60, reTweetId=12:61, reUserId=13:62, source=14:63, thumbnail_pic=15:64, truncated=16:65)
noretweetinfo <- data.frame(uid=21:50, mid=22:51, annotations=23:52, bmiddle_pic=24:53, created_at=25:54, favorited=26:55, geo=27:56, in_reply_to_screen_name=28:57, in_reply_to_status_id=29:58, in_reply_to_user_id=30:59, original_pic=31:60, reTweetId=32:61, reUserId=33:62, source=34:63, thumbnail_pic=35:64, truncated=36:65)
retweetinfo <- data.frame(uid=41:50, mid=42:51, annotations=43:52, bmiddle_pic=44:53, created_at=45:54, deleted=46:55, favorited=47:56, geo=48:57, in_reply_to_screen_name=49:58, in_reply_to_status_id=50:59, in_reply_to_user_id=51:60, original_pic=52:61, source=53:62, thumbnail_pic=54:63, truncated=55:64)
tweetinfo$type <- "ti"
noretweetinfo$type <- "nr"
retweetinfo$type <- "rt"
gtinfo <- rbind(tweetinfo, noretweetinfo)
gtinfo$deleted=""
gtinfo <- gtinfo[,c(1:16,18,17)]
retweetinfo <- transform(retweetinfo, reTweetId="", reUserId="")
retweetinfo <- retweetinfo[,c(1:5,7:12,17:18,13:15,6,16)]
gtinfo <- rbind(gtinfo, retweetinfo)
write.table(gtinfo, file="C:/gtinfo.txt", row.names=F, col.names=T, sep="\t", quote=F)
# rm(list=ls(all=T))

score 3 · Accepted Answer

因为我在拖延和尝试不同的事情，所以这是我掀起的一个功能。最终，它取决于append：

moveme <- function(invec, movecommand) {
  movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"), 
                        function(x) x[x != ""])
  movelist <- lapply(movecommand, function(x) {
    Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
    ToMove <- setdiff(x, Where)
    list(ToMove, Where)
  })
  myVec <- invec
  for (i in seq_along(movelist)) {
    temp <- setdiff(myVec, movelist[[i]][[1]])
    A <- movelist[[i]][[2]][1]
    if (A %in% c("before", "after")) {
      ba <- movelist[[i]][[2]][2]
      if (A == "before") {
        after <- match(ba, temp)-1
      } else if (A == "after") {
        after <- match(ba, temp)
      }    
    } else if (A == "first") {
      after <- 0
    } else if (A == "last") {
      after <- length(myVec)
    }
    myVec <- append(temp, values = movelist[[i]][[1]], after = after)
  }
  myVec
}

以下是一些代表数据集名称的示例数据：

x <- paste0("v", 1:18)

现在想象一下，我们想要“v17”和“v18”在“v3”之前，最后是“v6”和“v16”，开头是“v5”：

moveme(x, "v17, v18 before v3; v6, v16 last; v5 first")
#  [1] "v5"  "v1"  "v2"  "v17" "v18" "v3"  "v4"  "v7"  "v8"  "v9"  "v10" "v11" "v12"
# [14] "v13" "v14" "v15" "v6"  "v16"

因此，对于data.frame名为“df”的明显用法是：

df[moveme(names(df), "how you want to move the columns")]

而且，对于一个data.table名为“DT”（正如@mnel 指出的那样，它的内存效率会更高）：

setcolorder(DT, moveme(names(DT), "how you want to move the columns"))

请注意，复合移动由分号指定。

公认的动作是：

before（将指定的列移动到另一个命名列之前）
after（将指定的列移动到另一个命名列之后）
first（将指定的列移动到第一个位置）
last（将指定的列移动到最后一个位置）

score 2 · Accepted Answer

我明白你的问题。我现在可以提供代码：

move <- function(data,variable,before) {
  m <- data[variable]
  r <- data[names(data)!=variable]
  i <- match(before,names(data))
  pre <- r[1:i-1]
  post <- r[i:length(names(r))]
  cbind(pre,m,post)
}

# Example.
library(MASS)
data(painters)
str(painters)

# Move 'Expression' variable before 'Drawing' variable.
new <- move(painters,"Expression","Drawing")
View(new)

score 2 · Accepted Answer

您可以编写自己的函数来执行此操作。

以下将使用与 stata 类似的语法为您提供列名的新顺序

where是一个有 4 种可能性的命名列表
- list(last = T)
- list(first = T)
- list(before = x)x有问题的变量名称在哪里
- list(after = x)x有问题的变量名称在哪里
sorted = Tvar_list将按字典顺序排序（命令alphabetic和命令sequential的组合）stata

该函数仅适用于名称，（一旦您将data.frame对象传递为data，并返回重新排序的名称列表

例如

stata.order <- function(var_list, where, sorted = F, data) {
    all_names = names(data)
    # are all the variable names in
    check <- var_list %in% all_names
    if (any(!check)) {
        stop("Not all variables in var_list exist within  data")
    }
    if (names(where) == "before") {
        if (!(where %in% all_names)) {
            stop("before variable not in the data set")
        }
    }
    if (names(where) == "after") {
        if (!(where %in% all_names)) {
            stop("after variable not in the data set")
        }
    }

    if (sorted) {
        var_list <- sort(var_list)
    }
    where_in <- which(all_names %in% var_list)
    full_list <- seq_along(data)
    others <- full_list[-c(where_in)]

    .nwhere <- names(where)
    if (!(.nwhere %in% c("last", "first", "before", "after"))) {
        stop("where must be a list of a named element first, last, before or after")
    }

    do_what <- switch(names(where), last = length(others), first = 0, before = which(all_names[others] == 
        where) - 1, after = which(all_names[others] == where))

    new_order <- append(others, where_in, do_what)
    return(all_names[new_order])
}

tmp <- as.data.frame(matrix(1:100, ncol = 10))

stata.order(var_list = c("V2", "V5"), where = list(last = T), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V6"  "V7"  "V8"  "V9"  "V10" "V2"  "V5" 

stata.order(var_list = c("V2", "V5"), where = list(first = T), data = tmp)

##  [1] "V2"  "V5"  "V1"  "V3"  "V4"  "V6"  "V7"  "V8"  "V9"  "V10"

stata.order(var_list = c("V2", "V5"), where = list(before = "V6"), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V2"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"

stata.order(var_list = c("V2", "V5"), where = list(after = "V4"), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V2"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"

# throws an error
stata.order(var_list = c("V2", "V5"), where = list(before = "v11"), data = tmp)

## Error: before variable not in the data set

如果您想有效地进行内存重新排序（通过引用，无需复制），请使用data.table

DT <- data.table(tmp)
# sets by reference, no copying
setcolorder(DT, stata.order(var_list = c("V2", "V5"), where = list(after = "V4"), 
    data = DT))

DT

##     V1 V3 V4 V2 V5 V6 V7 V8 V9 V10
##  1:  1 21 31 11 41 51 61 71 81  91
##  2:  2 22 32 12 42 52 62 72 82  92
##  3:  3 23 33 13 43 53 63 73 83  93
##  4:  4 24 34 14 44 54 64 74 84  94
##  5:  5 25 35 15 45 55 65 75 85  95
##  6:  6 26 36 16 46 56 66 76 86  96
##  7:  7 27 37 17 47 57 67 77 87  97
##  8:  8 28 38 18 48 58 68 78 88  98
##  9:  9 29 39 19 49 59 69 79 89  99
## 10: 10 30 40 20 50 60 70 80 90 100

score 0 · Accepted Answer

这应该给你相同的文件：

#snip
gtinfo <- rbind(tweetinfo, noretweetinfo)
gtinfo$deleted=""
retweetinfo <- transform(retweetinfo, reTweetId="", reUserId="")
gtinfo <- rbind(gtinfo, retweetinfo)
gtinfo <-gtinfo[,c(1:16,18,17)]
#snip

可以在 R 中实现像 Strata 的 order 函数这样的函数，但我认为对此的需求并不大。

score 0 · Accepted Answer

目前还不清楚你想做什么，但你的第一句话让我假设你想对数据集进行排序。

实际上，有一个内置order函数，它返回有序序列的索引。你在找这个吗？

> x <- c(3,2,1)

> order(x)
[1] 3 2 1

> x[order(x)]
[1] 1 2 3

score 0 · Accepted Answer

packagedplyr和 functiondplyr::relocate是中引入的一个新动词dplyr 1.0.0，完全符合您的要求。

library(dplyr)

data %>% relocate(v17, v18, .before = v13)

data %>% relocate(v6, v16, .after = last_col())

data %>% relocate(age, .after = gender)

r - 是否有与 Stata 'order' 命令等效的 R 函数？

6 回答 6

Related

Reference