我想为随机森林的无监督学习准备数据。程序如下:
- 获取数据并将值为 1 的属性“类”添加到所有示例
- 从原始数据生成合成数据:
- 虽然您没有与原始数据构建示例相同数量的示例:
- 从原始数据中该属性的所有值中采样新的属性值
- 对所有属性执行此操作并将它们组合到新示例中
- 虽然您没有与原始数据构建示例相同数量的示例:
- 分配给合成数据值 2 的属性“类”
- 将两个数据绑定在一起
最后它看起来像这样:
... Class
|1
Original |1
Data |1
|1
--------------
|2
Synthetic |2
Data |2
|2
我的 R 代码如下所示:
library(gtools) #for smartbind()
sample1 <- function(X) { sample(X, replace=T) }
g1 <- function(dat) { apply(dat,2,sample1) }
data$class <- rep(1, times=nrow(data)) #add attribute 'class' with value 1
synthData<-data.frame(g1(data[,1:ncol(data)])) #generate synthetic data with sampling from data
synthData$class <- rep(2, times=nrow(synthData)) #attribute 'class' is 2
colnames(synthData) <- colnames(data)
newData <- smartbind(data, synthData) #bind the data together
很明显,我对 R 真的很陌生,但它确实有效——只有一个问题:合成数据中的属性类型与原始数据中的不同。如果原来它们是 nums,那么现在它们变成了因数。如何在生成合成数据时保留相同的类型?
谢谢!
Data1(数字成为因素):
结构(列表(V2 = c(1.51793, 1.51711, 1.51645, 1.51916, 1.51131), V3 = c(13.21, 12.89, 13.44, 14.15, 13.69), V4 = c(3.48, 3.62, 3.61, 0, 3.2), V5 = c(1.41, 1.57, 1.54, 2.09, 1.81), V6 = c(72.64, 72.96, 72.39, 72.74, 72.81), V7 = c(0.59, 0.61, 0.66, 0, 1.76), V8 = c(8.43, 8.11, 8.03, 10.88, 5.43), V9 = c(0, 0, 0, 0, 1.19), V10 = c(0, 0, 0, 0, 0), realClass = structure(c(1L, 2L, 2L) , 5L, 6L), .Label = c("1", "2", "3", "5", "6", "7"), class = "factor")), .Names = c(" V2”、“V3”、“V4”、“V5”、“V6”、“V7”、“V8”、“V9”、“V10”、“realClass”),row.names = c(27L, 138L, 77L, 183L, 186L), 类 = "data.frame")
Data2(因子变为 chrs):
结构(列表(realClass =结构(c(2L,2L,2L,1L,2L),.Label = c(“e”,“p”),类=“因子”),V2 =结构(c(6L, 3L, 4L, 6L, 6L), .Label = c("b", "c", "f", "k", "s", "x"), class = "factor"), V3 = structure( c(4L, 4L, 3L, 1L, 1L), .Label = c("f", "g", "s", "y"), class = "factor"), V4 = structure(c(5L, 5L, 5L, 3L, 4L), .Label = c("b", "c", "e", "g", "n", "p", "r", "u", "w", "y"), class = "factor"), V5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("f", "t"), class = "factor"), V6 = 结构(c(3L, 9L, 3L, 6L, 3L), .标签 = c("a", "c", "f", "l", "m", "n", "p", "s", "y"), class = "factor"), V7 =结构(c(2L, 2L, 2L, 2L, 2L), .Label = c("a", "f"), class = "factor"), V8 = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("c", "w"), class = "factor"), V9 = structure(c(2L, 2L, 2L, 1L, 1L), .Label = c("b", "n"), class = "factor"), V10 = structure(c(1L, 1L, 1L, 10L, 4L), .Label = c("b", "e", "g", "h", “k”,“n”,“o”,“p”,“r”,“u”,“w”,“y”),类 =“因子”),V11 = 结构(c(2L,2L, 2L, 2L, 1L), .标签 = c("e", "t"), class = "factor"), V12 = structure(c(NA, NA, NA, 1L, 1L), .Label = c("b", "c", "e", "r"), class = "factor"), V13 = structure(c(3L, 2L, 3L, 3L, 2L), .Label = c("f", "k", "s", "y"), class = "factor"), V14 = structure(c(3L, 3L, 2L, 3L, 2L), .Label = c("f", "k", "s", "y") , class = "factor"), V15 = structure(c(7L, 8L, 7L, 4L, 7L), .Label = c("b", "c", "e", "g", "n", “o”、“p”、“w”、“y”)、类 =“因子”)、V16 = 结构(c(7L、7L、8L、4L、1L),.Label = c(“b”, “c”、“e”、“g”、“n”、“o”、“p”、“w”、“y”),类 = “因子”),V17 = 结构(c(1L,1L,1L,1L,1L),.标签 = “p”,类 = “因子”),V18 = 结构(c(3L,3L,3L,3L,3L),.Label = c(“n”,“o”,“w”,“y” ), 类 = "因子"), V19 = 结构 (c(2L, 2L, 2L, 2L, 2L), .Label = c("n", "o", "t"), 类 = "因子") , V20 = 结构(c(1L, 1L, 1L, 5L, 3L), .Label = c("e", "f", "l", "n", "p"), class = "factor") , V21 = 结构(c(8L, 8L, 8L, 4L, 2L), .Label = c("b", "h", "k", "n", "o", "r", "u" , "w","y"), class = "factor"), V22 = structure(c(5L, 5L, 5L, 5L, 6L), .Label = c("a", "c", "n", "s", "v", "y"), class = "factor"), V23 = structure(c(3L, 3L, 5L, 1L, 2L), .Label = c("d", "g", "l", “m”、“p”、“u”、“w”)、class =“因子”))、.Names = c(“realClass”、“V2”、“V3”、“V4”、“V5”、 “V6”、“V7”、“V8”、“V9”、“V10”、“V11”、“V12”、“V13”、“V14”、“V15”、“V16”、“V17”、“V18” ", "V19", "V20", "V21", "V22", "V23"), row.names = c(4105L, 6207L,6696L, 2736L, 3756L), 类 = "data.frame")