0

我试图将我的数据分成几类,以了解哪些组更有可能成为“默认”。因此我想使用决策树。

我的数据有 809054 个观察值和 8 个变量。如果我只考虑我的数据的一小部分样本(例如 1%),我可以运行代码并且我有我的拆分,问题是当我有我总观察的 70% 时。

我的数据示例:

train <- structure(list(Gender = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("Man", "Woman"
), class = "factor"), Card = structure(c(1L, 1L, 2L, 1L, 1L, 
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L), .Label = c("Credit", 
"Debit"), class = "factor"), Age = c(72, 62, 9999, 9999, 66, 
51, 44, 76, 47, 59, 40, 48, 40, 75, 57, 9999, 39, 49, 50, 65, 
67, 84, 58, 50, 50, 43, 45, 55, 64, 9999, 48, 73, 29, 9999, 29, 
63, 29, 9999, 49, 66, 48, 59, 57, 60, 50, 54, 9999, 57, 62, 59, 
62, 42, 50, 62, 9999, 48, 42, 52, 35, 80, 73, 46, 54, 76, 37, 
68, 46, 39, 37, 64, 43, 55, 9999, 33, 59, 66, 9999, 59, 45, 53
), Leasing = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), 
    District = structure(c(3L, 8L, 1L, 8L, 10L, 7L, 9L, 11L, 
    4L, 11L, 7L, 9L, 10L, 8L, 11L, 3L, 6L, 3L, 6L, 3L, 8L, 7L, 
    11L, 8L, 8L, 8L, 9L, 5L, 8L, 10L, 5L, 8L, 5L, 9L, 5L, 6L, 
    6L, 4L, 9L, 5L, 8L, 5L, 7L, 10L, 2L, 5L, 8L, 1L, 10L, 2L, 
    10L, 8L, 7L, 4L, 1L, 1L, 8L, 8L, 3L, 5L, 10L, 3L, 5L, 8L, 
    3L, 5L, 3L, 4L, 5L, 8L, 1L, 7L, 11L, 3L, 10L, 7L, 4L, 10L, 
    2L, 10L), .Label = c("Zona_01", "Zona_02", "Zona_03", "Zona_04", 
    "Zona_05", "Zona_06", "Zona_07", "Zona_08", "Zona_09", "Zona_10", 
    "Zona_11"), class = "factor"), product_type = structure(c(4L, 
    2L, 2L, 1L, 1L, 4L, 3L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 4L, 18L, 
    2L, 1L, 4L, 2L, 4L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 5L, 1L, 
    1L, 4L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 22L, 1L, 2L, 2L, 1L, 
    1L, 49L, 1L, 2L, 2L, 4L, 2L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 
    4L, 4L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    2L, 53L, 5L, 1L, 1L, 1L), .Label = c("010", "020", "040", 
    "050", "060", "061", "062", "070", "071", "072", "080", "081", 
    "082", "090", "091", "092", "093", "100", "101", "102", "110", 
    "130", "140", "150", "160", "170", "171", "172", "250", "260", 
    "265", "270", "271", "280", "285", "290", "291", "300", "301", 
    "302", "303", "304", "305", "306", "307", "308", "309", "310", 
    "330", "331", "351", "354", "520", "521"), class = "factor"), 
    Client_time = c(10, 17, 7, 8, 23, 21, 4, 24, 2, 20, 19, 21, 
    22, 15, 18, 18, 19, 22, 8, 14, 33, 24, 23, 18, 21, 8, 23, 
    21, 29, 5, 23, 10, 27, 3, 22, 16, 7, 3, 13, 10, 7, 12, 20, 
    17, 14, 17, 19, 26, 18, 11, 21, 6, 12, 6, 22, 17, 19, 10, 
    11, 19, 17, 18, 6, 19, 16, 24, 29, 15, 12, 19, 15, 18, 24, 
    17, 14, 37, 15, 2, 16, 22), Default = structure(c(1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("N", "Y"), class = "factor")), row.names = c(NA, 
-80L), class = c("tbl_df", "tbl", "data.frame"))

代码:

# Make dependent variable as a factor (categorical)
mydata$Default= as.factor(mydata$Default)
mydata$Gender= as.factor(mydata$Gender)
mydata$Card= as.factor(mydata$Card)
mydata$Leasing = as.factor(mydata$Leasing)
mydata$District= as.factor(mydata$District)
mydata$product_type= as.factor(mydata$product_type)

# Check attributes of data
str(mydata)

# Split data into training (70%) and validation (30%)
dt = sort(sample(nrow(mydata), nrow(mydata)*.7))
train<-mydata[dt,]
val<-mydata[-dt,] # Check number of rows in training data set
nrow(train)

#Compute decision tree
mtree<-rpart(Default~., data=train, parms=list(split=c("information","gini")),
      cp = 0, minsplit=1, minbucket=1)

由于参数的原因,这会导致一棵巨大的树,有几个分裂。所以我在代码中包含了 maxdepth :

mtree<-rpart(Default~., data=train, parms=list(split=c("information","gini")), cp = 0, minsplit=1, minbucket=1, maxdepth=5)

问题是当我这样做时没有分裂:

> mtree
n= 566337 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 566337 38293 N (0.93238478 0.06761522) *

真的很想得到一些帮助!

谢谢!

4

0 回答 0