1

我有一个数据集,想运行一个 PCA 图。在此图中,观察值应根据name( ) 列以相同颜色分​​组habillage = a$name。此外,我希望单个观察结果显示它对应于哪个组Age。我发现label = "none"没有显示它,但如果我写label = a$Age什么都没有改变。最后,如何避免在图例中显示重复的黑白文本habillage = a$name

libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), 
    Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L, 
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
    ), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54", 
    "55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L, 
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L, 
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L, 
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L, 
    21L, 22L), .Label = c("Automated Boost", "Competitors January", 
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April", 
    "Marketing August", "Marketing December", "Marketing February", 
    "Marketing January", "Marketing July", "Marketing June", 
    "Marketing March", "Marketing May", "Upsell April", "Upsell August", 
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July", 
    "Upsell June", "Upsell March", "Upsell May"), class = "factor"), 
    n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L, 
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L, 
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L, 
    0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L, 
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L, 
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L, 
    0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L, 
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L, 
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L, 
    948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16, 
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22, 
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12, 
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6, 
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L, 
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22, 
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66, 
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29, 
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19, 
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19, 
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18, 
    21, 5)), row.names = c(NA, -40L), groups = structure(list(
    effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
    ), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
    ), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L), 
        c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L, 
        27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))


a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>% 
group_by(effective_status,Age,name)  %>%  
summarise(
  n_obs = n(),
  Clicks = sum(Clicks,na.rm = TRUE),
  Impressions = sum(Impressions,na.rm = TRUE),
  Reach = sum(Reach,na.rm = TRUE),
  Spend = sum(Spend,na.rm = TRUE),
  Purchase = sum(Purchase,na.rm = TRUE),
  PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
  Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
)  %>% arrange(desc(PurchaseValue))


res.pca <- prcomp(a[4:ncol(a)],  scale = TRUE)

fviz_pca_ind(res.pca, 
                          #col.ind = a$name, # color by groups 
                          label = "none",
                          #geom = c("point","text"),
                          habillage = a$name, # color by groups
                          #palette = c("#00AFBB", "#FC4E07", "#2CA25F"), 
                          addEllipses = TRUE, # Concentration ellipses 
                          ellipse.type = "confidence", 
                          legend.title = "Groups", 
                          repel = TRUE )
4

1 回答 1

1

您可以提取计算出的 PCA 分数,然后做自己的 ggplot:

library(tidyverse)
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at

df <- structure(list(
  effective_status = structure(c(
    1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L
  ), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
  Age = structure(c(
    3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
  ), .Label = c(
    "13-17", "18-24", "25-34", "35-44", "45-54",
    "55-64", "65+", "Unknown"
  ), class = "factor"), name = structure(c(
    19L,
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
    21L, 22L
  ), .Label = c(
    "Automated Boost", "Competitors January",
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
    "Marketing August", "Marketing December", "Marketing February",
    "Marketing January", "Marketing July", "Marketing June",
    "Marketing March", "Marketing May", "Upsell April", "Upsell August",
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
    "Upsell June", "Upsell March", "Upsell May"
  ), class = "factor"),
  n_obs = c(
    1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L
  ), Clicks = c(
    1364L,
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
    0L, 68L, 0L, 0L, 0L
  ), Impressions = c(
    12409L, 0L, 58222L,
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
    0L, 948L, 0L, 2972L, 0L, 0L, 0L
  ), Reach = c(
    12164L, 0L, 46142L,
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
    948L, 0L, 2782L, 0L, 0L, 0L
  ), Spend = c(
    1153.11, 0, 9663.16,
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05
  ), Purchase = c(
    140L,
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
  ), PurchaseValue = c(
    221595.22,
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  ),
  Date_minus_start_time = c(
    9, 13, 15, 26.3055555555556, 29,
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
    21, 5
  )
), row.names = c(NA, -40L), groups = structure(list(
  effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c(
    "13-17",
    "18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
  ), class = "factor"), .rows = structure(list(
    c(8L, 11L, 12L), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
    c(6L, 19L, 20L, 21L), c(
      3L, 22L, 23L, 24L, 25L, 26L,
      27L, 28L, 29L, 30L
    ), 31:37, c(2L, 10L, 38L, 39L, 40L)
  ), ptype = integer(0), class = c(
    "vctrs_list_of",
    "vctrs_vctr", "list"
  ))
), row.names = c(NA, -7L), class = c(
  "tbl_df",
  "tbl", "data.frame"
), .drop = TRUE), class = c(
  "grouped_df",
  "tbl_df", "tbl", "data.frame"
))

df
#> # A tibble: 40 x 11
#> # Groups:   effective_status, Age [7]
#>    effective_status Age    name    n_obs Clicks Impressions Reach Spend Purchase
#>    <fct>            <fct>  <fct>   <int>  <int>       <int> <int> <dbl>    <int>
#>  1 ACTIVE           25-34  Upsell…     1   1364       12409 12164 1153.      140
#>  2 ACTIVE           Unkno… Upsell…     1      0           0     0    0       163
#>  3 ACTIVE           55-64  Upsell…     1   4919       58222 46142 9663.      104
#>  4 ACTIVE           35-44  Upsell…     3   2597       30115 25282 3202.       33
#>  5 ACTIVE           35-44  Market…     2   2641       47119 35142 3393.       22
#>  6 ACTIVE           45-54  Market…     2      0           0     0    0        17
#>  7 ACTIVE           35-44  Market…     2    915       18817 14843 1739.       11
#>  8 ACTIVE           18-24  Market…     1   1104       17068 13533 1344.       13
#>  9 ACTIVE           35-44  Upsell…     1     63        4175  3624  502.        2
#> 10 ACTIVE           Unkno… Market…     2      0           0     0    0         0
#> # … with 30 more rows, and 2 more variables: PurchaseValue <dbl>,
#> #   Date_minus_start_time <dbl>

res.pca <- prcomp(df[, c("Clicks", "Impressions", "Reach", "Spend", "Purchase", "PurchaseValue", "Date_minus_start_time")], scale = TRUE)

fviz_pca_ind(res.pca)$data %>%
  bind_cols(df) %>%
  ggplot(aes(x, y, color = name...9)) +
  geom_label(aes(label = Age)) +
  labs(color = "Name")
#> New names:
#> * name -> name...1
#> * name -> name...9

reprex 包于 2021-09-17 创建(v2.0.1)

于 2021-09-17T12:29:16.107 回答