我正在处理一些数据。我已经完成了层次聚类分析。首先我用热图进行聚类分析,然后只是为了好玩,我对列变量和行变量进行了不同的聚类分析。我很惊讶,因为行簇与热图相似,但列簇与热图完全不同,为什么会这样?
注意:为了获得最佳集群,我进行了肘部测试。
我用于热图的数据
hdata = structure(list(X = c("NDDZ91", "NDDZ92", "NDDZ94", "NDDZ95",
"NDDZ96", "NDDZ97", "NDDZ98", "NDDZ99", "NDDZ103", "NDDZ105",
"NDDZ106", "NDDZ107", "NDDZ112", "NDDZ113", "NDDZ114", "NDDZ115",
"NDDZ116", "NDDZ117", "NDDZ119", "NDDZ122", "NDDZ123"), Petroleum = c(-0.647084118412196,
-0.551848107272278, -0.533406396531982, 2.33692841539666, -0.733377890728331,
0.0577903473780798, -0.607675174629647, -0.639353182196633, -0.294786156183172,
-0.718723841791673, -0.662646966254462, -0.0948858367969614,
1.579605650713, -0.604204738608079, -0.507701861100507, -0.664395581635162,
-0.315549005943229, 1.67466171698116, 0.184284540536848, 2.12514781932784,
-0.38277963224928), Sulfide = c(-0.803295548320315, -0.305895923993731,
-0.572726113112637, 0.11301523300602, -0.874440065487914, -0.442370997786199,
-0.70415222117063, -0.484904709125107, 0.839454302923776, -0.92890769583558,
-0.139892050602668, 0.126326142669635, 0.375867449122007, -0.869238100791788,
0.0678805393190489, -0.599806929324819, 0.20695659545958, 1.95726471674415,
-0.384384391320565, 3.20894922200892, 0.214300545618816), PAH = c(-0.712371602367626,
-0.456497163866604, 0.628152971666032, 0.597576780901758, -0.535351550574466,
0.177556476192533, 2.90205600060908, -0.337410947205751, 0.404463997127402,
-0.351894405988827, 1.89143243218995, 0.54608003856193, -0.380861323554981,
-0.202231998563701, 0.694133172788937, -0.913530752132581, -0.963418221274289,
-0.936060576906256, -1.05031897397275, 0.0488146203429623, -1.05031897397275
), OCP = c(-0.0834008874012298, -0.360716951928731, -0.360716951928731,
-0.360716951928731, -0.360716951928731, -0.250814260150284, -0.360716951928731,
-0.360716951928731, -0.360716951928731, -0.308432701236277, -0.360716951928731,
1.08296347658429, -0.265223818590346, 0.132514343053848, -0.360716951928731,
4.12365054859107, -0.0908792841723442, -0.360716951928731, -0.360716951928731,
-0.0117739935339564, -0.360716951928731), PES = c(-1.11151376185314,
0.53341410759024, -0.443794330157326, 0.210060743571714, -1.11151376185314,
1.38202604462594, -0.678334951363158, 0.0253890719124948, 1.01970812092051,
0.196698531810288, 0.730365950542372, 2.01463766969121, -1.11151376185314,
-1.11151376185314, 0.586066086873878, -1.11151376185314, -1.11151376185314,
-0.796267925089677, -0.0053106538477932, 1.73720400498749, 0.157220099050653
), PET = c(-0.378968054988408, -0.378968054988408, -0.378968054988408,
-0.378968054988408, -0.191599038694733, -0.378968054988408, -0.378968054988408,
-0.378968054988408, -0.378968054988408, -0.378968054988408, -0.378968054988408,
-0.378968054988408, 2.93610087592508, -0.378968054988408, 2.86477283549588,
-0.378968054988408, -0.378968054988408, -0.378968054988408, -0.378968054988408,
-0.378968054988408, 0.833182262076715), PP = c(-0.355786691457058,
-0.355786691457058, -0.355786691457058, -0.355786691457058, -0.276113653196189,
-0.355786691457058, -0.355786691457058, -0.355786691457058, -0.355786691457058,
-0.355786691457058, 0.392487584131918, -0.355786691457058, 2.46348000506521,
3.40448459741963, -0.355786691457058, -0.355786691457058, -0.355786691457058,
-0.29175147010764, -0.355786691457058, -0.355786691457058, -0.355786691457058
), Rayon = c(0.0708756309162313, -0.161796156534661, -0.239046496921556,
0.0210534403144277, -0.654635735908907, -1.05315416176279, 0.00314036263036767,
-0.129052144045892, 0.679154337768608, 0.0101923129261644, -0.304592055985073,
-0.417901359196941, -1.05315416176279, -1.05315416176279, 1.01659654582927,
0.647831438976976, -0.503028004906404, -0.796914749643283, -0.15400562102564,
3.57785872291754, 0.493732017177138)), class = "data.frame", row.names = c(NA,
-21L))
和我使用的代码
library(dendextend)
row_dend = hclust(dist(hdata))
col_dend = hclust(dist(t(hdata)))
Heatmap(hdata, name = "Cluster Analysis", row_names_gp = gpar(fontsize = 6.5),cluster_rows = color_branches(row_dend, k = 3),cluster_columns = color_branches(col_dend, k = 4))
现在我在没有热图的情况下分别对行和列进行了层次聚类分析。对于列簇,当我使用该代码时它可以工作。
de=t(hdata)
require(stats)
mres.dist = dist(x=de, method = "euclidean")
mhcl = hclust(d=mres.dist, method = "complete")
library("ggplot2")
library("ggdendro")
dendr <- dendro_data(mhcl, type="rectangle")
clust <- cutree(mhcl, k=4)
clust.df <- data.frame(label=names(clust), cluster=factor(clust))
dendr[["labels"]] <- merge(dendr[["labels"]],clust.df, by="label")
ggplot() +
geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label(dendr), aes(x, y, label=label, hjust=0, color=cluster),
size=3) +
coord_flip() + scale_y_reverse(expand=c(0.2, 0)) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.y=element_blank(),
axis.title.y=element_blank(),
panel.background=element_rect(fill="white"),
panel.grid=element_blank())
这段代码中,column cluster 和 heatmap 类似,见图
但是当我对下面的数据使用相同的代码时
mdata = structure(list(X = c("Petroleum", "Sulfide", "PAH", "OCP", "PES",
"PET", "PP", "Rayon"), NDDZ91 = c(0.25431048366632, 0.0560811642344239,
-0.530470895747429, -0.53191403831172, -0.532330057854161, -0.532330057854161,
-0.532330057854161, 2.34898345972089), NDDZ92 = c(0.463837639333079,
0.952950069296586, -0.903684657008105, -0.90617284752821, 0.815724803678714,
-0.90617284752821, -0.90617284752821, 1.38969068728436), NDDZ94 = c(0.971226427370999,
0.602793071008862, -0.823240498990531, -0.82945654761652, 0.0196693870567974,
-0.82945654761652, -0.82945654761652, 1.71792125640343), NDDZ95 = c(2.42802579411348,
-0.0506083754332584, -0.502583214461814, -0.503363243835357,
-0.289369238829121, -0.503363243835357, -0.503363243835357, -0.0753752338832128
), NDDZ96 = c(0.00274842857027743, 0.426017873184556, -0.766748988595133,
-0.77347114876572, -0.77347114876572, -0.1727085157315, -0.1727085157315,
2.23034201583474), NDDZ97 = c(2.10848758860433, 0.193448084745251,
-0.615305741309039, -0.617376493884711, 0.783142722825488, -0.617465386993774,
-0.617465386993774, -0.617465386993774), NDDZ98 = c(0.423375944838784,
0.225140759895981, -0.661579713047644, -0.672888070547451, -0.190387590278865,
-0.672888070547451, -0.672888070547451, 2.2221148102341), NDDZ99 = c(0.124694974839785,
0.76096782243398, -0.814698233667559, -0.817823335988372, 0.52156070279018,
-0.817823335988372, -0.817823335988372, 1.86094474156873), NDDZ103 = c(0.54397319352348,
1.42930260927922, -0.862544004633737, -0.864740129772812, 0.206249487125555,
-0.864740129772812, -0.864740129772812, 1.27723910402392), NDDZ105 = c(-0.241620251309,
-0.326398794697822, -0.599198639718797, -0.601906105080565, 0.79037000894708,
-0.601986236431586, -0.601986236431586, 2.18272625472228), NDDZ106 = c(-0.394157377566325,
1.16693979050257, -1.07064905165659, -1.07869604129994, 0.818419573773412,
-1.07869604129994, 0.818419573773412, 0.818419573773412), NDDZ107 = c(1.51774013128984,
0.835479148262337, -0.859050587367988, -0.860622307310844, 1.01408543381883,
-0.861869814240638, -0.861869814240638, 0.0761078097890962),
NDDZ112 = c(2.11364624669633, -0.00153433171657681, -0.709095409475689,
-0.709592975332893, -0.709621311656388, 0.00539926044388821,
0.720419832697712, -0.709621311656388), NDDZ113 = c(-0.10219199491079,
-0.288522475561189, -0.413106229900568, -0.413806467348564,
-0.414026981854573, -0.414026981854573, 2.45970811328483,
-0.414026981854573), NDDZ114 = c(-0.129692980086801, 0.47635959532587,
-0.973467990222615, -0.976193794294955, -0.058232069784503,
0.859729654725949, -0.976193794294955, 1.77769137863201),
NDDZ115 = c(-0.0579841877030005, 0.222032191755277, -0.508574033544108,
-0.505028936233488, -0.509483930618318, -0.509483930618318,
-0.509483930618318, 2.37800675758027), NDDZ116 = c(1.37784174252917,
1.63989361040572, -0.674954611309526, -0.675578422826408,
-0.675875899093204, -0.675875899093204, -0.675875899093204,
0.360425378480659), NDDZ117 = c(2.17720287040877, 0.849407303449914,
-0.539284668957118, -0.539526111444497, -0.479438953631418,
-0.539526111444497, -0.509482532534407, -0.419351795846743
), NDDZ119 = c(2.21495861144316, 0.141523162994869, -0.678761340784713,
-0.679280372639639, -0.106386562727029, -0.679280372639639,
-0.679280372639639, 0.466507246992633), NDDZ122 = c(1.74372941120167,
0.768630096454789, -0.772390802747609, -0.772844893806552,
-0.348008730623262, -0.772919842236302, -0.772919842236302,
0.926724603993572), NDDZ123 = c(0.565107727548649, 1.07534226748155,
-0.991124049814337, -0.991804914993823, -0.129865188790062,
-0.129865188790062, -0.991804914993823, 1.59401426235191)), class = "data.frame", row.names = c(NA,
-8L))
该图与热图不匹配。上面的数据是转置数据,hdata
但它是在excel中创建的。现在我很困惑,因为如果hdata
在 r 中转置,那么分析是可以的,但是如果我下载hdata
并在 excel 中转置它,然后在 r 中工作,图就改变了。为什么?