我正在读取一个包含 26 列和 4288 个实例的数据集“Dummy_data.csv”,其中总共有 17 个参数(列)对我们的数据分析很重要。17 个参数中的 6 个,即(param1、param2、param3、param5、param6、param7)是关键参数,同时超出范围决定了项目是否有缺陷(类标签)。例如,
range1 = (min1, max1) = (0.25, 0.35)
range2 = (min2, max2) = (2.5, 3.1)
range3 = (min3, max3) = (680, 700)
range5 = (min5, max5) = (56, 64)
range6 = (min6, max6) = (40, 60)
range7 = (min7, max7) = (28, 38)
if (param1 out of range1 & param2 out of range2 & param3 out of range3 &
param5 out of range5 & param6 out of range6 & param7 out of range7)
class = 'defective'
else
class = 'ok'
我们需要对上述数据进行两次缺陷分析。首先,我需要从项目总数中找出缺陷的百分比。其次,我需要找出 6 个关键参数中每一个的超出范围值的频率直方图,以了解这些关键参数的哪些超出范围值对缺陷品的影响更大。
我做了什么:由于这 6 个关键参数的范围大多不重叠,首先我缩放了 17 个参数(尽管缩放 6 个关键参数就足够了!)使用(x - min(x))/(max(x)-min(x))
间隔,(0, 1)
以便我可以进行频率分布x 轴统一刻度上 6 个参数的超出范围值。在图形方面,这意味着小于 0 的参数值意味着小于最小值,而大于 1 的参数值意味着大于最大值。因此,我从数据框中的数据集中过滤了所有有缺陷的实例,z
并绘制了一个饼图来显示缺陷和正常项目的百分比。(第一次分析)
对于频率直方图(第二次分析),我将缩放数据集中的所有缺陷实例过滤scaled.dat.df
到defect.dat.df
. 然后我从所有 6 个参数中选择min
和max
来确定缺陷间隔范围。接下来,我将 6 个参数中的每一个的唯一值分箱,p1.bin.defect.dat.df
并使用该函数在同一图上p7.bin.defect.dat.df
绘制各个直方图。plot
多个重叠直方图的问题
我得到了如下所示的多直方图,但问题是条形的宽度因 6 个参数而异。有人知道如何为多直方图设置统一的条形宽度吗?另外,如何在这个多历史情节中添加合适的图例?
任何有用的建议/答案都将受到高度赞赏和相应的奖励。
注意:我在此处的多直方图上关注了另一个线程 how-to-plot-two-histograms-together-in-r 并想要一个与此非常相似但 6 个重叠的直方图而不是 2 个重叠的直方图(如在线程中)
library(RWeka)
library(party)
library(plyr)
library(plotrix)
library(sm)
#read data and class labels
dat <- read.csv("Dummy_data.csv", head=T, sep=",")
datm <- as.matrix(dat[,8:24])
class <- as.matrix(dat[,26])
#center and scale data
center <- c(0.25, 2.5, 680, 1067, 56, 40, 28, -99, -99, 40, 5, 50, 5000, 15000, 11.3, 9.1, 0)
scale <- c(0.1, 0.6, 20, 6, 8, 20, 10, 19, 19, 20, 2, 10, 500, 1000, 3.4, 18.3, 5)
scaled.datm <- scale(datm, center, scale)
write.table(scaled.datm,
file = "C:\\Users\\schakrabarti\\Documents\\Dummy_data_whdr17.csv",
append=FALSE, quote=TRUE, sep=",", eol = "\n", na = "NA", dec = ".",
row.names = FALSE, col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
#filter total non-compliants
scaled.dat.df <- as.data.frame(scaled.datm)
total <- length(scaled.dat.df[,1])
z <- c((scaled.dat.df[,"PARAM1"]<0 | scaled.dat.df[,"PARAM1"]>1) &
(scaled.dat.df[,"PARAM2"]<0 | scaled.dat.df[,"PARAM2"]>1) &
(scaled.dat.df[,"PARAM3"]<0 | scaled.dat.df[,"PARAM3"]>1) &
(scaled.dat.df[,"PARAM5"]<0 | scaled.dat.df[,"PARAM5"]>1) &
(scaled.dat.df[,"PARAM6"]<0 | scaled.dat.df[,"PARAM6"]>1) &
(scaled.dat.df[,"PARAM7"]<0 | scaled.dat.df[,"PARAM7"]>1) )
noncompliant <- length(z[z == TRUE])
slices <- c(noncompliant, total - noncompliant)
labls <- c("NOT OK","OK")
pct <- round(slices/sum(slices)*100, digits=2)
labls <- paste(labls, pct)
labls <- paste(labls, "%", sep="")
#pie3D(slices,labels=labls,explode=0.05, col=c(rgb(0.75,0,0.5),rgb(0,1,0.75)),main="Defect Analysis due to critical parameters")
pie(slices,labels=labls,main="Defect Analysis due to critical parameters")
#filter non-compliants due to individual params
defect.dat.df <- scaled.dat.df[z,]
#select defect interval range
min <- min(as.numeric(sapply(defect.dat.df[,c("PARAM1","PARAM2","PARAM3","PARAM5","PARAM6","PARAM7")], function(x) min(as.numeric(x)))))
max <- max(as.numeric(sapply(defect.dat.df[,c("PARAM1","PARAM2","PARAM3","PARAM5","PARAM6","PARAM7")], function(x) max(as.numeric(x)))))
#plot histogram for param1 defect
#p1.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM1")], breaks=seq(-0.4,0.2,by=0.2))
p1.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM1")], breaks=seq(min,max,by=0.2))
#h1 <- hist(defect.dat.df[,c("PARAM1")])
#plot(h1, col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h1 <- hist(defect.dat.df[,c("PARAM1")], col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
#h1 <- hist(defect.dat.df[,c("PARAM1")], col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max))
box()
p2.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM2")], breaks=seq(min,max,by=0.2))
#h2 <- hist(defect.dat.df[,c("PARAM2")])
#plot(h2, col=rgb(0,0,1,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h2 <- hist(defect.dat.df[,c("PARAM2")], col=rgb(0,0,1,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p2.bin.defect.dat.df$breaks, n=1),tail(p2.bin.defect.dat.df$breaks, n=1)), add=T)
#h2 <- hist(defect.dat.df[,c("PARAM2")], col=rgb(0,0,1,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p3.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM3")], breaks=seq(min,max,by=0.2))
#h3 <- hist(defect.dat.df[,c("PARAM3")])
#plot(h3, col=rgb(0,1,0,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h3 <- hist(defect.dat.df[,c("PARAM3")], col=rgb(0,1,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p3.bin.defect.dat.df$breaks, n=1),tail(p3.bin.defect.dat.df$breaks, n=1)), add=T)
#h3 <- hist(defect.dat.df[,c("PARAM3")], col=rgb(0,1,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p5.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM5")], breaks=seq(min,max,by=0.2))
#h5 <- hist(defect.dat.df[,c("PARAM5")])
#plot(h5, col=rgb(0.5,0.5,0,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h5 <- hist(defect.dat.df[,c("PARAM5")], col=rgb(0.5,0,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p5.bin.defect.dat.df$breaks, n=1),tail(p5.bin.defect.dat.df$breaks, n=1)), add=T)
#h5 <- hist(defect.dat.df[,c("PARAM5")], col=rgb(0.5,0,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p6.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM6")], breaks=seq(min,max,by=0.2))
#h6 <- hist(defect.dat.df[,c("PARAM6")])
#plot(h6, col=rgb(0,0.5,0.5,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h6 <- hist(defect.dat.df[,c("PARAM6")], col=rgb(0,0.5,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p6.bin.defect.dat.df$breaks, n=1),tail(p6.bin.defect.dat.df$breaks, n=1)), add=T)
#h6 <- hist(defect.dat.df[,c("PARAM6")], col=rgb(0,0.5,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p7.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM7")], breaks=seq(min,max,by=0.2))
#h7 <- hist(defect.dat.df[,c("PARAM7")])
#plot(h7, col=rgb(0.5,0,0.5,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h7 <- hist(defect.dat.df[,c("PARAM7")], col=rgb(0.5,0.5,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p7.bin.defect.dat.df$breaks, n=1),tail(p7.bin.defect.dat.df$breaks, n=1)), add=T)
#h7 <- hist(defect.dat.df[,c("PARAM7")], col=rgb(0.5,0.5,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()