关于游泳者瀑布图时间轴的一个问题。
我使用下面的代码生成了游泳者图>
但是,我希望根据数据集中的 responseStartTime 和 responseEndTime 持续时间,让每个主题的条形由 responseType(而不是“阶段”)着色。请建议我如何在响应期间通过 responseType 定义颜色。
谢谢!
资料来源:(http://rpubs.com/alexiswl/swimmer)。
数据代码
library(magrittr)
library(stringi)
library(readr) # Reading in the dataset
library(ggplot2) # Viewing the dataset
library(forcats) # Sorting factors
library(RColorBrewer) # Plot colours
library(dplyr, warn.conflicts=FALSE) # Manipulating the dataframes
library(purrr, warn.conflicts=FALSE) # Manipulating dataframe metadata
library(zoo, warn.conflicts=FALSE) # Filling in NA values
library(reshape2) # Reformmating dataframes
library(editData)
df.data <- df
swimmer_file = "https://blogs.sas.com/content/graphicallyspeaking/files/2014/06/Swimmer_93.txt"
col.names = c("subjectID", "stage", "startTime", "endTime",
"isContinued", "responseType", "responseStartTime", "responseEndTime", "Durable")
df <- readr::read_lines(swimmer_file) %>%
# Split by line recursion (\r\n)
stringi::stri_split(fixed="\r\n", simplify=TRUE) %>%
# Take only lines starting with a number (sample id)
.[grepl("^[0-9]+", .)] %>%
# Remove spaces from response column
gsub(pattern="\\sresponse", replacement="_response") %>%
# Remove spaces from stage column
gsub(pattern="Stage\\s", replacement="Stage_") %>%
# Some lines missing 'Stage' and 'isContinued' column.
# Replace any set of 8 or more spaces with ' . '
gsub(pattern="\\s{8,}", replacement=' . ') %>%
# Split strings by spaces, do not include empty strings as columns
stringi::stri_split(fixed=" ", simplify=TRUE, omit_empty=TRUE) %>%
# Convert to dataframe
as.data.frame(stringsAsFactors=FALSE) %>%
# Set the column names
purrr::set_names(col.names) %>%
# We need to do some more cleaning up of the dataframe
# Convert all . to NAs
dplyr::na_if(".") %>%
# Fill NAs in Stage column
dplyr::mutate(stage=zoo::na.locf(stage)) %>%
# Turn isContinued into boolean
dplyr::mutate(isContinued=dplyr::if_else(isContinued=="FilledArrow", TRUE, FALSE, missing=FALSE)) %>%
# Convert stage variable to factor, remove underscore
dplyr::mutate(stage = as.factor(gsub(pattern="_", replacement=" ", x=stage))) %>%
# Remove underscore from response types
dplyr::mutate(responseType = gsub("_", " ", responseType)) %>%
# Change Durable from character to numeric
dplyr::mutate(Durable = as.numeric(Durable)) %>%
# Change Time variables from character to numeric
dplyr::mutate_at(vars(dplyr::ends_with("Time")), as.numeric)
df.shapes <- df %>%
# Get just the subject and response time columns
dplyr::select(subjectID, responseType, responseStartTime) %>%
# Melt the data frame, so one row per response value.
reshape2::melt(id.vars=c("subjectID", "responseType"), value.name="time") %>%
# Remove na values
dplyr::filter(!is.na(time)) %>%
# Remove response variable column
dplyr::select(-variable) %>%
# Add 'start' to the end of the response type
dplyr::mutate(responseType=paste(responseType, "start", sep=" "))
# Add the end time for each
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, endTime, responseEndTime, isContinued) %>%
# Place endtime as response endtime if not continuing and responseEndTime is NA
dplyr::mutate(responseEndTime=dplyr::if_else(!isContinued & is.na(responseEndTime),
endTime, responseEndTime)) %>%
dplyr::select(-endTime, -isContinued) %>%
# Remove other existing NA responseEndTimes
dplyr::filter(!is.na(responseEndTime)) %>%
dplyr::mutate(responseType="Response end") %>%
dplyr::rename(time=responseEndTime))
# Append on the durable column
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, Durable) %>%
dplyr::filter(!is.na(Durable)) %>%
dplyr::mutate(responseType="Durable") %>%
dplyr::rename(time=Durable))
# Add on the arrow sets
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, endTime, isContinued) %>%
dplyr::filter(isContinued) %>%
dplyr::select(-isContinued) %>%
dplyr::mutate(responseType="Continued Treatment") %>%
dplyr::mutate(endTime=endTime+0.25) %>%
dplyr::rename(time=endTime))
responseLevels = c("Complete response start", "Partial response start",
"Response end", "Durable", "Continued Treatment")
# Convert responseType to factor and set the levels
df.shapes %<>%
dplyr::mutate(responseType = factor(responseType, levels=responseLevels)) %>%
# Order by response type
dplyr::arrange(desc(responseType))
unicode = list(triangle=sprintf('\u25B2'),
circle=sprintf('\u25CF'),
square=sprintf('\u25A0'),
arrow=sprintf('\u2794'))
剧情代码:
df %>%
# Get just the variables we need for the base of the plot
dplyr::select(subjectID, endTime, stage) %>%
# Remove duplicate rows
dplyr::distinct() %>%
# Order subject ID by numeric value
dplyr::mutate(subjectID=forcats::fct_reorder(.f=subjectID, .x=as.numeric(subjectID), .desc = TRUE)) %>%
# Pipe into ggplot
ggplot(aes(subjectID, endTime)) + # Base axis
geom_bar(stat="identity", aes(fill=factor(stage))) + # Bar plot. Colour by stage
geom_point(data=df.shapes, # Use df.shapes to add reponse points
aes(subjectID, time, colour=responseType, shape=responseType), size=5) +
coord_flip() + # Flip to horizonal bar plot.
scale_colour_manual(values=c(RColorBrewer::brewer.pal(3, "Set1")[1:2], # Add colours
rep("black", 3))) + # min of brewerpal is three but we only need 2.
scale_shape_manual(values=c(rep(unicode[["triangle"]], 2), # Add shapes
unicode[["circle"]], unicode[["square"]], unicode[["arrow"]])) +
scale_y_continuous(limits=c(-0.5, 20), breaks=0:20) + # Set time limits
labs(fill="Disease Stage", colour="Symbol Key", shape="Symbol Key", # Add labels
x="Subject ID ", y="Months since diagnosis",
title="Swimmer Plot",
caption="Durable defined as subject with six months or more of confirmed response") +
theme(plot.title = element_text(hjust = 0.5), # Put title in the middle of plot
plot.caption = element_text(size=7, hjust=0)) # Make caption size smaller