r - 根据每日最高价和最低价估算买卖价差（CORWIN & SCHULTZ, 2012）——解释问题

Question

我正在尝试根据每日高价和低价计算买卖价差。CORWIN & SCHULTZ (2012) ( http://onlinelibrary.wiley.com/doi/10.1111/j.1540-6261.2012.01729.x/abstract ) 提出了论文和算法。

这是他们在 SAS 中的实现：https ://www3.nd.edu/~scorwin/HILOW_Estimator_Sample_002.sas

这是我对 R 的翻译：

# ----------------------------------- #
##### 2. CALCULATE BID-ASK-SPREAD ##### 
# ----------------------------------- #

##### 2.1 AGGREGATE TRADES BY DAY

# To calculate the bid-ask-spread, we need the following inputs:
# DATE = date in yyyymmdd format
# PRC = daily closing price
# LOPRC = daily low price
# HIPRC = daily high price

# import transactions data
transactions <- read.csv(file = "transaction_data_filtered.csv", sep = ",", header = T)

##### 2.1.1 Reformatting & Cleaning

transactions$DOLLAR_PRICE <- as.double(transactions$DOLLAR_PRICE)
transactions$PAR_TRADED <- as.numeric(transactions$PAR_TRADED)
# create a comparable representation of time to get closing price
transactions$SECONDS_SINCE_MIDNIGHT <- 0
for (i in 1:3) {
  transactions$SECONDS_SINCE_MIDNIGHT <- transactions$SECONDS_SINCE_MIDNIGHT + as.integer(sapply(strsplit(as.character(transactions$TIME_OF_TRADE), ":"), "[", i)) * 60 ^ (3 - i)
}
# some NAs got introduced; values are corrupt --> drop
nas <- subset(transactions, is.na(transactions$SECONDS_SINCE_MIDNIGHT))
transactions <- subset(transactions, !is.na(transactions$SECONDS_SINCE_MIDNIGHT))
rm(nas, i)
# format date as proper date
transactions$DATE <- anytime(transactions$TRADE_DATE)
# add iq_issuer_ciqid to clearly identify trades from one entity
transactions$IQ_ISSUER_CIQID <- NULL
iq_issuer_ciqid <- read.csv(file = "output_data/cusips_and_ciqid.csv", header = T, sep = "," )
keeps <- c("cusip", "iq_issuer_ciqid")
iq_issuer_ciqid <- iq_issuer_ciqid[keeps]
colnames(iq_issuer_ciqid) <- c("CUSIP", "IQ_ISSUER_CIQID")
rm(keeps)
transactions <- merge(transactions, iq_issuer_ciqid, by = "CUSIP", all.x = T)

##### 2.1.2 Generate high, low and closing prices

# on a daily base; use data.table for higher efficiency
transactions_dt <- as.data.table(transactions)
transactions_daily_by_issuer <- transactions_dt[, .(HIPRC = max(DOLLAR_PRICE), LOPRC = min(DOLLAR_PRICE), PRC = max(ifelse(SECONDS_SINCE_MIDNIGHT == max(SECONDS_SINCE_MIDNIGHT), DOLLAR_PRICE, 0)), VOLUME = sum(PAR_TRADED)), by = .(IQ_ISSUER_CIQID, DATE)]
# investigate the number of trades to get a better feeling for the data
number_of_transactions <- as.data.frame(xtabs(~IQ_ISSUER_CIQID, transactions))
colnames(number_of_transactions) <- c("IQ_ISSUER_CIQID", "NUMBER_OF_TRADES")
summary(as.integer(number_of_transactions$NUMBER_OF_TRADES))
number_of_transactions <- number_of_transactions[order(number_of_transactions$NUMBER_OF_TRADES, decreasing = T),]
# replace all # of trades over 10.000 with 10.000 to get a meaningful histogram
number_of_transactions$NUMBER_OF_TRADES[number_of_transactions$NUMBER_OF_TRADES > 10000] <- 10000
hist(number_of_transactions$NUMBER_OF_TRADES, breaks = 20, col = "blue")
plot_density <- density(number_of_transactions$NUMBER_OF_TRADES)
plot(plot_density,, main = "Density of number of trades per IQ_ISSUER_CIQID")
polygon(plot_density, col = "blue")
# general problem: a lot of entities don't have much volume --> KEEP IN MIND!

write.csv(transactions_daily_by_issuer, "transaction_data.csv")
rm(transactions_daily_by_issuer, transactions, transactions_dt, number_of_transactions)


##### 2.2 Calc bid-ask-spread as liquidity proxy - based on CORWIN(2012) #####
# ------------------------------------------------------------------------- #

##### 2.2.1 Adjust/remove bad prices 

##### 2.2.1.1 IF (LOPRC = HIPRC OR LOPRC <= 0 OR HIPRC <= 0 THEN DO: DROP BAD PRICES ONLY (USE BID / ASK ON ZERO VOLUME DAYS)
# import file to R and remove unnecessary column "X"
transaction_data <- read.csv("transaction_data.csv")
transaction_data$X <- NULL
# basically all bad prices are due to HIPRC == LOPRC --> indicates only one trade at that day, KEEP IN MIND!
bad_prices <- subset(transaction_data, (LOPRC == HIPRC | LOPRC <= 0 | HIPRC <= 0 | PRC <= 0 | VOLUME == 0))
# sorting the dataframe based on CompanyID and Date
sample <- transaction_data[order(transaction_data$IQ_ISSUER_CIQID, transaction_data$DATE),]

sample$LOPRCIN <- sample$LOPRC
sample$HIPRCIN <- sample$HIPRC

##### 2.2.1.2 RETAIN GOOD HIGH-LOW PRICES AND REPLACE IN CASES WHERE HIGH=LOW
sample$ISAMEPRC <- ifelse(sample$LOPRC == sample$HIPRC, 1, 0)
sample$INOTRADE <- ifelse(sample$PRC < 0 | sample$VOLUME == 0, 1, 0) # 0 results with "1"

sample$LOPRC[sample$PRC <= 0 | sample$VOLUME == 0] <- NA
sample$HIPRC[sample$PRC <= 0 | sample$VOLUME == 0] <- NA
sample_na <- subset(sample, is.na(sample$PRC)) # no NAs introduced
rm(sample_na)
sample$PRC <- abs(sample$PRC) # no effect either

sample$LOPRCR <- NA
sample$HIPRCR <- NA

sample[1, "LOPRCR"] <- sample[1, "LOPRC"]
sample[1, "HIPRCR"] <- sample[1, "HIPRC"]

for (i in 2:length(sample$IQ_ISSUER_CIQID)) {
  if (sample[i, "LOPRC"] > 0 && sample[i, "LOPRC"] < sample[i, "HIPRC"]) {
    sample$LOPRCR <- sample$LOPRC
    sample$HIPRCR <- sample$HIPRC
  }
  else {
    sample[i, "LOPRCR"] <- sample[i - 1, "LOPRCR"]
    sample[i, "HIPRCR"] <- sample[i - 1, "HIPRCR"]
    if (sample[i, "PRC"] >= sample[i, "LOPRC"] && sample[i, "PRC"] <= sample[i, "HIPRC"]) {
      #REPLACE IF WITHIN PRIOR DAY'S RANGE
      sample[i, "LOPRC"] <- sample[i - 1, "LOPRCR"]
      sample[i, "HIPRC"] <- sample[i - 1, "HIPRCR"]
      sample$HLRESET <- 1
    }
    if (sample[i, "PRC"] < sample[i - 1, "LOPRCR"]) {
      #REPLACE IF BELOW PRIOR DAY'S RANGE
      sample[i, "LOPRC"] <- sample[i, "PRC"]
      sample[i, "HIPRC"] <- sample[i, "HIPRCR"] - (sample[i, "LOPRCR"] - sample[i, "PRC"])
      sample$HLRESET <- 2
    }
    if (sample[i, "PRC"] > sample[i - 1, "HIPRCR"]) {
      #REPLACE IF ABOVE PRIOR DAY'S RANGE
      sample[i, "LOPRC"] <- sample[i - 1, "LOPRCR"] + (sample[i, "PRC"] - sample[i - 1, "HIPRCR"])
      sample[i, "HIPRC"] <- sample[i, "PRC"]
      sample$HLRESET <- 3
    }
  }
}
rm(i)

##### 2.2.1.3 ADJUST FOR OVERNIGHT RETURNS BASED ON LAGGED CLOSING PRICE
# THIPRC = today high price
# TLOPRC = today day low price
# LHIPRC = lag high price (yesterday)
# LLOPRC = lag low price (yesterday)
# LPRC = lag closing price (yesterday)
sample$RETADJ <- 0
#CURRENT DAY LOW PRICE
sample$TLOPRC <- sample$LOPRC
#CURRENT DAY HIGH PRICE
sample$THIPRC <- sample$HIPRC
#PRIOR DAY LOW PRICE
sample <- sample %>%
  group_by(IQ_ISSUER_CIQID) %>%
  mutate(LLOPRC = lag(LOPRC))
#PRIOR DAY HIGH PRICE
sample <- sample %>%
  group_by(IQ_ISSUER_CIQID) %>%
  mutate(LHIPRC = lag(HIPRC))
#PRIOR DAY PRICE
sample <- sample %>%
  group_by(IQ_ISSUER_CIQID) %>%
  mutate(LPRC = lag(PRC))

f.adjust.overnight.pricechanges <- function(sample) {
  #ADJUST WHEN PRIOR CLOSE IS BELOW CURRENT LOW
  sample$THIPRC[sample$LPRC < sample$LOPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$LOPRC)] <-
    sample$HIPRC[sample$LPRC < sample$LOPRC &
                   sample$LPRC > 0 &
                   !is.na(sample$LPRC) & !is.na(sample$LOPRC)] -
    (sample$LOPRC[sample$LPRC < sample$LOPRC &
                    sample$LPRC > 0 &
                    !is.na(sample$LPRC) &
                    !is.na(sample$LOPRC)] - sample$LPRC[sample$LPRC < sample$LOPRC &
                                                          sample$LPRC > 0 &
                                                          !is.na(sample$LPRC) & !is.na(sample$LOPRC)])

  sample$TLOPRC[sample$LPRC < sample$LOPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$LOPRC)] <-
    sample$LPRC[sample$LPRC < sample$LOPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$LOPRC)]

  sample$RETADJ[sample$LPRC < sample$LOPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$LOPRC)] <- 1

  #ADJUST WHEN PRIOR CLOSE IS ABOVE CURRENT HIGH
  sample$THIPRC[sample$LPRC > sample$HIPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$HIPRC)] <-
    sample$LPRC[sample$LPRC > sample$HIPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$HIPRC)]

  sample$TLOPRC[sample$LPRC > sample$HIPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$HIPRC)] <-
    sample$LOPRC[sample$LPRC > sample$HIPRC &
                   sample$LPRC > 0 &
                   !is.na(sample$LPRC) & !is.na(sample$HIPRC)] +
    (sample$LPRC[sample$LPRC > sample$HIPRC &
                   sample$LPRC > 0 &
                   !is.na(sample$LPRC) &
                   !is.na(sample$HIPRC)] - sample$HIPRC[sample$LPRC > sample$HIPRC &
                                                          sample$LPRC > 0 &
                                                          !is.na(sample$LPRC) & !is.na(sample$HIPRC)])

  sample$RETADJ[sample$LPRC > sample$HIPRC &
                  sample$LPRC > 0 &
                  !is.na(sample$LPRC) & !is.na(sample$HIPRC)] <- 2
  return(sample)
}
sample <- f.adjust.overnight.pricechanges(sample)


##### 2.3 Calculate daily high-low spread estimates #####
# ----------------------------------------------------- #

# declare some variables & constants
v_PI <- pi
v_K <- 1 / (4 * log(2))
v_K1 <- 4 * log(2) # what is this for?? unused
v_K2 <- sqrt(8 / v_PI)
v_CONST <- 3 - (2 * sqrt(2))

sample <- transform(sample, LOPRC2 = pmin(sample$TLOPRC, sample$LLOPRC))
sample <- transform(sample, HIPRC2 = pmin(sample$THIPRC, sample$LHIPRC))

##### 2.3.1 CALCULATE DAILY HIGH-LOW SPREAD ESTIMATES 
f.calculate.spread <- function(sample) {
  sample$BETA <- 0

  # Beta
  sample$BETA[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] <-
    (log(sample$THIPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] / sample$TLOPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0])) ^ 2 +
    (log(sample$LHIPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] / sample$LLOPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0])) ^ 2
  # Gamma
  sample$GAMMA[!is.na(sample$LOPRC2) > 0] <-
    log(sample$HIPRC2[!is.na(sample$LOPRC2) > 0] / sample$LOPRC2[!is.na(sample$LOPRC2) > 0]) ^ 2
  # Alpha
  sample$ALPHA <- (sqrt(2 * sample$BETA) - sqrt(sample$BETA)) / v_CONST - sqrt(sample$GAMMA / v_CONST)
  # Spread
  sample$SPREAD <- 2 * (exp(sample$ALPHA) - 1) / (1 + exp(sample$ALPHA))
  # set negative spread estimates to zero
  sample <- transform(sample, SPREAD_0 = pmax(sample$SPREAD, 0))
  # drop negative spread estimates
  sample$SPREAD_MISS[!is.na(sample$SPREAD) > 0] <- sample$SPREAD[!is.na(sample$SPREAD) > 0]
  # Sigma
  sample$SIGMA <- ((sqrt(sample$BETA / 2) - sqrt(sample$BETA))) / (v_K2 * v_CONST) + sqrt(sample$GAMMA / (v_K2 * v_K2 * v_CONST))
  # set negative sigma estimates to zero
  sample <- transform(sample, SIGMA_0 = pmax(sample$SIGMA, 0))
  # drop negative sigma estimates 

  return(sample)
}
sample <- f.calculate.spread(sample)
rm(v_CONST, v_K, v_K1, v_K2, v_PI)

##### 2.4 Output daily high-low spread estimates #####
# -------------------------------------------------- #

## OUTPUT VARIABLES:                                                                                 **;
## SPREAD = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES INCLUDED                                   **;
## SPREAD_0 = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES SET TO ZERO                              **;
## SPREAD_MISS = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES SET TO MISSING                        **;
## SIGMA = DAILY STD. DEV. ESTIMATE WITH NEG ESTIMATES INCLUDED                                      **;
## SIGMA_0 = DAILY STD. DEV. ESTIMATE WITH NEG ESTIMATES SET TO ZERO
HLSPRD_DAY_SAMPLE <- sample[, colnames(sample) %in% c("IQ_ISSUER_CIQID", "DATE", "SPREAD", "SPREAD_0", "SPREAD_MISS", "SIGMA", "SIGMA_0")]


##### 2.5 Calculate monthly and yearly high-low spread estimates #####
# ------------------------------------------------------------------ #

HLSPRD_DAY_SAMPLE$DATE1 <- as.Date(HLSPRD_DAY_SAMPLE$DATE, "%Y-%m-%d")
HLSPRD_DAY_SAMPLE$YEAR = as.numeric(format(HLSPRD_DAY_SAMPLE$DATE1, "%Y"))
HLSPRD_DAY_SAMPLE$MONTH = as.numeric(format(HLSPRD_DAY_SAMPLE$DATE1, "%m"))

save.image(file = "bid_ask.RData")

##### 2.5.1 Monthly average 

# calculate # of days with trading activity per month
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
  group_by(IQ_ISSUER_CIQID, YEAR, MONTH) %>%
  mutate(TRADES_PER_MONTH = length(unique(SPREAD)))
# set all days within month on first of month for easier plotting later
HLSPRD_DAY_SAMPLE$DATE_MONTH <- paste("01-", strftime(as.POSIXct(HLSPRD_DAY_SAMPLE$DATE1), "%m-%Y"), sep = "")
HLSPRD_DAY_SAMPLE$DATE_MONTH <- as.Date(HLSPRD_DAY_SAMPLE$DATE_MONTH, "%d-%m-%Y")

# store all entries where # of trades per month < 12
low_trades_month <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH < 12)
# 56 % of all months have >= 12 active trading days
length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID[HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH >= 12]) / length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID)
HLSPRD_DAY_SAMPLE_CLEANED <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH >= 12)
# calculate average monthly spreads & sigmas
spreads_monthly <- merge((aggregate(SPREAD_0 ~ IQ_ISSUER_CIQID + DATE_MONTH, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)), (aggregate(SIGMA_0 ~ IQ_ISSUER_CIQID + DATE_MONTH, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)))
spreads_monthly <- spreads_monthly[with(spreads_monthly, order(IQ_ISSUER_CIQID)),]
colnames(spreads_monthly) <- c("iq_issuer_ciqid", "date", "mean_spread", "mean_sigma")

# example plot
ggplot(spreads_monthly[spreads_monthly$IQ_ISSUER_CIQID == "IQ116231656",], aes(x = DATE_MONTH, y = MEAN_SPREAD_0, colour = "MEAN_SPREAD_0")) +
  geom_line() +
  geom_line(aes(y = MEAN_SIGMA_0, colour = "MEAN_SIGMA_0")) +
  labs(x = "Date", y = "Mean sigma / spread", title = "CIQID: IQ116231656", colour = "Value")

##### 5.2 Yearly average
# calculate # of months with trading activity > 12 days 
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
  group_by(IQ_ISSUER_CIQID, YEAR) %>%
  mutate(TRADES_PER_YEAR = length(unique(MONTH)))
# calculate # of active trading days per _year_
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
  group_by(IQ_ISSUER_CIQID, YEAR) %>%
  mutate(TRADES_PER_YEAR_IN_DAYS = length(unique(SPREAD)))
# store all entries where # of trades < 60
low_trades_year <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS < 60)
# 76 % of the iq_issuer_ciqids have 60 or more trading days per year
length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID[HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS >= 60]) / length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID)
# set all months within year on 01.01. for easier plotting later
HLSPRD_DAY_SAMPLE$DATE_YEAR <- paste("01-01-", strftime(as.POSIXct(HLSPRD_DAY_SAMPLE$DATE1), "%Y"), sep = "")
HLSPRD_DAY_SAMPLE$DATE_YEAR <- as.Date(HLSPRD_DAY_SAMPLE$DATE_YEAR, "%d-%m-%Y")
HLSPRD_DAY_SAMPLE_CLEANED <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS >= 60)
# calculate average yearly spreads & sigmas
spreads_yearly <- merge((aggregate(SPREAD_0 ~ IQ_ISSUER_CIQID + DATE_YEAR, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)), (aggregate(SIGMA_0 ~ IQ_ISSUER_CIQID + DATE_YEAR, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)))
colnames(spreads_yearly) <- c("IQ_ISSUER_CIQID", "DATE_YEAR", "MEAN_SPREAD_0", "MEAN_SIGMA_0")
spreads_yearly <- spreads_yearly[with(spreads_yearly, order(IQ_ISSUER_CIQID)),]
colnames(spreads_yearly) <- c("iq_issuer_ciqid", "date", "mean_spread", "mean_sigma")

我现在遇到的问题是，对于一些罕见的异常值，我得到的点差 > 1（我正在将该算法应用于美国的市政债券数据）。我做错了吗？或者我该如何解释它？

非常感谢你的帮助！

r - 根据每日最高价和最低价估算买卖价差（CORWIN & SCHULTZ, 2012）——解释问题

0 回答 0

Related

Reference