library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);
dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
AsOfDate,
CashFlow FROM dbo.Accounts')
resdatatable = as.data.table(res)
odbcCloseAll();
sppv <- function(i, n) {
return((1 + i / 100) ^ (-n))
}
npv <- function(x, i) {
npv = c()
for (k in 1:length(i)) {
pvs = x * sppv(i[k], 1:length(x))
npv = c(npv, sum(pvs))
}
return(npv)
}
xirr <- function(cashflow, dates) {
if (length(cashflow) != length(dates)) {
stop("length(cashflow) != length(dates)")
}
cashflow_adj <- c(cashflow[1])
for (i in 1:(length(cashflow) - 1)) {
d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")
# There are no checks about the monotone values of dates
# put a check in here if the interval is negative
interval <- as.integer(d2 - d1)
if (length(interval) > 0 && !is.na(interval)) {
cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
}
}
left = -10
right = 10
epsilon = 1e-8
while (abs(right - left) > 2 * epsilon) {
midpoint = (right + left) / 2
if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
left = midpoint
} else {
right = midpoint
}
}
irr = (right + left) / 2 / 100
irr <- irr * 365
# Annualized yield (return) reflecting compounding effect of daily returns
irr <- (1 + irr / 365) ^ 365 - 1
irr
}
groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));
groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);
groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);
resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));
datalist = result <- vector("list", length(groupedCompanyNames));
for (i in groupedCompanyNames) {
datesForCompany <- groupedDatesPerCompany[i];
dates <- datesForCompany[[i]];
cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
cashFlows <- cashFlowsForCompany[[i]];
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
datalist[[i]] <- newRow;
}
resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);
print(finalDataFrame);
因此,为了提供上下文,我正在尝试执行以下操作:
- 使用 RODBC 连接从数据库中获取数据
- 获取唯一的公司名称
- 拆分每家公司的现金流和日期
- 使用已知行数初始化数据表,使其不需要增量增长。
- 循环遍历唯一的公司名称并调用函数 get xirr 在公司的现金流和日期列表上。
- 将带有公司名称和 XIRR 值的每一行添加到新数据表中。
- 使用 rbindlist。
这是我正在使用的源数据示例
Company_ID CashFlow AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D 368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -1529180.0000 2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6791.0000 2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -187375.0000 2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -215902.0000 2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -217409.0000 2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -191830.0000 2012-05-31 00:00:00.000
我是 R 新手——大约有 2000 个独特的公司名称,平均有 50 个日期,每个现金流组合 = 100000 条记录,循环大约需要 28 秒来处理。
我已经研究过使用 asParallel 库并使用了 foreach ,但这似乎对速度没有任何影响。如果我取出函数 xirr 的调用,则循环将立即处理并完成。
xirr 需要异常处理,因为有时它不可能迭代地计算 xirr 值。
我知道循环并不是 R 中的最佳实践——关于如何向量化它以获得更好的性能的任何建议?