我正在尝试匹配两个非常大的数据(nsar 和 crsp)集。我的代码运行良好,但需要很多时间。我的程序按以下方式工作:
- 通过ticker尝试匹配(从而控制NAV(只是一个数字)和日期是相同的)
- 通过确切的基金名称尝试匹配(控制资产净值和日期)
- 尝试通过最接近的匹配进行匹配:首先搜索相同的 NAV 和日期 --> 列出并仅考虑与两种匹配措施最接近的公司 --> 获取剩余条目并找到最接近的匹配(但匹配距离受到限制)。
关于如何提高代码效率的任何建议:
#Go through each nsar entry and try to match with crsp
trackchanges = sapply(seq_along(nsar$fund),function(x){
#Define vars
ticker = nsar$ticker[x]
r_date = format(nsar$r_date[x], "%m%Y")
nav1 = nsar$NAV_share[x]
nav2 = nsar$NAV_sshare[x]
searchbyname = 0
if(nav1 == 0) nav1 = -99
if(nav2 == 0) nav2 = -99
########## If ticker is available --> Merge via ticker and NAV
if(is.na(ticker) == F)
{
#Look for same NAV, date and ticker
found = which(crsp$nasdaq == ticker & crsp$caldt2 == r_date & (round(crsp$mnav,1) == round(nav1,1) | round(crsp$mnav,1) == round(nav2,1)))
#If nothing found
if(length(found) == 0)
{
#Mark that you should search by names
searchbyname = 1
} else { #ticker found
#Record crsp_fundno and that match is found
nsar$match[x] = 1
nsar$crsp_fundno[x] = crsp$crsp_fundno[found[1]]
assign("nsar",nsar,envir=.GlobalEnv)
#Return: 1 --> Merged by ticker
return(1)
}
}
###########
########### No Ticker available or found --> Exact name matching
if(is.na(ticker) == T | searchbyname == 1)
{
#Define vars
name = tolower(nsar$fund[x])
company = tolower(nsar$company[x])
#Exact name, date and same NAV
found = which(crsp$fund_name2 == name & crsp$caldt2 == r_date & (round(crsp$mnav,1) == round(nav1,1) | round(crsp$mnav,1) == round(nav2,1)))
#If nothing found
if(length(found) == 0)
{
#####Continue searching by closest match
#First search for nav and date to get list of funds
allfunds = which(crsp$caldt2 == r_date & (round(crsp$mnav,1) == round(nav1,1) | round(crsp$mnav,1) == round(nav2,1)))
allfunds_companies = crsp$company[allfunds]
#Check if anything found
if(length(allfunds) == 0)
{
#Return: 0 --> nothing found
return(0)
}
#Get best match by lev and substring measure for company
levmatch = levenstheinMatch(company, allfunds_companies)
submatch = substringMatch(company, allfunds_companies)
allfunds = levmatch[levmatch %in% submatch]
allfunds_names = crsp$fund_name2[allfunds]
#Check if now anything found
if(length(allfunds) == 0)
{
#Mark match (5=Company not found)
nsar$match[x] = 5
#Save globally
assign("nsar",nsar,envir=.GlobalEnv)
#Return: 5 --> Company not found
return(5)
}
#Get best match by all measures
levmatch = levenstheinMatch(name, allfunds_names)
submatch = substringMatch(name, allfunds_names)
#Only accept if identical
allfunds = levmatch[levmatch %in% submatch]
allfunds_names = crsp$fund_name2[allfunds]
if(length(allfunds) > 0)
{
#Mark match (3=closest name matching)
nsar$match[x] = 3
#Add crsp_fundno to nsar data
nsar$crsp_fundno[x] = crsp$crsp_fundno[allfunds[1]]
#Save globally
assign("nsar",nsar,envir=.GlobalEnv)
#Return 3=closest name matching
return(3)
} else {
#return 0 -> no match
return(0)
}
#####
} else { #If exact name,date,nav found
#Mark match (2=exact name matching)
nsar$match[x] = 2
#Add crsp_fundno to nsar data
nsar$crsp_fundno[x] = crsp$crsp_fundno[found[1]]
#Return 2=exact name matching
return(2)
}
}
})#End sapply
非常感谢您的帮助!劳伦兹