SAS中是否有Jaro-Winkler字符串比较的实现?
看起来Link King有 Jaro-Winkler,但我更喜欢自己调用函数的灵活性。
谢谢!
据我所知,jaro-winkler 距离没有内置函数。@Itzy 已经引用了我所知道的唯一一个。如果您愿意,您可以使用自己的功能proc fcmp
。我什至会从下面的代码开始。我只是尝试关注关于它的维基百科文章。无论如何,它肯定不是 Bill Winkler 的 strcmp.c 文件的完美代表,并且可能有很多错误。
proc fcmp outlib=work.jaro.chars;
subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
outargs matchChars;
/* Returns number of matched characters between 2 strings excluding blanks*/
/* two chars from string1 and string2 are considered matching
if they are no farther than floor(max(|s1|, |s2|)/2)-1 */
str1_len = length(strip(string1));
str2_len = length(strip(string2));
allowedDist = floor(max(str1_len, str2_len)/2) -1;
matchChars="";
/* walk through string 1 and match characters to string2 */
do i= 1 to str1_len;
x=substr(string1,i,1);
position = findc(string2,x ,max(1,i-allowedDist));
if position > 0 then do;
if position - i <= allowedDist then do;
y=substr(string2,position,1);
/* build list of matched characters */
matchChars=cats(matchChars,y);
end;
end;
end;
matchChars = strip(matchChars);
endsub;
function jarotrans (string1 $ , string2 $ );
ntrans = 0;
ubnd = min(length(strip(string1)), length(strip(string2)));
do i = 1 to ubnd;
if substr(string1,i,1) ne substr(string2,i,1) then do;
ntrans + 1;
end;
end;
return(ntrans/2);
endsub;
function getPrefixlen( string1 $ , string2 $, maxprelen);
/* get the length of the matching characters at the beginning */
n = min(maxprelen, length(string1), length(string2));
do i = 1 to n;
if substr(string1,i,1) ne substr(string2,i,1)
then return(max(1,i-1));
end;
endsub;
function jarodist(string1 $, string2 $);
/* get number of matched characters */
call jaromatch(string1, string2, m1);
m1_len = length(m1);
if m1_len = 0 then return(0);
call jaromatch(string2, string1, m2);
m2_len = length(m2);
if m2_len = 0 then return(0);
/* get number of transposed characters */
ntrans = jarotrans(m1, m2);
put m1_len= m2_len= ntrans= ;
j_dist = (m1_len/length(string1)
+ m2_len/length(string2)
+ (m1_len-ntrans)/m1_len ) / 3;
return(j_dist);
endsub;
function jarowink( string1 $, string2 $, prefixscale);
jarodist=jarodist(string1, string2);
prelen=getPrefixlen(string1, string2, 4);
if prelen = 0 then return(jarodist);
else return(jarodist + prelen * prefixscale * (1-jarodist));
endsub;
run;quit;
/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;
/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;
我不这么认为。它可以做 Levenshtein 距离(complev
函数)或广义编辑距离(compged
),但我还没有看到任何其他编辑距离函数。
如果您对在 SAS 中执行此操作一无所知,您可以在PROC IML
.
我修改并更正了 cmjohns 的代码。感谢他/她让我开始。Winkler 在他的论文 Winkler, WE (2006) 中发表了一些例子。“记录链接概述和当前研究方向”。研究报告系列,RRS。(见表 6)我用这些例子来测试我的代码。
proc fcmp outlib=work.jaro.chars;
/* Returns matched characters between 2 strings. Two chars from string1 and string2
are considered matching if they are no farther apart than
floor(max(|s1|, |s2|)/2)-1 */
function jaromatch(string1 $, string2 $) $ 40;
length matchChars $ 40;
str1_len = lengthn(string1);
str2_len = lengthn(string2);
allowedDist = floor(max(str1_len, str2_len) / 2) - 1;
*** walk through string1 and match characters to string2 ***;
matchChars="";
do i= 1 to str1_len;
*** get the part of string2 to search ***;
allowed_start = max(1, i - allowedDist); *** starting char position ***;
allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);
*** find i char from string1 in string2 within the allowedDist ***;
position = findc(allowed_str2, substr(string1, i, 1));
if position > 0
then do;
matchChars = cats(matchChars, substr(allowed_str2, position, 1));
*** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
substr(string2, allowed_start + position -1, 1) = '~';
end;
end;
return(strip(matchChars));
endsub;
/* count the number of "half" transpositions */
function jarotrans(string1 $, string2 $);
ntrans = 0;
do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
end;
return(ntrans / 2);
endsub;
/* get the length of the matching characters at the beginning */
function getPrefixlen(string1 $, string2 $, maxprelen);
n = min(maxprelen, lengthn(string1), lengthn(string2));
if n = 0
then return(0);
else do;
do i = 1 to n;
if substr(string1, i, 1) ne substr(string2, i, 1)
then return(i - 1);
end;
return(n); *** all maxprelen characters match ***;
end;
endsub;
/* calc the jaro distance */
function jarodist(string1 $, string2 $);
*** get number of matched characters in string1 ***;
m1 = jaromatch(string1, string2);
m1_len = lengthn(m1);
if m1_len = 0 then return(0);
*** get number of matched characters in string2 ***;
m2 = jaromatch(string2, string1);
m2_len = lengthn(m2);
if m2_len = 0 then return(0);
*** get number of transposed characters ***;
ntrans = jarotrans(m1, m2);
*** calc jaro distance ***;
j_dist = (m1_len / lengthn(string1) +
m2_len / lengthn(string2) +
(m1_len - ntrans) / m1_len
) / 3;
return(j_dist);
endsub;
/* calc the jaro-winkler distance */
function jarowink(string1 $, string2 $, prefixscale);
string1 = upcase(strip(string1));
string2 = upcase(strip(string2));
*** check for trivial case and calc JW if needed ***;
if string1 = string2
then return(1.0);
else do;
jarodist = jarodist(string1, string2);
prelen = getPrefixlen(string1, string2, 4);
return(jarodist + prelen * prefixscale * (1 - jarodist));
end;
endsub;
run;
*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;
/* test code */
data _null_;
put 'SHACKLEFORD SHACKELFORD 0.982';
jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
put jw=;
put;
put 'DUNNINGHAM CUNNIGHAM 0.896';
jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
put jw=;
put;
put 'NICHLESON NICHULSON 0.956';
jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
put jw=;
put;
put 'JONES JOHNSON 0.832';
jw = jarowink('JONES', 'JOHNSON', 0.1);
put jw=;
put;
put 'MASSEY MASSIE 0.933';
jw = jarowink('MASSEY', 'MASSIE', 0.1);
put jw=;
put;
put 'ABROMS ABRAMS 0.922';
jw = jarowink('ABROMS', 'ABRAMS', 0.1);
put jw=;
put;
put 'JERALDINE GERALDINE 0.926';
jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
put jw=;
put;
put 'MARHTA MARTHA 0.961';
jw = jarowink('MARHTA', 'MARTHA', 0.1);
put jw=;
put;
put 'MICHELLE MICHAEL 0.921';
jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
put jw=;
put;
put 'JULIES JULIUS 0.933';
jw = jarowink('JULIES', 'JULIUS', 0.1);
put jw=;
put;
put 'TANYA TONYA 0.880';
jw = jarowink('TANYA', 'TONYA', 0.1);
put jw=;
put;
put 'DWAYNE DUANE 0.840';
jw = jarowink('DWAYNE', 'DUANE', 0.1);
put jw=;
put;
put 'SEAN SUSAN 0.805';
jw = jarowink('SEAN', 'SUSAN', 0.1);
put jw=;
put;
put 'JON JOHN 0.933';
jw = jarowink('JON', 'JOHN', 0.1);
put jw=;
put;
run;