我正在尝试使用一些代码运行 Jaro Winkler 函数来比较两个字符串的相似性。如果我只是硬编码两个值,john 和 jon,使用下面的逻辑我不会遇到问题。但是我想要的是使用 csv 文件并比较所有名称。当我尝试得到
ValueError:Series 的真值不明确。使用 a.empty、a.bool()、a.item()、a.any() 或 a.all()。
# Python3 implementation of above approach
from math import floor
import pandas as pd
# Function to calculate the
# Jaro Similarity of two strings
def jaro_distance(s1, s2):
# If the strings are equal
if (s1 == s2):
return 1.0;
# Length of two strings
len1 = len(s1);
len2 = len(s2);
if (len1 == 0 or len2 == 0):
return 0.0;
# Maximum distance upto which matching
# is allowed
max_dist = (max(len(s1), len(s2)) // 2) - 1;
# Count of matches
match = 0;
# Hash for matches
hash_s1 = [0] * len(s1);
hash_s2 = [0] * len(s2);
# Traverse through the first string
for i in range(len1):
# Check if there is any matches
for j in range(max(0, i - max_dist),
min(len2, i + max_dist + 1)):
# If there is a match
if (s1[i] == s2[j] and hash_s2[j] == 0):
hash_s1[i] = 1;
hash_s2[j] = 1;
match += 1;
break;
# If there is no match
if (match == 0):
return 0.0;
# Number of transpositions
t = 0;
point = 0;
# Count number of occurrences
# where two characters match but
# there is a third matched character
# in between the indices
for i in range(len1):
if (hash_s1[i]):
# Find the next matched character
# in second string
while (hash_s2[point] == 0):
point += 1;
if (s1[i] != s2[point]):
point += 1;
t += 1;
else:
point += 1;
t /= 2;
# Return the Jaro Similarity
return ((match / len1 + match / len2 +
(match - t) / match) / 3.0);
# Jaro Winkler Similarity
def jaro_Winkler(s1, s2):
jaro_dist = jaro_distance(s1, s2);
# If the jaro Similarity is above a threshold
if (jaro_dist > 0.7):
# Find the length of common prefix
prefix = 0;
for i in range(min(len(s1), len(s2))):
# If the characters match
if (s1[i] == s2[i]):
prefix += 1;
# Else break
else:
break;
# Maximum of 4 characters are allowed in prefix
prefix = min(4, prefix);
# Calculate jaro winkler Similarity
jaro_dist += 0.1 * prefix * (1 - jaro_dist);
return jaro_dist;
# Driver code
if __name__ == "__main__":
df = pd.read_csv('names.csv')
# s1 = 'john' -- this works
# s1 = 'jon' -- this works
s1 = df['name1'] --this doesn't. csv contains header row name1, name2, and a few rows in each
s2 = df['name2'] --this doesn't
print("Jaro-Winkler Similarity =", jaro_Winkler(s1, s2));
Traceback (most recent call last):
File "C:\Users\john\PycharmProjects\heatMap\Jaro.py", line 113, in <module>
print("Jaro-Winkler Similarity =", jaro_Winkler(s1, s2));
File "C:\Users\john\PycharmProjects\heatMap\Jaro.py", line 80, in jaro_Winkler
jaro_dist = jaro_distance(s1, s2);
File "C:\Users\john\PycharmProjects\heatMap\Jaro.py", line 9, in jaro_distance
if (s1 == s2):
File "C:\Users\john\PycharmProjects\heatMap\venv\lib\site-packages\pandas\core\generic.py", line 1537, in __nonzero__
raise ValueError(
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Process finished with exit code 1
来自 csv 的示例 在此处输入图像描述