我必须制作一个 [所有文档中的唯一瓦片集] x [文档 ID] 的布尔矩阵。到目前为止,我有一个名为allshinglesU的列表,其中包含所有文档中所有唯一的带状疱疹集。我还有一个名为docsAsShingleSetsW的键值字典,它以文档 ID 作为键,在该文档中找到的 shingle 集作为值。如何制作一个布尔矩阵来识别文档 n 上是否出现了唯一的瓦片集?
这是我到目前为止的进展:
docsAsShingleSetsW = {}
# Maintain a list of all document IDs.
docNames = []
shingle_size = 2
totalShingles = 0
shingleNo = 0
allshingles = []
for i in range(0, len(documents)):
# Read all of the words
words = documents[i]
words = words.split()
docID = i
docNames.append(docID)
# 'shinglesInDoc' will hold all of the unique shingles present in the current document. If a shingle ID occurs multiple times in the document, it will only appear once in the set.
# keep word shingles
shinglesInDocWords = set()
# keep hashed shingles
shinglesInDocInts = set()
shingle = []
# For each word in the document...
for index in range(len(words) - shingle_size + 1):
# Construct the shingle text by combining k words together.
shingle = words[index:index + shingle_size]
shingle = ' '.join(shingle)
# Hash the shingle to a 32-bit integer.
crc = binascii.crc32(bytes(shingle,encoding='utf8')) & 0xffffffff
if shingle not in shinglesInDocWords:
shinglesInDocWords.add(shingle)
allshingles.append(shingle)
# Add the hash value to the list of shingles for the current document.
# Note that set objects will only add the value to the set if the set
# doesn't already contain it.
if crc not in shinglesInDocInts:
shinglesInDocInts.add(crc)
# Count the number of shingles across all documents.
shingleNo = shingleNo + 1
else:
del shingle
index = index - 1
# Store the completed list of shingles for this document in the dictionary.
docsAsShingleSets[docID] = shinglesInDocInts
docsAsShingleSetsW[docID] = shinglesInDocWords
totalShingles = shingleNo