在gensim
库中,有一个MmReader
类可以将矩阵市场格式文件转换为 python 对象。有时需要对矩阵进行转置,因此在 中引入了转置参数MmReader
。
但是,我很困惑为什么在行525-526
和https://github.com/piskvorky/gensim/blob/develop/gensim/matutils.py567-568
时,术语文档值和 id的反转发生在.transposed == False
任何熟悉信息检索中的术语文档矩阵的人都可以启发我吗?
class MmReader(object):
"""
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Note that the file is read into memory one document at a time, not the whole
matrix at once (unlike scipy.io.mmread). This allows us to process corpora
which are larger than the available RAM.
"""
def __init__(self, input, transposed=True):
"""
Initialize the matrix reader.
The `input` refers to a file on local filesystem, which is expected to
be in the sparse (coordinate) Matrix Market format. Documents are assumed
to be rows of the matrix (and document features are columns).
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
if isinstance(input, basestring):
input = open(input)
header = input.next().strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(input):
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
if not self.transposed: ## line 525
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
(self.num_docs, self.num_terms, self.num_nnz))
def __len__(self):
return self.num_docs
def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))
def skip_headers(self, input_file):
"""
Skip file headers that appear before the first document.
"""
for line in input_file:
if line.startswith('%'):
continue
break
def __iter__(self):
"""
Iteratively yield vectors from the underlying file, in the format (row_no, vector),
where vector is a list of (col_no, value) 2-tuples.
Note that the total number of vectors returned is always equal to the
number of rows specified in the header; empty documents are inserted and
yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
"""
if isinstance(self.input, basestring):
fin = open(self.input)
else:
fin = self.input
fin.seek(0)
self.skip_headers(fin)
previd = -1
for line in fin:
docid, termid, val = line.split()
if not self.transposed:
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
# change of document: return the document read so far (its id is prevId)
if previd >= 0:
yield previd, document
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
for previd in xrange(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
previd = docid
document = []
document.append((termid, val,)) # add another field to the current document
# handle the last document, as a special case
if previd >= 0:
yield previd, document
# return empty documents between the last explicit document and the number
# of documents as specified in the header
for previd in xrange(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
"""Return document at file offset `offset` (in bytes)"""
# empty documents are not stored explicitly in MM format, so the index marks
# them with a special offset, -1.
if offset == -1:
return []
if isinstance(self.input, basestring):
fin = open(self.input)
else:
fin = self.input
fin.seek(offset) # works for gzip/bz2 input, too
previd, document = -1, []
for line in fin:
docid, termid, val = line.split()
if not self.transposed: ## line 567
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
if previd >= 0:
return document
previd = docid
document.append((termid, val,)) # add another field to the current document
return document
#endclass MmReader