0

我正在使用 pylucene 构建和搜索倒排文本索引。我建立了这个类(不要害怕python代码,pylucene暴露了与java相同的功能):

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)

    def index_documents(self, docs):
        writer_config = IndexWriterConfig(self.analyzer)
        writer = IndexWriter(self.dir_wrapper, writer_config)
        for content, id in docs:
            doc = Document()
            doc.add(Field("content", content, self.TermsField.TYPE_STORED))
            doc.add(Field("id", id, StringField.TYPE_STORED))
            writer.addDocument(doc)
        writer.commit()
        writer.close()

    def query_index(self, query_terms, n_top=10):
        reader = DirectoryReader.open(self.dir_wrapper)
        searcher = IndexSearcher(reader)
        parser = QueryParser("content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)
        query = parser.parse(query_terms)
        scoreDocs = searcher.search(query, n_top).scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            print(table)
        reader.close()

我是 lucene 的新手,我想知道每次运行index_documentsandquery_index函数时创建 writer 和 reader 有多优化。我不能将更多信息存储到类中吗?我试图将读取器和写入器保存为属性,但它使进程崩溃。

编辑:我正在使用的最后一堂课

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper_reader = SimpleFSDirectory(Paths.get(self.index_dir))
        self.dir_wrapper_writer = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
        self.reader = None
        self.searcher = None
        self.writer_config = IndexWriterConfig(self.analyzer)
        self.writer = IndexWriter(self.dir_wrapper_writer, self.writer_config)

    def index_documents(self, docs):
        for text, id_ in docs:
            doc = Document()
            doc.add(Field("text", text, TextField.TYPE_STORED))
            doc.add(Field("id", id_, StringField.TYPE_STORED))
            self.writer.addDocument(doc)
        self.writer.commit()

    def query_index(self, tokens, operator='AND',n_top=10):
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir_wrapper_reader)
            self.searcher = IndexSearcher(self.reader)
        else:
            new_reader = DirectoryReader.openIfChanged(self.reader)
            if new_reader:
                self.reader = new_reader
                self.searcher = IndexSearcher(self.reader)
        parser = QueryParser("text", self.analyzer)
        if operator.lower() in ['and', '+']:
            parser.setDefaultOperator(QueryParser.Operator.AND)
        else:
            parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(tokens)
        scoreDocs = self.searcher.search(query, n_top).scoreDocs
        return scoreDocs
4

0 回答 0