0

我有一个 pdf 文件,从中提取了两个语料库。返回关键字的函数在循环中被调用两次,每个语料库调用一次。对于第一个语料库,它可以很好地提取关键字。但是对于第二个语料库,我收到这样的错误:

File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 82, in process
   Graphaware.create_pipeline()
File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 151, in create_pipeline
   is_customstopwords_present = Graphaware.customStopWords_present()
File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 130, in customStopWords_present
   with Graphaware.driver.session() as session:
File "/Users/rishi/.virtualenvs/keyphrase-extraction-centrality/lib/python3.6/site-packages/neo4j/v1/direct.py", line 81, in session
   return Session(self._pool.acquire, access_mode, **parameters)
AttributeError: 'NoneType' object has no attribute 'acquire'

这些是日志中的尾巴:

Saptwarshis-MacBook-Pro:logs rishi$ tail neo4j.log
2018-08-09 11:31:40.633+0000 INFO  Taking default pipeline from configuration : customStopWords
2018-08-09 11:31:40.642+0000 INFO  Time for pipeline annotation: 8. Text lenght: 5
2018-08-09 11:31:40.643+0000 INFO  Start storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO  end storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO  Notifying listeners for event {}
2018-08-09 11:31:40.646+0000 INFO  Notifying listeners for event {}
2018-08-09 11:31:40.651+0000 WARN  Threshold hit after 1 iterations
2018-08-09 11:31:40.654+0000 INFO  --- Results:
2018-08-09 11:31:40.655+0000 INFO  AnnotatedText with ID 349 processed. Result: true
2018-08-09 11:31:40.661+0000 INFO  Notifying listeners for event {}
Saptwarshis-MacBook-Pro:logs rishi$ tail debug.log
2018-08-09 11:31:40.634+0000 INFO [c.g.n.u.ProcessorUtils] Taking default pipeline from configuration : customStopWords
2018-08-09 11:31:40.643+0000 INFO [c.g.n.p.s.StanfordTextProcessor] Time for pipeline annotation: 8. Text lenght: 5
2018-08-09 11:31:40.643+0000 INFO [c.g.n.p.p.AnnotatedTextPersister] Start storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO [c.g.n.p.p.AnnotatedTextPersister] end storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
2018-08-09 11:31:40.647+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
2018-08-09 11:31:40.652+0000 WARN [c.g.n.m.p.PageRank] Threshold hit after 1 iterations
2018-08-09 11:31:40.654+0000 INFO [c.g.n.m.t.TextRank] --- Results:
2018-08-09 11:31:40.655+0000 INFO [c.g.n.m.t.TextRankProcessor] AnnotatedText with ID 349 processed. Result: true
2018-08-09 11:31:40.661+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
Saptwarshis-MacBook-Pro:logs rishi$

这是代码:

from conceptmap.models.base import NLPBaseItem
from neo4j.v1 import GraphDatabase
from operator import itemgetter

class Graphaware(NLPBaseItem):
    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))


    def __init__(self, data):
        # self.lecturewise_generated_keywords = [[]] * len(data)
        self.lecturewise_generated_keywords = [[]]
        self.generated_keywords_for_each_lecture = []

        super(Graphaware, self).__init__(data=data)


    def process(self):
        self.lecturewise_generated_keywords = [[]] * len(self.data)
        lecturewise_list_of_text = self.data

        # Initialize pipeline
        Graphaware.create_pipeline()
        Graphaware.set_default_pipeline()
        Graphaware.set_fallback_lang()

        # create index for populating lecturewise_generated_keywords
        i = 0
        # select each lecture from lecturewise_list_of_text
        for list_of_text in lecturewise_list_of_text:
            # each lecture is a list of text: for ex, lec: ['text1', 'text2', 'text3', ...]
            lecture_delim  = ". "
            lecture_corpus = lecture_delim.join(list_of_text)
            # Create a node with property 'text' set to lecture_corpus
            Graphaware.add_text_node(lecture_corpus)
            # Annotate the text & generate the keywords
            Graphaware.annotate()
            Graphaware.generate_keywords()
            keywords = Graphaware.return_keywords()
            # Delete everything in the database
            Graphaware.clean_database()
            for keyword in keywords:
                self.generated_keywords_for_each_lecture.append((keyword, round(len(keyword) / 100, 3)))

            # sort generated_keywords_for_each_lecture according to scores of keyphrases
            self.generated_keywords_for_each_lecture.sort(key=itemgetter(1), reverse=True)
            # store all keywords from a lecture into an element of lecturewise_generated_keywords
            self.lecturewise_generated_keywords[i] = self.generated_keywords_for_each_lecture
            i = i + 1
            self.generated_keywords_for_each_lecture = []

        # Close the connection to the database
        Graphaware.driver.close()

    def score_keyphrases(self):
        keyphrases = self.lecturewise_generated_keywords
        print("keyphrases: {}".format(keyphrases))
        return keyphrases

    @staticmethod
    def customStopWords_present():
        """
        Check if pipeline 'customStopWords' already exists
        :return:
        """
        with Graphaware.driver.session() as session:
            result = session.run("CALL ga.nlp.processor.getPipelines")
            session.close()
            is_customstopwords_present = False
            pipeline_dict = result.data()
            num_pipelines = len(pipeline_dict)
            # print("num_pipelines: {}".format(num_pipelines))
            for i in range(0, num_pipelines):
                pipeline_i = pipeline_dict[i]
                pipeline_name = pipeline_i['name']
                if "customStopWords" == pipeline_name:
                    is_customstopwords_present = True
                    break
            return is_customstopwords_present

    @staticmethod
    def create_pipeline():
        """
        Create a pipeline
        :return:
        """
        is_customstopwords_present = Graphaware.customStopWords_present()
        if is_customstopwords_present:
            print("pipeline 'customStopWords' already exists")
        else:
            with Graphaware.driver.session() as session:
                session.run(
                "CALL ga.nlp.processor.addPipeline({textProcessor: 'com.graphaware.nlp.processor.stanford.StanfordTextProcessor', \
                name: 'customStopWords', processingSteps: {tokenize: true, ner: true, dependency: true, coref: true, relations: true, cleanxml: true, truecase: true}, \
                stopWords: '+, result, all, during', threadNumber: 20})")
                session.close()


    @staticmethod
    def set_default_pipeline():
        """
        Set a  default pipeline
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("CALL ga.nlp.processor.pipeline.default('customStopWords')")
            session.close()

    @staticmethod
    def set_fallback_lang():
        """
        Set fallback language to English
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("CALL ga.nlp.config.set('SETTING_fallbackLanguage', 'en')")
            session.close()

    @staticmethod
    def add_text_node(node_text):
        """
        Create a node with label 'Text' and set property 'text' of node to node_text
        :param node_text:
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("CREATE (n:Text {text: $node_text})", node_text=node_text)
            session.close()

    @staticmethod
    def annotate():
        """
        Annotate the text.
        The extraction is done via the annotate procedure which is the entry point to text information extraction
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("""\
            MATCH (n:Text)
            CALL ga.nlp.annotate({text: n.text, id: id(n)})
            YIELD result
            MERGE (n)-[:HAS_ANNOTATED_TEXT]->(result)
            RETURN result
            """)
            session.close()

    @staticmethod
    def generate_keywords():
        """
        Generate keywords
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("""\
            MATCH (a:AnnotatedText)
            CALL ga.nlp.ml.textRank({annotatedText: a, stopwords: '+,other,email', useDependencies: true})
            YIELD result RETURN result
            """)
            session.close()

    @staticmethod
    def return_keywords():
        with Graphaware.driver.session() as session:
            result = session.run("""\
            MATCH (n:Keyword) 
            RETURN n.originalTagValue
            """)
            session.close()
        keywords = result.data()
        keywordlist = []
        for keyword in keywords:
            keywordlist.append(keyword['n.originalTagValue'])
        return keywordlist
        # return df(result.data())

    @staticmethod
    def inspect_multiword_keyphrases():
        """
        Inspect multi-word keywords
        :return:
        """
        with Graphaware.driver.session() as session:
            result = session.run("""\
            MATCH (k:Keyword)-[:DESCRIBES]->(a:AnnotatedText)
            WHERE size(split(k.value, " "))>1
            RETURN k.value AS Keyphrase, count(*) AS n_lessons
            ORDER BY n_lessons DESC
            """)
            session.close()
            print("multi word keyphrases: {}".format(result.data()))

    @staticmethod
    def clean_database():
        """
        Delete everything in the database
        :return:
        """
        with Graphaware.driver.session() as session:
            session.run("""\
            MATCH (n)
            DETACH DELETE n
            """)
            session.close()

这是来自 neo4j/v1/direct.py 的代码:

class DirectDriver(Driver):
    """ A :class:`.DirectDriver` is created from a ``bolt`` URI and addresses
    a single database instance. This provides basic connectivity to any
    database service topology.
    """

    uri_scheme = "bolt"

    def __new__(cls, uri, **config):
        cls._check_uri(uri)
        if SocketAddress.parse_routing_context(uri):
            raise ValueError("Parameters are not supported with scheme 'bolt'. Given URI: '%s'." % uri)
        instance = object.__new__(cls)
        # We keep the address containing the host name or IP address exactly
        # as-is from the original URI. This means that every new connection
        # will carry out DNS resolution, leading to the possibility that
        # the connection pool may contain multiple IP address keys, one for
        # an old address and one for a new address.
        instance.address = SocketAddress.from_uri(uri, DEFAULT_PORT)
        instance.security_plan = security_plan = SecurityPlan.build(**config)
        instance.encrypted = security_plan.encrypted

        def connector(address, error_handler):
            return connect(address, security_plan.ssl_context, error_handler, **config)

        pool = DirectConnectionPool(connector, instance.address, **config)
        print("pool: {}".format(pool))
        pool.release(pool.acquire())
        print("pool after release: {}".format(pool))
        instance._pool = pool
        instance._max_retry_time = config.get("max_retry_time", default_config["max_retry_time"])
        print("max_retry_time: {}".format(instance._max_retry_time))
        return instance

    def session(self, access_mode=None, **parameters):
        if "max_retry_time" not in parameters:
            parameters["max_retry_time"] = self._max_retry_time
        return Session(self._pool.acquire, access_mode, **parameters)

似乎 self._pool 在第二次未初始化。有人可以帮我找出原因吗?

4

1 回答 1

0

好像我的driver对象管理不正确。我driver在错误的地方创建。在类定义时定义driver上述方法创建了一个实例,该实例在我的类的所有实例中共享。在方法内部移动对象创建解决了这个问题。__init__driverdriver_init_

于 2018-08-10T07:45:31.877 回答