我有一个 pdf 文件,从中提取了两个语料库。返回关键字的函数在循环中被调用两次,每个语料库调用一次。对于第一个语料库,它可以很好地提取关键字。但是对于第二个语料库,我收到这样的错误:
File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 82, in process
Graphaware.create_pipeline()
File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 151, in create_pipeline
is_customstopwords_present = Graphaware.customStopWords_present()
File "/Users/rishi/Downloads/Lectures/idp/Rake/Rake/conceptmap/models/unsupervised/graphaware_nlp/graphaware.py", line 130, in customStopWords_present
with Graphaware.driver.session() as session:
File "/Users/rishi/.virtualenvs/keyphrase-extraction-centrality/lib/python3.6/site-packages/neo4j/v1/direct.py", line 81, in session
return Session(self._pool.acquire, access_mode, **parameters)
AttributeError: 'NoneType' object has no attribute 'acquire'
这些是日志中的尾巴:
Saptwarshis-MacBook-Pro:logs rishi$ tail neo4j.log
2018-08-09 11:31:40.633+0000 INFO Taking default pipeline from configuration : customStopWords
2018-08-09 11:31:40.642+0000 INFO Time for pipeline annotation: 8. Text lenght: 5
2018-08-09 11:31:40.643+0000 INFO Start storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO end storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO Notifying listeners for event {}
2018-08-09 11:31:40.646+0000 INFO Notifying listeners for event {}
2018-08-09 11:31:40.651+0000 WARN Threshold hit after 1 iterations
2018-08-09 11:31:40.654+0000 INFO --- Results:
2018-08-09 11:31:40.655+0000 INFO AnnotatedText with ID 349 processed. Result: true
2018-08-09 11:31:40.661+0000 INFO Notifying listeners for event {}
Saptwarshis-MacBook-Pro:logs rishi$ tail debug.log
2018-08-09 11:31:40.634+0000 INFO [c.g.n.u.ProcessorUtils] Taking default pipeline from configuration : customStopWords
2018-08-09 11:31:40.643+0000 INFO [c.g.n.p.s.StanfordTextProcessor] Time for pipeline annotation: 8. Text lenght: 5
2018-08-09 11:31:40.643+0000 INFO [c.g.n.p.p.AnnotatedTextPersister] Start storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO [c.g.n.p.p.AnnotatedTextPersister] end storing annotatedText 342
2018-08-09 11:31:40.645+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
2018-08-09 11:31:40.647+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
2018-08-09 11:31:40.652+0000 WARN [c.g.n.m.p.PageRank] Threshold hit after 1 iterations
2018-08-09 11:31:40.654+0000 INFO [c.g.n.m.t.TextRank] --- Results:
2018-08-09 11:31:40.655+0000 INFO [c.g.n.m.t.TextRankProcessor] AnnotatedText with ID 349 processed. Result: true
2018-08-09 11:31:40.661+0000 INFO [c.g.n.e.EventDispatcher] Notifying listeners for event {}
Saptwarshis-MacBook-Pro:logs rishi$
这是代码:
from conceptmap.models.base import NLPBaseItem
from neo4j.v1 import GraphDatabase
from operator import itemgetter
class Graphaware(NLPBaseItem):
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))
def __init__(self, data):
# self.lecturewise_generated_keywords = [[]] * len(data)
self.lecturewise_generated_keywords = [[]]
self.generated_keywords_for_each_lecture = []
super(Graphaware, self).__init__(data=data)
def process(self):
self.lecturewise_generated_keywords = [[]] * len(self.data)
lecturewise_list_of_text = self.data
# Initialize pipeline
Graphaware.create_pipeline()
Graphaware.set_default_pipeline()
Graphaware.set_fallback_lang()
# create index for populating lecturewise_generated_keywords
i = 0
# select each lecture from lecturewise_list_of_text
for list_of_text in lecturewise_list_of_text:
# each lecture is a list of text: for ex, lec: ['text1', 'text2', 'text3', ...]
lecture_delim = ". "
lecture_corpus = lecture_delim.join(list_of_text)
# Create a node with property 'text' set to lecture_corpus
Graphaware.add_text_node(lecture_corpus)
# Annotate the text & generate the keywords
Graphaware.annotate()
Graphaware.generate_keywords()
keywords = Graphaware.return_keywords()
# Delete everything in the database
Graphaware.clean_database()
for keyword in keywords:
self.generated_keywords_for_each_lecture.append((keyword, round(len(keyword) / 100, 3)))
# sort generated_keywords_for_each_lecture according to scores of keyphrases
self.generated_keywords_for_each_lecture.sort(key=itemgetter(1), reverse=True)
# store all keywords from a lecture into an element of lecturewise_generated_keywords
self.lecturewise_generated_keywords[i] = self.generated_keywords_for_each_lecture
i = i + 1
self.generated_keywords_for_each_lecture = []
# Close the connection to the database
Graphaware.driver.close()
def score_keyphrases(self):
keyphrases = self.lecturewise_generated_keywords
print("keyphrases: {}".format(keyphrases))
return keyphrases
@staticmethod
def customStopWords_present():
"""
Check if pipeline 'customStopWords' already exists
:return:
"""
with Graphaware.driver.session() as session:
result = session.run("CALL ga.nlp.processor.getPipelines")
session.close()
is_customstopwords_present = False
pipeline_dict = result.data()
num_pipelines = len(pipeline_dict)
# print("num_pipelines: {}".format(num_pipelines))
for i in range(0, num_pipelines):
pipeline_i = pipeline_dict[i]
pipeline_name = pipeline_i['name']
if "customStopWords" == pipeline_name:
is_customstopwords_present = True
break
return is_customstopwords_present
@staticmethod
def create_pipeline():
"""
Create a pipeline
:return:
"""
is_customstopwords_present = Graphaware.customStopWords_present()
if is_customstopwords_present:
print("pipeline 'customStopWords' already exists")
else:
with Graphaware.driver.session() as session:
session.run(
"CALL ga.nlp.processor.addPipeline({textProcessor: 'com.graphaware.nlp.processor.stanford.StanfordTextProcessor', \
name: 'customStopWords', processingSteps: {tokenize: true, ner: true, dependency: true, coref: true, relations: true, cleanxml: true, truecase: true}, \
stopWords: '+, result, all, during', threadNumber: 20})")
session.close()
@staticmethod
def set_default_pipeline():
"""
Set a default pipeline
:return:
"""
with Graphaware.driver.session() as session:
session.run("CALL ga.nlp.processor.pipeline.default('customStopWords')")
session.close()
@staticmethod
def set_fallback_lang():
"""
Set fallback language to English
:return:
"""
with Graphaware.driver.session() as session:
session.run("CALL ga.nlp.config.set('SETTING_fallbackLanguage', 'en')")
session.close()
@staticmethod
def add_text_node(node_text):
"""
Create a node with label 'Text' and set property 'text' of node to node_text
:param node_text:
:return:
"""
with Graphaware.driver.session() as session:
session.run("CREATE (n:Text {text: $node_text})", node_text=node_text)
session.close()
@staticmethod
def annotate():
"""
Annotate the text.
The extraction is done via the annotate procedure which is the entry point to text information extraction
:return:
"""
with Graphaware.driver.session() as session:
session.run("""\
MATCH (n:Text)
CALL ga.nlp.annotate({text: n.text, id: id(n)})
YIELD result
MERGE (n)-[:HAS_ANNOTATED_TEXT]->(result)
RETURN result
""")
session.close()
@staticmethod
def generate_keywords():
"""
Generate keywords
:return:
"""
with Graphaware.driver.session() as session:
session.run("""\
MATCH (a:AnnotatedText)
CALL ga.nlp.ml.textRank({annotatedText: a, stopwords: '+,other,email', useDependencies: true})
YIELD result RETURN result
""")
session.close()
@staticmethod
def return_keywords():
with Graphaware.driver.session() as session:
result = session.run("""\
MATCH (n:Keyword)
RETURN n.originalTagValue
""")
session.close()
keywords = result.data()
keywordlist = []
for keyword in keywords:
keywordlist.append(keyword['n.originalTagValue'])
return keywordlist
# return df(result.data())
@staticmethod
def inspect_multiword_keyphrases():
"""
Inspect multi-word keywords
:return:
"""
with Graphaware.driver.session() as session:
result = session.run("""\
MATCH (k:Keyword)-[:DESCRIBES]->(a:AnnotatedText)
WHERE size(split(k.value, " "))>1
RETURN k.value AS Keyphrase, count(*) AS n_lessons
ORDER BY n_lessons DESC
""")
session.close()
print("multi word keyphrases: {}".format(result.data()))
@staticmethod
def clean_database():
"""
Delete everything in the database
:return:
"""
with Graphaware.driver.session() as session:
session.run("""\
MATCH (n)
DETACH DELETE n
""")
session.close()
这是来自 neo4j/v1/direct.py 的代码:
class DirectDriver(Driver):
""" A :class:`.DirectDriver` is created from a ``bolt`` URI and addresses
a single database instance. This provides basic connectivity to any
database service topology.
"""
uri_scheme = "bolt"
def __new__(cls, uri, **config):
cls._check_uri(uri)
if SocketAddress.parse_routing_context(uri):
raise ValueError("Parameters are not supported with scheme 'bolt'. Given URI: '%s'." % uri)
instance = object.__new__(cls)
# We keep the address containing the host name or IP address exactly
# as-is from the original URI. This means that every new connection
# will carry out DNS resolution, leading to the possibility that
# the connection pool may contain multiple IP address keys, one for
# an old address and one for a new address.
instance.address = SocketAddress.from_uri(uri, DEFAULT_PORT)
instance.security_plan = security_plan = SecurityPlan.build(**config)
instance.encrypted = security_plan.encrypted
def connector(address, error_handler):
return connect(address, security_plan.ssl_context, error_handler, **config)
pool = DirectConnectionPool(connector, instance.address, **config)
print("pool: {}".format(pool))
pool.release(pool.acquire())
print("pool after release: {}".format(pool))
instance._pool = pool
instance._max_retry_time = config.get("max_retry_time", default_config["max_retry_time"])
print("max_retry_time: {}".format(instance._max_retry_time))
return instance
def session(self, access_mode=None, **parameters):
if "max_retry_time" not in parameters:
parameters["max_retry_time"] = self._max_retry_time
return Session(self._pool.acquire, access_mode, **parameters)
似乎 self._pool 在第二次未初始化。有人可以帮我找出原因吗?