在 Flair NER 上运行 dask 时出现酸洗错误。当我没有分布式客户端时,这很有效,但是在执行时间上它似乎没有做得更好
我的代码是:
from dask.distributed import Client, progress
import dask
import time
import random
import pandas as pd
import numpy as np
import time
import dask.bag as db
import dask.dataframe as ddf
from flair.data import Sentence
from flair.models import SequenceTagger
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset="train")
def getFlairEntities(text):
#passing text to sentence
sentence = Sentence(text)
tagger.predict(sentence)
entites=[]
for entity in sentence.get_spans('ner'):
temp={}
temp['start_pos']=entity.start_pos
temp['end_pos']=entity.end_pos
temp['entity']= entity.text
temp['entity_type']= entity.tag
temp['score']= round(entity.score, 4)
entites.append(temp)
return entites
def ner_flair(df):
df["entities"] = df.text.map(getFlairEntities)
return df
if __name__ == "__main__":
client = Client(processes=True)
df = pd.DataFrame()
df = df.assign(text=data["data"]).assign(target=data["target"])
dask_dataframe = ddf.from_pandas(df.head(20), npartitions=20)
t0 = time.time()
result = dask_dataframe.map_partitions(ner_flair)
dask_entities = result.compute()
t1 = time.time()
print("Time to process with Dask {}".format(t1-t0))
PicklingError: Can't pickle <functools._lru_cache_wrapper object at 0x00000136957653A0>: it's not the same object as flair.embeddings.token.WordEmbeddings.get_cached_vec