我一直在使用很多 python tika 从一些 pdf 中提取文本。突然,Tika 不再使用以下代码和类似代码:
from tika import parser
document = parser.from_file("prova.pdf")['content']
或者
from tika import parser
parser.from_file("C:/Users/Daniele/Desktop/progetto_tesi_magistrale/prova.pdf")['content']
每次我收到此错误时:
2021-02-23 10:57:36,244 [MainThread ] [INFO ] Retrieving C:\Program Files\tika-server-1.24.1.jar to C:\Users\Daniele\AppData\Local\Temp\tika-server.jar.
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath, destPath)
797 try:
--> 798 urlretrieve(urlOrPath, destPath)
799 except IOError:
~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
~\anaconda3\lib\urllib\request.py in _open(self, req, data)
546
--> 547 return self._call_chain(self.handle_open, 'unknown',
548 'unknown_open', req)
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
~\anaconda3\lib\urllib\request.py in unknown_open(self, req)
1420 type = req.type
-> 1421 raise URLError('unknown url type: %s' % type)
1422
URLError: <urlopen error unknown url type: c>
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
<ipython-input-4-5aa5aa48deec> in <module>
1 from tika import parser
2
----> 3 document = parser.from_file("prova.pdf")['content']
4 #import tika
5 #from tika import parser
~\anaconda3\lib\site-packages\tika\parser.py in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions)
38 '''
39 if not xmlContent:
---> 40 output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
41 else:
42 output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
~\anaconda3\lib\site-packages\tika\tika.py in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
334 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
335 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 336 status, response = callServer('put', serverEndpoint, service, f,
337 headers, verbose, tikaServerJar, config_path=config_path,
338 rawResponse=rawResponse, requestOptions=requestOptions)
~\anaconda3\lib\site-packages\tika\tika.py in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
529 global TikaClientOnly
530 if not TikaClientOnly:
--> 531 serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
532
533 serviceUrl = serverEndpoint + service
~\anaconda3\lib\site-packages\tika\tika.py in checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
590 if not alreadyRunning:
591 if not os.path.isfile(jarPath) and urlp.scheme != '':
--> 592 getRemoteJar(tikaServerJar, jarPath)
593
594 if not checkJarSig(tikaServerJar, jarPath):
~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath, destPath)
806 if os.path.exists(destPath) and os.path.isfile(destPath):
807 os.remove(destPath)
--> 808 urlretrieve(urlOrPath, destPath)
809
810 return (destPath, 'remote')
~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
523
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
527 # post-process response
~\anaconda3\lib\urllib\request.py in _open(self, req, data)
545 return result
546
--> 547 return self._call_chain(self.handle_open, 'unknown',
548 'unknown_open', req)
549
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\anaconda3\lib\urllib\request.py in unknown_open(self, req)
1419 def unknown_open(self, req):
1420 type = req.type
-> 1421 raise URLError('unknown url type: %s' % type)
1422
1423 def parse_keqv_list(l):
URLError: <urlopen error unknown url type: c>
我试图卸载 tika python、tika server、java、python ......基本上所有东西。奇怪的是,我的第二台电脑突然出现了同样的问题。有什么建议么 ?非常感谢。