0

我编写了一个小程序来观察一个文件夹,一旦将 .pdf 文件放入该文件夹中,它将在 .pdf 中搜索关键字并输出一个新的 .txt(列出页码)和一个新的 pdf 文件,该文件只包含包含以下内容的页面关键字。

它适用于大多数 .pdf,但有些表现出奇怪的行为。似乎有时它只搜索第一页而没有其他内容。如果需要,我可以提供其中一个 pdf 的链接。

这是我的代码:

import fitz, glob, os, time
from watchdog.observers.polling import PollingObserver
from watchdog.events import PatternMatchingEventHandler
os.chdir("C:/test/")
s1 = ["Siphone"]

if __name__ == "__main__":
    patterns = ["*.pdf"]
    ignore_patterns = ["*done.pdf"]
    ignore_directories = True
    case_sensitive = True
    my_event_handler = PatternMatchingEventHandler(patterns, ignore_patterns, ignore_directories, case_sensitive)

def on_created(event):
    print("on_created", event.src_path)
    time.sleep(2)
    txt = "%s.txt" %event.src_path
    open("%s" %event.src_path, 'r') 
    pdf_document = fitz.open(event.src_path)
    out_file = "%s_done.pdf" %event.src_path
    f = open("%s" %txt, "w")
    bla = ""
    for words in s1:
        f = open("%s" % txt, "a")
        f.write("%s:" % words)
        for current_page in range(len(pdf_document)):
            page = pdf_document.loadPage(current_page)
            textsuche = page.searchFor(words)
            if page.searchFor(words):
                bla += (("%s,") % current_page)
                seite = int(current_page)
                seite += 1
                f.write("%i," % seite)
        f.write("\n")
    liste = bla.split(",")
    str_list = list(filter(None, liste))
    str_list = list(dict.fromkeys(str_list))
    test_list = [int(i) for i in str_list]
    test_list.sort()
    print(test_list)
    doc = fitz.open()
    for p in test_list:
        doc.insertPDF(pdf_document, from_page=p, to_page=p)
    output= ("%s_done.pdf" % event.src_path)
    pdf_document.close()
    for page in doc:
        for i in s1:
            text_instances = page.searchFor(i)
            for inst in text_instances:
                highlight = page.addHighlightAnnot(inst)
    doc.save(output)
    doc.close()

my_event_handler.on_created = on_created

path = "C:/test/"
go_recursively = True
my_observer = PollingObserver()
my_observer.schedule(my_event_handler, path, recursive=go_recursively)
my_observer.start()
while True:
    try:
        time.sleep(5)
    except KeyboardInterrupt:
        my_observer.stop()
        my_observer.join()

一些pdf上出现以下错误(我假设pymupdf无法正确读取文件并且只搜索第0页):

Exception in thread Thread-1:
Traceback (most recent call last):
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\threading.py", line 954, in _bootstrap_inner
   self.run()
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 199, in run
   self.dispatch_events(self.event_queue, self.timeout)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 372, in dispatch_events
   handler.dispatch(event)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 382, in dispatch
   super().dispatch(event)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 261, in dispatch
   {
 File "C:\all\pdf\final_pdf_suche.py", line 51, in on_created
   doc.save(output)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\fitz\fitz.py", line 4206, in save
   raise ValueError("cannot save with zero pages")
ValueError: cannot save with zero pages

该词在pdf中多次出现,但找不到。

4

0 回答 0