打開軟件后界面如下:

點(diǎn)擊打開文件按鈕打開之前的PDF文件后效果如下:

框選區(qū)域后,標(biāo)題欄會(huì)自動(dòng)顯示當(dāng)前框選的區(qū)域提取到的文字,還可以左右按鈕切換:

實(shí)際我們需要提取文字的區(qū)域可能不止這一個(gè),所以程序支持多區(qū)域框選:

完成區(qū)域框選后就可以點(diǎn)擊保存文件,將PDF每頁提取到的文本保存到一個(gè)csv文件中,當(dāng)前選區(qū)的保存結(jié)果如下:

可以看到已經(jīng)按框選順序依次保存了每一個(gè)區(qū)域的字符串。
如果選擇區(qū)域時(shí)發(fā)現(xiàn)提取結(jié)果不準(zhǔn)確,可以撤銷后重新選擇:

保存圖片則會(huì)將PDF的每頁的整體保存為一張圖片,未選擇區(qū)域時(shí),以頁碼為文件名保存圖片:

選擇區(qū)域時(shí),會(huì)自動(dòng)提取最后一個(gè)區(qū)域提取的文本作為當(dāng)前頁的文件名:

當(dāng)然這個(gè)項(xiàng)目由于本人是一次使用wxpython,功能非常簡(jiǎn)約,現(xiàn)在將完整代碼開源出來期待各位大佬的改進(jìn)。
源碼和已編譯工具下載地址:
https://codechina.csdn.net/as604049322/python_gui
完整代碼:
"""
小小明的代碼
CSDN主頁:https://blog.csdn.net/as604049322
"""
__author__?=?'小小明'
__time__?=?'2021/11/24'
import?csv
import?wx
import?os
import?fitz
class?MyCanvas(wx.Panel):
????def?__init__(self,?parent):
????????wx.Panel.__init__(self,?parent)
????????self.parent?=?parent
????????self.rects?=?[]
????????self.Bind(wx.EVT_LEFT_DOWN,?self.OnLeftButtonEvent)
????????self.Bind(wx.EVT_LEFT_UP,?self.OnLeftButtonEvent)
????????self.Bind(wx.EVT_MOTION,?self.OnLeftButtonEvent)
????????self.Bind(wx.EVT_PAINT,?self.DoDrawing)
????????b?=?wx.Button(self,?-1,?"打開文件",?(0,?0))
????????self.Bind(wx.EVT_BUTTON,?self.OnButton,?b)
????????b?=?wx.Button(self,?-1,?"保存文件",?(75,?0))
????????self.Bind(wx.EVT_BUTTON,?self.save_file,?b)
????????b?=?wx.Button(self,?-1,?"保存圖片",?(150,?0))
????????self.Bind(wx.EVT_BUTTON,?self.save_img,?b)
????????b?=?wx.Button(self,?-1,?"撤銷選區(qū)",?(225,?0))
????????self.Bind(wx.EVT_BUTTON,?self.back_select,?b)
????????b?=?wx.Button(self,?-1,?"《",?(300,?0),?size=(25,?25))
????????self.Bind(wx.EVT_BUTTON,?self.previous,?b)
????????b?=?wx.Button(self,?-1,?"》",?(325,?0),?size=(25,?25))
????????self.Bind(wx.EVT_BUTTON,?self.next,?b)
????????self.g1?=?wx.Gauge(self,?-1,?100,?(0,?30),?(-1,?100),?wx.GA_VERTICAL)
????def?previous(self,?evt):
????????if?not?hasattr(self,?"pdfDoc"):
????????????return
????????if?self.i?>?0:
????????????self.i?-=?1
????????????self.change_pdf_page(self.i,?False)
????????????self.DoDrawing(-1)
????????????if?self.rects:
????????????????self.parent.SetTitle(self.path?+?"|"?+?self.extract_pdf_text())
????def?next(self,?evt):
????????if?not?hasattr(self,?"pdfDoc"):
????????????return
????????if?self.i?3d}"
????????????????pix.save(f"{path}/{name}.png")
????????????????self.g1.SetValue((i?+?1)?*?100?//?self.pdfDoc.pageCount)
????????dlg.Destroy()
????????os.system(f"explorer?{path}")
????def?save_file(self,?evt):
????????if?not?hasattr(self,?"pdfDoc"):
????????????return
????????path?=?self.save_FileDialog()
????????if?path?is?None:
????????????return
????????data?=?[]
????????for?i?in?range(self.pdfDoc.pageCount):
????????????page?=?self.pdfDoc[i]
????????????row?=?[self.extract_pdf_text(page,?rect)
???????????????????for?i,?rect?in?enumerate(self.rects)]
????????????data.append(row)
????????with?open(path,?"w")?as?f:
????????????writer?=?csv.writer(f,?lineterminator="\n")
????????????row?=?[f"區(qū)域{i}"?for?i?in?range(1,?len(row)?+?1)]
????????????writer.writerow(row)
????????????for?row?in?data:
????????????????writer.writerow(row)
????????os.system(f"cmd?/c?start?{path}")
????def?extract_pdf_text(self,?page=None,?rect=None):
????????if?page?is?None:
????????????page?=?self.pdfDoc[self.i]
????????if?rect?is?None:
????????????rect?=?self.rects[-1]
????????a,?b,?c,?d?=?rect
????????clip?=?fitz.Rect(a,?b,?a?+?c,?b?+?d)
????????text?=?page.get_text(clip=clip).strip()
????????return?text
????def?change_img(self,?img_path,?move=True):
????????self.bmp?=?wx.Bitmap(img_path)
????????self.SetSize(self.bmp.GetSize())
????????self.parent.SetSize(self.parent.GetBestSize())
????????if?move:
????????????self.parent.Center()
????def?DoDrawing(self,?evt):
????????if?not?hasattr(self,?"bmp"):
????????????return
????????dc?=?wx.ClientDC(self)
????????dc.DrawBitmap(self.bmp,?0,?0,?True)
????????dc.SetPen(wx.Pen('blue'))
????????dc.SetBrush(wx.Brush('white',?wx.BRUSHSTYLE_TRANSPARENT))
????????dc.DrawRectangleList(self.rects)
????def?OnLeftButtonEvent(self,?event):
????????if?event.LeftDown():
????????????self.x,?self.y?=?event.GetPosition()
????????????self.rects.append([self.x,?self.y,?0,?0])
????????elif?event.Dragging():
????????????x,?y?=?event.GetPosition()
????????????self.rects[-1][2]?=?x?-?self.x
????????????self.rects[-1][3]?=?y?-?self.y
????????????self.DoDrawing(-1)
????????elif?event.LeftUp():
????????????print(self.rects)
????????????if?self.rects[-1][2]?5?or?self.rects[-1][3]?5:
????????????????self.rects.pop()
????????????else:
????????????????self.parent.SetTitle(self.path?+?"|"?+?self.extract_pdf_text())
app?=?wx.App()
frm?=?wx.Frame(None)
pnl?=?MyCanvas(frm)
frm.Center()
frm.Show()
frm.SetTitle("PDF文本提取器")
app.MainLoop()