import pikepdf
import fitz # PyMuPDF
import io
from PIL import Image
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import threading
import time
class PDFImageExtractorApp:
def __init__(self, root):
self.root = root
self.root.title("PDF图片提取工具")
self.root.geometry("600x400")
self.root.resizable(True, True)
self.pdf_path = ""
self.output_dir = ""
self.password = ""
self.is_running = False
# 创建主框架
main_frame = ttk.Frame(root, padding="20")
main_frame.pack(fill=tk.BOTH, expand=True)
# 文件选择区域
file_frame = ttk.Frame(main_frame)
file_frame.pack(fill=tk.X, pady=10)
ttk.Label(file_frame, text="PDF文件:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
self.file_entry = ttk.Entry(file_frame, width=50)
self.file_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W + tk.E)
ttk.Button(file_frame, text="浏览...", command=self.browse_file).grid(row=0, column=2, padx=5, pady=5)
# 密码输入区域
password_frame = ttk.Frame(main_frame)
password_frame.pack(fill=tk.X, pady=5)
ttk.Label(password_frame, text="PDF密码:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
self.password_entry = ttk.Entry(password_frame, width=30, show="*")
self.password_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W)
ttk.Label(password_frame, text="(可选,如PDF有密码)").grid(row=0, column=2, sticky=tk.W, padx=5, pady=5)
# 进度显示区域
self.progress_var = tk.DoubleVar()
self.progress_frame = ttk.Frame(main_frame)
self.progress_frame.pack(fill=tk.X, pady=10)
self.progress_bar = ttk.Progressbar(self.progress_frame, variable=self.progress_var, length=100)
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
# 状态显示区域
self.status_frame = ttk.Frame(main_frame)
self.status_frame.pack(fill=tk.BOTH, expand=True)
self.status_text = tk.Text(self.status_frame, height=10, width=70, wrap=tk.WORD)
self.status_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5)
scrollbar = ttk.Scrollbar(self.status_frame, command=self.status_text.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.status_text.config(yscrollcommand=scrollbar.set)
self.status_text.config(state=tk.DISABLED)
# 操作按钮区域
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=10)
self.extract_btn = ttk.Button(button_frame, text="提取图片", command=self.start_extraction)
self.extract_btn.pack(side=tk.RIGHT, padx=5)
def browse_file(self):
"""打开文件选择对话框"""
file_path = filedialog.askopenfilename(
title="选择PDF文件",
filetypes=[("PDF文件", "*.pdf"), ("所有文件", "*.*")]
)
if file_path:
self.pdf_path = file_path
self.file_entry.delete(0, tk.END)
self.file_entry.insert(0, file_path)
# 自动设置输出目录为PDF同名文件夹
pdf_dir = os.path.dirname(file_path)
pdf_name = os.path.splitext(os.path.basename(file_path))[0]
self.output_dir = os.path.join(pdf_dir, pdf_name)
self.log_message(f"已选择PDF文件: {file_path}")
self.log_message(f"图片将保存到: {self.output_dir}")
def log_message(self, message):
"""向状态文本框添加消息"""
self.status_text.config(state=tk.NORMAL)
self.status_text.insert(tk.END, message + "\n")
self.status_text.see(tk.END)
self.status_text.config(state=tk.DISABLED)
self.root.update()
def update_progress(self, value):
"""更新进度条"""
self.progress_var.set(value)
self.root.update()
def start_extraction(self):
"""开始提取图片的线程"""
if not self.pdf_path:
messagebox.showerror("错误", "请先选择PDF文件")
return
if self.is_running:
messagebox.showinfo("提示", "正在处理中,请稍候...")
return
self.password = self.password_entry.get()
# 禁用提取按钮
self.extract_btn.config(state=tk.DISABLED)
self.is_running = True
# 清空状态框
self.status_text.config(state=tk.NORMAL)
self.status_text.delete(1.0, tk.END)
self.status_text.config(state=tk.DISABLED)
# 启动处理线程
self.thread = threading.Thread(target=self.process_pdf)
self.thread.daemon = True
self.thread.start()
# 启动监控线程状态的方法
self.root.after(100, self.check_thread)
def check_thread(self):
"""检查处理线程是否完成"""
if self.thread.is_alive():
self.root.after(100, self.check_thread)
else:
self.extract_btn.config(state=tk.NORMAL)
self.is_running = False
messagebox.showinfo("完成", "图片提取完成!")
def check_pdf_encryption(self, pdf_path):
"""检查PDF文件是否加密"""
try:
with pikepdf.Pdf.open(pdf_path) as pdf:
return False # 未加密
except pikepdf.PasswordError:
return True # 已加密
def decrypt_pdf(self, pdf_path, password, output_path):
"""解密PDF文件"""
try:
with pikepdf.Pdf.open(pdf_path, password=password) as pdf:
pdf.save(output_path)
return True
except pikepdf.PasswordError:
self.log_message("密码错误,无法解密PDF")
return False
def extract_images_from_pdf(self, pdf_path, output_dir):
"""提取PDF中的所有原始图片"""
os.makedirs(output_dir, exist_ok=True)
doc = fitz.open(pdf_path)
image_count = 0
total_pages = len(doc)
for page_index in range(total_pages):
page = doc[page_index]
image_list = page.get_images(full=True)
# 更新进度
progress_value = (page_index + 1) / total_pages * 100
self.update_progress(progress_value)
self.log_message(f"正在处理第 {page_index + 1}/{total_pages} 页...")
for image_index, img in enumerate(image_list):
xref = img[0]
# 提取原始图像数据
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# 直接使用原始图像数据
image = Image.open(io.BytesIO(image_bytes))
image_filename = os.path.join(output_dir, f"page{page_index + 1}_img{image_index + 1}.{image_ext}")
image.save(image_filename)
image_count += 1
self.log_message(f"已保存: {os.path.basename(image_filename)}")
self.log_message(f"成功提取了 {image_count} 张原始图片")
return image_count
def process_pdf(self):
"""处理PDF文件:检查加密,解密(如需),提取图片"""
try:
os.makedirs(self.output_dir, exist_ok=True)
self.update_progress(0)
if self.check_pdf_encryption(self.pdf_path):
self.log_message("PDF文件已加密,尝试解密...")
if not self.password:
self.log_message("错误: PDF文件已加密,但未提供密码")
return
decrypted_pdf_path = os.path.join(self.output_dir, "decrypted_" + os.path.basename(self.pdf_path))
if self.decrypt_pdf(self.pdf_path, self.password, decrypted_pdf_path):
self.log_message("PDF解密成功,开始提取图片...")
self.extract_images_from_pdf(decrypted_pdf_path, self.output_dir)
# 删除临时解密文件
try:
os.remove(decrypted_pdf_path)
except:
pass
else:
self.log_message("PDF解密失败,无法提取图片")
else:
self.log_message("PDF文件未加密,直接提取图片...")
self.extract_images_from_pdf(self.pdf_path, self.output_dir)
self.update_progress(100)
self.log_message(f"处理完成! 图片保存在: {self.output_dir}")
except Exception as e:
self.log_message(f"处理出错: {str(e)}")
self.update_progress(0)
if __name__ == "__main__":
root = tk.Tk()
app = PDFImageExtractorApp(root)
root.mainloop()