这个脚本就是从pdf里提取文字,再将pdf改成中文名。方便以后搜索查看。
[Python] 纯文本查看 复制代码import os
import re
import random
from pdfplumber import open as pdf_open
from tkinter import Tk, filedialog, Text, Button, Scrollbar, END
import threading
import time # 提示缺模块,pip安装即可,python3.7以上皆可正常用
def extract_title_efficiently(pdf_path):
try:
with pdf_open(pdf_path) as pdf:
text = pdf.pages[0].extract_text()
title_match = re.search(r'标题:(.+)', text)
if title_match:
return title_match.group(1).strip()
except Exception as e:
return None
def extract_chinese_text_from_first_four_lines(pdf_path):
try:
with pdf_open(pdf_path) as pdf:
text = pdf.pages[0].extract_text().strip()
lines = text.split('\n')
first_four_lines = ' '.join(lines[:4]).strip()
chinese_text = re.sub(r'[^\u4e00-\u9fff\d]+', '', first_four_lines)
return chinese_text
except Exception as e:
return None
def generate_valid_filename(text, max_length=50):
return re.sub(r'[:"/\\|?*\x00-\x1F\x7F]+', '-', text[:max_length])
def generate_unique_filename(folder_path, filename, new_filename):
base, extension = os.path.splitext(filename)
counter = 1
while True:
new_filename = f"{base}-{new_filename}"
new_path = os.path.join(folder_path, new_filename + extension)
if not os.path.exists(new_path):
return new_filename + extension
counter += 1
def load_skipped_files(skipped_file_path):
skipped_files = set()
if os.path.exists(skipped_file_path):
with open(skipped_file_path, 'r', encoding='utf-8') as file:
skipped_files = {line.strip() for line in file.readlines()}
return skipped_files
def record_skipped_file(skipped_file_path, filename):
with open(skipped_file_path, 'a', encoding='utf-8') as file:
file.write(filename + '\n')
def rename_pdf_with_title_or_extracted_text(folder_path, text_box, log_file_path, skipped_file_path):
if not os.path.isdir(folder_path):
update_text_box(text_box, "错误:提供的路径不是一个目录。")
return
skipped_files = load_skipped_files(skipped_file_path)
update_text_box(text_box, f"正在处理文件夹:{folder_path}")
walk_folder(folder_path, skipped_files, text_box, log_file_path, skipped_file_path)
update_text_box(text_box, "所有文件改名已完成。")
def walk_folder(folder_path, skipped_files, text_box, log_file_path, skipped_file_path):
for root, dirs, files in os.walk(folder_path):
for filename in sorted(files):
if filename.lower().endswith('.pdf') and filename not in skipped_files:
pdf_path = os.path.join(root, filename)
start_time = time.time() # 记录开始时间
try:
title = extract_title_efficiently(pdf_path)
if not title:
title = extract_chinese_text_from_first_four_lines(pdf_path)
if title:
new_filename = generate_valid_filename(title)
new_filename = generate_unique_filename(root, filename, new_filename)
os.rename(pdf_path, os.path.join(root, new_filename))
with open(log_file_path, 'a', encoding='utf-8') as log_file:
log_file.write(f"文件 '{filename}' 已重命名为 '{new_filename}'\n")
update_text_box(text_box, f"文件 '{filename}' 已重命名为 '{new_filename}'")
else:
record_skipped_file(skipped_file_path, filename)
update_text_box(text_box, f"跳过文件:'{filename}'(未找到标题或中文文本)")
except Exception as e:
record_skipped_file(skipped_file_path, filename)
update_text_box(text_box, f"跳过文件:'{filename}'(处理时发生错误)")
elapsed_time = time.time() - start_time # 计算处理时间
if elapsed_time > 10: # 如果处理时间超过10s
record_skipped_file(skipped_file_path, filename)
update_text_box(text_box, f"跳过文件:'{filename}'(处理时间超过10s)")
def update_text_box(text_box, message):
text_box.config(state='normal')
text_box.insert(END, message + '\n')
text_box.yview(END) # 滚动到文本框的底部
text_box.config(state='disabled')
def do_renaming_thread(folder_path, text_box, log_file_path, skipped_file_path):
rename_pdf_with_title_or_extracted_text(folder_path, text_box, log_file_path, skipped_file_path)
# 创建主窗口
root = Tk()
root.title("PDF Renamer by barnett2016")
# 创建文本框用于显示消息
text_box = Text(root, height=10)
text_box.pack()
# 创建滚动条
scrollbar = Scrollbar(root, orient='vertical', command=text_box.yview)
scrollbar.pack(side='right', fill='y')
# 将滚动条与文本框连接
text_box.config(yscrollcommand=scrollbar.set)
# 存储选择的文件夹路径
selected_folder = None
def browse_folder():
global selected_folder
selected_folder = filedialog.askdirectory()
if selected_folder:
update_text_box(text_box, f"请选择的文件夹:{selected_folder}")
def do_renaming():
if selected_folder:
log_file_path = os.path.join(selected_folder, "改名记录.txt")
skipped_file_path = os.path.join(selected_folder, "skipped.txt")
renaming_thread = threading.Thread(target=do_renaming_thread, args=(selected_folder, text_box, log_file_path, skipped_file_path))
renaming_thread.start()
else:
update_text_box(text_box, "错误:请先选择文件夹。")
# 创建按钮用于选择文件夹
browse_button = Button(root, text="打开PDF所在文件夹", command=browse_folder)
browse_button.pack()
# 创建按钮用于执行重命名操作
rename_button = Button(root, text="开始重命名PDF文件", command=do_renaming)
rename_button.pack()
# 启动GUI主循环
root.mainloop()