二、软件功能:
1. 根据template.docx中的占位符锁定word文件表格数据所在坐标,批量提取Files文件夹下其它同一版本word文件数据到Excel表。
2. 可以批量把多个Word文件数据批量提取存入到Excel表,一个文件一行数据。执行状态可显示。
3. 提取对象主要是Word表格中用占位符标记的数据。
三、使用方法
1. 模板中用 {{标记名}} 标注要提取的单元格,点击导入模板,右侧显示要提取数据信息
2. 点击文件标记信息按钮,软件按从上到下、从左到右提取数据,并遍历 Files 下所有 .docx ,doc要转为docx, 在相同(表序, 行, 列) 处取值写入汇总表.xlsx
四、注意事项
1. 首先要设定Word文件表格模板,要提取的数据要放入双花括号中。
2. 每个Word文件的表格列数、行数一致、版式相同,以便能够提取数据。
3. 更换Word文件时,注意修改模板文件。
五、软件截图

六、代码展示
[Python] 纯文本查看 复制代码# -*- coding: utf-8 -*-
"""
基于模板标记的 Word 表格批量提取工具 (tkinter)
- 模板中用 {{标记名}} 标注要提取的单元格
- 解析模板所有表格,按 从上到下、从左到右 顺序记录 (表序, 行, 列, 标记名)
- 遍历 ./Files 下所有 .docx ,doc要转为docx, 在相同(表序, 行, 列) 处取值写入 ./汇总.xlsx
- UI 左:导入模板 / 提取文件标记信息 / 退出程序;右:Text 显示坐标与标记
"""
import os
import re
import traceback
import tkinter as tk
from tkinter import ttk, messagebox, filedialog
# 依赖
missing = []
try:
from docx import Document
except Exception:
missing = ["python-docx"]
try:
from openpyxl import Workbook
except Exception:
missing.append("openpyxl")
if missing:
raise SystemExit("缺少依赖:{}\n请先执行:pip install {}".format(", ".join(missing), " ".join(missing)))
FILES_DIR = os.path.join(os.getcwd(), "Files")
OUTPUT_XLSX = os.path.join(os.getcwd(), "汇总.xlsx")
MARK_PATTERN = re.compile(r"\{\{(.+?)\}\}") # {{标记名}}
# ----------------- Word 表解析:合并单元格锚点模型 -----------------
def _get_tcPr_prop(cell, name):
try:
tcPr = cell._tc.tcPr
if tcPr is None:
return None
return getattr(tcPr, name)
except Exception:
return None
def _to_int(val, default=1):
try:
return int(str(val).strip())
except Exception:
return default
class GridCell:
__slots__ = ("anchor", "visible", "text", "rowspan", "colspan")
def __init__(self, anchor=None, visible=False, text="", rowspan=1, colspan=1):
self.anchor = anchor
self.visible = visible
self.text = text
self.rowspan = rowspan
self.colspan = colspan
def build_table_grid(table):
"""将 python-docx 的表解析为仅锚点可见的网格,处理 gridSpan / vMerge(continue/restart)"""
# 列数(考虑横向合并)
row_col_counts = []
for row in table.rows:
cnt = 0
for cell in row.cells:
gs = _get_tcPr_prop(cell, "gridSpan")
val = getattr(gs, "val", None) if (gs is not None) else None
colspan = _to_int(val, 1) if (val is not None) else 1
cnt += max(1, colspan)
row_col_counts.append(cnt)
n_rows = len(table.rows)
n_cols = max(row_col_counts) if row_col_counts else 0
grid = [[None for _ in range(n_cols)] for _ in range(n_rows)]
anchors = {}
def occupy(ar, ac, rs, cs, text):
for rr in range(ar, ar + rs):
for cc in range(ac, ac + cs):
grid[rr][cc] = GridCell(anchor=(ar, ac),
visible=(rr == ar and cc == ac),
text=(text if rr == ar and cc == ac else ""),
rowspan=(rs if rr == ar and cc == ac else 1),
colspan=(cs if rr == ar and cc == ac else 1))
for r, row in enumerate(table.rows):
c = 0
for cell in row.cells:
# 跳到空位
while c = n_cols:
break
gs = _get_tcPr_prop(cell, "gridSpan")
colspan = _to_int(getattr(gs, "val", 1), 1) if (gs is not None) else 1
colspan = max(1, colspan)
vm = _get_tcPr_prop(cell, "vMerge")
vm_val = None
if vm is not None:
try:
vm_val = (str(vm.val).strip().lower() if vm.val is not None else None)
except Exception:
vm_val = None
# 兼容 与 w:val="continue"
is_continue = (vm is not None) and (vm_val is None or vm_val == "continue")
text = cell.text.replace("\n", " ").strip()
if not is_continue:
# 普通/起点:创建锚点
occupy(r, c, 1, colspan, text)
anchors[(r, c)] = {"rowspan": 1, "colspan": colspan}
else:
# 继续:向上找锚点并扩展
rr = r - 1
anchor_found = None
while rr >= 0 and anchor_found is None:
gc = grid[rr][c]
if gc is not None:
anchor_found = gc.anchor
break
rr -= 1
if anchor_found is None:
occupy(r, c, 1, colspan, text)
anchors[(r, c)] = {"rowspan": 1, "colspan": colspan}
else:
ar, ac = anchor_found
meta = anchors.get((ar, ac))
if meta:
meta["rowspan"] += 1
for cc in range(ac, ac + meta["colspan"]):
grid[r][cc] = GridCell(anchor=(ar, ac), visible=False)
else:
occupy(r, c, 1, colspan, text)
anchors[(r, c)] = {"rowspan": 1, "colspan": colspan}
c += colspan
while c {{ {m['name']} }}")
self.text.insert("end", "\n".join(lines))
def on_extract_all(self):
if not self.marks:
messagebox.showinfo("提示", "请先导入包含 {{标记}} 的模板。")
return
if not os.path.isdir(FILES_DIR):
messagebox.showwarning("提示", f"未找到目录:{FILES_DIR}")
return
docx_paths = [os.path.join(FILES_DIR, f) for f in os.listdir(FILES_DIR) if f.lower().endswith(".docx")]
if not docx_paths:
messagebox.showinfo("提示", "Files 目录中没有 .docx 文件。")
return
self.set_status("正在遍历 Files 提取数据...")
# 准备列头
#headers = ["文件名"] + [m["name"] for m in self.marks]
rows_out = []
for p in docx_paths:
try:
doc = Document(p)
except Exception as e:
fname = os.path.splitext(os.path.basename(p))[0]
rows_out.append([fname] + [""]*len(self.marks))
continue
values = []
for m in self.marks:
ti, r, c = m["table"], m["row"], m["col"]
if ti >= len(doc.tables):
values.append("")
continue
grid, n_rows, n_cols = build_table_grid(doc.tables[ti])
if not (0 >> 在这里加过滤行
rows_out = [row for row in rows_out if any(cell for cell in row[1:])]
wb = Workbook()
ws = wb.active
ws.title = "提取结果"
#ws.append(headers)
for row in rows_out:
ws.append(row)
wb.save(OUTPUT_XLSX)
self.set_status(f"已写入:{OUTPUT_XLSX}")
messagebox.showinfo("完成", f"提取完成,已写入:\n{OUTPUT_XLSX}")
except Exception:
messagebox.showerror("写入失败", "保存 Excel 时出错,请查看控制台日志。")
traceback.print_exc()
self.set_status("写入失败")
if __name__ == "__main__":
try:
app = App()
app.mainloop()
except SystemExit as e:
print(str(e))
except Exception:
traceback.print_exc()
七、源码、样例下载
提取word表格数据写入到Excel pythonfun作品.zip
(75.52 KB, 下载次数: 191)
2025-8-10 12:19 上传
点击文件名下载附件