简介:本文介绍如何使用Python实现发票批量识别与Excel自动录入,通过OCR技术提取关键信息并生成结构化数据,详细讲解技术选型、代码实现及优化策略,帮助企业提升财务处理效率。
在财务工作中,发票处理是每月必经的重复性劳动。传统流程需要人工核对发票号码、金额、日期等20余项信息,再逐条录入Excel表格,平均每张发票处理耗时3-5分钟。对于月均处理200张发票的企业,每月需投入约10人小时,且存在人为录入错误风险。
Python自动化解决方案通过OCR(光学字符识别)技术实现发票信息智能提取,结合Excel自动化操作,可将单张发票处理时间压缩至10秒内,准确率提升至98%以上。这种技术革新不仅释放了人力资源,更构建了数字化的财务处理闭环。
推荐组合:PaddleOCR(主体识别)+ EasyOCR(补充识别)
import cv2import numpy as npdef preprocess_invoice(image_path):# 读取图像并转为灰度图img = cv2.imread(image_path)gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)# 二值化处理_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)# 边缘检测与透视矫正edges = cv2.Canny(binary, 50, 150)contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)# 筛选最大轮廓(假设为发票区域)largest_contour = max(contours, key=cv2.contourArea)rect = cv2.minAreaRect(largest_contour)box = cv2.boxPoints(rect)box = np.int0(box)# 透视变换width, height = 800, 600 # 输出尺寸dst = np.array([[0, 0], [width-1, 0], [width-1, height-1], [0, height-1]], dtype="float32")M = cv2.getPerspectiveTransform(box.astype("float32"), dst)warped = cv2.warpPerspective(img, M, (width, height))return warped
from paddleocr import PaddleOCRdef extract_invoice_data(image):ocr = PaddleOCR(use_angle_cls=True, lang="ch")result = ocr.ocr(image, cls=True)invoice_data = {"发票号码": None,"开票日期": None,"金额": None,"购买方名称": None,"销售方名称": None}for line in result:for word_info in line:text = word_info[1][0]# 发票号码识别规则if "发票号码" in text or len(text) == 10 and text.isdigit():invoice_data["发票号码"] = text# 日期识别规则elif len(text) == 8 and text[4] == text[6] == "-":invoice_data["开票日期"] = text# 金额识别规则elif "¥" in text or "元" in text:amount = text.replace("¥", "").replace("元", "")if amount.replace(".", "").isdigit():invoice_data["金额"] = amountreturn invoice_data
import pandas as pdfrom openpyxl import load_workbookdef write_to_excel(data_list, output_path):# 如果文件不存在则创建try:wb = load_workbook(output_path)ws = wb.activeexcept FileNotFoundError:wb = Workbook()ws = wb.active# 写入表头ws.append(["发票号码", "开票日期", "金额", "购买方名称", "销售方名称"])# 写入数据for data in data_list:ws.append([data["发票号码"],data["开票日期"],data["金额"],data["购买方名称"],data["销售方名称"]])wb.save(output_path)
import osfrom datetime import datetimedef process_invoices(input_folder, output_excel):all_data = []for filename in os.listdir(input_folder):if filename.lower().endswith(('.png', '.jpg', '.jpeg')):# 1. 图像预处理image_path = os.path.join(input_folder, filename)processed_img = preprocess_invoice(image_path)# 2. OCR识别invoice_data = extract_invoice_data(processed_img)# 3. 数据增强(可添加业务逻辑)invoice_data["处理时间"] = datetime.now().strftime("%Y-%m-%d %H:%M")invoice_data["原始文件名"] = filenameall_data.append(invoice_data)# 4. 写入Excelwrite_to_excel(all_data, output_excel)print(f"处理完成,共处理{len(all_data)}张发票,结果已保存至{output_excel}")# 使用示例process_invoices("input_invoices", "output_invoices.xlsx")
concurrent.futures实现并行识别def parallel_process(input_folder, output_excel, max_workers=4):
all_data = []
def process_single(filename):# 单个文件处理逻辑(同上)passfilenames = [f for f in os.listdir(input_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]with ThreadPoolExecutor(max_workers=max_workers) as executor:results = list(executor.map(process_single, filenames))# 合并结果并写入Excel# ...
```
PyInstaller打包为独立EXEPyQt5或Tkinter)win32com调用SAP/用友等ERP接口某中型制造企业实施该方案后:
Python实现的发票自动化处理方案,通过OCR与Excel自动化的深度整合,构建了从图像识别到数据落地的完整闭环。该方案具有实施成本低(仅需基础开发环境)、扩展性强(可对接各类业务系统)、维护简单(代码模块化设计)等显著优势,是现代企业财务数字化转型的理想选择。随着计算机视觉技术的持续进步,此类自动化方案将在更多办公场景中展现巨大价值。