简介:本文详细介绍了如何使用Python实现图片表格识别并转为Excel表格,包括OCR技术选型、表格结构解析、Excel文件生成等关键步骤,适合开发者及企业用户参考。
在数字化办公场景中,将图片中的表格数据快速转换为可编辑的Excel文件是提高工作效率的重要需求。本文将系统阐述如何使用Python实现图片表格识别并导出为Excel表格,涵盖技术选型、实现步骤及优化建议。
OCR(光学字符识别)是图片表格识别的核心技术。主流Python库包括:
建议选择PaddleOCR作为核心识别引擎,其特点包括:
pip install paddleocr openpyxl pillow numpy
from paddleocr import PaddleOCR, draw_ocrimport cv2import numpy as npdef recognize_table(image_path):# 初始化PaddleOCRocr = PaddleOCR(use_angle_cls=True, lang="ch")# 读取图片img = cv2.imread(image_path)# 执行OCR识别result = ocr.ocr(img, cls=True)# 提取识别结果table_data = []for line in result:for word_info in line:word_text = word_info[1][0]word_pos = word_info[0] # 包含四个点的坐标table_data.append((word_text, word_pos))return table_data
对于复杂表格,需额外处理表格线检测和单元格合并:
def parse_table_structure(image_path):# 使用PaddleOCR的表格识别功能ocr = PaddleOCR(use_angle_cls=True, lang="ch", table_engine="True")img = cv2.imread(image_path)result = ocr.ocr(img, cls=True)# 解析表格结构table_results = result[1] # 表格识别结果cells = []for table in table_results:for cell in table['data']:text = cell['text'][0]bbox = cell['bbox']cells.append({'text': text,'bbox': bbox,'row': cell['row'], # 行号'col': cell['col'] # 列号})return cells
from openpyxl import Workbookdef export_to_excel(table_data, output_path):wb = Workbook()ws = wb.active# 假设table_data是按行组织的二维列表for row_idx, row_data in enumerate(table_data, start=1):for col_idx, cell_data in enumerate(row_data, start=1):ws.cell(row=row_idx, column=col_idx, value=cell_data)wb.save(output_path)
对于解析后的表格结构:
def export_structured_excel(cells, output_path):# 确定最大行列数max_row = max(cell['row'] for cell in cells) + 1max_col = max(cell['col'] for cell in cells) + 1wb = Workbook()ws = wb.active# 创建二维数组存储数据excel_data = [[None for _ in range(max_col)] for _ in range(max_row)]for cell in cells:excel_data[cell['row']][cell['col']] = cell['text']# 写入Excelfor row_idx, row in enumerate(excel_data):for col_idx, value in enumerate(row):if value is not None:ws.cell(row=row_idx+1, column=col_idx+1, value=value)wb.save(output_path)
from paddleocr import PaddleOCRfrom openpyxl import Workbookimport cv2def image_table_to_excel(image_path, excel_path):# 1. 表格识别ocr = PaddleOCR(use_angle_cls=True, lang="ch", table_engine=True)img = cv2.imread(image_path)result = ocr.ocr(img, cls=True)# 2. 解析表格结构table_data = []if len(result) > 1 and isinstance(result[1], list):for table in result[1]:for cell in table['data']:row, col = cell['row'], cell['col']text = cell['text'][0]# 确保二维数组足够大while len(table_data) <= row:table_data.append([])while len(table_data[row]) <= col:table_data[row].append("")table_data[row][col] = textelse:# 简单文本识别 fallbackfor line in result[0]:for word in line:text = word[1][0]# 简单按行添加table_data.append([text])# 3. 生成Excelwb = Workbook()ws = wb.activefor row_idx, row in enumerate(table_data, start=1):for col_idx, cell in enumerate(row, start=1):if cell:ws.cell(row=row_idx, column=col_idx, value=cell)wb.save(excel_path)print(f"Excel文件已生成: {excel_path}")# 使用示例image_table_to_excel("table.png", "output.xlsx")
def preprocess_image(image_path):img = cv2.imread(image_path)# 转换为灰度图gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)# 二值化处理_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)# 降噪denoised = cv2.fastNlMeansDenoising(binary, None, 10, 7, 21)return denoised
def postprocess_text(text):# 去除多余空格和换行符cleaned = " ".join(text.split())# 中文全角转半角import rerstring = cleaned.replace(" ", "").replace(" ", "")return rstring
本文详细介绍了使用Python实现图片表格识别并转为Excel表格的完整方案。通过PaddleOCR等先进工具,开发者可以高效完成这一任务。未来发展方向包括:
掌握这一技术将显著提升数据处理效率,为企业数字化转型提供有力支持。