简介:本文详细介绍如何用Python读取含中文、日文、泰文等亚洲语言的PDF文件,提取文本并转为字符串,重点解决OCR识别与编码处理难题,提供完整代码示例与优化建议。
在全球化信息处理场景中,企业与开发者常面临多语言PDF文本提取需求。例如,日企需将产品说明书PDF转为可编辑文本,中文文献机构需数字化古籍PDF,泰语旅游资料需翻译为多语言版本。传统PDF解析工具对亚洲语言支持不足,主要存在两大痛点:
本文将系统讲解从PDF读取到字符串存储的全流程,重点解决OCR识别与编码处理两大核心问题。
当PDF包含可复制的文本层时,优先使用PyPDF2或pdfminer.six库:
# 使用PyPDF2提取文本(适用于结构化PDF)from PyPDF2 import PdfReaderdef extract_text_from_pdf(pdf_path):reader = PdfReader(pdf_path)text = ""for page in reader.pages:text += page.extract_text()return text# 使用示例japanese_text = extract_text_from_pdf("japanese_doc.pdf")print(japanese_text[:200]) # 打印前200字符验证
局限性:对扫描件或图片型PDF无效,中日泰文混排时可能出现编码错误。
对于图片型PDF,需结合OCR引擎:
# 安装依赖:pip install pytesseract pdf2imageimport pytesseractfrom pdf2image import convert_from_pathdef ocr_pdf_to_text(pdf_path, lang='jpn+chi_sim+tha'):# 将PDF转为图像列表images = convert_from_path(pdf_path)# 配置Tesseract路径(根据系统调整)# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'full_text = ""for i, image in enumerate(images):# 多语言混合识别text = pytesseract.image_to_string(image,lang=lang,config='--psm 6' # 自动页面分割模式)full_text += f"\n=== Page {i+1} ===\n" + textreturn full_text# 使用示例mixed_text = ocr_pdf_to_text("multilingual.pdf")print(mixed_text[:500])
关键参数说明:
lang参数:jpn(日文)、chi_sim(简体中文)、tha(泰文)的组合psm模式:6表示假设为统一文本块,适合排版规整的文档对于专业场景,可考虑以下商业API:
# 伪代码示例(百度OCR需申请API Key)import requestsdef baidu_ocr_pdf(pdf_path, api_key, secret_key):# 1. 上传PDF获取access_token# 2. 调用PDF识别接口# 3. 解析返回的JSONpass
亚洲语言处理常见编码问题:
最佳实践:
# 显式指定编码读写文件with open("output.txt", "w", encoding="utf-8") as f:f.write(mixed_text)# 安全截取字符串(避免中文字符截断)def safe_substr(text, max_len):if len(text.encode('utf-8')) <= max_len:return text# 回溯查找最后一个完整字符for i in range(max_len, 0, -1):if len(text[:i].encode('utf-8')) <= max_len:return text[:i]return ""
对于多语言文本,推荐采用JSON格式存储:
import jsondata = {"metadata": {"source_file": "multilingual.pdf","languages": ["jpn", "chi_sim", "tha"],"page_count": 5},"content": {"page_1": "こんにちは...你好...สวัสดี","page_2": "続きのテキスト..."}}with open("output.json", "w", encoding="utf-8") as f:json.dump(data, f, ensure_ascii=False, indent=2)
对于超过100页的PDF:
# 分块处理示例def process_large_pdf(pdf_path, chunk_size=10):reader = PdfReader(pdf_path)all_text = []for i in range(0, len(reader.pages), chunk_size):chunk_text = ""for page_num in range(i, min(i+chunk_size, len(reader.pages))):chunk_text += reader.pages[page_num].extract_text()all_text.append(chunk_text)# 此处可添加异步处理或存储逻辑return all_text
实施三步验证法:
def validate_text(text, expected_lang):lang_features = {'jpn': ['の', 'は', 'を'],'chi_sim': ['的', '了', '是'],'tha': ['ก', 'ข', 'ค']}if expected_lang not in lang_features:return Falsefeature_chars = lang_features[expected_lang]return any(char in text for char in feature_chars)
综合方案实现:
import pytesseractfrom pdf2image import convert_from_pathimport jsonimport osclass MultilingualPDFExtractor:def __init__(self, tesseract_path=None):if tesseract_path:pytesseract.pytesseract.tesseract_cmd = tesseract_pathdef extract(self, pdf_path, output_format='txt', lang='jpn+chi_sim+tha'):# 尝试原生文本提取try:from PyPDF2 import PdfReaderreader = PdfReader(pdf_path)text = "\n".join([page.extract_text() for page in reader.pages])if text.strip(): # 如果提取到文本return self._save_result(text, pdf_path, output_format)except:pass# 回退到OCR方案images = convert_from_path(pdf_path)full_text = ""for i, img in enumerate(images):text = pytesseract.image_to_string(img,lang=lang,config='--psm 6')full_text += f"\n=== Page {i+1} ===\n" + textreturn self._save_result(full_text, pdf_path, output_format)def _save_result(self, text, source_file, output_format):base_name = os.path.splitext(source_file)[0]result = {"source": source_file,"content": text,"stats": {"char_count": len(text),"byte_size": len(text.encode('utf-8'))}}if output_format == 'json':output_path = f"{base_name}.json"with open(output_path, 'w', encoding='utf-8') as f:json.dump(result, f, ensure_ascii=False, indent=2)return output_pathelse: # txtoutput_path = f"{base_name}.txt"with open(output_path, 'w', encoding='utf-8') as f:f.write(text)return output_path# 使用示例extractor = MultilingualPDFExtractor()result_path = extractor.extract("multilingual_doc.pdf", output_format='json')print(f"处理完成,结果保存至: {result_path}")
解决方案:
--psm 0模式(自动页面分割)
from PIL import Imageimport numpy as npdef rotate_vertical_text(image_path, output_path):img = Image.open(image_path)# 转换为numpy数组处理arr = np.array(img)# 旋转90度(根据实际排版调整)rotated = np.rot90(arr, k=3) # k=3表示270度旋转rotated_img = Image.fromarray(rotated)rotated_img.save(output_path)return output_path
泰文特点:
优化建议:
tha.traineddata)
def post_process_thai(text):# 简单示例:添加泰文词间空格(实际需更复杂逻辑)thai_chars = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะาำิีึืฺุูเแโใไํ๊์๋์็๎"processed = []for char in text:if char in thai_chars and len(processed) > 0 and processed[-1] not in thai_chars:processed.append(" ")processed.append(char)return "".join(processed)
本文提出的解决方案实现了:
未来发展方向:
通过合理组合现有工具与自定义处理逻辑,开发者可以构建高效可靠的多语言PDF文本提取系统,满足全球化信息处理需求。