提交 46bba812 authored 作者: 贺阳's avatar 贺阳

优化分批处理,减少内存

上级 5ab84684
......@@ -367,11 +367,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
def _merge_pdf_files(self, processed_files):
"""
合并所有涂抹后的PDF文件为一个PDF并保存到pdf_file字段
使用临时文件方式减少内存占用
:param processed_files: 处理后的文件数组
"""
import fitz # PyMuPDF
from datetime import datetime
import tempfile
import os
import gc
temp_file_path = None
try:
# 过滤有效的PDF文件
valid_files = []
......@@ -406,41 +411,74 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 多个PDF文件需要合并
_logger.info(f"开始合并 {len(valid_files)} 个PDF文件")
# 创建新的PDF文档用于合并
# 使用临时文件方式合并,避免内存占用过大
temp_file_path = tempfile.mktemp(suffix='.pdf')
merged_pdf = fitz.open()
bl_numbers = []
# 遍历所有处理后的PDF文件
for file_info in valid_files:
bl = file_info['bl']
file_data = file_info['file_data']
bl_numbers.append(bl.bl_no)
# 遍历所有处理后的PDF文件,分批处理以减少内存占用
batch_size = 5 # 每批处理5个PDF
for batch_start in range(0, len(valid_files), batch_size):
batch_files = valid_files[batch_start:batch_start + batch_size]
_logger.info(f"处理第 {batch_start // batch_size + 1} 批,共 {len(batch_files)} 个PDF")
try:
# 将base64数据转换为二进制
pdf_binary = base64.b64decode(file_data)
# 打开PDF文档
source_pdf = fitz.open(stream=pdf_binary, filetype="pdf")
for file_info in batch_files:
bl = file_info['bl']
file_data = file_info['file_data']
bl_numbers.append(bl.bl_no)
# 将源PDF的所有页面插入到合并的PDF中
merged_pdf.insert_pdf(source_pdf)
source_pdf.close()
_logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档")
except Exception as e:
_logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
continue
source_pdf = None
pdf_binary = None
try:
# 将base64数据转换为二进制
pdf_binary = base64.b64decode(file_data)
# 打开PDF文档
source_pdf = fitz.open(stream=pdf_binary, filetype="pdf")
# 将源PDF的所有页面插入到合并的PDF中
merged_pdf.insert_pdf(source_pdf)
_logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档({len(source_pdf)} 页)")
except Exception as e:
_logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
continue
finally:
# 立即释放资源
if source_pdf:
source_pdf.close()
source_pdf = None
pdf_binary = None
gc.collect() # 强制垃圾回收
# 每批处理完后,保存到临时文件并释放内存
if batch_start + batch_size < len(valid_files):
# 保存当前合并结果到临时文件
merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
merged_pdf.close()
# 重新打开临时文件继续合并
merged_pdf = fitz.open(temp_file_path)
gc.collect()
# 如果有页面,保存合并后的PDF
if len(merged_pdf) > 0:
# 保存到内存
output_buffer = io.BytesIO()
merged_pdf.save(output_buffer, garbage=4, deflate=True, clean=True)
# 使用临时文件保存,减少内存占用
if not temp_file_path:
temp_file_path = tempfile.mktemp(suffix='.pdf')
merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
merged_pdf.close()
# 从临时文件读取并转换为base64
with open(temp_file_path, 'rb') as f:
pdf_data = f.read()
# 转换为base64
merged_pdf_base64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
merged_pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
# 清理临时数据
del pdf_data
gc.collect()
# 生成文件名(包含提单号和日期)
bl_numbers_str = '_'.join(bl_numbers[:5]) # 最多显示5个提单号
......@@ -455,12 +493,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
'pdf_filename': pdf_filename
})
# 清理base64数据
del merged_pdf_base64
gc.collect()
_logger.info(f"成功合并 {len(bl_numbers)} 个PDF文件,文件名: {pdf_filename}")
else:
_logger.warning("没有有效的PDF文件可以合并")
except Exception as e:
_logger.error(f"合并PDF文件失败: {str(e)}")
finally:
# 清理临时文件
if temp_file_path and os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
_logger.info(f"已删除临时文件: {temp_file_path}")
except Exception as e:
_logger.warning(f"删除临时文件失败: {str(e)}")
def _match_bl_by_file_name(self, pdf_file_arr):
"""
......@@ -836,6 +886,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
def _process_pdf_with_ai_image_edit(self, pdf_data, bl_no):
"""
使用AI图片编辑处理PDF:PDF转图片 -> AI抹除文字 -> 图片转回PDF(按照image-to-coordinate.py的逻辑)
优化内存占用:对于多页PDF使用临时文件方式分批处理
:param pdf_data: PDF二进制数据
:param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据
......@@ -843,6 +894,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
import fitz # PyMuPDF
import base64
import mimetypes
import gc
import os
import tempfile
from PIL import Image
import time
......@@ -854,12 +908,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 打开PDF文档
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
processed_images = [] # 存储处理后的PIL图片对象
total_pages = len(pdf_document)
total_ai_time = 0.0 # 累计AI总耗时
_logger.info(f"PDF总页数: {total_pages}")
# 对于多页PDF,使用临时文件方式减少内存占用
use_temp_file = total_pages > 5 # 超过5页使用临时文件
temp_file_path = None
if use_temp_file:
import tempfile
temp_file_path = tempfile.mktemp(suffix='.pdf')
_logger.info(f"使用临时文件方式处理,减少内存占用: {temp_file_path}")
processed_images = [] # 存储处理后的PIL图片对象(分批处理)
batch_size = 5 # 每批处理5页图片
# 遍历每一页(按照image-to-coordinate.py的逻辑)
for page_num in range(total_pages):
page_start_time = time.time()
......@@ -867,73 +931,171 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger.info(f"正在处理第{page_num + 1}页")
# 将页面转换为图像(按照image-to-coordinate.py的pdf_to_images函数,使用dpi=150)
dpi = 150
# 对于内存优化,使用稍低的分辨率(120 DPI)以避免内存问题
dpi = 120
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
# 将pixmap转换为PIL Image对象
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data))
# 获取图片尺寸(按照image-to-coordinate.py的逻辑)
img_w, img_h = img.size
_logger.info(f"第{page_num + 1}页页面尺寸: {img_w}x{img_h} 像素")
# 将图片编码为base64(按照image-to-coordinate.py的encode_file函数逻辑)
img_bytes_io = io.BytesIO()
img.save(img_bytes_io, format='PNG')
img_bytes_io.seek(0)
encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
mime_type = 'image/png'
img_base64 = f"data:{mime_type};base64,{encoded_string}"
# 使用AI编辑图片,移除指定文字(带重试机制)
edited_img_base64 = None
ai_processing_time = 0.0
max_retries = 2 # 最多尝试2次(首次+1次重试)
pix = None
img = None
img_bytes_io = None
for attempt in range(1, max_retries + 1):
ai_start_time = time.time()
try:
# 调用AI服务(使用base64编码的图片数据,不带data:前缀)
edited_img_base64_raw = ai_service.edit_image_remove_text(
encoded_string, # 传入不带data:前缀的base64字符串
text_to_remove="AGN UCLINK LOGISITICS LTD"
)
ai_end_time = time.time()
attempt_time = ai_end_time - ai_start_time
ai_processing_time += attempt_time # 累计AI耗时
total_ai_time += attempt_time # 累计总AI耗时
if edited_img_base64_raw:
edited_img_base64 = edited_img_base64_raw
_logger.info(f"第{page_num + 1}页AI处理成功(第{attempt}次尝试),耗时: {attempt_time:.2f}秒")
break
else:
try:
pix = page.get_pixmap(matrix=mat)
# 将pixmap转换为PIL Image对象
img_data = pix.tobytes("png")
del pix # 立即释放pixmap以节省内存
pix = None
gc.collect() # 强制垃圾回收
img = Image.open(io.BytesIO(img_data))
# 获取图片尺寸(按照image-to-coordinate.py的逻辑)
img_w, img_h = img.size
_logger.info(f"第{page_num + 1}页页面尺寸: {img_w}x{img_h} 像素")
# 将图片编码为base64(按照image-to-coordinate.py的encode_file函数逻辑)
img_bytes_io = io.BytesIO()
img.save(img_bytes_io, format='PNG')
img_bytes_io.seek(0)
encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
img_bytes_io.close() # 立即关闭BytesIO
img_bytes_io = None
del img_data # 释放图片数据
gc.collect() # 强制垃圾回收
# 使用AI编辑图片,移除指定文字(带重试机制)
edited_img_base64 = None
ai_processing_time = 0.0
max_retries = 2 # 最多尝试2次(首次+1次重试)
for attempt in range(1, max_retries + 1):
ai_start_time = time.time()
try:
# 调用AI服务(使用base64编码的图片数据,不带data:前缀)
edited_img_base64_raw = ai_service.edit_image_remove_text(
encoded_string, # 传入不带data:前缀的base64字符串
text_to_remove="AGN UCLINK LOGISITICS LTD"
)
ai_end_time = time.time()
attempt_time = ai_end_time - ai_start_time
ai_processing_time += attempt_time # 累计AI耗时
total_ai_time += attempt_time # 累计总AI耗时
if edited_img_base64_raw:
edited_img_base64 = edited_img_base64_raw
_logger.info(f"第{page_num + 1}页AI处理成功(第{attempt}次尝试),耗时: {attempt_time:.2f}秒")
break
else:
if attempt < max_retries:
_logger.warning(f"第{page_num + 1}页AI处理失败(第{attempt}次尝试),将重试,耗时: {attempt_time:.2f}秒")
else:
_logger.warning(f"第{page_num + 1}页AI处理失败(第{attempt}次尝试,已用尽重试),耗时: {attempt_time:.2f}秒")
except Exception as e:
ai_end_time = time.time()
attempt_time = ai_end_time - ai_start_time
ai_processing_time += attempt_time
total_ai_time += attempt_time
_logger.error(f"第{page_num + 1}页AI处理异常(第{attempt}次尝试): {str(e)},耗时: {attempt_time:.2f}秒")
if attempt < max_retries:
_logger.warning(f"第{page_num + 1}页AI处理失败(第{attempt}次尝试),将重试,耗时: {attempt_time:.2f}秒")
_logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
edited_img_base64 = None
# 释放encoded_string以节省内存
del encoded_string
gc.collect()
if edited_img_base64:
# 解码base64图片数据并转换为PIL Image对象(按照image-to-coordinate.py的逻辑)
edited_img_data = base64.b64decode(edited_img_base64)
edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
del edited_img_data # 立即释放原始数据
del edited_img_base64 # 释放base64字符串
processed_images.append(edited_img)
_logger.info(f"第{page_num + 1}页AI处理最终成功,总耗时: {ai_processing_time:.2f}秒")
else:
_logger.warning(f"第{page_num + 1}页AI处理最终失败(已重试),使用原始页面,总耗时: {ai_processing_time:.2f}秒")
# 如果AI处理失败,使用原始图片
processed_images.append(img.convert('RGB'))
# 释放原始图片对象
if img:
img.close()
del img
img = None
gc.collect() # 强制垃圾回收
# 分批处理:每处理batch_size页,就转换为PDF并保存到临时文件
if use_temp_file and len(processed_images) >= batch_size:
_logger.info(f"达到批次大小 {batch_size},开始保存到临时文件")
try:
# 将已处理的图片转换为PDF
batch_buffer = io.BytesIO()
first_batch = processed_images[0]
rest_batch = processed_images[1:]
first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
batch_buffer.seek(0)
pdf_bytes = batch_buffer.getvalue()
batch_buffer.close()
# 释放已处理的图片
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = []
gc.collect()
if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
# 追加到现有PDF:先读取现有内容,合并后保存到新文件,再替换
with open(temp_file_path, 'rb') as f:
existing_bytes = f.read()
existing_pdf = fitz.open(stream=existing_bytes, filetype="pdf")
new_pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
existing_pdf.insert_pdf(new_pdf)
new_pdf.close()
# 保存到新临时文件,避免"save to original must be incremental"错误
new_temp_path = tempfile.mktemp(suffix='.pdf')
existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
existing_pdf.close()
# 替换旧文件
os.remove(temp_file_path)
os.rename(new_temp_path, temp_file_path)
# 释放资源
del existing_bytes
del pdf_bytes
gc.collect()
else:
_logger.warning(f"第{page_num + 1}页AI处理失败(第{attempt}次尝试,已用尽重试),耗时: {attempt_time:.2f}秒")
except Exception as e:
ai_end_time = time.time()
attempt_time = ai_end_time - ai_start_time
ai_processing_time += attempt_time
total_ai_time += attempt_time
_logger.error(f"第{page_num + 1}页AI处理异常(第{attempt}次尝试): {str(e)},耗时: {attempt_time:.2f}秒")
if attempt < max_retries:
_logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
edited_img_base64 = None
if edited_img_base64:
# 解码base64图片数据并转换为PIL Image对象(按照image-to-coordinate.py的逻辑)
edited_img_data = base64.b64decode(edited_img_base64)
edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
processed_images.append(edited_img)
_logger.info(f"第{page_num + 1}页AI处理最终成功,总耗时: {ai_processing_time:.2f}秒")
else:
_logger.warning(f"第{page_num + 1}页AI处理最终失败(已重试),使用原始页面,总耗时: {ai_processing_time:.2f}秒")
# 如果AI处理失败,使用原始图片
processed_images.append(img.convert('RGB'))
# 创建新的PDF
with open(temp_file_path, 'wb') as f:
f.write(pdf_bytes)
del pdf_bytes
gc.collect()
except Exception as e:
_logger.error(f"分批保存PDF失败: {str(e)}")
# 失败时继续处理,最后统一处理
# 但需要释放已处理的图片,避免内存占用
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = []
gc.collect()
except Exception as e:
_logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
# 确保资源被释放
if pix:
del pix
if img:
img.close()
del img
if img_bytes_io:
img_bytes_io.close()
gc.collect()
# 如果处理失败,跳过这一页或使用原始页面
continue
page_end_time = time.time()
page_processing_time = page_end_time - page_start_time
......@@ -943,21 +1105,126 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 将处理后的图片转换为PDF(按照image-to-coordinate.py的images_to_pdf函数逻辑)
pdf_creation_start = time.time()
if not processed_images:
_logger.error("没有需要写入PDF的图片")
return None
result_data = None
import os
# 使用PIL的save方法将图片保存为PDF(按照image-to-coordinate.py的逻辑)
output_buffer = io.BytesIO()
first = processed_images[0]
rest = processed_images[1:] # 按照image-to-coordinate.py的逻辑,直接使用切片
# 按照image-to-coordinate.py的images_to_pdf函数:first.save(output_pdf, save_all=True, append_images=rest)
# 即使rest是空列表,也直接传入(PIL会正确处理)
first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
output_buffer.seek(0)
try:
if use_temp_file and temp_file_path:
# 如果还有剩余的图片,追加到临时文件
if processed_images:
_logger.info(f"处理剩余的 {len(processed_images)} 页图片")
try:
# 将剩余图片转换为PDF
batch_buffer = io.BytesIO()
first_batch = processed_images[0]
rest_batch = processed_images[1:]
first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
batch_buffer.seek(0)
temp_pdf_bytes = batch_buffer.getvalue()
batch_buffer.close()
# 释放图片
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = None
gc.collect()
# 追加到临时文件
if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
# 如果临时文件已存在,先读取内容
with open(temp_file_path, 'rb') as f:
existing_pdf_bytes = f.read()
# 合并PDF:打开现有PDF和新PDF,然后合并
existing_pdf = fitz.open(stream=existing_pdf_bytes, filetype="pdf")
new_pdf = fitz.open(stream=temp_pdf_bytes, filetype="pdf")
existing_pdf.insert_pdf(new_pdf)
new_pdf.close()
# 保存到新的临时文件,避免"save to original must be incremental"错误
new_temp_path = tempfile.mktemp(suffix='.pdf')
existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
existing_pdf.close()
# 删除旧临时文件,重命名新文件
os.remove(temp_file_path)
os.rename(new_temp_path, temp_file_path)
# 释放资源
del existing_pdf_bytes
del temp_pdf_bytes
gc.collect()
else:
# 如果临时文件不存在或为空,直接写入
with open(temp_file_path, 'wb') as f:
f.write(temp_pdf_bytes)
del temp_pdf_bytes
gc.collect()
except Exception as e:
_logger.error(f"追加剩余图片失败: {str(e)}")
# 注意:processed_images 在这里已经被释放了,需要重新获取
# 如果还有剩余图片,需要重新处理(这种情况不应该发生,因为前面已经释放了)
_logger.warning("追加剩余图片失败,剩余图片已在之前释放")
# 从临时文件读取最终结果
if os.path.exists(temp_file_path):
with open(temp_file_path, 'rb') as f:
result_data = f.read()
# 删除临时文件
try:
os.remove(temp_file_path)
_logger.info(f"已删除临时文件: {temp_file_path}")
except Exception as e:
_logger.warning(f"删除临时文件失败: {str(e)}")
else:
_logger.error("临时文件不存在,无法读取结果")
return None
elif processed_images:
# 使用内存方式处理(5页以内)
output_buffer = io.BytesIO()
first = processed_images[0]
rest = processed_images[1:] # 按照image-to-coordinate.py的逻辑,直接使用切片
# 按照image-to-coordinate.py的images_to_pdf函数:first.save(output_pdf, save_all=True, append_images=rest)
# 即使rest是空列表,也直接传入(PIL会正确处理)
first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
output_buffer.seek(0)
result_data = output_buffer.getvalue()
output_buffer.close()
# 释放所有图片对象
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = None
del first
del rest
else:
_logger.error("没有需要写入PDF的图片")
return None
gc.collect() # 强制垃圾回收
except Exception as e:
_logger.error(f"PDF创建失败: {str(e)}")
# 确保资源被释放
if processed_images:
for img_obj in processed_images:
if img_obj:
img_obj.close()
if temp_file_path and os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
except:
pass
gc.collect()
return None
pdf_creation_end = time.time()
result_data = output_buffer.getvalue()
total_time = time.time() - start_time
pdf_creation_time = pdf_creation_end - pdf_creation_start
......@@ -1664,6 +1931,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import numpy as np
from PIL import Image
import re
import gc
# 定义目标文字(与_find_target_texts一致)
TARGET_TEXTS = ['AGN', 'ACN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
......@@ -1671,6 +1939,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
'Page 5 of 5']
pdf_document = None
try:
# 设置Tesseract路径
self._setup_tesseract_path()
......@@ -1689,110 +1958,168 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 遍历每一页
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
pix = None
pil_img = None
img = None
nparr = None
img_data = None
# 首先尝试从PDF文本层提取(如果是文本型PDF)
page_text_pdf = page.get_text().upper()
# 将页面转换为图像进行OCR识别
mat = fitz.Matrix(3.0, 3.0) # 进一步提高分辨率,从2.0提升到3.0
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# 转换为PIL图像
if cv2_available:
nparr = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
else:
pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB')
# OCR识别
try:
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
ocr_text = pytesseract.image_to_string(pil_img, config=config, lang='eng').upper()
except Exception as e:
_logger.warning(f"OCR识别失败,第{page_num + 1}页,使用PDF文本: {str(e)}")
ocr_text = page_text_pdf
# 合并PDF文本和OCR文本进行检查
combined_text = (page_text_pdf + ' ' + ocr_text).upper()
# 使用与_find_target_texts完全相同的逻辑:先进行OCR单词识别
try:
# 获取OCR识别的单词列表
words = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, lang='eng')
# 首先尝试从PDF文本层提取(如果是文本型PDF)
page_text_pdf = page.get_text().upper()
# 过滤出有效的单词
valid_words = []
for i in range(len(words['text'])):
word_text = words['text'][i].strip()
if word_text and int(words['conf'][i]) > 30: # 置信度大于30
valid_words.append({
'text': word_text,
'confidence': int(words['conf'][i]),
'bbox': {
'x0': words['left'][i],
'y0': words['top'][i],
'x1': words['left'][i] + words['width'][i],
'y1': words['top'][i] + words['height'][i]
}
})
# 将页面转换为图像进行OCR识别(降低分辨率以节省内存)
# 使用 2.0 倍分辨率(约 144 DPI)而不是 3.0 倍(约 216 DPI)
mat = fitz.Matrix(2.0, 2.0)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
del pix # 立即释放pixmap
pix = None
gc.collect() # 强制垃圾回收
# 使用与_find_target_texts相同的匹配逻辑
page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
if page_found_texts:
for found_text in page_found_texts:
found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
except Exception as e:
_logger.warning(f"OCR单词识别失败,第{page_num + 1}页,使用文本匹配: {str(e)}")
# 如果OCR单词识别失败,回退到文本匹配
for target_text in TARGET_TEXTS:
target_upper = target_text.upper()
# 转换为PIL图像
if cv2_available:
nparr = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
del nparr # 释放numpy数组
del img # 释放OpenCV图像
nparr = None
img = None
gc.collect()
else:
pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB')
# 释放img_data
del img_data
img_data = None
gc.collect()
# OCR识别
try:
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
ocr_text = pytesseract.image_to_string(pil_img, config=config, lang='eng').upper()
except Exception as e:
_logger.warning(f"OCR识别失败,第{page_num + 1}页,使用PDF文本: {str(e)}")
ocr_text = page_text_pdf
# 合并PDF文本和OCR文本进行检查
combined_text = (page_text_pdf + ' ' + ocr_text).upper()
# 使用与_find_target_texts完全相同的逻辑:先进行OCR单词识别
try:
# 获取OCR识别的单词列表
words = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, lang='eng')
# 检查是否包含目标文字
is_match = False
if target_text == 'AGN':
# AGN使用精确匹配
if re.search(r'\bAGN\b', combined_text):
is_match = True
elif target_text == 'ACN':
# ACN使用精确匹配
if re.search(r'\bACN\b', combined_text):
is_match = True
elif target_text == 'LTD':
# LTD使用精确匹配,但要排除其他包含LTD的文字
if re.search(r'\bLTD\b', combined_text) and 'UCLINK' in combined_text:
is_match = True
else:
# 其他文字使用包含匹配
if target_upper in combined_text:
# 排除AIR、EQK、ARN等(需要这些词都不存在)
if 'AIR EQK' not in combined_text and 'ARN' not in combined_text:
is_match = True
# 过滤出有效的单词
valid_words = []
for i in range(len(words['text'])):
word_text = words['text'][i].strip()
if word_text and int(words['conf'][i]) > 30: # 置信度大于30
valid_words.append({
'text': word_text,
'confidence': int(words['conf'][i]),
'bbox': {
'x0': words['left'][i],
'y0': words['top'][i],
'x1': words['left'][i] + words['width'][i],
'y1': words['top'][i] + words['height'][i]
}
})
# 如果匹配,检查是否在排除列表中
if is_match:
is_excluded = False
for exclude_text in EXCLUDE_TEXTS:
exclude_upper = exclude_text.upper()
if exclude_upper in combined_text and target_upper in combined_text:
# 检查是否是页码
if re.search(r'PAGE\s+\d+\s+OF\s+\d+', combined_text) or re.search(r'\d+\s*/\s*\d+', combined_text):
is_excluded = True
break
# 检查是否是AIR EQK等排除项
if 'AIR EQK' in combined_text or 'ARN' in combined_text:
is_excluded = True
break
if not is_excluded:
found_texts.append(f"第{page_num + 1}页: {target_text}")
# 释放words字典以节省内存
del words
gc.collect()
# 使用与_find_target_texts相同的匹配逻辑
page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
del valid_words # 释放valid_words列表
gc.collect()
if page_found_texts:
for found_text in page_found_texts:
found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
break # 找到就跳出,避免重复
except Exception as e:
_logger.warning(f"OCR单词识别失败,第{page_num + 1}页,使用文本匹配: {str(e)}")
# 如果OCR单词识别失败,回退到文本匹配
for target_text in TARGET_TEXTS:
target_upper = target_text.upper()
# 检查是否包含目标文字
is_match = False
if target_text == 'AGN':
# AGN使用精确匹配
if re.search(r'\bAGN\b', combined_text):
is_match = True
elif target_text == 'ACN':
# ACN使用精确匹配
if re.search(r'\bACN\b', combined_text):
is_match = True
elif target_text == 'LTD':
# LTD使用精确匹配,但要排除其他包含LTD的文字
if re.search(r'\bLTD\b', combined_text) and 'UCLINK' in combined_text:
is_match = True
else:
# 其他文字使用包含匹配
if target_upper in combined_text:
# 排除AIR、EQK、ARN等(需要这些词都不存在)
if 'AIR EQK' not in combined_text and 'ARN' not in combined_text:
is_match = True
# 如果匹配,检查是否在排除列表中
if is_match:
is_excluded = False
for exclude_text in EXCLUDE_TEXTS:
exclude_upper = exclude_text.upper()
if exclude_upper in combined_text and target_upper in combined_text:
# 检查是否是页码
if re.search(r'PAGE\s+\d+\s+OF\s+\d+', combined_text) or re.search(r'\d+\s*/\s*\d+', combined_text):
is_excluded = True
break
# 检查是否是AIR EQK等排除项
if 'AIR EQK' in combined_text or 'ARN' in combined_text:
is_excluded = True
break
if not is_excluded:
found_texts.append(f"第{page_num + 1}页: {target_text}")
break # 找到就跳出,避免重复
# 释放PIL图像和文本变量
if pil_img:
pil_img.close()
del pil_img
del page_text_pdf
del ocr_text
del combined_text
pil_img = None
gc.collect() # 强制垃圾回收
except Exception as e:
_logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
# 确保资源被释放
if pix:
del pix
if pil_img:
pil_img.close()
del pil_img
if img is not None:
del img
if nparr is not None:
del nparr
if img_data is not None:
del img_data
gc.collect()
continue
pdf_document.close()
if pdf_document:
pdf_document.close()
pdf_document = None
gc.collect()
if found_texts:
_logger.warning(f"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}")
......@@ -1803,6 +2130,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
except Exception as e:
_logger.error(f"检查目标文字失败,提单号: {bl_no}, 错误: {str(e)}")
# 确保资源被释放
if pdf_document:
try:
pdf_document.close()
except:
pass
gc.collect()
# 检查失败时,假设不存在(避免误报)
return False, []
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论