提交 46bba812 authored 作者: 贺阳's avatar 贺阳

优化分批处理,减少内存

上级 5ab84684
...@@ -367,11 +367,16 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -367,11 +367,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
def _merge_pdf_files(self, processed_files): def _merge_pdf_files(self, processed_files):
""" """
合并所有涂抹后的PDF文件为一个PDF并保存到pdf_file字段 合并所有涂抹后的PDF文件为一个PDF并保存到pdf_file字段
使用临时文件方式减少内存占用
:param processed_files: 处理后的文件数组 :param processed_files: 处理后的文件数组
""" """
import fitz # PyMuPDF import fitz # PyMuPDF
from datetime import datetime from datetime import datetime
import tempfile
import os
import gc
temp_file_path = None
try: try:
# 过滤有效的PDF文件 # 过滤有效的PDF文件
valid_files = [] valid_files = []
...@@ -406,16 +411,24 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -406,16 +411,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 多个PDF文件需要合并 # 多个PDF文件需要合并
_logger.info(f"开始合并 {len(valid_files)} 个PDF文件") _logger.info(f"开始合并 {len(valid_files)} 个PDF文件")
# 创建新的PDF文档用于合并 # 使用临时文件方式合并,避免内存占用过大
temp_file_path = tempfile.mktemp(suffix='.pdf')
merged_pdf = fitz.open() merged_pdf = fitz.open()
bl_numbers = [] bl_numbers = []
# 遍历所有处理后的PDF文件 # 遍历所有处理后的PDF文件,分批处理以减少内存占用
for file_info in valid_files: batch_size = 5 # 每批处理5个PDF
for batch_start in range(0, len(valid_files), batch_size):
batch_files = valid_files[batch_start:batch_start + batch_size]
_logger.info(f"处理第 {batch_start // batch_size + 1} 批,共 {len(batch_files)} 个PDF")
for file_info in batch_files:
bl = file_info['bl'] bl = file_info['bl']
file_data = file_info['file_data'] file_data = file_info['file_data']
bl_numbers.append(bl.bl_no) bl_numbers.append(bl.bl_no)
source_pdf = None
pdf_binary = None
try: try:
# 将base64数据转换为二进制 # 将base64数据转换为二进制
pdf_binary = base64.b64decode(file_data) pdf_binary = base64.b64decode(file_data)
...@@ -426,21 +439,46 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -426,21 +439,46 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 将源PDF的所有页面插入到合并的PDF中 # 将源PDF的所有页面插入到合并的PDF中
merged_pdf.insert_pdf(source_pdf) merged_pdf.insert_pdf(source_pdf)
source_pdf.close() _logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档({len(source_pdf)} 页)")
_logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档")
except Exception as e: except Exception as e:
_logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}") _logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
continue continue
finally:
# 立即释放资源
if source_pdf:
source_pdf.close()
source_pdf = None
pdf_binary = None
gc.collect() # 强制垃圾回收
# 每批处理完后,保存到临时文件并释放内存
if batch_start + batch_size < len(valid_files):
# 保存当前合并结果到临时文件
merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
merged_pdf.close()
# 重新打开临时文件继续合并
merged_pdf = fitz.open(temp_file_path)
gc.collect()
# 如果有页面,保存合并后的PDF # 如果有页面,保存合并后的PDF
if len(merged_pdf) > 0: if len(merged_pdf) > 0:
# 保存到内存 # 使用临时文件保存,减少内存占用
output_buffer = io.BytesIO() if not temp_file_path:
merged_pdf.save(output_buffer, garbage=4, deflate=True, clean=True) temp_file_path = tempfile.mktemp(suffix='.pdf')
merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
merged_pdf.close() merged_pdf.close()
# 从临时文件读取并转换为base64
with open(temp_file_path, 'rb') as f:
pdf_data = f.read()
# 转换为base64 # 转换为base64
merged_pdf_base64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8') merged_pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
# 清理临时数据
del pdf_data
gc.collect()
# 生成文件名(包含提单号和日期) # 生成文件名(包含提单号和日期)
bl_numbers_str = '_'.join(bl_numbers[:5]) # 最多显示5个提单号 bl_numbers_str = '_'.join(bl_numbers[:5]) # 最多显示5个提单号
...@@ -455,12 +493,24 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -455,12 +493,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
'pdf_filename': pdf_filename 'pdf_filename': pdf_filename
}) })
# 清理base64数据
del merged_pdf_base64
gc.collect()
_logger.info(f"成功合并 {len(bl_numbers)} 个PDF文件,文件名: {pdf_filename}") _logger.info(f"成功合并 {len(bl_numbers)} 个PDF文件,文件名: {pdf_filename}")
else: else:
_logger.warning("没有有效的PDF文件可以合并") _logger.warning("没有有效的PDF文件可以合并")
except Exception as e: except Exception as e:
_logger.error(f"合并PDF文件失败: {str(e)}") _logger.error(f"合并PDF文件失败: {str(e)}")
finally:
# 清理临时文件
if temp_file_path and os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
_logger.info(f"已删除临时文件: {temp_file_path}")
except Exception as e:
_logger.warning(f"删除临时文件失败: {str(e)}")
def _match_bl_by_file_name(self, pdf_file_arr): def _match_bl_by_file_name(self, pdf_file_arr):
""" """
...@@ -836,6 +886,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -836,6 +886,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
def _process_pdf_with_ai_image_edit(self, pdf_data, bl_no): def _process_pdf_with_ai_image_edit(self, pdf_data, bl_no):
""" """
使用AI图片编辑处理PDF:PDF转图片 -> AI抹除文字 -> 图片转回PDF(按照image-to-coordinate.py的逻辑) 使用AI图片编辑处理PDF:PDF转图片 -> AI抹除文字 -> 图片转回PDF(按照image-to-coordinate.py的逻辑)
优化内存占用:对于多页PDF使用临时文件方式分批处理
:param pdf_data: PDF二进制数据 :param pdf_data: PDF二进制数据
:param bl_no: 提单号(用于日志) :param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据 :return: 处理后的PDF二进制数据
...@@ -843,6 +894,9 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -843,6 +894,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
import fitz # PyMuPDF import fitz # PyMuPDF
import base64 import base64
import mimetypes import mimetypes
import gc
import os
import tempfile
from PIL import Image from PIL import Image
import time import time
...@@ -854,12 +908,22 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -854,12 +908,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 打开PDF文档 # 打开PDF文档
pdf_document = fitz.open(stream=pdf_data, filetype="pdf") pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
processed_images = [] # 存储处理后的PIL图片对象
total_pages = len(pdf_document) total_pages = len(pdf_document)
total_ai_time = 0.0 # 累计AI总耗时 total_ai_time = 0.0 # 累计AI总耗时
_logger.info(f"PDF总页数: {total_pages}") _logger.info(f"PDF总页数: {total_pages}")
# 对于多页PDF,使用临时文件方式减少内存占用
use_temp_file = total_pages > 5 # 超过5页使用临时文件
temp_file_path = None
if use_temp_file:
import tempfile
temp_file_path = tempfile.mktemp(suffix='.pdf')
_logger.info(f"使用临时文件方式处理,减少内存占用: {temp_file_path}")
processed_images = [] # 存储处理后的PIL图片对象(分批处理)
batch_size = 5 # 每批处理5页图片
# 遍历每一页(按照image-to-coordinate.py的逻辑) # 遍历每一页(按照image-to-coordinate.py的逻辑)
for page_num in range(total_pages): for page_num in range(total_pages):
page_start_time = time.time() page_start_time = time.time()
...@@ -867,12 +931,22 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -867,12 +931,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger.info(f"正在处理第{page_num + 1}页") _logger.info(f"正在处理第{page_num + 1}页")
# 将页面转换为图像(按照image-to-coordinate.py的pdf_to_images函数,使用dpi=150) # 将页面转换为图像(按照image-to-coordinate.py的pdf_to_images函数,使用dpi=150)
dpi = 150 # 对于内存优化,使用稍低的分辨率(120 DPI)以避免内存问题
dpi = 120
mat = fitz.Matrix(dpi / 72, dpi / 72) mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = None
img = None
img_bytes_io = None
try:
pix = page.get_pixmap(matrix=mat) pix = page.get_pixmap(matrix=mat)
# 将pixmap转换为PIL Image对象 # 将pixmap转换为PIL Image对象
img_data = pix.tobytes("png") img_data = pix.tobytes("png")
del pix # 立即释放pixmap以节省内存
pix = None
gc.collect() # 强制垃圾回收
img = Image.open(io.BytesIO(img_data)) img = Image.open(io.BytesIO(img_data))
# 获取图片尺寸(按照image-to-coordinate.py的逻辑) # 获取图片尺寸(按照image-to-coordinate.py的逻辑)
...@@ -884,8 +958,10 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -884,8 +958,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
img.save(img_bytes_io, format='PNG') img.save(img_bytes_io, format='PNG')
img_bytes_io.seek(0) img_bytes_io.seek(0)
encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8') encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
mime_type = 'image/png' img_bytes_io.close() # 立即关闭BytesIO
img_base64 = f"data:{mime_type};base64,{encoded_string}" img_bytes_io = None
del img_data # 释放图片数据
gc.collect() # 强制垃圾回收
# 使用AI编辑图片,移除指定文字(带重试机制) # 使用AI编辑图片,移除指定文字(带重试机制)
edited_img_base64 = None edited_img_base64 = None
...@@ -924,10 +1000,16 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -924,10 +1000,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试") _logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
edited_img_base64 = None edited_img_base64 = None
# 释放encoded_string以节省内存
del encoded_string
gc.collect()
if edited_img_base64: if edited_img_base64:
# 解码base64图片数据并转换为PIL Image对象(按照image-to-coordinate.py的逻辑) # 解码base64图片数据并转换为PIL Image对象(按照image-to-coordinate.py的逻辑)
edited_img_data = base64.b64decode(edited_img_base64) edited_img_data = base64.b64decode(edited_img_base64)
edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB') edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
del edited_img_data # 立即释放原始数据
del edited_img_base64 # 释放base64字符串
processed_images.append(edited_img) processed_images.append(edited_img)
_logger.info(f"第{page_num + 1}页AI处理最终成功,总耗时: {ai_processing_time:.2f}秒") _logger.info(f"第{page_num + 1}页AI处理最终成功,总耗时: {ai_processing_time:.2f}秒")
else: else:
...@@ -935,6 +1017,86 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -935,6 +1017,86 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 如果AI处理失败,使用原始图片 # 如果AI处理失败,使用原始图片
processed_images.append(img.convert('RGB')) processed_images.append(img.convert('RGB'))
# 释放原始图片对象
if img:
img.close()
del img
img = None
gc.collect() # 强制垃圾回收
# 分批处理:每处理batch_size页,就转换为PDF并保存到临时文件
if use_temp_file and len(processed_images) >= batch_size:
_logger.info(f"达到批次大小 {batch_size},开始保存到临时文件")
try:
# 将已处理的图片转换为PDF
batch_buffer = io.BytesIO()
first_batch = processed_images[0]
rest_batch = processed_images[1:]
first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
batch_buffer.seek(0)
pdf_bytes = batch_buffer.getvalue()
batch_buffer.close()
# 释放已处理的图片
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = []
gc.collect()
if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
# 追加到现有PDF:先读取现有内容,合并后保存到新文件,再替换
with open(temp_file_path, 'rb') as f:
existing_bytes = f.read()
existing_pdf = fitz.open(stream=existing_bytes, filetype="pdf")
new_pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
existing_pdf.insert_pdf(new_pdf)
new_pdf.close()
# 保存到新临时文件,避免"save to original must be incremental"错误
new_temp_path = tempfile.mktemp(suffix='.pdf')
existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
existing_pdf.close()
# 替换旧文件
os.remove(temp_file_path)
os.rename(new_temp_path, temp_file_path)
# 释放资源
del existing_bytes
del pdf_bytes
gc.collect()
else:
# 创建新的PDF
with open(temp_file_path, 'wb') as f:
f.write(pdf_bytes)
del pdf_bytes
gc.collect()
except Exception as e:
_logger.error(f"分批保存PDF失败: {str(e)}")
# 失败时继续处理,最后统一处理
# 但需要释放已处理的图片,避免内存占用
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = []
gc.collect()
except Exception as e:
_logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
# 确保资源被释放
if pix:
del pix
if img:
img.close()
del img
if img_bytes_io:
img_bytes_io.close()
gc.collect()
# 如果处理失败,跳过这一页或使用原始页面
continue
page_end_time = time.time() page_end_time = time.time()
page_processing_time = page_end_time - page_start_time page_processing_time = page_end_time - page_start_time
_logger.info(f"第{page_num + 1}页总处理时间: {page_processing_time:.2f}秒") _logger.info(f"第{page_num + 1}页总处理时间: {page_processing_time:.2f}秒")
...@@ -943,11 +1105,85 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -943,11 +1105,85 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 将处理后的图片转换为PDF(按照image-to-coordinate.py的images_to_pdf函数逻辑) # 将处理后的图片转换为PDF(按照image-to-coordinate.py的images_to_pdf函数逻辑)
pdf_creation_start = time.time() pdf_creation_start = time.time()
if not processed_images: result_data = None
_logger.error("没有需要写入PDF的图片") import os
try:
if use_temp_file and temp_file_path:
# 如果还有剩余的图片,追加到临时文件
if processed_images:
_logger.info(f"处理剩余的 {len(processed_images)} 页图片")
try:
# 将剩余图片转换为PDF
batch_buffer = io.BytesIO()
first_batch = processed_images[0]
rest_batch = processed_images[1:]
first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
batch_buffer.seek(0)
temp_pdf_bytes = batch_buffer.getvalue()
batch_buffer.close()
# 释放图片
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = None
gc.collect()
# 追加到临时文件
if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
# 如果临时文件已存在,先读取内容
with open(temp_file_path, 'rb') as f:
existing_pdf_bytes = f.read()
# 合并PDF:打开现有PDF和新PDF,然后合并
existing_pdf = fitz.open(stream=existing_pdf_bytes, filetype="pdf")
new_pdf = fitz.open(stream=temp_pdf_bytes, filetype="pdf")
existing_pdf.insert_pdf(new_pdf)
new_pdf.close()
# 保存到新的临时文件,避免"save to original must be incremental"错误
new_temp_path = tempfile.mktemp(suffix='.pdf')
existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
existing_pdf.close()
# 删除旧临时文件,重命名新文件
os.remove(temp_file_path)
os.rename(new_temp_path, temp_file_path)
# 释放资源
del existing_pdf_bytes
del temp_pdf_bytes
gc.collect()
else:
# 如果临时文件不存在或为空,直接写入
with open(temp_file_path, 'wb') as f:
f.write(temp_pdf_bytes)
del temp_pdf_bytes
gc.collect()
except Exception as e:
_logger.error(f"追加剩余图片失败: {str(e)}")
# 注意:processed_images 在这里已经被释放了,需要重新获取
# 如果还有剩余图片,需要重新处理(这种情况不应该发生,因为前面已经释放了)
_logger.warning("追加剩余图片失败,剩余图片已在之前释放")
# 从临时文件读取最终结果
if os.path.exists(temp_file_path):
with open(temp_file_path, 'rb') as f:
result_data = f.read()
# 删除临时文件
try:
os.remove(temp_file_path)
_logger.info(f"已删除临时文件: {temp_file_path}")
except Exception as e:
_logger.warning(f"删除临时文件失败: {str(e)}")
else:
_logger.error("临时文件不存在,无法读取结果")
return None return None
# 使用PIL的save方法将图片保存为PDF(按照image-to-coordinate.py的逻辑) elif processed_images:
# 使用内存方式处理(5页以内)
output_buffer = io.BytesIO() output_buffer = io.BytesIO()
first = processed_images[0] first = processed_images[0]
rest = processed_images[1:] # 按照image-to-coordinate.py的逻辑,直接使用切片 rest = processed_images[1:] # 按照image-to-coordinate.py的逻辑,直接使用切片
...@@ -955,9 +1191,40 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -955,9 +1191,40 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 即使rest是空列表,也直接传入(PIL会正确处理) # 即使rest是空列表,也直接传入(PIL会正确处理)
first.save(output_buffer, format='PDF', save_all=True, append_images=rest) first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
output_buffer.seek(0) output_buffer.seek(0)
pdf_creation_end = time.time()
result_data = output_buffer.getvalue() result_data = output_buffer.getvalue()
output_buffer.close()
# 释放所有图片对象
for img_obj in processed_images:
if img_obj:
img_obj.close()
processed_images = None
del first
del rest
else:
_logger.error("没有需要写入PDF的图片")
return None
gc.collect() # 强制垃圾回收
except Exception as e:
_logger.error(f"PDF创建失败: {str(e)}")
# 确保资源被释放
if processed_images:
for img_obj in processed_images:
if img_obj:
img_obj.close()
if temp_file_path and os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
except:
pass
gc.collect()
return None
pdf_creation_end = time.time()
total_time = time.time() - start_time total_time = time.time() - start_time
pdf_creation_time = pdf_creation_end - pdf_creation_start pdf_creation_time = pdf_creation_end - pdf_creation_start
...@@ -1664,6 +1931,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1664,6 +1931,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import numpy as np import numpy as np
from PIL import Image from PIL import Image
import re import re
import gc
# 定义目标文字(与_find_target_texts一致) # 定义目标文字(与_find_target_texts一致)
TARGET_TEXTS = ['AGN', 'ACN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD', TARGET_TEXTS = ['AGN', 'ACN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
...@@ -1671,6 +1939,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1671,6 +1939,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
'Page 5 of 5'] 'Page 5 of 5']
pdf_document = None
try: try:
# 设置Tesseract路径 # 设置Tesseract路径
self._setup_tesseract_path() self._setup_tesseract_path()
...@@ -1689,25 +1958,45 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1689,25 +1958,45 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 遍历每一页 # 遍历每一页
for page_num in range(len(pdf_document)): for page_num in range(len(pdf_document)):
page = pdf_document[page_num] page = pdf_document[page_num]
pix = None
pil_img = None
img = None
nparr = None
img_data = None
try:
# 首先尝试从PDF文本层提取(如果是文本型PDF) # 首先尝试从PDF文本层提取(如果是文本型PDF)
page_text_pdf = page.get_text().upper() page_text_pdf = page.get_text().upper()
# 将页面转换为图像进行OCR识别 # 将页面转换为图像进行OCR识别(降低分辨率以节省内存)
mat = fitz.Matrix(3.0, 3.0) # 进一步提高分辨率,从2.0提升到3.0 # 使用 2.0 倍分辨率(约 144 DPI)而不是 3.0 倍(约 216 DPI)
mat = fitz.Matrix(2.0, 2.0)
pix = page.get_pixmap(matrix=mat) pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png") img_data = pix.tobytes("png")
del pix # 立即释放pixmap
pix = None
gc.collect() # 强制垃圾回收
# 转换为PIL图像 # 转换为PIL图像
if cv2_available: if cv2_available:
nparr = np.frombuffer(img_data, np.uint8) nparr = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
del nparr # 释放numpy数组
del img # 释放OpenCV图像
nparr = None
img = None
gc.collect()
else: else:
pil_img = Image.open(io.BytesIO(img_data)) pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB': if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB') pil_img = pil_img.convert('RGB')
# 释放img_data
del img_data
img_data = None
gc.collect()
# OCR识别 # OCR识别
try: try:
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1' config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
...@@ -1740,11 +2029,19 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1740,11 +2029,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
} }
}) })
# 释放words字典以节省内存
del words
gc.collect()
# 使用与_find_target_texts相同的匹配逻辑 # 使用与_find_target_texts相同的匹配逻辑
page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600) page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
del valid_words # 释放valid_words列表
gc.collect()
if page_found_texts: if page_found_texts:
for found_text in page_found_texts: for found_text in page_found_texts:
found_texts.append(f"第{page_num + 1}页: {found_text['text']}") found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
break # 找到就跳出,避免重复
except Exception as e: except Exception as e:
_logger.warning(f"OCR单词识别失败,第{page_num + 1}页,使用文本匹配: {str(e)}") _logger.warning(f"OCR单词识别失败,第{page_num + 1}页,使用文本匹配: {str(e)}")
...@@ -1792,7 +2089,37 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1792,7 +2089,37 @@ class BatchGetPodInfoWizard(models.TransientModel):
found_texts.append(f"第{page_num + 1}页: {target_text}") found_texts.append(f"第{page_num + 1}页: {target_text}")
break # 找到就跳出,避免重复 break # 找到就跳出,避免重复
# 释放PIL图像和文本变量
if pil_img:
pil_img.close()
del pil_img
del page_text_pdf
del ocr_text
del combined_text
pil_img = None
gc.collect() # 强制垃圾回收
except Exception as e:
_logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
# 确保资源被释放
if pix:
del pix
if pil_img:
pil_img.close()
del pil_img
if img is not None:
del img
if nparr is not None:
del nparr
if img_data is not None:
del img_data
gc.collect()
continue
if pdf_document:
pdf_document.close() pdf_document.close()
pdf_document = None
gc.collect()
if found_texts: if found_texts:
_logger.warning(f"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}") _logger.warning(f"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}")
...@@ -1803,6 +2130,13 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -1803,6 +2130,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
except Exception as e: except Exception as e:
_logger.error(f"检查目标文字失败,提单号: {bl_no}, 错误: {str(e)}") _logger.error(f"检查目标文字失败,提单号: {bl_no}, 错误: {str(e)}")
# 确保资源被释放
if pdf_document:
try:
pdf_document.close()
except:
pass
gc.collect()
# 检查失败时,假设不存在(避免误报) # 检查失败时,假设不存在(避免误报)
return False, [] return False, []
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论