优化分批处理，减少内存

46bba812 · 贺阳 · 5ab84684 · 46bba812
--- a/ccs_base/wizard/batch_get_pod_info_wizard.py
+++ b/ccs_base/wizard/batch_get_pod_info_wizard.py
@@ -367,11 +367,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
    def _merge_pdf_files(self, processed_files):
        """
        合并所有涂抹后的PDF文件为一个PDF并保存到pdf_file字段
+        使用临时文件方式减少内存占用
        :param processed_files: 处理后的文件数组
        """
        import fitz  # PyMuPDF
        from datetime import datetime
+        import tempfile
+        import os
+        import gc
        
+        temp_file_path = None
        try:
            # 过滤有效的PDF文件
            valid_files = []
@@ -406,41 +411,74 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 多个PDF文件需要合并
            _logger.info(f"开始合并 {len(valid_files)} 个PDF文件")
            
-            # 创建新的PDF文档用于合并
+            # 使用临时文件方式合并，避免内存占用过大
+            temp_file_path = tempfile.mktemp(suffix='.pdf')
            merged_pdf = fitz.open()
            bl_numbers = []
            
-            # 遍历所有处理后的PDF文件
-            for file_info in valid_files:
-                bl = file_info['bl']
-                file_data = file_info['file_data']
-                bl_numbers.append(bl.bl_no)
+            # 遍历所有处理后的PDF文件，分批处理以减少内存占用
+            batch_size = 5  # 每批处理5个PDF
+            for batch_start in range(0, len(valid_files), batch_size):
+                batch_files = valid_files[batch_start:batch_start + batch_size]
+                _logger.info(f"处理第 {batch_start // batch_size + 1} 批，共 {len(batch_files)} 个PDF")
                
-                try:
-                    # 将base64数据转换为二进制
-                    pdf_binary = base64.b64decode(file_data)
-                    
-                    # 打开PDF文档
-                    source_pdf = fitz.open(stream=pdf_binary, filetype="pdf")
+                for file_info in batch_files:
+                    bl = file_info['bl']
+                    file_data = file_info['file_data']
+                    bl_numbers.append(bl.bl_no)
                    
-                    # 将源PDF的所有页面插入到合并的PDF中
-                    merged_pdf.insert_pdf(source_pdf)
-                    
-                    source_pdf.close()
-                    _logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档")
-                except Exception as e:
-                    _logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
-                    continue
+                    source_pdf = None
+                    pdf_binary = None
+                    try:
+                        # 将base64数据转换为二进制
+                        pdf_binary = base64.b64decode(file_data)
+                        
+                        # 打开PDF文档
+                        source_pdf = fitz.open(stream=pdf_binary, filetype="pdf")
+                        
+                        # 将源PDF的所有页面插入到合并的PDF中
+                        merged_pdf.insert_pdf(source_pdf)
+                        
+                        _logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档（{len(source_pdf)} 页）")
+                        
+                    except Exception as e:
+                        _logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
+                        continue
+                    finally:
+                        # 立即释放资源
+                        if source_pdf:
+                            source_pdf.close()
+                        source_pdf = None
+                        pdf_binary = None
+                        gc.collect()  # 强制垃圾回收
+                
+                # 每批处理完后，保存到临时文件并释放内存
+                if batch_start + batch_size < len(valid_files):
+                    # 保存当前合并结果到临时文件
+                    merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
+                    merged_pdf.close()
+                    # 重新打开临时文件继续合并
+                    merged_pdf = fitz.open(temp_file_path)
+                    gc.collect()
            
            # 如果有页面，保存合并后的PDF
            if len(merged_pdf) > 0:
-                # 保存到内存
-                output_buffer = io.BytesIO()
-                merged_pdf.save(output_buffer, garbage=4, deflate=True, clean=True)
+                # 使用临时文件保存，减少内存占用
+                if not temp_file_path:
+                    temp_file_path = tempfile.mktemp(suffix='.pdf')
+                merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
                merged_pdf.close()
                
+                # 从临时文件读取并转换为base64
+                with open(temp_file_path, 'rb') as f:
+                    pdf_data = f.read()
+                
                # 转换为base64
-                merged_pdf_base64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
+                merged_pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
+                
+                # 清理临时数据
+                del pdf_data
+                gc.collect()
                
                # 生成文件名（包含提单号和日期）
                bl_numbers_str = '_'.join(bl_numbers[:5])  # 最多显示5个提单号
@@ -455,12 +493,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    'pdf_filename': pdf_filename
                })
                
+                # 清理base64数据
+                del merged_pdf_base64
+                gc.collect()
+                
                _logger.info(f"成功合并 {len(bl_numbers)} 个PDF文件，文件名: {pdf_filename}")
            else:
                _logger.warning("没有有效的PDF文件可以合并")
                
        except Exception as e:
            _logger.error(f"合并PDF文件失败: {str(e)}")
+        finally:
+            # 清理临时文件
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.remove(temp_file_path)
+                    _logger.info(f"已删除临时文件: {temp_file_path}")
+                except Exception as e:
+                    _logger.warning(f"删除临时文件失败: {str(e)}")

    def _match_bl_by_file_name(self, pdf_file_arr):
        """
@@ -836,6 +886,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
    def _process_pdf_with_ai_image_edit(self, pdf_data, bl_no):
        """
        使用AI图片编辑处理PDF：PDF转图片 -> AI抹除文字 -> 图片转回PDF（按照image-to-coordinate.py的逻辑）
+        优化内存占用：对于多页PDF使用临时文件方式分批处理
        :param pdf_data: PDF二进制数据
        :param bl_no: 提单号（用于日志）
        :return: 处理后的PDF二进制数据
@@ -843,6 +894,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
        import fitz  # PyMuPDF
        import base64
        import mimetypes
+        import gc
+        import os
+        import tempfile
        from PIL import Image
        import time
        
@@ -854,12 +908,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
        
        # 打开PDF文档
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
-        processed_images = []  # 存储处理后的PIL图片对象
        total_pages = len(pdf_document)
        total_ai_time = 0.0  # 累计AI总耗时
        
        _logger.info(f"PDF总页数: {total_pages}")
        
+        # 对于多页PDF，使用临时文件方式减少内存占用
+        use_temp_file = total_pages > 5  # 超过5页使用临时文件
+        temp_file_path = None
+        if use_temp_file:
+            import tempfile
+            temp_file_path = tempfile.mktemp(suffix='.pdf')
+            _logger.info(f"使用临时文件方式处理，减少内存占用: {temp_file_path}")
+        
+        processed_images = []  # 存储处理后的PIL图片对象（分批处理）
+        batch_size = 5  # 每批处理5页图片
+        
        # 遍历每一页（按照image-to-coordinate.py的逻辑）
        for page_num in range(total_pages):
            page_start_time = time.time()
@@ -867,73 +931,171 @@ class BatchGetPodInfoWizard(models.TransientModel):
            _logger.info(f"正在处理第{page_num + 1}页")
            
            # 将页面转换为图像（按照image-to-coordinate.py的pdf_to_images函数，使用dpi=150）
-            dpi = 150
+            # 对于内存优化，使用稍低的分辨率（120 DPI）以避免内存问题
+            dpi = 120
            mat = fitz.Matrix(dpi / 72, dpi / 72)
-            pix = page.get_pixmap(matrix=mat)
-            
-            # 将pixmap转换为PIL Image对象
-            img_data = pix.tobytes("png")
-            img = Image.open(io.BytesIO(img_data))
-            
-            # 获取图片尺寸（按照image-to-coordinate.py的逻辑）
-            img_w, img_h = img.size
-            _logger.info(f"第{page_num + 1}页页面尺寸: {img_w}x{img_h} 像素")
-
-            # 将图片编码为base64（按照image-to-coordinate.py的encode_file函数逻辑）
-            img_bytes_io = io.BytesIO()
-            img.save(img_bytes_io, format='PNG')
-            img_bytes_io.seek(0)
-            encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
-            mime_type = 'image/png'
-            img_base64 = f"data:{mime_type};base64,{encoded_string}"
-            
-            # 使用AI编辑图片，移除指定文字（带重试机制）
-            edited_img_base64 = None
-            ai_processing_time = 0.0
-            max_retries = 2  # 最多尝试2次（首次+1次重试）
+            pix = None
+            img = None
+            img_bytes_io = None
            
-            for attempt in range(1, max_retries + 1):
-                ai_start_time = time.time()
-                try:
-                    # 调用AI服务（使用base64编码的图片数据，不带data:前缀）
-                    edited_img_base64_raw = ai_service.edit_image_remove_text(
-                        encoded_string,  # 传入不带data:前缀的base64字符串
-                        text_to_remove="AGN UCLINK LOGISITICS LTD"
-                    )
-                    ai_end_time = time.time()
-                    attempt_time = ai_end_time - ai_start_time
-                    ai_processing_time += attempt_time  # 累计AI耗时
-                    total_ai_time += attempt_time  # 累计总AI耗时
-                    
-                    if edited_img_base64_raw:
-                        edited_img_base64 = edited_img_base64_raw
-                        _logger.info(f"第{page_num + 1}页AI处理成功（第{attempt}次尝试），耗时: {attempt_time:.2f}秒")
-                        break
-                    else:
+            try:
+                pix = page.get_pixmap(matrix=mat)
+                
+                # 将pixmap转换为PIL Image对象
+                img_data = pix.tobytes("png")
+                del pix  # 立即释放pixmap以节省内存
+                pix = None
+                gc.collect()  # 强制垃圾回收
+                
+                img = Image.open(io.BytesIO(img_data))
+                
+                # 获取图片尺寸（按照image-to-coordinate.py的逻辑）
+                img_w, img_h = img.size
+                _logger.info(f"第{page_num + 1}页页面尺寸: {img_w}x{img_h} 像素")
+
+                # 将图片编码为base64（按照image-to-coordinate.py的encode_file函数逻辑）
+                img_bytes_io = io.BytesIO()
+                img.save(img_bytes_io, format='PNG')
+                img_bytes_io.seek(0)
+                encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
+                img_bytes_io.close()  # 立即关闭BytesIO
+                img_bytes_io = None
+                del img_data  # 释放图片数据
+                gc.collect()  # 强制垃圾回收
+                
+                # 使用AI编辑图片，移除指定文字（带重试机制）
+                edited_img_base64 = None
+                ai_processing_time = 0.0
+                max_retries = 2  # 最多尝试2次（首次+1次重试）
+                
+                for attempt in range(1, max_retries + 1):
+                    ai_start_time = time.time()
+                    try:
+                        # 调用AI服务（使用base64编码的图片数据，不带data:前缀）
+                        edited_img_base64_raw = ai_service.edit_image_remove_text(
+                            encoded_string,  # 传入不带data:前缀的base64字符串
+                            text_to_remove="AGN UCLINK LOGISITICS LTD"
+                        )
+                        ai_end_time = time.time()
+                        attempt_time = ai_end_time - ai_start_time
+                        ai_processing_time += attempt_time  # 累计AI耗时
+                        total_ai_time += attempt_time  # 累计总AI耗时
+                        
+                        if edited_img_base64_raw:
+                            edited_img_base64 = edited_img_base64_raw
+                            _logger.info(f"第{page_num + 1}页AI处理成功（第{attempt}次尝试），耗时: {attempt_time:.2f}秒")
+                            break
+                        else:
+                            if attempt < max_retries:
+                                _logger.warning(f"第{page_num + 1}页AI处理失败（第{attempt}次尝试），将重试，耗时: {attempt_time:.2f}秒")
+                            else:
+                                _logger.warning(f"第{page_num + 1}页AI处理失败（第{attempt}次尝试，已用尽重试），耗时: {attempt_time:.2f}秒")
+                    except Exception as e:
+                        ai_end_time = time.time()
+                        attempt_time = ai_end_time - ai_start_time
+                        ai_processing_time += attempt_time
+                        total_ai_time += attempt_time
+                        _logger.error(f"第{page_num + 1}页AI处理异常（第{attempt}次尝试）: {str(e)}，耗时: {attempt_time:.2f}秒")
                        if attempt < max_retries:
-                            _logger.warning(f"第{page_num + 1}页AI处理失败（第{attempt}次尝试），将重试，耗时: {attempt_time:.2f}秒")
+                            _logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
+                        edited_img_base64 = None
+                
+                # 释放encoded_string以节省内存
+                del encoded_string
+                gc.collect()
+                
+                if edited_img_base64:
+                    # 解码base64图片数据并转换为PIL Image对象（按照image-to-coordinate.py的逻辑）
+                    edited_img_data = base64.b64decode(edited_img_base64)
+                    edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
+                    del edited_img_data  # 立即释放原始数据
+                    del edited_img_base64  # 释放base64字符串
+                    processed_images.append(edited_img)
+                    _logger.info(f"第{page_num + 1}页AI处理最终成功，总耗时: {ai_processing_time:.2f}秒")
+                else:
+                    _logger.warning(f"第{page_num + 1}页AI处理最终失败（已重试），使用原始页面，总耗时: {ai_processing_time:.2f}秒")
+                    # 如果AI处理失败，使用原始图片
+                    processed_images.append(img.convert('RGB'))
+                
+                # 释放原始图片对象
+                if img:
+                    img.close()
+                    del img
+                img = None
+                gc.collect()  # 强制垃圾回收
+                
+                # 分批处理：每处理batch_size页，就转换为PDF并保存到临时文件
+                if use_temp_file and len(processed_images) >= batch_size:
+                    _logger.info(f"达到批次大小 {batch_size}，开始保存到临时文件")
+                    try:
+                        # 将已处理的图片转换为PDF
+                        batch_buffer = io.BytesIO()
+                        first_batch = processed_images[0]
+                        rest_batch = processed_images[1:]
+                        first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
+                        batch_buffer.seek(0)
+                        pdf_bytes = batch_buffer.getvalue()
+                        batch_buffer.close()
+                        
+                        # 释放已处理的图片
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = []
+                        gc.collect()
+                        
+                        if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
+                            # 追加到现有PDF：先读取现有内容，合并后保存到新文件，再替换
+                            with open(temp_file_path, 'rb') as f:
+                                existing_bytes = f.read()
+                            
+                            existing_pdf = fitz.open(stream=existing_bytes, filetype="pdf")
+                            new_pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
+                            existing_pdf.insert_pdf(new_pdf)
+                            new_pdf.close()
+                            
+                            # 保存到新临时文件，避免"save to original must be incremental"错误
+                            new_temp_path = tempfile.mktemp(suffix='.pdf')
+                            existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
+                            existing_pdf.close()
+                            
+                            # 替换旧文件
+                            os.remove(temp_file_path)
+                            os.rename(new_temp_path, temp_file_path)
+                            
+                            # 释放资源
+                            del existing_bytes
+                            del pdf_bytes
+                            gc.collect()
                        else:
-                            _logger.warning(f"第{page_num + 1}页AI处理失败（第{attempt}次尝试，已用尽重试），耗时: {attempt_time:.2f}秒")
-                except Exception as e:
-                    ai_end_time = time.time()
-                    attempt_time = ai_end_time - ai_start_time
-                    ai_processing_time += attempt_time
-                    total_ai_time += attempt_time
-                    _logger.error(f"第{page_num + 1}页AI处理异常（第{attempt}次尝试）: {str(e)}，耗时: {attempt_time:.2f}秒")
-                    if attempt < max_retries:
-                        _logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
-                    edited_img_base64 = None
-            
-            if edited_img_base64:
-                # 解码base64图片数据并转换为PIL Image对象（按照image-to-coordinate.py的逻辑）
-                edited_img_data = base64.b64decode(edited_img_base64)
-                edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
-                processed_images.append(edited_img)
-                _logger.info(f"第{page_num + 1}页AI处理最终成功，总耗时: {ai_processing_time:.2f}秒")
-            else:
-                _logger.warning(f"第{page_num + 1}页AI处理最终失败（已重试），使用原始页面，总耗时: {ai_processing_time:.2f}秒")
-                # 如果AI处理失败，使用原始图片
-                processed_images.append(img.convert('RGB'))
+                            # 创建新的PDF
+                            with open(temp_file_path, 'wb') as f:
+                                f.write(pdf_bytes)
+                            del pdf_bytes
+                            gc.collect()
+                    except Exception as e:
+                        _logger.error(f"分批保存PDF失败: {str(e)}")
+                        # 失败时继续处理，最后统一处理
+                        # 但需要释放已处理的图片，避免内存占用
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = []
+                        gc.collect()
+                
+            except Exception as e:
+                _logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
+                # 确保资源被释放
+                if pix:
+                    del pix
+                if img:
+                    img.close()
+                    del img
+                if img_bytes_io:
+                    img_bytes_io.close()
+                gc.collect()
+                # 如果处理失败，跳过这一页或使用原始页面
+                continue
            
            page_end_time = time.time()
            page_processing_time = page_end_time - page_start_time
@@ -943,21 +1105,126 @@ class BatchGetPodInfoWizard(models.TransientModel):
        
        # 将处理后的图片转换为PDF（按照image-to-coordinate.py的images_to_pdf函数逻辑）
        pdf_creation_start = time.time()
-        if not processed_images:
-            _logger.error("没有需要写入PDF的图片")
-            return None
+        result_data = None
+        import os
        
-        # 使用PIL的save方法将图片保存为PDF（按照image-to-coordinate.py的逻辑）
-        output_buffer = io.BytesIO()
-        first = processed_images[0]
-        rest = processed_images[1:]  # 按照image-to-coordinate.py的逻辑，直接使用切片
-        # 按照image-to-coordinate.py的images_to_pdf函数：first.save(output_pdf, save_all=True, append_images=rest)
-        # 即使rest是空列表，也直接传入（PIL会正确处理）
-        first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
-        output_buffer.seek(0)
+        try:
+            if use_temp_file and temp_file_path:
+                # 如果还有剩余的图片，追加到临时文件
+                if processed_images:
+                    _logger.info(f"处理剩余的 {len(processed_images)} 页图片")
+                    try:
+                        # 将剩余图片转换为PDF
+                        batch_buffer = io.BytesIO()
+                        first_batch = processed_images[0]
+                        rest_batch = processed_images[1:]
+                        first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
+                        batch_buffer.seek(0)
+                        temp_pdf_bytes = batch_buffer.getvalue()
+                        batch_buffer.close()
+                        
+                        # 释放图片
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = None
+                        gc.collect()
+                        
+                        # 追加到临时文件
+                        if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
+                            # 如果临时文件已存在，先读取内容
+                            with open(temp_file_path, 'rb') as f:
+                                existing_pdf_bytes = f.read()
+                            
+                            # 合并PDF：打开现有PDF和新PDF，然后合并
+                            existing_pdf = fitz.open(stream=existing_pdf_bytes, filetype="pdf")
+                            new_pdf = fitz.open(stream=temp_pdf_bytes, filetype="pdf")
+                            existing_pdf.insert_pdf(new_pdf)
+                            new_pdf.close()
+                            
+                            # 保存到新的临时文件，避免"save to original must be incremental"错误
+                            new_temp_path = tempfile.mktemp(suffix='.pdf')
+                            existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
+                            existing_pdf.close()
+                            
+                            # 删除旧临时文件，重命名新文件
+                            os.remove(temp_file_path)
+                            os.rename(new_temp_path, temp_file_path)
+                            
+                            # 释放资源
+                            del existing_pdf_bytes
+                            del temp_pdf_bytes
+                            gc.collect()
+                        else:
+                            # 如果临时文件不存在或为空，直接写入
+                            with open(temp_file_path, 'wb') as f:
+                                f.write(temp_pdf_bytes)
+                            del temp_pdf_bytes
+                            gc.collect()
+                    except Exception as e:
+                        _logger.error(f"追加剩余图片失败: {str(e)}")
+                        # 注意：processed_images 在这里已经被释放了，需要重新获取
+                        # 如果还有剩余图片，需要重新处理（这种情况不应该发生，因为前面已经释放了）
+                        _logger.warning("追加剩余图片失败，剩余图片已在之前释放")
+                
+                # 从临时文件读取最终结果
+                if os.path.exists(temp_file_path):
+                    with open(temp_file_path, 'rb') as f:
+                        result_data = f.read()
+                    
+                    # 删除临时文件
+                    try:
+                        os.remove(temp_file_path)
+                        _logger.info(f"已删除临时文件: {temp_file_path}")
+                    except Exception as e:
+                        _logger.warning(f"删除临时文件失败: {str(e)}")
+                else:
+                    _logger.error("临时文件不存在，无法读取结果")
+                    return None
+                    
+            elif processed_images:
+                # 使用内存方式处理（5页以内）
+                output_buffer = io.BytesIO()
+                first = processed_images[0]
+                rest = processed_images[1:]  # 按照image-to-coordinate.py的逻辑，直接使用切片
+                # 按照image-to-coordinate.py的images_to_pdf函数：first.save(output_pdf, save_all=True, append_images=rest)
+                # 即使rest是空列表，也直接传入（PIL会正确处理）
+                first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
+                output_buffer.seek(0)
+                
+                result_data = output_buffer.getvalue()
+                output_buffer.close()
+                
+                # 释放所有图片对象
+                for img_obj in processed_images:
+                    if img_obj:
+                        img_obj.close()
+                processed_images = None
+                del first
+                del rest
+            else:
+                _logger.error("没有需要写入PDF的图片")
+                return None
+            
+            gc.collect()  # 强制垃圾回收
+            
+        except Exception as e:
+            _logger.error(f"PDF创建失败: {str(e)}")
+            # 确保资源被释放
+            if processed_images:
+                for img_obj in processed_images:
+                    if img_obj:
+                        img_obj.close()
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.remove(temp_file_path)
+                except:
+                    pass
+            gc.collect()
+            return None
+            
        pdf_creation_end = time.time()
        
-        result_data = output_buffer.getvalue()
        total_time = time.time() - start_time
        pdf_creation_time = pdf_creation_end - pdf_creation_start
        
@@ -1664,6 +1931,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
        import numpy as np
        from PIL import Image
        import re
+        import gc
        
        # 定义目标文字（与_find_target_texts一致）
        TARGET_TEXTS = ['AGN', 'ACN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
@@ -1671,6 +1939,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
        EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
                         'Page 5 of 5']
        
+        pdf_document = None
        try:
            # 设置Tesseract路径
            self._setup_tesseract_path()
@@ -1689,110 +1958,168 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 遍历每一页
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
+                pix = None
+                pil_img = None
+                img = None
+                nparr = None
+                img_data = None
                
-                # 首先尝试从PDF文本层提取（如果是文本型PDF）
-                page_text_pdf = page.get_text().upper()
-                
-                # 将页面转换为图像进行OCR识别
-                mat = fitz.Matrix(3.0, 3.0)  # 进一步提高分辨率，从2.0提升到3.0
-                pix = page.get_pixmap(matrix=mat)
-                img_data = pix.tobytes("png")
-                
-                # 转换为PIL图像
-                if cv2_available:
-                    nparr = np.frombuffer(img_data, np.uint8)
-                    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-                    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-                else:
-                    pil_img = Image.open(io.BytesIO(img_data))
-                    if pil_img.mode != 'RGB':
-                        pil_img = pil_img.convert('RGB')
-                
-                # OCR识别
-                try:
-                    config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
-                    ocr_text = pytesseract.image_to_string(pil_img, config=config, lang='eng').upper()
-                except Exception as e:
-                    _logger.warning(f"OCR识别失败，第{page_num + 1}页，使用PDF文本: {str(e)}")
-                    ocr_text = page_text_pdf
-                
-                # 合并PDF文本和OCR文本进行检查
-                combined_text = (page_text_pdf + ' ' + ocr_text).upper()
-                
-                # 使用与_find_target_texts完全相同的逻辑：先进行OCR单词识别
                try:
-                    # 获取OCR识别的单词列表
-                    words = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, lang='eng')
+                    # 首先尝试从PDF文本层提取（如果是文本型PDF）
+                    page_text_pdf = page.get_text().upper()
                    
-                    # 过滤出有效的单词
-                    valid_words = []
-                    for i in range(len(words['text'])):
-                        word_text = words['text'][i].strip()
-                        if word_text and int(words['conf'][i]) > 30:  # 置信度大于30
-                            valid_words.append({
-                                'text': word_text,
-                                'confidence': int(words['conf'][i]),
-                                'bbox': {
-                                    'x0': words['left'][i],
-                                    'y0': words['top'][i],
-                                    'x1': words['left'][i] + words['width'][i],
-                                    'y1': words['top'][i] + words['height'][i]
-                                }
-                            })
+                    # 将页面转换为图像进行OCR识别（降低分辨率以节省内存）
+                    # 使用 2.0 倍分辨率（约 144 DPI）而不是 3.0 倍（约 216 DPI）
+                    mat = fitz.Matrix(2.0, 2.0)
+                    pix = page.get_pixmap(matrix=mat)
+                    img_data = pix.tobytes("png")
+                    del pix  # 立即释放pixmap
+                    pix = None
+                    gc.collect()  # 强制垃圾回收
                    
-                    # 使用与_find_target_texts相同的匹配逻辑
-                    page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
-                    if page_found_texts:
-                        for found_text in page_found_texts:
-                            found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
-                            
-                except Exception as e:
-                    _logger.warning(f"OCR单词识别失败，第{page_num + 1}页，使用文本匹配: {str(e)}")
-                    # 如果OCR单词识别失败，回退到文本匹配
-                    for target_text in TARGET_TEXTS:
-                        target_upper = target_text.upper()
+                    # 转换为PIL图像
+                    if cv2_available:
+                        nparr = np.frombuffer(img_data, np.uint8)
+                        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+                        del nparr  # 释放numpy数组
+                        del img  # 释放OpenCV图像
+                        nparr = None
+                        img = None
+                        gc.collect()
+                    else:
+                        pil_img = Image.open(io.BytesIO(img_data))
+                        if pil_img.mode != 'RGB':
+                            pil_img = pil_img.convert('RGB')
+                    
+                    # 释放img_data
+                    del img_data
+                    img_data = None
+                    gc.collect()
+                    
+                    # OCR识别
+                    try:
+                        config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
+                        ocr_text = pytesseract.image_to_string(pil_img, config=config, lang='eng').upper()
+                    except Exception as e:
+                        _logger.warning(f"OCR识别失败，第{page_num + 1}页，使用PDF文本: {str(e)}")
+                        ocr_text = page_text_pdf
+                    
+                    # 合并PDF文本和OCR文本进行检查
+                    combined_text = (page_text_pdf + ' ' + ocr_text).upper()
+                    
+                    # 使用与_find_target_texts完全相同的逻辑：先进行OCR单词识别
+                    try:
+                        # 获取OCR识别的单词列表
+                        words = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, lang='eng')
                        
-                        # 检查是否包含目标文字
-                        is_match = False
-                        if target_text == 'AGN':
-                            # AGN使用精确匹配
-                            if re.search(r'\bAGN\b', combined_text):
-                                is_match = True
-                        elif target_text == 'ACN':
-                            # ACN使用精确匹配
-                            if re.search(r'\bACN\b', combined_text):
-                                is_match = True
-                        elif target_text == 'LTD':
-                            # LTD使用精确匹配，但要排除其他包含LTD的文字
-                            if re.search(r'\bLTD\b', combined_text) and 'UCLINK' in combined_text:
-                                is_match = True
-                        else:
-                            # 其他文字使用包含匹配
-                            if target_upper in combined_text:
-                                # 排除AIR、EQK、ARN等（需要这些词都不存在）
-                                if 'AIR EQK' not in combined_text and 'ARN' not in combined_text:
-                                    is_match = True
+                        # 过滤出有效的单词
+                        valid_words = []
+                        for i in range(len(words['text'])):
+                            word_text = words['text'][i].strip()
+                            if word_text and int(words['conf'][i]) > 30:  # 置信度大于30
+                                valid_words.append({
+                                    'text': word_text,
+                                    'confidence': int(words['conf'][i]),
+                                    'bbox': {
+                                        'x0': words['left'][i],
+                                        'y0': words['top'][i],
+                                        'x1': words['left'][i] + words['width'][i],
+                                        'y1': words['top'][i] + words['height'][i]
+                                    }
+                                })
                        
-                        # 如果匹配，检查是否在排除列表中
-                        if is_match:
-                            is_excluded = False
-                            for exclude_text in EXCLUDE_TEXTS:
-                                exclude_upper = exclude_text.upper()
-                                if exclude_upper in combined_text and target_upper in combined_text:
-                                    # 检查是否是页码
-                                    if re.search(r'PAGE\s+\d+\s+OF\s+\d+', combined_text) or re.search(r'\d+\s*/\s*\d+', combined_text):
-                                        is_excluded = True
-                                        break
-                                    # 检查是否是AIR EQK等排除项
-                                    if 'AIR EQK' in combined_text or 'ARN' in combined_text:
-                                        is_excluded = True
-                                        break
-                            
-                            if not is_excluded:
-                                found_texts.append(f"第{page_num + 1}页: {target_text}")
+                        # 释放words字典以节省内存
+                        del words
+                        gc.collect()
+                        
+                        # 使用与_find_target_texts相同的匹配逻辑
+                        page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
+                        del valid_words  # 释放valid_words列表
+                        gc.collect()
+                        
+                        if page_found_texts:
+                            for found_text in page_found_texts:
+                                found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
                                break  # 找到就跳出，避免重复
+                                
+                    except Exception as e:
+                        _logger.warning(f"OCR单词识别失败，第{page_num + 1}页，使用文本匹配: {str(e)}")
+                        # 如果OCR单词识别失败，回退到文本匹配
+                        for target_text in TARGET_TEXTS:
+                            target_upper = target_text.upper()
+                            
+                            # 检查是否包含目标文字
+                            is_match = False
+                            if target_text == 'AGN':
+                                # AGN使用精确匹配
+                                if re.search(r'\bAGN\b', combined_text):
+                                    is_match = True
+                            elif target_text == 'ACN':
+                                # ACN使用精确匹配
+                                if re.search(r'\bACN\b', combined_text):
+                                    is_match = True
+                            elif target_text == 'LTD':
+                                # LTD使用精确匹配，但要排除其他包含LTD的文字
+                                if re.search(r'\bLTD\b', combined_text) and 'UCLINK' in combined_text:
+                                    is_match = True
+                            else:
+                                # 其他文字使用包含匹配
+                                if target_upper in combined_text:
+                                    # 排除AIR、EQK、ARN等（需要这些词都不存在）
+                                    if 'AIR EQK' not in combined_text and 'ARN' not in combined_text:
+                                        is_match = True
+                            
+                            # 如果匹配，检查是否在排除列表中
+                            if is_match:
+                                is_excluded = False
+                                for exclude_text in EXCLUDE_TEXTS:
+                                    exclude_upper = exclude_text.upper()
+                                    if exclude_upper in combined_text and target_upper in combined_text:
+                                        # 检查是否是页码
+                                        if re.search(r'PAGE\s+\d+\s+OF\s+\d+', combined_text) or re.search(r'\d+\s*/\s*\d+', combined_text):
+                                            is_excluded = True
+                                            break
+                                        # 检查是否是AIR EQK等排除项
+                                        if 'AIR EQK' in combined_text or 'ARN' in combined_text:
+                                            is_excluded = True
+                                            break
+                                
+                                if not is_excluded:
+                                    found_texts.append(f"第{page_num + 1}页: {target_text}")
+                                    break  # 找到就跳出，避免重复
+                    
+                    # 释放PIL图像和文本变量
+                    if pil_img:
+                        pil_img.close()
+                        del pil_img
+                    del page_text_pdf
+                    del ocr_text
+                    del combined_text
+                    pil_img = None
+                    gc.collect()  # 强制垃圾回收
+                    
+                except Exception as e:
+                    _logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
+                    # 确保资源被释放
+                    if pix:
+                        del pix
+                    if pil_img:
+                        pil_img.close()
+                        del pil_img
+                    if img is not None:
+                        del img
+                    if nparr is not None:
+                        del nparr
+                    if img_data is not None:
+                        del img_data
+                    gc.collect()
+                    continue
            
-            pdf_document.close()
+            if pdf_document:
+                pdf_document.close()
+                pdf_document = None
+                gc.collect()
            
            if found_texts:
                _logger.warning(f"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}")
@@ -1803,6 +2130,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
                
        except Exception as e:
            _logger.error(f"检查目标文字失败，提单号: {bl_no}, 错误: {str(e)}")
+            # 确保资源被释放
+            if pdf_document:
+                try:
+                    pdf_document.close()
+                except:
+                    pass
+            gc.collect()
            # 检查失败时，假设不存在（避免误报）
            return False, []