优化分批处理，减少内存

46bba812 · 贺阳 · 5ab84684 · 46bba812
--- a/ccs_base/wizard/batch_get_pod_info_wizard.py
+++ b/ccs_base/wizard/batch_get_pod_info_wizard.py
@@ -367,11 +367,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
    def _merge_pdf_files(self, processed_files):
        """
        合并所有涂抹后的PDF文件为一个PDF并保存到pdf_file字段
+        使用临时文件方式减少内存占用
        :param processed_files: 处理后的文件数组
        """
        import fitz  # PyMuPDF
        from datetime import datetime
+        import tempfile
+        import os
+        import gc
        
+        temp_file_path = None
        try:
            # 过滤有效的PDF文件
            valid_files = []
@@ -406,16 +411,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 多个PDF文件需要合并
            _logger.info(f"开始合并 {len(valid_files)} 个PDF文件")
            
-            # 创建新的PDF文档用于合并
+            # 使用临时文件方式合并，避免内存占用过大
+            temp_file_path = tempfile.mktemp(suffix='.pdf')
            merged_pdf = fitz.open()
            bl_numbers = []
            
-            # 遍历所有处理后的PDF文件
-            for file_info in valid_files:
+            # 遍历所有处理后的PDF文件，分批处理以减少内存占用
+            batch_size = 5  # 每批处理5个PDF
+            for batch_start in range(0, len(valid_files), batch_size):
+                batch_files = valid_files[batch_start:batch_start + batch_size]
+                _logger.info(f"处理第 {batch_start // batch_size + 1} 批，共 {len(batch_files)} 个PDF")
+                
+                for file_info in batch_files:
                    bl = file_info['bl']
                    file_data = file_info['file_data']
                    bl_numbers.append(bl.bl_no)
                    
+                    source_pdf = None
+                    pdf_binary = None
                    try:
                        # 将base64数据转换为二进制
                        pdf_binary = base64.b64decode(file_data)
@@ -426,21 +439,46 @@ class BatchGetPodInfoWizard(models.TransientModel):
                        # 将源PDF的所有页面插入到合并的PDF中
                        merged_pdf.insert_pdf(source_pdf)
                        
-                    source_pdf.close()
-                    _logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档")
+                        _logger.info(f"已添加提单 {bl.bl_no} 的PDF到合并文档（{len(source_pdf)} 页）")
+                        
                    except Exception as e:
                        _logger.error(f"合并提单 {bl.bl_no} 的PDF失败: {str(e)}")
                        continue
+                    finally:
+                        # 立即释放资源
+                        if source_pdf:
+                            source_pdf.close()
+                        source_pdf = None
+                        pdf_binary = None
+                        gc.collect()  # 强制垃圾回收
+                
+                # 每批处理完后，保存到临时文件并释放内存
+                if batch_start + batch_size < len(valid_files):
+                    # 保存当前合并结果到临时文件
+                    merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
+                    merged_pdf.close()
+                    # 重新打开临时文件继续合并
+                    merged_pdf = fitz.open(temp_file_path)
+                    gc.collect()
            
            # 如果有页面，保存合并后的PDF
            if len(merged_pdf) > 0:
-                # 保存到内存
-                output_buffer = io.BytesIO()
-                merged_pdf.save(output_buffer, garbage=4, deflate=True, clean=True)
+                # 使用临时文件保存，减少内存占用
+                if not temp_file_path:
+                    temp_file_path = tempfile.mktemp(suffix='.pdf')
+                merged_pdf.save(temp_file_path, garbage=4, deflate=True, clean=True)
                merged_pdf.close()
                
+                # 从临时文件读取并转换为base64
+                with open(temp_file_path, 'rb') as f:
+                    pdf_data = f.read()
+                
                # 转换为base64
-                merged_pdf_base64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
+                merged_pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
+                
+                # 清理临时数据
+                del pdf_data
+                gc.collect()
                
                # 生成文件名（包含提单号和日期）
                bl_numbers_str = '_'.join(bl_numbers[:5])  # 最多显示5个提单号
@@ -455,12 +493,24 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    'pdf_filename': pdf_filename
                })
                
+                # 清理base64数据
+                del merged_pdf_base64
+                gc.collect()
+                
                _logger.info(f"成功合并 {len(bl_numbers)} 个PDF文件，文件名: {pdf_filename}")
            else:
                _logger.warning("没有有效的PDF文件可以合并")
                
        except Exception as e:
            _logger.error(f"合并PDF文件失败: {str(e)}")
+        finally:
+            # 清理临时文件
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.remove(temp_file_path)
+                    _logger.info(f"已删除临时文件: {temp_file_path}")
+                except Exception as e:
+                    _logger.warning(f"删除临时文件失败: {str(e)}")

    def _match_bl_by_file_name(self, pdf_file_arr):
        """
@@ -836,6 +886,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
    def _process_pdf_with_ai_image_edit(self, pdf_data, bl_no):
        """
        使用AI图片编辑处理PDF：PDF转图片 -> AI抹除文字 -> 图片转回PDF（按照image-to-coordinate.py的逻辑）
+        优化内存占用：对于多页PDF使用临时文件方式分批处理
        :param pdf_data: PDF二进制数据
        :param bl_no: 提单号（用于日志）
        :return: 处理后的PDF二进制数据
@@ -843,6 +894,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
        import fitz  # PyMuPDF
        import base64
        import mimetypes
+        import gc
+        import os
+        import tempfile
        from PIL import Image
        import time
        
@@ -854,12 +908,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
        
        # 打开PDF文档
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
-        processed_images = []  # 存储处理后的PIL图片对象
        total_pages = len(pdf_document)
        total_ai_time = 0.0  # 累计AI总耗时
        
        _logger.info(f"PDF总页数: {total_pages}")
        
+        # 对于多页PDF，使用临时文件方式减少内存占用
+        use_temp_file = total_pages > 5  # 超过5页使用临时文件
+        temp_file_path = None
+        if use_temp_file:
+            import tempfile
+            temp_file_path = tempfile.mktemp(suffix='.pdf')
+            _logger.info(f"使用临时文件方式处理，减少内存占用: {temp_file_path}")
+        
+        processed_images = []  # 存储处理后的PIL图片对象（分批处理）
+        batch_size = 5  # 每批处理5页图片
+        
        # 遍历每一页（按照image-to-coordinate.py的逻辑）
        for page_num in range(total_pages):
            page_start_time = time.time()
@@ -867,12 +931,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
            _logger.info(f"正在处理第{page_num + 1}页")
            
            # 将页面转换为图像（按照image-to-coordinate.py的pdf_to_images函数，使用dpi=150）
-            dpi = 150
+            # 对于内存优化，使用稍低的分辨率（120 DPI）以避免内存问题
+            dpi = 120
            mat = fitz.Matrix(dpi / 72, dpi / 72)
+            pix = None
+            img = None
+            img_bytes_io = None
+            
+            try:
                pix = page.get_pixmap(matrix=mat)
                
                # 将pixmap转换为PIL Image对象
                img_data = pix.tobytes("png")
+                del pix  # 立即释放pixmap以节省内存
+                pix = None
+                gc.collect()  # 强制垃圾回收
+                
                img = Image.open(io.BytesIO(img_data))
                
                # 获取图片尺寸（按照image-to-coordinate.py的逻辑）
@@ -884,8 +958,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
                img.save(img_bytes_io, format='PNG')
                img_bytes_io.seek(0)
                encoded_string = base64.b64encode(img_bytes_io.read()).decode('utf-8')
-            mime_type = 'image/png'
-            img_base64 = f"data:{mime_type};base64,{encoded_string}"
+                img_bytes_io.close()  # 立即关闭BytesIO
+                img_bytes_io = None
+                del img_data  # 释放图片数据
+                gc.collect()  # 强制垃圾回收
                
                # 使用AI编辑图片，移除指定文字（带重试机制）
                edited_img_base64 = None
@@ -924,10 +1000,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
                            _logger.info(f"第{page_num + 1}页将进行第{attempt + 1}次重试")
                        edited_img_base64 = None
                
+                # 释放encoded_string以节省内存
+                del encoded_string
+                gc.collect()
+                
                if edited_img_base64:
                    # 解码base64图片数据并转换为PIL Image对象（按照image-to-coordinate.py的逻辑）
                    edited_img_data = base64.b64decode(edited_img_base64)
                    edited_img = Image.open(io.BytesIO(edited_img_data)).convert('RGB')
+                    del edited_img_data  # 立即释放原始数据
+                    del edited_img_base64  # 释放base64字符串
                    processed_images.append(edited_img)
                    _logger.info(f"第{page_num + 1}页AI处理最终成功，总耗时: {ai_processing_time:.2f}秒")
                else:
@@ -935,6 +1017,86 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    # 如果AI处理失败，使用原始图片
                    processed_images.append(img.convert('RGB'))
                
+                # 释放原始图片对象
+                if img:
+                    img.close()
+                    del img
+                img = None
+                gc.collect()  # 强制垃圾回收
+                
+                # 分批处理：每处理batch_size页，就转换为PDF并保存到临时文件
+                if use_temp_file and len(processed_images) >= batch_size:
+                    _logger.info(f"达到批次大小 {batch_size}，开始保存到临时文件")
+                    try:
+                        # 将已处理的图片转换为PDF
+                        batch_buffer = io.BytesIO()
+                        first_batch = processed_images[0]
+                        rest_batch = processed_images[1:]
+                        first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
+                        batch_buffer.seek(0)
+                        pdf_bytes = batch_buffer.getvalue()
+                        batch_buffer.close()
+                        
+                        # 释放已处理的图片
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = []
+                        gc.collect()
+                        
+                        if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
+                            # 追加到现有PDF：先读取现有内容，合并后保存到新文件，再替换
+                            with open(temp_file_path, 'rb') as f:
+                                existing_bytes = f.read()
+                            
+                            existing_pdf = fitz.open(stream=existing_bytes, filetype="pdf")
+                            new_pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
+                            existing_pdf.insert_pdf(new_pdf)
+                            new_pdf.close()
+                            
+                            # 保存到新临时文件，避免"save to original must be incremental"错误
+                            new_temp_path = tempfile.mktemp(suffix='.pdf')
+                            existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
+                            existing_pdf.close()
+                            
+                            # 替换旧文件
+                            os.remove(temp_file_path)
+                            os.rename(new_temp_path, temp_file_path)
+                            
+                            # 释放资源
+                            del existing_bytes
+                            del pdf_bytes
+                            gc.collect()
+                        else:
+                            # 创建新的PDF
+                            with open(temp_file_path, 'wb') as f:
+                                f.write(pdf_bytes)
+                            del pdf_bytes
+                            gc.collect()
+                    except Exception as e:
+                        _logger.error(f"分批保存PDF失败: {str(e)}")
+                        # 失败时继续处理，最后统一处理
+                        # 但需要释放已处理的图片，避免内存占用
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = []
+                        gc.collect()
+                
+            except Exception as e:
+                _logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
+                # 确保资源被释放
+                if pix:
+                    del pix
+                if img:
+                    img.close()
+                    del img
+                if img_bytes_io:
+                    img_bytes_io.close()
+                gc.collect()
+                # 如果处理失败，跳过这一页或使用原始页面
+                continue
+            
            page_end_time = time.time()
            page_processing_time = page_end_time - page_start_time
            _logger.info(f"第{page_num + 1}页总处理时间: {page_processing_time:.2f}秒")
@@ -943,11 +1105,85 @@ class BatchGetPodInfoWizard(models.TransientModel):
        
        # 将处理后的图片转换为PDF（按照image-to-coordinate.py的images_to_pdf函数逻辑）
        pdf_creation_start = time.time()
-        if not processed_images:
-            _logger.error("没有需要写入PDF的图片")
+        result_data = None
+        import os
+        
+        try:
+            if use_temp_file and temp_file_path:
+                # 如果还有剩余的图片，追加到临时文件
+                if processed_images:
+                    _logger.info(f"处理剩余的 {len(processed_images)} 页图片")
+                    try:
+                        # 将剩余图片转换为PDF
+                        batch_buffer = io.BytesIO()
+                        first_batch = processed_images[0]
+                        rest_batch = processed_images[1:]
+                        first_batch.save(batch_buffer, format='PDF', save_all=True, append_images=rest_batch)
+                        batch_buffer.seek(0)
+                        temp_pdf_bytes = batch_buffer.getvalue()
+                        batch_buffer.close()
+                        
+                        # 释放图片
+                        for img_obj in processed_images:
+                            if img_obj:
+                                img_obj.close()
+                        processed_images = None
+                        gc.collect()
+                        
+                        # 追加到临时文件
+                        if os.path.exists(temp_file_path) and os.path.getsize(temp_file_path) > 0:
+                            # 如果临时文件已存在，先读取内容
+                            with open(temp_file_path, 'rb') as f:
+                                existing_pdf_bytes = f.read()
+                            
+                            # 合并PDF：打开现有PDF和新PDF，然后合并
+                            existing_pdf = fitz.open(stream=existing_pdf_bytes, filetype="pdf")
+                            new_pdf = fitz.open(stream=temp_pdf_bytes, filetype="pdf")
+                            existing_pdf.insert_pdf(new_pdf)
+                            new_pdf.close()
+                            
+                            # 保存到新的临时文件，避免"save to original must be incremental"错误
+                            new_temp_path = tempfile.mktemp(suffix='.pdf')
+                            existing_pdf.save(new_temp_path, garbage=4, deflate=True, clean=True)
+                            existing_pdf.close()
+                            
+                            # 删除旧临时文件，重命名新文件
+                            os.remove(temp_file_path)
+                            os.rename(new_temp_path, temp_file_path)
+                            
+                            # 释放资源
+                            del existing_pdf_bytes
+                            del temp_pdf_bytes
+                            gc.collect()
+                        else:
+                            # 如果临时文件不存在或为空，直接写入
+                            with open(temp_file_path, 'wb') as f:
+                                f.write(temp_pdf_bytes)
+                            del temp_pdf_bytes
+                            gc.collect()
+                    except Exception as e:
+                        _logger.error(f"追加剩余图片失败: {str(e)}")
+                        # 注意：processed_images 在这里已经被释放了，需要重新获取
+                        # 如果还有剩余图片，需要重新处理（这种情况不应该发生，因为前面已经释放了）
+                        _logger.warning("追加剩余图片失败，剩余图片已在之前释放")
+                
+                # 从临时文件读取最终结果
+                if os.path.exists(temp_file_path):
+                    with open(temp_file_path, 'rb') as f:
+                        result_data = f.read()
+                    
+                    # 删除临时文件
+                    try:
+                        os.remove(temp_file_path)
+                        _logger.info(f"已删除临时文件: {temp_file_path}")
+                    except Exception as e:
+                        _logger.warning(f"删除临时文件失败: {str(e)}")
+                else:
+                    _logger.error("临时文件不存在，无法读取结果")
                    return None
                    
-        # 使用PIL的save方法将图片保存为PDF（按照image-to-coordinate.py的逻辑）
+            elif processed_images:
+                # 使用内存方式处理（5页以内）
                output_buffer = io.BytesIO()
                first = processed_images[0]
                rest = processed_images[1:]  # 按照image-to-coordinate.py的逻辑，直接使用切片
@@ -955,9 +1191,40 @@ class BatchGetPodInfoWizard(models.TransientModel):
                # 即使rest是空列表，也直接传入（PIL会正确处理）
                first.save(output_buffer, format='PDF', save_all=True, append_images=rest)
                output_buffer.seek(0)
-        pdf_creation_end = time.time()
                
                result_data = output_buffer.getvalue()
+                output_buffer.close()
+                
+                # 释放所有图片对象
+                for img_obj in processed_images:
+                    if img_obj:
+                        img_obj.close()
+                processed_images = None
+                del first
+                del rest
+            else:
+                _logger.error("没有需要写入PDF的图片")
+                return None
+            
+            gc.collect()  # 强制垃圾回收
+            
+        except Exception as e:
+            _logger.error(f"PDF创建失败: {str(e)}")
+            # 确保资源被释放
+            if processed_images:
+                for img_obj in processed_images:
+                    if img_obj:
+                        img_obj.close()
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.remove(temp_file_path)
+                except:
+                    pass
+            gc.collect()
+            return None
+            
+        pdf_creation_end = time.time()
+        
        total_time = time.time() - start_time
        pdf_creation_time = pdf_creation_end - pdf_creation_start
        
@@ -1664,6 +1931,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
        import numpy as np
        from PIL import Image
        import re
+        import gc
        
        # 定义目标文字（与_find_target_texts一致）
        TARGET_TEXTS = ['AGN', 'ACN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
@@ -1671,6 +1939,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
        EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
                         'Page 5 of 5']
        
+        pdf_document = None
        try:
            # 设置Tesseract路径
            self._setup_tesseract_path()
@@ -1689,25 +1958,45 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 遍历每一页
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
+                pix = None
+                pil_img = None
+                img = None
+                nparr = None
+                img_data = None
                
+                try:
                    # 首先尝试从PDF文本层提取（如果是文本型PDF）
                    page_text_pdf = page.get_text().upper()
                    
-                # 将页面转换为图像进行OCR识别
-                mat = fitz.Matrix(3.0, 3.0)  # 进一步提高分辨率，从2.0提升到3.0
+                    # 将页面转换为图像进行OCR识别（降低分辨率以节省内存）
+                    # 使用 2.0 倍分辨率（约 144 DPI）而不是 3.0 倍（约 216 DPI）
+                    mat = fitz.Matrix(2.0, 2.0)
                    pix = page.get_pixmap(matrix=mat)
                    img_data = pix.tobytes("png")
+                    del pix  # 立即释放pixmap
+                    pix = None
+                    gc.collect()  # 强制垃圾回收
                    
                    # 转换为PIL图像
                    if cv2_available:
                        nparr = np.frombuffer(img_data, np.uint8)
                        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
                        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+                        del nparr  # 释放numpy数组
+                        del img  # 释放OpenCV图像
+                        nparr = None
+                        img = None
+                        gc.collect()
                    else:
                        pil_img = Image.open(io.BytesIO(img_data))
                        if pil_img.mode != 'RGB':
                            pil_img = pil_img.convert('RGB')
                    
+                    # 释放img_data
+                    del img_data
+                    img_data = None
+                    gc.collect()
+                    
                    # OCR识别
                    try:
                        config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
@@ -1740,11 +2029,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
                                    }
                                })
                        
+                        # 释放words字典以节省内存
+                        del words
+                        gc.collect()
+                        
                        # 使用与_find_target_texts相同的匹配逻辑
                        page_found_texts = self._find_target_texts(valid_words, page_num, 800, 600, 800, 600)
+                        del valid_words  # 释放valid_words列表
+                        gc.collect()
+                        
                        if page_found_texts:
                            for found_text in page_found_texts:
                                found_texts.append(f"第{page_num + 1}页: {found_text['text']}")
+                                break  # 找到就跳出，避免重复
                                
                    except Exception as e:
                        _logger.warning(f"OCR单词识别失败，第{page_num + 1}页，使用文本匹配: {str(e)}")
@@ -1792,7 +2089,37 @@ class BatchGetPodInfoWizard(models.TransientModel):
                                    found_texts.append(f"第{page_num + 1}页: {target_text}")
                                    break  # 找到就跳出，避免重复
                    
+                    # 释放PIL图像和文本变量
+                    if pil_img:
+                        pil_img.close()
+                        del pil_img
+                    del page_text_pdf
+                    del ocr_text
+                    del combined_text
+                    pil_img = None
+                    gc.collect()  # 强制垃圾回收
+                    
+                except Exception as e:
+                    _logger.error(f"第{page_num + 1}页处理异常: {str(e)}")
+                    # 确保资源被释放
+                    if pix:
+                        del pix
+                    if pil_img:
+                        pil_img.close()
+                        del pil_img
+                    if img is not None:
+                        del img
+                    if nparr is not None:
+                        del nparr
+                    if img_data is not None:
+                        del img_data
+                    gc.collect()
+                    continue
+            
+            if pdf_document:
                pdf_document.close()
+                pdf_document = None
+                gc.collect()
            
            if found_texts:
                _logger.warning(f"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}")
@@ -1803,6 +2130,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
                
        except Exception as e:
            _logger.error(f"检查目标文字失败，提单号: {bl_no}, 错误: {str(e)}")
+            # 确保资源被释放
+            if pdf_document:
+                try:
+                    pdf_document.close()
+                except:
+                    pass
+            gc.collect()
            # 检查失败时，假设不存在（避免误报）
            return False, []