测试环境的删除不彻底，增加调试

601b7eac · 贺阳 · 8ca88357 · 601b7eac · 601b7eac
--- a/ccs_base/wizard/batch_get_pod_info_wizard.py
+++ b/ccs_base/wizard/batch_get_pod_info_wizard.py
@@ -38,6 +38,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
        help='Whether to remove specified text from PDF files'  # 是否涂抹PDF中的指定文字
    )
+    # debug_mode = fields.Boolean(
+    #     string='Debug Mode',  # 调试模式
+    #     default=False,
+    #     help='Show red markers for deleted text positions'  # 显示删除文字位置的红色标记
+    # )
    def confirm(self):
        """
        Confirm operation  # 确认操作
@@ -59,7 +65,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
                ', '.join([bl.bl_no for bl in error_bl])))  # xx提单无法找到release note文件
        if self.remove_specified_text:
-            processed_files = self._remove_specified_text(processed_files)
+            # 临时启用调试模式，查看删除位置
+            processed_files = self._remove_specified_text(processed_files, debug_mode=False)
            # 用于测试的：保存处理后的PDF并返回下载链接
            # if processed_files and processed_files[0].get('file_data'):
@@ -141,7 +148,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
                        except Exception as e:
                            _logger.warning(f"API PDF文件验证失败，提单号: {bill_number}, 错误: {str(e)}")
                            continue
-                logging.info('len(pdf_file_arr): %s' % len(pdf_file_arr))
                return pdf_file_arr
            else:
                raise ValidationError(_('Failed to get PDF file from API: %s') % response.text)
@@ -194,8 +200,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    }
                    processed_files.append(processed_file)
                    break
-            logging.info("继续下一个提单")
-        _logger.info(f"匹配完成，成功匹配{len(processed_files)}个文件")
        return processed_files
    def _sync_last_mile_pod(self, processed_files):
@@ -216,10 +220,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
                clearance_file.action_sync()  # 同步尾程POD
                _logger.info(f"Successfully synced POD for BL {bl.bl_no}")
-    def _remove_specified_text(self, processed_files):
+    def _remove_specified_text(self, processed_files, debug_mode=False):
        """
        Remove specified text from PDF files using OCR recognition  # 使用OCR识别涂抹指定文字
        :param processed_files: 处理后的文件数组
+        :param debug_mode: 是否显示调试标记
        :return: 处理后的文件数组（包含处理后的PDF数据）
        """
        updated_files = []
@@ -239,7 +244,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
                # 使用OCR方法处理PDF
                processed_pdf = self._process_pdf_with_ocr(
                    pdf_data=pdf_binary,
-                    bl_no=bl.bl_no
+                    bl_no=bl.bl_no,
+                    debug_mode=debug_mode
                )
                if processed_pdf:
                    # 将处理后的PDF转换回base64
@@ -252,11 +258,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
        return updated_files
-    def _process_pdf_with_ocr(self, pdf_data, bl_no):
+    def _process_pdf_with_ocr(self, pdf_data, bl_no, debug_mode=False):
        """
        Process PDF with OCR recognition and text removal (完全按照HTML逻辑)  # 使用OCR识别处理PDF并删除文字
        :param pdf_data: PDF二进制数据
        :param bl_no: 提单号（用于日志）
+        :param debug_mode: 是否显示调试标记
        :return: 处理后的PDF二进制数据
        """
        import fitz  # PyMuPDF
@@ -286,7 +293,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
        # 处理每一页（完全按照HTML逻辑）
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
-            # _logger.info(f"正在OCR识别第{page_num + 1}页")
+            _logger.info(f"正在OCR识别第{page_num + 1}页")
            # 将页面转换为图像（与HTML完全一致）
            mat = fitz.Matrix(2.0, 2.0)  # 提高分辨率
@@ -345,6 +352,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
            all_recognized_texts.extend(page_recognized_texts)
+            # 调试：输出所有识别到的文字
+            _logger.info(f"第{page_num + 1}页识别到的所有文字: {[word['text'] for word in page_recognized_texts]}")
            # 查找目标文字（完全按照HTML逻辑）
            page_texts = self._find_target_texts(
                page_recognized_texts,
@@ -356,6 +366,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
            )
            detected_texts.extend(page_texts)
+            # 调试：输出检测到的目标文字
+            if page_texts:
+                _logger.info(f"第{page_num + 1}页检测到的目标文字: {[text['text'] for text in page_texts]}")
+            else:
+                _logger.info(f"第{page_num + 1}页未检测到目标文字")
            # 根据OCR结果删除文字（完全按照HTML逻辑）
            if page_texts:
@@ -368,14 +384,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
                        'height': text_info['height']
                    }
-                    # 绘制白色矩形覆盖文字
+                    # 绘制背景色矩形标记删除位置
                    try:
+                        if debug_mode:
+                            # 调试模式：先绘制红色边框标记删除区域
+                            page.draw_rect(
+                                fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
+                                color=(1, 0, 0),  # 红色边框
+                                fill=(1, 0.8, 0.8)  # 浅红色填充
+                            )
+                            # 再绘制白色矩形覆盖文字
                            page.draw_rect(
                                fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
                                color=(1, 1, 1),  # 白色
                                fill=(1, 1, 1)  # 填充白色
                            )
-                        # _logger.info(f"删除目标文字: {text_info['text']}")
+                        else:
+                            # 正常模式：直接绘制白色矩形覆盖文字
+                            page.draw_rect(
+                                fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
+                                color=(1, 1, 1),  # 白色
+                                fill=(1, 1, 1)  # 填充白色
+                            )
+                        _logger.info(
+                            f"删除目标文字: {text_info['text']} 位置: x={rect['x']:.1f}, y={rect['y']:.1f}, w={rect['width']:.1f}, h={rect['height']:.1f}")
                        total_rectangles += 1
                    except Exception as e:
                        _logger.error(f"删除失败: {str(e)}")
@@ -389,6 +423,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
            result_data = output_buffer.getvalue()
            output_buffer.close()
+            # 输出处理总结
+            _logger.info(
+                f"PDF处理完成 - 提单号: {bl_no}, 处理页数: {processed_pages}, 删除矩形数: {total_rectangles}, 检测到文字数: {len(detected_texts)}")
+            if detected_texts:
+                _logger.info(f"检测到的目标文字: {[text['text'] for text in detected_texts]}")
        except Exception as e:
            _logger.error(f"PDF保存失败: {str(e)}")
            pdf_document.close()
@@ -535,6 +575,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x
                    converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y
+                    # 增加宽度和高度，确保完全覆盖文字
+                    # 针对长文字使用更大的边距
+                    if target_text in ['UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINKLOGISITICSLTD']:
+                        # 长文字使用更大的边距
+                        width_margin = max(20, converted_width * 0.3)  # 至少20像素或30%的宽度边距
+                        height_margin = max(5, converted_height * 0.3)  # 至少5像素或30%的高度边距
+                    else:
+                        # 短文字使用标准边距
+                        width_margin = max(10, converted_width * 0.2)  # 至少10像素或20%的宽度边距
+                        height_margin = max(3, converted_height * 0.2)  # 至少3像素或20%的高度边距
+                    # 记录原始尺寸
+                    original_x = converted_x
+                    original_y = converted_y
+                    original_width = converted_width
+                    original_height = converted_height
+                    converted_x = max(0, converted_x - width_margin / 2)
+                    converted_y = max(0, converted_y - height_margin / 2)
+                    converted_width = min(page_width - converted_x, converted_width + width_margin)
+                    converted_height = min(page_height - converted_y, converted_height + height_margin)
+                    # 调试：显示边距计算过程
+                    _logger.info(
+                        f"文字 '{target_text}' 边距计算: 原始尺寸({original_width:.1f}x{original_height:.1f}) -> 边距({width_margin:.1f}x{height_margin:.1f}) -> 最终尺寸({converted_width:.1f}x{converted_height:.1f})")
                    found_texts.append({
                        'text': target_text,
                        'full_text': word['text'],
@@ -612,9 +678,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 确保PDF数据有效
            if not pdf_binary.startswith(b'%PDF-'):
-                _logger.error(f"保存的PDF数据不是有效的PDF格式，文件头: {pdf_binary[:20]}")
-                _logger.error(f"文件头(hex): {pdf_binary[:20].hex()}")
-                _logger.error(f"文件大小: {len(pdf_binary)}字节")
                # 尝试修复：如果是base64字符串被错误处理
                if isinstance(file_data, str) and len(file_data) > 100:
@@ -653,10 +716,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
                'res_model': 'batch.get.pod.info.wizard',
                'res_id': self.id,
            })
-            _logger.info(
-                f"成功保存处理后的PDF附件，文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
            # 返回下载动作
            return {
                'type': 'ir.actions.act_url',

--- a/requirements.txt
+++ b/requirements.txt
 pygtrans
 PyMuPDF
-opencv-python
+opencv-python; sys_platform == "win32"
+opencv-python-headless; sys_platform != "win32"
 numpy
 Pillow
 tesseract
 pytesseract
-# # 更新包列表
+# 系统依赖安装说明：
-# sudo apt update
-# # 安装Tesseract OCR
+# Windows系统：
-# sudo apt install tesseract-ocr
+# 1. 安装Tesseract OCR: 下载并安装 https://github.com/UB-Mannheim/tesseract/wiki
+# 2. 安装OpenCV: pip install opencv-python
-# # 安装英语语言包
+# Linux系统：
-# sudo apt install tesseract-ocr-eng
+# 1. 更新包列表: sudo apt update
+# 2. 安装Tesseract OCR: sudo apt install tesseract-ocr
+# 3. 安装英语语言包: sudo apt install tesseract-ocr-eng
+# 4. 安装OpenCV系统依赖: sudo apt install libopencv-dev python3-opencv
+# 5. 安装OpenCV Python包: pip install opencv-python-headless
+# macOS系统：
-# apt install libopencv-dev python3-opencv
+# 1. 安装Tesseract: brew install tesseract
+# 2. 安装OpenCV: pip install opencv-python-headless