不调接口的测试

ee951ff9 · 贺阳 · e45ced44 · ee951ff9 · ee951ff9
--- a/ccs_base/views/cc_bl_view.xml
+++ b/ccs_base/views/cc_bl_view.xml
@@ -471,7 +471,7 @@
        <field name="model_id" ref="model_cc_bl"/>
        <field name="binding_model_id" ref="model_cc_bl"/>
        <field name="state">code</field>
-        <field name="binding_view_types">list</field>
+        <field name="binding_view_types">list,form</field>
        <field name="groups_id" eval="[(4, ref('ccs_base.group_clearance_of_customs_user'))]"/>
        <field name="code">
            if records:

--- a/ccs_base/wizard/batch_get_pod_info_wizard.py
+++ b/ccs_base/wizard/batch_get_pod_info_wizard.py
 # -*- coding: utf-8 -*-
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).

+import io
 import logging
-
+import base64
 import requests
 from odoo import models, fields, _
 from odoo.exceptions import ValidationError
@@ -40,11 +41,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
        """
        Confirm operation  # 确认操作
        """
-        try:
        bl_objs = self.get_order()
-            # 调用接口获取提单pdf文件
-            pdf_file_arr = self._get_pdf_file_arr()

+        # 调用接口获取提单pdf文件
+        # pdf_file_arr = self._get_pdf_file_arr()
+        pdf_file_arr = self._get_pdf_file_arr_test()
+        if not pdf_file_arr:
+            raise ValidationError(_('No PDF files found'))#提示：没有获取到PDF文件
        # 处理PDF文件，匹配提单对象
        processed_files = self._match_bl_by_file_name(pdf_file_arr)
        # 把没有匹配到文件的进行提示
@@ -56,47 +59,197 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 英文提示
            raise ValidationError(_('%s bill of loading cannot find release note file') % (
                ', '.join([bl.bl_no for bl in error_bl])))  # xx提单无法找到release note文件
-            # 先涂抹指定文字
+        
        if self.remove_specified_text:
            processed_files = self._remove_specified_text(processed_files)
+            
+            # 用于测试的：保存处理后的PDF并返回下载链接
+            # if processed_files and processed_files[0].get('file_data'):
+            #     return self._save_and_return_download_link(processed_files[0])
+        
        # 再同步和回写
        if self.sync_last_mile_pod:
            self._sync_last_mile_pod(processed_files)

-            # 显示成功消息
-            return {
-                'type': 'ir.actions.client',
-                'tag': 'display_notification',
-                'params': {
-                    'title': _('Operation Completed'),  # 操作完成
-                    'message': _('Successfully processed %d PDF files for %d bill of loadings') % (len(processed_files), len(bl_objs)),  # 成功处理了%d个PDF文件，涉及%d个提单
-                    'type': 'success',
-                }
-            }
+    def _get_pdf_file_arr_test(self):
+        """
+        Get PDF file from test data  # 从测试数据获取PDF文件
+        """
+        pdf_file_arr = []
+        bl_objs = self.get_order()
        
+        for bl in bl_objs:
+            clearance_file = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
+                                                                                        '尾程交接POD(待大包数量和箱号)')    #查找清关文件
+            if clearance_file and clearance_file.file:
+                try:
+                    # 验证原始文件数据
+                    file_data = clearance_file.file
+                    if isinstance(file_data, bytes):
+                        # 验证PDF文件头
+                        if not file_data.startswith(b'%PDF-'):
+                            # 检查是否是base64编码的字符串
+                            try:
+                                decoded_data = base64.b64decode(file_data)
+                                if decoded_data.startswith(b'%PDF-'):
+                                    _logger.info(f"发现base64编码的PDF数据，提单号: {bl.bl_no}")
+                                    file_data = decoded_data
+                                else:
+                                    _logger.warning(f"base64解码后仍不是PDF格式，提单号: {bl.bl_no}")
+                                    continue
                            except Exception as e:
-            raise ValidationError(_('Operation failed: %s') % str(e))  # 操作失败
+                                _logger.warning(f"尝试base64解码失败，提单号: {bl.bl_no}, 错误: {str(e)}")
+                                continue
+                    elif isinstance(file_data, str):
+                        # 尝试base64解码
+                        try:
+                            decoded_data = base64.b64decode(file_data)
+                            if decoded_data.startswith(b'%PDF-'):
+                                _logger.info(f"字符串base64解码成功，是有效PDF，提单号: {bl.bl_no}")
+                                file_data = decoded_data
+                            else:
+                                _logger.warning(f"字符串base64解码后不是PDF格式，提单号: {bl.bl_no}")
+                                continue
+                        except Exception as e:
+                            _logger.warning(f"字符串base64解码失败，提单号: {bl.bl_no}, 错误: {str(e)}")
+                            continue
+                    else:
+                        _logger.warning(f"清关文件数据格式不正确，类型: {type(file_data)}，提单号: {bl.bl_no}")
+                        continue
+                    
+                    # 验证PDF可以打开
+                    try:
+                        import fitz
+                        test_doc = fitz.open(stream=file_data, filetype="pdf")
+                        page_count = len(test_doc)
+                        test_doc.close()
+                        _logger.info(f"清关文件PDF验证成功，页数: {page_count}，提单号: {bl.bl_no}")
+                    except Exception as e:
+                        _logger.warning(f"清关文件PDF无法打开，提单号: {bl.bl_no}, 错误: {str(e)}")
+                        continue
+                    
+                    # 转换为base64
+                    file_data_base64 = base64.b64encode(file_data).decode('utf-8')
+                    
+                    pdf_file_arr.append({
+                        'bl_no': self.env['common.common'].sudo().process_match_str(bl.bl_no),
+                        'file_name': clearance_file.attachment_name or clearance_file.file_name,
+                        'file_data': file_data_base64
+                    })
+                    _logger.info(f"成功添加PDF文件，提单号: {bl.bl_no}, 文件名: {clearance_file.attachment_name or clearance_file.file_name}")
+                        
+                except Exception as e:
+                    _logger.error(f"处理清关文件失败，提单号: {bl.bl_no}, 错误: {str(e)}")
+                    continue
+            else:
+                _logger.warning(f"未找到清关文件，提单号: {bl.bl_no}")
+        
+        _logger.info(f"从测试数据获取PDF文件，成功获取{len(pdf_file_arr)}个文件")
+        return pdf_file_arr

    # 写一个方法掉接口获取提单pdf文件
    def _get_pdf_file_arr(self):
        """
-        Get PDF file  # 获取PDF文件
+        Get PDF file from API  # 从API获取PDF文件
        """
-        # 调用接口，接口返回数组[{'bl_no':'','file_name':'','file_data':''}]
-        # bl_no:提单号
-        # file_name:文件名
-        # file_data:文件数据
-        return [{
-            'bl_no': '436-10259804',
-            'file_name': '合并提单_436-10259804_20251008.pdf',
-            'file_data': 'base64_data'
-        }]
-        api_url = self.env['ir.config_parameter'].sudo().get_param('ccs_base.last_mile_pod_api_url')
-        response = requests.get(api_url + '/get_pdf_file')
+        # 获取当前选中的提单对象
+        bl_objs = self.get_order()
+        bill_numbers = [self.env['common.common'].sudo().process_match_str(bl.bl_no) for bl in bl_objs]
+        
+        # 调用API获取PDF文件
+        api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url','http://172.104.52.150:7002')
+        if not api_url:
+            raise ValidationError(_('API URL not configured'))
+        
+        # 构建请求数据
+        request_data = {
+            "bill_numbers": bill_numbers
+        }
+        
+        try:
+            response = requests.post(
+                f"{api_url}/api/release-notes/pdfs",
+                headers={'Content-Type': 'application/json'},
+                json=request_data
+            )
+            
            if response.status_code == 200:
-            return response.json()
+                result = response.json()
+                
+                # 检查API响应结构
+                if not result:
+                    _logger.error("API返回空响应")
+                    raise ValidationError(_('API returned empty response'))
+                
+                if not result.get('success'):
+                    error_msg = result.get('message', 'Unknown error')
+                    _logger.error(f"API返回失败状态: {error_msg}")
+                    raise ValidationError(_('API returned error: %s') % error_msg)
+                
+                # 处理结果数据
+                results = result.get('results', [])
+                if not results:
+                    _logger.warning("API调用成功，但没有PDF文件")
+                    raise ValidationError(_('No PDF files found in API response'))
+                # 构建PDF文件数组
+                pdf_file_arr = []
+                for result_item in results:
+                    if result_item.get('success'):
+                        # 验证必要字段
+                        bill_number = result_item.get('bill_number')
+                        filename = result_item.get('filename')
+                        base64_data = result_item.get('base64')
+                        
+                        if not all([bill_number, filename, base64_data]):
+                            _logger.warning(f"跳过无效的PDF文件项: {result_item}")
+                            continue
+                        
+                        # 验证PDF文件
+                        try:
+                            pdf_binary = base64.b64decode(base64_data)
+                            
+                            # 验证PDF文件头
+                            if not pdf_binary.startswith(b'%PDF-'):
+                                _logger.warning(f"API返回的文件不是有效的PDF格式，提单号: {bill_number}")
+                                continue
+                            
+                            # 验证PDF可以打开
+                            try:
+                                import fitz
+                                test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
+                                page_count = len(test_doc)
+                                test_doc.close()
+                                _logger.info(f"API PDF验证成功，页数: {page_count}，提单号: {bill_number}")
+                            except Exception as e:
+                                _logger.warning(f"API PDF文件无法打开，提单号: {bill_number}, 错误: {str(e)}")
+                                continue
+                            
+                            pdf_file_arr.append({
+                                'bl_no': bill_number,
+                                'file_name': filename,
+                                'file_data': base64_data
+                            })
+                            _logger.info(f"成功添加API PDF文件，提单号: {bill_number}, 文件名: {filename}")
+                            
+                        except Exception as e:
+                            _logger.warning(f"API PDF文件验证失败，提单号: {bill_number}, 错误: {str(e)}")
+                            continue
+                
+                if not pdf_file_arr:
+                    _logger.error("所有API PDF文件验证都失败")
+                    raise ValidationError(_('All API PDF files failed validation'))
+                
+                _logger.info(f"API调用成功，获取到{len(pdf_file_arr)}个有效PDF文件")
+                return pdf_file_arr
+                
            else:
-            raise ValidationError(_('Failed to get PDF file: %s') % response.text)
+                _logger.error(f"API调用失败，状态码: {response.status_code}")
+                _logger.error(f"响应内容: {response.text}")
+                raise ValidationError(_('Failed to get PDF file from API: %s') % response.text)
+                
+        except requests.exceptions.RequestException as e:
+            _logger.error(f"API请求异常: {str(e)}")
+            raise ValidationError(_('API request failed: %s') % str(e))

    def _write_pdf_file(self, processed_files):
        """
@@ -109,7 +262,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
            bl = file_info['bl']
            file_name = file_info['file_name']
            file_data = file_info['file_data']
-            try:
            # 查找或创建清关文件记录
            clearance_file = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
                                                                                        '尾程交接POD(待大包数量和箱号)')
@@ -128,13 +280,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
                    'attachment_name': file_name,
                    'file': file_data
                })
-            except Exception as e:
-                raise ValidationError(_('Failed to write PDF file %s: %s') % (file_name, str(e)))

    def _match_bl_by_file_name(self, pdf_file_arr):
        """
        Match BL by file name and return processed array  # 根据文件名匹配提单并返回处理后的数组
-        :param pdf_file_arr: PDF文件数组 [{'bl_no':'', 'file_name':'', 'file_data':''}]
+        :param pdf_file_arr: PDF文件数组 [{'bill_number':'', 'filename':'', 'file_data':''}]
        :return: 处理后的数组 [{'bl': bl_obj, 'file_name': 'xxx.pdf', 'file_data': 'xxx', 'matched': True/False}]
        """
        bl_obj = self.get_order()  # 获取当前选中的提单对象
@@ -142,13 +292,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
        for bl in bl_obj:
            select_bl_no = self.env['common.common'].sudo().process_match_str(bl.bl_no)
            for pdf_file in pdf_file_arr:
-                file_name = pdf_file.get('file_name', '')  # 获取文件名
-                file_data = pdf_file.get('file_data', '')  # 获取文件数据
-                bl_no = pdf_file.get('bl_no', '')  # 获取提单号
-                if not bl_no:
-                    # 从文件名获取提单号  合并提单_436-10259804_20251008.pdf
-                    split_bl_no = file_name.split('_')[1]
-                    bl_no = self.env['common.common'].sudo().process_match_str(split_bl_no)
+                # 尝试不同的字段名（API可能使用不同的字段名）
+                file_name = pdf_file.get('file_name' )  # 获取文件名
+                file_data = pdf_file.get('file_data')  # 获取文件数据
+                bl_no = pdf_file.get('bl_no')  # 获取提单号
                if bl_no and select_bl_no == bl_no:
                    # 构建处理后的文件信息
                    processed_file = {
@@ -156,10 +303,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
                        'file_name': file_name,
                        'file_data': file_data,
                        'bl_no': bl_no,
-                        'original_data': pdf_file  # 保留原始数据
                    }
                    processed_files.append(processed_file)
                    break
+        _logger.info(f"匹配完成，成功匹配{len(processed_files)}个文件,匹配结果: {processed_files}")
        return processed_files

    def _sync_last_mile_pod(self, processed_files):
@@ -169,14 +316,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
        """
        # 回写PDF文件到清关文件
        self._write_pdf_file(processed_files)
-
+        return False#测试  先不同步
        # 同步尾程POD信息
        for file_info in processed_files:
            if not file_info['bl']:
                continue

            bl = file_info['bl']
-            try:
            # 查找清关文件并执行同步
            clearance_files = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
                                                                                         '尾程交接POD(待大包数量和箱号)')
@@ -184,10 +330,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
                clearance_file.action_sync()  # 同步尾程POD
                _logger.info(f"Successfully synced POD for BL {bl.bl_no}")

-            except Exception as e:
-                _logger.error(f"Failed to sync POD for BL {bl.bl_no}: {str(e)}")
-                raise ValidationError(_('Failed to sync POD for BL %s: %s') % (bl.bl_no, str(e)))
-
    def _remove_specified_text(self, processed_files):
        """
        Remove specified text from PDF files using OCR recognition  # 使用OCR识别涂抹指定文字
@@ -205,12 +347,34 @@ class BatchGetPodInfoWizard(models.TransientModel):
            file_data = file_info['file_data']
            processed_file_data = file_data  # 默认使用原始数据

-            try:
            # 使用OCR识别和删除指定文字
-                if file_data and file_data != 'base64_data':  # 跳过测试数据
+            if file_data:
                # 将base64数据转换为二进制
                import base64
+                try:
                    pdf_binary = base64.b64decode(file_data)
+                    _logger.info(f"Base64解码成功，数据大小: {len(pdf_binary)}字节，提单号: {bl.bl_no}")
+                    
+                    # 验证PDF文件头
+                    if not pdf_binary.startswith(b'%PDF-'):
+                        _logger.error(f"解码后的数据不是有效的PDF文件，提单号: {bl.bl_no}")
+                        _logger.error(f"文件头: {pdf_binary[:20]}")
+                        raise ValidationError(_('Decoded data is not a valid PDF file for BL %s') % bl.bl_no)
+                    
+                    # 验证PDF可以打开
+                    try:
+                        import fitz
+                        test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
+                        page_count = len(test_doc)
+                        test_doc.close()
+                        _logger.info(f"PDF验证成功，页数: {page_count}，提单号: {bl.bl_no}")
+                    except Exception as e:
+                        _logger.error(f"PDF文件无法打开，提单号: {bl.bl_no}, 错误: {str(e)}")
+                        raise ValidationError(_('PDF file cannot be opened for BL %s: %s') % (bl.bl_no, str(e)))
+                        
+                except Exception as e:
+                    _logger.error(f"Base64解码失败，提单号: {bl.bl_no}, 错误: {str(e)}")
+                    raise ValidationError(_('Failed to decode base64 data for BL %s: %s') % (bl.bl_no, str(e)))

                # 使用OCR方法处理PDF
                processed_pdf = self._process_pdf_with_ocr(
@@ -219,12 +383,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
                )

                # 将处理后的PDF转换回base64
-                    processed_file_data = base64.b64encode(processed_pdf)
+                processed_file_data = base64.b64encode(processed_pdf).decode('utf-8')
                _logger.info(f"Successfully removed specified text from PDF for BL {bl.bl_no}")
-
-            except Exception as e:
-                _logger.error(f"Failed to remove text from PDF for BL {bl.bl_no}: {str(e)}")
-                raise ValidationError(_('Failed to remove text from PDF for BL %s: %s') % (bl.bl_no, str(e)))
+                _logger.info(f"处理后的PDF base64数据长度: {len(processed_file_data)}")
            
            # 更新文件信息，使用处理后的PDF数据
            updated_file_info = file_info.copy()
@@ -235,22 +396,27 @@ class BatchGetPodInfoWizard(models.TransientModel):

    def _process_pdf_with_ocr(self, pdf_data, bl_no):
        """
-        Process PDF with OCR recognition and text removal  # 使用OCR识别处理PDF并删除文字
+        Process PDF with OCR recognition and text removal (完全按照HTML逻辑)  # 使用OCR识别处理PDF并删除文字
        :param pdf_data: PDF二进制数据
        :param bl_no: 提单号（用于日志）
        :return: 处理后的PDF二进制数据
        """
+        import os
        import fitz  # PyMuPDF
        import cv2
        import numpy as np
        from PIL import Image
        import pytesseract
-        import base64
-        import io
        
-        # 定义目标文字和排除文字（与HTML文件保持一致）
-        TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD']
-        EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5']
+        # 设置Tesseract路径
+        self._setup_tesseract_path()
+        
+        # 验证PDF数据
+        if not pdf_data or not pdf_data.startswith(b'%PDF-'):
+            _logger.error(f"PDF数据无效，提单号: {bl_no}")
+            raise ValidationError(_('Invalid PDF data for BL %s') % bl_no)
+        
+        _logger.info(f"开始OCR处理PDF，提单号: {bl_no}")
        
        # 打开PDF文档
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
@@ -259,15 +425,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
        detected_texts = []
        all_recognized_texts = []
        
-        _logger.info(f"开始OCR处理PDF，共{len(pdf_document)}页，提单号: {bl_no}")
-        
-        # 处理每一页
+        # 处理每一页（完全按照HTML逻辑）
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            _logger.info(f"正在OCR识别第{page_num + 1}页")
            
-            try:
-                # 将页面转换为图像（提高分辨率，与HTML文件保持一致）
+            # 将页面转换为图像（与HTML完全一致）
            mat = fitz.Matrix(2.0, 2.0)  # 提高分辨率
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("png")
@@ -279,15 +442,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
            # 转换为PIL图像
            pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            
-                # 使用Tesseract进行OCR识别（优化配置，与HTML文件保持一致）
+            # OCR配置（与HTML完全一致）
+            config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
+            
+            # 使用Tesseract进行OCR识别
+            try:
                ocr_data = pytesseract.image_to_data(
                    pil_img, 
                    output_type=pytesseract.Output.DICT, 
                    lang='eng',
-                    config='--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- '
+                    config=config
                )
+            except Exception as e:
+                _logger.error(f"OCR识别失败: {str(e)}")
+                continue
            
-                # 处理OCR结果
+            # 处理OCR结果（与HTML完全一致）
            page_width = page.rect.width
            page_height = page.rect.height
            viewport_width = pil_img.width
@@ -312,24 +482,25 @@ class BatchGetPodInfoWizard(models.TransientModel):
            
            all_recognized_texts.extend(page_recognized_texts)
            
-                # 查找目标文字
+            # 查找目标文字（完全按照HTML逻辑）
            page_texts = self._find_target_texts(
                page_recognized_texts, 
                page_num, 
                viewport_width, 
                viewport_height, 
                page_width, 
-                    page_height,
-                    TARGET_TEXTS,
-                    EXCLUDE_TEXTS
+                page_height
            )
            detected_texts.extend(page_texts)
            
            _logger.info(f"第{page_num + 1}页OCR完成，找到{len(page_texts)}个目标文字")
            
-                # 在页面上绘制删除矩形
+            
+            # 根据OCR结果删除文字（完全按照HTML逻辑）
+            if page_texts:
+                
                for text_info in page_texts:
-                    # 超精确删除模式（与HTML文件保持一致）
+                    # 超精确删除模式（与HTML完全一致）
                    rect = {
                        'x': text_info['x'],
                        'y': text_info['y'],
@@ -337,61 +508,162 @@ class BatchGetPodInfoWizard(models.TransientModel):
                        'height': text_info['height']
                    }
                    
+                    
                    # 绘制白色矩形覆盖文字
+                    try:
                        page.draw_rect(
                            fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
-                        color=(1, 1, 1),
-                        fill=(1, 1, 1)
+                            color=(1, 1, 1),  # 白色
+                            fill=(1, 1, 1)    # 填充白色
                        )
+                        _logger.info(f"删除目标文字: {text_info['text']}")
                        total_rectangles += 1
-                
-                processed_pages += 1
-                
                    except Exception as e:
-                _logger.warning(f"第{page_num + 1}页OCR失败: {str(e)}")
-                # 使用回退策略：预设坐标
-                self._apply_fallback_rectangles(page, page_num)
+                        _logger.error(f"删除失败: {str(e)}")
+            else:
+                _logger.warning(f"第{page_num + 1}页没有找到目标文字")
+            
            processed_pages += 1
        
        # 保存处理后的PDF
+        try:
            output_buffer = io.BytesIO()
-        pdf_document.save(output_buffer)
+            pdf_document.save(output_buffer, garbage=4, deflate=True, clean=True)
            pdf_document.close()
            
            result_data = output_buffer.getvalue()
            output_buffer.close()
            
+            _logger.info(f"PDF保存成功，数据大小: {len(result_data)}字节")
+            
+        except Exception as e:
+            _logger.error(f"PDF保存失败: {str(e)}")
+            pdf_document.close()
+            raise ValidationError(_('Failed to save PDF: %s') % str(e))
+        
        _logger.info(f"PDF OCR处理完成，共处理{processed_pages}页，删除{total_rectangles}个文字区域，提单号: {bl_no}")
        
+        
        return result_data

-    def _find_target_texts(self, words, page_num, viewport_width, viewport_height, page_width, page_height, target_texts, exclude_texts):
+    def _setup_tesseract_path(self):
        """
-        Find target texts using OCR results  # 使用OCR结果查找目标文字
+        Setup Tesseract path for different systems  # 为不同系统设置Tesseract路径
        """
+        # try:
+        import pytesseract
+        import os
+        import shutil
+        
+        if os.name == 'nt':  # Windows
+            # Windows常见路径
+            possible_paths = [
+                r'C:\Program Files\Tesseract-OCR\tesseract.exe',
+                r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
+                r'C:\Users\%USERNAME%\AppData\Local\Tesseract-OCR\tesseract.exe'
+            ]
+            for path in possible_paths:
+                if os.path.exists(path):
+                    pytesseract.pytesseract.tesseract_cmd = path
+                    _logger.info(f"设置Tesseract路径: {path}")
+                    break
+            else:
+                _logger.warning("未找到Tesseract安装路径")
+        else:  # Linux/Mac
+            # 检查Tesseract是否在PATH中
+            tesseract_path = shutil.which('tesseract')
+            if tesseract_path:
+                pytesseract.pytesseract.tesseract_cmd = tesseract_path
+                _logger.info(f"找到Tesseract路径: {tesseract_path}")
+            else:
+                # 尝试常见路径
+                possible_paths = [
+                    '/usr/bin/tesseract',
+                    '/usr/local/bin/tesseract',
+                    '/opt/homebrew/bin/tesseract',  # macOS M1
+                    '/usr/local/Cellar/tesseract/*/bin/tesseract'  # macOS Homebrew
+                ]
+                
+                for path in possible_paths:
+                    if os.path.exists(path):
+                        pytesseract.pytesseract.tesseract_cmd = path
+                        _logger.info(f"设置Tesseract路径: {path}")
+                        break
+                else:
+                    _logger.warning("未找到Tesseract，请确保已安装tesseract-ocr")
+            
+            # 检查语言数据文件
+            self._check_tessdata_files()
+                
+        # except Exception as e:
+        #     _logger.warning(f"设置Tesseract路径失败: {str(e)}")
+
+    def _check_tessdata_files(self):
+        """
+        Check if tessdata files exist  # 检查tessdata文件是否存在
+        """
+        import pytesseract
+        import os
+        
+        # 获取Tesseract数据路径
+        tesseract_cmd = pytesseract.pytesseract.tesseract_cmd
+        tessdata_dir = os.path.dirname(tesseract_cmd) + '/tessdata'
+        
+        # 如果tessdata目录不存在，尝试其他常见位置
+        if not os.path.exists(tessdata_dir):
+            possible_tessdata_dirs = [
+                '/usr/share/tesseract-ocr/tessdata',
+                '/usr/local/share/tesseract-ocr/tessdata',
+                '/opt/homebrew/share/tessdata',  # macOS M1
+                '/usr/local/Cellar/tesseract/*/share/tessdata'  # macOS Homebrew
+            ]
+            
+            for tessdata_path in possible_tessdata_dirs:
+                if os.path.exists(tessdata_path):
+                    tessdata_dir = tessdata_path
+                    break
+        
+        # 检查英语语言数据文件
+        eng_data = os.path.join(tessdata_dir, 'eng.traineddata')
+        if os.path.exists(eng_data):
+            _logger.info(f"找到英语语言数据文件: {eng_data}")
+        else:
+            _logger.warning(f"未找到英语语言数据文件: {eng_data}")
+            _logger.warning("请安装英语语言包: sudo apt-get install tesseract-ocr-eng")
+
+
+    def _find_target_texts(self, words, page_num, viewport_width, viewport_height, page_width, page_height):
+        """
+        Find target texts using OCR results (完全按照HTML逻辑)  # 使用OCR结果查找目标文字
+        """
+        # 定义目标文字和排除文字（与HTML文件完全一致）
+        TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD','UCLINKLOGISITICSLTD']
+        EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5']
+        
        found_texts = []
        
        for word in words:
            text = word['text'].strip().upper()
            
-            # 首先检查是否在排除列表中
+            # 首先检查是否在排除列表中（与HTML完全一致）
            is_excluded = False
-            for exclude_text in exclude_texts:
+            for exclude_text in EXCLUDE_TEXTS:
                exclude_upper = exclude_text.upper()
                if exclude_upper in text or text in exclude_upper:
                    is_excluded = True
                    break
            
-            # 检查页码模式（Page X of Y）
+            # 检查页码模式（Page X of Y）（与HTML完全一致）
            import re
            if not is_excluded and (re.match(r'^PAGE\s+\d+\s+OF\s+\d+$', text) or re.match(r'^\d+\s*/\s*\d+$', text)):
                is_excluded = True
            
            if is_excluded:
+                _logger.info(f"排除文字: {word['text']}")
                continue
            
-            # 检查目标文字匹配
-            for target_text in target_texts:
+            # 检查目标文字匹配（与HTML完全一致）
+            for target_text in TARGET_TEXTS:
                target_upper = target_text.upper()
                is_match = False
                
@@ -401,39 +673,30 @@ class BatchGetPodInfoWizard(models.TransientModel):
                elif target_text == 'LTD':
                    # LTD使用精确匹配
                    is_match = text == 'LTD'
-                elif target_text == 'UCLINK LOGISITICS LTD':
-                    # 完整短语匹配
-                    is_match = ('UCLINK' in text and 'LOGISITICS' in text and 'LTD' in text) or \
-                              'UCLINK LOGISITICS LTD' in text or \
-                              text == 'UCLINK LOGISITICS LTD'
-                elif target_text == 'UCLINK LOGISITICS':
-                    # 部分短语匹配
-                    is_match = ('UCLINK' in text and 'LOGISITICS' in text) or \
-                              text == 'UCLINK LOGISITICS'
-                elif target_text == 'UCLINK':
-                    # 单独UCLINK匹配
-                    is_match = text == 'UCLINK' or text.startswith('UCLINK ')
-                elif target_text in ['LOGISITICS', 'LOGISTICS']:
-                    # LOGISITICS/LOGISTICS匹配
-                    is_match = text in ['LOGISITICS', 'LOGISTICS'] or \
-                              text.startswith('LOGISITICS') or text.startswith('LOGISTICS')
                else:
-                    # 其他文字使用包含匹配，但更严格
+                    # 其他文字使用包含匹配，但更严格（与HTML完全一致）
                    is_match = target_upper in text and \
                             'AIR' not in text and \
                             'EQK' not in text and \
                             'ARN' not in text
                
+                # 如果精确匹配失败，尝试模糊匹配（与HTML完全一致）
+                if not is_match and target_text != 'AGN' and target_text != 'LTD':
+                    is_match = self._fuzzy_match(text, target_upper)
+                
                if is_match:
-                    # 坐标转换（与HTML文件保持一致）
+                    # 坐标转换（适配PyMuPDF坐标系统）
                    scale_x = page_width / viewport_width
                    scale_y = page_height / viewport_height
                    
+                    # PyMuPDF使用左下角为原点，OCR使用左上角为原点
+                    # 简化Y坐标转换：直接使用OCR的Y坐标，但调整到正确位置
                    converted_x = word['bbox']['x0'] * scale_x
-                    converted_y = (viewport_height - word['bbox']['y1']) * scale_y
+                    converted_y = (word['bbox']['y0'] * scale_y)  # 直接使用OCR的Y坐标
                    converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x
                    converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y
                    
+                    
                    found_texts.append({
                        'text': target_text,
                        'full_text': word['text'],
@@ -449,26 +712,134 @@ class BatchGetPodInfoWizard(models.TransientModel):
        
        return found_texts

-    def _apply_fallback_rectangles(self, page, page_num):
+    def _fuzzy_match(self, str1, str2):
        """
-        Apply fallback rectangles when OCR fails  # OCR失败时应用回退矩形
+        Fuzzy match function (与HTML完全一致)  # 模糊匹配函数
        """
-        page_width = page.rect.width
-        page_height = page.rect.height
+        import re
+        s1 = re.sub(r'[^A-Z]', '', str1)
+        s2 = re.sub(r'[^A-Z]', '', str2)
        
-        # 超精确的预设坐标覆盖（与HTML文件保持一致）
-        rectangles = [
-            {'x': 50, 'y': page_height - 200, 'width': 60, 'height': 10},  # AGN
-            {'x': 50, 'y': page_height - 220, 'width': 100, 'height': 10},  # UCLINK LOGISITICS
-            {'x': 155, 'y': page_height - 220, 'width': 30, 'height': 10}   # LTD
-        ]
+        if len(s1) == 0 or len(s2) == 0:
+            return False
+        
+        # 计算编辑距离
+        distance = self._levenshtein_distance(s1, s2)
+        max_len = max(len(s1), len(s2))
+        
+        # 如果编辑距离小于等于最大长度的1/3，认为匹配
+        return distance <= max_len / 3
+
+    def _levenshtein_distance(self, s1, s2):
+        """
+        Calculate Levenshtein distance (与HTML完全一致)  # 计算编辑距离
+        """
+        if len(s1) < len(s2):
+            return self._levenshtein_distance(s2, s1)
+        
+        if len(s2) == 0:
+            return len(s1)
+        
+        previous_row = list(range(len(s2) + 1))
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+        
+        return previous_row[-1]
+
+
+
+
+
+    def _save_and_return_download_link(self, file_info):
+        """
+        Save processed PDF as attachment and return download action  # 保存处理后的PDF作为附件并返回下载动作
+        :param file_info: 处理后的文件信息
+        :return: Odoo action to download the file
+        """
+        import base64
+        
+        try:
+            # 获取处理后的PDF数据
+            file_data = file_info.get('file_data', '')
+            file_name = file_info.get('file_name', 'processed.pdf')
+            
+            if not file_data:
+                raise ValidationError(_('No processed file data available'))
+            
+            # 解码base64数据
+            if isinstance(file_data, str):
+                _logger.info(f"输入是字符串类型，长度: {len(file_data)}")
+                _logger.info(f"输入前50字符: {file_data[:50]}")
+                pdf_binary = base64.b64decode(file_data)
+            else:
+                _logger.info(f"输入是bytes类型，长度: {len(file_data)}")
+                _logger.info(f"输入前20字节: {file_data[:20]}")
+                pdf_binary = file_data
+            
+            # 验证PDF数据完整性
+            _logger.info(f"PDF二进制数据大小: {len(pdf_binary)}字节")
+            _logger.info(f"PDF文件头: {pdf_binary[:20]}")
+            _logger.info(f"PDF文件头(hex): {pdf_binary[:20].hex()}")
+            
+            # 确保PDF数据有效
+            if not pdf_binary.startswith(b'%PDF-'):
+                _logger.error(f"保存的PDF数据不是有效的PDF格式，文件头: {pdf_binary[:20]}")
+                _logger.error(f"文件头(hex): {pdf_binary[:20].hex()}")
+                _logger.error(f"文件大小: {len(pdf_binary)}字节")
+                
+                # 尝试修复：如果是base64字符串被错误处理
+                if isinstance(file_data, str) and len(file_data) > 100:
+                    _logger.info("尝试重新解码base64数据...")
+                    try:
+                        # 重新尝试base64解码
+                        pdf_binary_fixed = base64.b64decode(file_data)
+                        if pdf_binary_fixed.startswith(b'%PDF-'):
+                            _logger.info("✅ 重新解码成功，PDF数据有效")
+                            pdf_binary = pdf_binary_fixed
+                        else:
+                            _logger.error("❌ 重新解码后仍然不是有效的PDF")
+                            raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
+                    except Exception as e:
+                        _logger.error(f"重新解码失败: {str(e)}")
+                        raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
+                else:
+                    raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
            
-        for rect in rectangles:
+            # 验证PDF可以打开
+            try:
                import fitz
-            page.draw_rect(
-                fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
-                color=(1, 1, 1),
-                fill=(1, 1, 1)
-            )
+                test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
+                _logger.info(f"PDF验证成功，页数: {len(test_doc)}")
+                test_doc.close()
+            except Exception as e:
+                _logger.error(f"PDF验证失败: {str(e)}")
+                raise ValidationError(_('Invalid PDF data for saving: cannot open PDF - %s') % str(e))
+            
+            # 创建附件记录
+            attachment = self.env['ir.attachment'].create({
+                'name': f'processed_{file_name}',
+                'type': 'binary',
+                'datas': base64.b64encode(pdf_binary),
+                'mimetype': 'application/pdf',
+                'res_model': 'batch.get.pod.info.wizard',
+                'res_id': self.id,
+            })
            
-        _logger.info(f"第{page_num + 1}页使用回退策略，应用了{len(rectangles)}个预设矩形")
+            _logger.info(f"成功保存处理后的PDF附件，文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
+            
+            # 返回下载动作
+            return {
+                'type': 'ir.actions.act_url',
+                'url': f'/web/content/{attachment.id}?download=true',
+                'target': 'new',
+            }
+            
+        except Exception as e:
+            _logger.error(f"保存PDF附件失败: {str(e)}")
+            raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))