1.优化

2972a296 · 刘擎阳 · d82f2dbc · 2972a296 · 2972a296 · 2972a296
--- a/ccs_base/models/order_state_change_rule.py
+++ b/ccs_base/models/order_state_change_rule.py
@@ -9,6 +9,11 @@ import tempfile
 from datetime import datetime, timedelta
 from io import BytesIO

+import os
+# 引入你的本地脚本函
+from ..pdf_tools.pod_indexer import index_pod_directory
+from ..pdf_tools.awb_page_merger import merge_awb_pages
+
 import pdfplumber
 import xlrd
 from aip.ocr import AipOcr
@@ -238,7 +243,7 @@ class OrderStateChangeRule(models.Model):
        pattern = re.compile("\\d{3}-\\d{8}\s*")
        data_re = re.compile(pattern)
        data_arr = data_re.findall(email_body)
-        data_arr = [i.replace('\r\n', '') for i in data_arr]
+        data_arr = [i.replace('\r\n', '').replace('\xa0', '') for i in data_arr]
        return data_arr

    def fetch_final_mail_dlv(self, **kwargs):
@@ -252,7 +257,7 @@ class OrderStateChangeRule(models.Model):
        attachment_tuple_arr = attachment_arr if attachment_arr else []
        # order_obj_arr = []
        try:
-            text_arr = [i.replace('-', '').replace(' ', '') for i in text_arr]
+            text_arr = [i.replace('-', '').replace(' ', '').replace('\xa0', '') for i in text_arr]
            ids = []
            if text_arr:
                sql = "select id from cc_bl where UPPER(REPLACE(REPLACE(REPLACE(bl_no, ' ', ''), '-', ''), '/', '')) in %s"
@@ -260,24 +265,88 @@ class OrderStateChangeRule(models.Model):
                result = self._cr.fetchall()
                ids = [i[0] for i in result]
            bl_objs = self.env['cc.bl'].sudo().search([('id', 'in', ids)]) if ids else False
-            if bl_objs and attachment_tuple_arr:
-                file_objs = self.env['cc.clearance.file'].sudo().search(
-                    [('file_name', '=', '尾程交接POD(待大包数量和箱号)'),
-                     ('bl_id', 'in', bl_objs.ids)])
-                file_objs.unlink()
-                for attachment_tuple in attachment_tuple_arr:
-                    attachment_name, attachment_data = attachment_tuple
-                    self.upload_pod_attachment(bl_objs, attachment_name, attachment_data)
+            not_bl_pdf_arr = []
+            if bl_objs:
+                # 提单对象  bl_no提单号
+                # attachment_tuple_arr [('11.pdf', 'pdf数据')]
+                # 1. 开启临时文件夹 (with 块结束时，所有临时文件会自动销毁)
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    # 构建临时目录结构
+                    pod_dir = os.path.join(temp_dir, "POD")
+                    pages_dir = os.path.join(pod_dir, "pages")
+                    output_dir = os.path.join(temp_dir, "Output")
+                    os.makedirs(pod_dir)
+                    os.makedirs(output_dir)
+                    # 2. 将内存中的 PDF 数据写入临时目录
+                    for file_name, pdf_data in attachment_tuple_arr:
+                        pdf_path = os.path.join(pod_dir, file_name)
+                        with open(pdf_path, 'wb') as f:
+                            # 注意：Odoo 里的附件通常是 base64 编码的。
+                            # 如果你的 'pdf数据' 是 base64 字符串/bytes，请用 base64.b64decode(pdf_data)
+                            # 如果已经是纯二进制流(rb读取的)，直接写入即可：f.write(pdf_data)
+                            # f.write(base64.b64decode(pdf_data))
+                            f.write(pdf_data)
+                            # 3. 对这些 PDF 进行集中拆分和识别（只执行一次，非常关键）
+                    ctx_index = {
+                        "dir_path": pod_dir,
+                        "output_index_csv": os.path.join(pod_dir, "pod_index.csv"),
+                        "output_summary_csv": os.path.join(pod_dir, "summary.csv"),
+                        "save_pages": True,
+                        "page_output_dir": pages_dir,
+                        "pipeline_split_first": True
+                    }
+                    # 这一步会消耗一点时间，它会生成单页 PDF 和索引 CSV
+                    index_pod_directory(ctx_index)
+                    # 4. 遍历你的提单对象，按需提取 PDF
+                    for bl_obj in bl_objs:
+                        target_awb = bl_obj.bl_no  # 获取提单号，例如 '436-10353136'
+                        if not target_awb:
+                            continue
+                        # 调用拼合工具
+                        ctx_merge = {
+                            "awb": target_awb,
+                            "index_file": ctx_index["output_index_csv"],
+                            "pages_dir": pages_dir,
+                            "output_dir": output_dir
+                        }
+                        result = merge_awb_pages(ctx_merge)
+                        # 5. 检查是否成功生成了对应的单号 PDF
+                        if result.get("output") and os.path.exists(result["output"]):
+                            # 将生成的 PDF 重新读回内存
+                            with open(result["output"], 'rb') as f:
+                                extracted_pdf_bytes = f.read()
+                            # 重新转为 base64，准备存入 Odoo
+                            # extracted_pdf_b64 = base64.b64encode(extracted_pdf_bytes)
+                            # print(extracted_pdf_bytes)
+                            self.upload_pod_attachment(bl_obj, f'{bl_obj.bl_no}.pdf', extracted_pdf_bytes)
+                            # bl_pdf_arr.append((bl_obj, f'{bl_obj.bl_no}.pdf', extracted_pdf_bytes))
+                        else:
+                            # 没找到这个单号对应的页面
+                            not_bl_pdf_arr.append(bl_obj.bl_no)
+                            # 这里可以记个日志，或者给 bl_obj 打个“未找到凭证”的标签
+            # 屏蔽 2026-03-26以下
+            # if bl_objs and attachment_tuple_arr:
+            #     file_objs = self.env['cc.clearance.file'].sudo().search(
+            #         [('file_name', '=', '尾程交接POD(待大包数量和箱号)'),
+            #          ('bl_id', 'in', bl_objs.ids)])
+            #     file_objs.unlink()
+            #     for attachment_tuple in attachment_tuple_arr:
+            #         attachment_name, attachment_data = attachment_tuple
+            #         self.upload_pod_attachment(bl_objs, attachment_name, attachment_data)
+            # 屏蔽 2026-03-26 以上
                # redis_conn = self.env['common.common'].sudo().get_redis()
                # if redis_conn == 'no':
                #     raise ValidationError('未连接redis')
                # else:
                #     redis_conn.lpush('mail_push_package_list', json.dumps({'id': bl_obj.id, 'utc_time': utc_time.strftime("%Y-%m-%d %H:%M:%S")}))
-            if not bl_objs:
+            if not bl_objs or not_bl_pdf_arr:
                mail_time = (datetime.utcnow() + timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S")
                content = f"""<p>您好：
                邮箱在{mail_time}(+8)时间接收到主题为POD的邮件，但未识别到对应的提单，请检查
-                避免推送超时！</p>"""
+                避免推送超时！</p>
+                """
+                if not_bl_pdf_arr:
+                    content += f"\n    以下提单未提取到PDF文件 {'/'.join(not_bl_pdf_arr)}"
                # 给客户配置的每个邮箱都发送邮件
                patrol_sender_email = self.env["ir.config_parameter"].sudo().get_param('patrol_sender_email') or ''
                patrol_receiver_emails = self.env["ir.config_parameter"].sudo().get_param(

--- a/ccs_base/pdf_tools/__init__.py
+++ b/ccs_base/pdf_tools/__init__.py
--- a/ccs_base/pdf_tools/awb_page_merger.py
+++ b/ccs_base/pdf_tools/awb_page_merger.py
--- a/ccs_base/pdf_tools/baidu_ocr_config.json
+++ b/ccs_base/pdf_tools/baidu_ocr_config.json
+{
+    "baidu_ocr_app_id": "118782515",
+    "baidu_ocr_api_key": "gWnGCmjJYzaYwhph8sJEdiRJ",
+    "baidu_ocr_secret_key": "mjgUUgbxXK8UHcRi5MTlPrb4BWM8NrOu",
+    "ocr_enabled": true,
+    "ocr_timeout": 30,
+    "max_retries": 3
+}
\ No newline at end of file
--- a/ccs_base/pdf_tools/baidu_ocr_config.py
+++ b/ccs_base/pdf_tools/baidu_ocr_config.py
+#!/usr/bin/env python3
+"""
+百度OCR配置文件
+用于管理百度OCR API的相关配置参数
+"""
+
+import os
+import json
+from typing import Dict, Optional
+
+class BaiduOCRConfig:
+    """百度OCR配置管理类"""
+    
+    def __init__(self, config_file: str = None):
+        """
+        初始化配置管理器
+        
+        Args:
+            config_file: 配置文件路径，默认为当前目录下的baidu_ocr_config.json
+        """
+        if config_file is None:
+            config_file = os.path.join(os.path.dirname(__file__), 'baidu_ocr_config.json')
+        
+        self.config_file = config_file
+        self._config = self._load_config()
+    
+    def _load_config(self) -> Dict:
+        """
+        从配置文件加载配置
+        
+        Returns:
+            配置字典
+        """
+        # 默认配置
+        default_config = {
+            "baidu_ocr_app_id": "118782515",
+            "baidu_ocr_api_key": "gWnGCmjJYzaYwhph8sJEdiRJ",
+            "baidu_ocr_secret_key": "mjgUUgbxXK8UHcRi5MTlPrb4BWM8NrOu",
+            "ocr_enabled": True,
+            "ocr_timeout": 30,
+            "max_retries": 3
+        }
+        
+        # 如果配置文件存在，则加载
+        if os.path.exists(self.config_file):
+            try:
+                with open(self.config_file, 'r', encoding='utf-8') as f:
+                    file_config = json.load(f)
+                    # 合并默认配置和文件配置
+                    default_config.update(file_config)
+            except (json.JSONDecodeError, IOError) as e:
+                print(f"警告：无法读取配置文件 {self.config_file}: {e}")
+                print("使用默认配置")
+        else:
+            # 创建默认配置文件
+            self._save_config(default_config)
+        
+        return default_config
+    
+    def _save_config(self, config: Dict) -> None:
+        """
+        保存配置到文件
+        
+        Args:
+            config: 配置字典
+        """
+        try:
+            with open(self.config_file, 'w', encoding='utf-8') as f:
+                json.dump(config, f, indent=4, ensure_ascii=False)
+        except IOError as e:
+            print(f"警告：无法保存配置文件 {self.config_file}: {e}")
+    
+    def get(self, key: str, default=None):
+        """
+        获取配置值
+        
+        Args:
+            key: 配置键
+            default: 默认值
+            
+        Returns:
+            配置值
+        """
+        # 优先从环境变量获取
+        env_value = os.getenv(key.upper())
+        if env_value:
+            return env_value
+        
+        return self._config.get(key, default)
+    
+    def set(self, key: str, value) -> None:
+        """
+        设置配置值
+        
+        Args:
+            key: 配置键
+            value: 配置值
+        """
+        self._config[key] = value
+        self._save_config(self._config)
+    
+    def get_app_id(self) -> str:
+        """获取百度OCR App ID"""
+        return self.get('baidu_ocr_app_id', '')
+    
+    def get_api_key(self) -> str:
+        """获取百度OCR API Key"""
+        return self.get('baidu_ocr_api_key', '')
+    
+    def get_secret_key(self) -> str:
+        """获取百度OCR Secret Key"""
+        return self.get('baidu_ocr_secret_key', '')
+    
+    def is_ocr_enabled(self) -> bool:
+        """检查OCR是否启用"""
+        return self.get('ocr_enabled', True)
+    
+    def get_timeout(self) -> int:
+        """获取OCR请求超时时间"""
+        return self.get('ocr_timeout', 30)
+    
+    def get_max_retries(self) -> int:
+        """获取最大重试次数"""
+        return self.get('max_retries', 3)
+    
+    def is_configured(self) -> bool:
+        """
+        检查百度OCR是否已正确配置
+        
+        Returns:
+            True如果配置完整，False否则
+        """
+        app_id = self.get_app_id()
+        api_key = self.get_api_key()
+        secret_key = self.get_secret_key()
+        
+        return bool(app_id and api_key and secret_key)
+    
+    def get_all_config(self) -> Dict:
+        """获取所有配置"""
+        return self._config.copy()
+    
+    def update_config(self, config_dict: Dict) -> None:
+        """
+        批量更新配置
+        
+        Args:
+            config_dict: 配置字典
+        """
+        self._config.update(config_dict)
+        self._save_config(self._config)
+
+
+# 全局配置实例
+baidu_ocr_config = BaiduOCRConfig()
+
+
+def get_baidu_ocr_config() -> BaiduOCRConfig:
+    """
+    获取百度OCR配置实例
+    
+    Returns:
+        BaiduOCRConfig实例
+    """
+    return baidu_ocr_config
+
+
+def check_baidu_ocr_config() -> bool:
+    """
+    检查百度OCR配置是否可用
+    
+    Returns:
+        True如果配置可用，False否则
+    """
+    try:
+        config = get_baidu_ocr_config()
+        
+        # 检查基本配置
+        if not config.is_configured():
+            return False
+        
+        # 检查是否启用OCR
+        if not config.is_ocr_enabled():
+            return False
+        
+        # 尝试导入百度OCR SDK
+        try:
+            from aip import AipOcr
+            
+            # 尝试初始化客户端
+            client = AipOcr(
+                config.get_app_id(),
+                config.get_api_key(),
+                config.get_secret_key()
+            )
+            
+            # 如果能成功创建客户端，认为配置可用
+            return True
+            
+        except ImportError as e:
+            import sys
+            print(f"警告: 百度OCR SDK (baidu-aip) 导入失败: {e}", file=sys.stderr)
+            print("请运行: pip install baidu-aip", file=sys.stderr)
+            # 如果是在Docker中，提示重建
+            if os.path.exists('/.dockerenv'):
+                print("提示: 检测到Docker环境，请尝试重新构建镜像: docker build --no-cache -t david-customs-data .", file=sys.stderr)
+            return False
+        except Exception as e:
+            print(f"警告: 百度OCR客户端初始化失败: {e}")
+            return False
+            
+    except Exception as e:
+        print(f"警告: 百度OCR配置检查失败: {e}")
+        return False
+
+
+def get_config_status() -> Dict:
+    """
+    获取详细的配置状态信息
+    
+    Returns:
+        配置状态字典
+    """
+    config = get_baidu_ocr_config()
+    
+    status = {
+        'app_id_configured': bool(config.get_app_id()),
+        'api_key_configured': bool(config.get_api_key()),
+        'secret_key_configured': bool(config.get_secret_key()),
+        'ocr_enabled': config.is_ocr_enabled(),
+        'timeout': config.get_timeout(),
+        'max_retries': config.get_max_retries(),
+        'fully_configured': config.is_configured()
+    }
+    
+    # 添加部分配置信息（隐藏敏感信息）
+    if status['app_id_configured']:
+        app_id = config.get_app_id()
+        status['app_id_preview'] = app_id[:8] + '...' if len(app_id) > 8 else app_id
+    
+    if status['api_key_configured']:
+        api_key = config.get_api_key()
+        status['api_key_preview'] = api_key[:8] + '...' if len(api_key) > 8 else api_key
+    
+    if status['secret_key_configured']:
+        secret_key = config.get_secret_key()
+        status['secret_key_preview'] = secret_key[:8] + '...' if len(secret_key) > 8 else secret_key
+    
+    # 检查SDK可用性
+    try:
+        from aip import AipOcr
+        status['sdk_available'] = True
+    except ImportError:
+        status['sdk_available'] = False
+    
+    return status
+
+
+if __name__ == "__main__":
+    # 测试配置
+    config = get_baidu_ocr_config()
+    
+    print("百度OCR配置测试:")
+    print(f"App ID: {config.get_app_id()}")
+    print(f"API Key: {config.get_api_key()}")
+    print(f"Secret Key: {config.get_secret_key()}")
+    print(f"OCR启用: {config.is_ocr_enabled()}")
+    print(f"超时时间: {config.get_timeout()}秒")
+    print(f"最大重试: {config.get_max_retries()}次")
+    print(f"配置完整: {config.is_configured()}")
\ No newline at end of file
--- a/ccs_base/pdf_tools/baidu_ocr_extractor.py
+++ b/ccs_base/pdf_tools/baidu_ocr_extractor.py
--- a/ccs_base/pdf_tools/pod_indexer.py
+++ b/ccs_base/pdf_tools/pod_indexer.py
--- a/ccs_connect_tiktok/models/tt_api.py
+++ b/ccs_connect_tiktok/models/tt_api.py
@@ -105,6 +105,7 @@ class TT(models.Model):
        }
        request_url = tt_url + url
        logging.info('request_url: %s' % request_url)
+        if 'clearance_file_feedback' not in request_url:
            logging.info('request_data: %s' % parameter)
        response = requests.post(request_url, headers=headers, data=parameter)
        logging.info('response: %s' % response.text)

--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ numpy
 Pillow
 tesseract
 pytesseract
+baidu-aip

 # 系统依赖安装说明：