提单上操作获取尾程POD，弹出向导上新增字段：同步推送匹配节点，默认打开

根据获取的POD文件，获取对应的清关节点，提取文件中红色框时间，作为设置的尾程POD节点匹配勾选的节点操作时间。若一个pdf里提取到多个时间，则取时间最早的一条作为节点操作时间。

提单上操作获取尾程POD，弹出向导上新增字段：同步推送匹配节点，默认打开
4b2b1e33 · 贺阳 · 6934ad2e · 4b2b1e33 · 4b2b1e33
--- a/ccs_base/wizard/batch_get_pod_info_wizard.py
+++ b/ccs_base/wizard/batch_get_pod_info_wizard.py
@@ -4,6 +4,7 @@
 import base64
 import io
 import logging
+import time

 import requests
 from odoo import models, fields, _
@@ -18,7 +19,7 @@ class BatchGetPodInfoWizard(models.TransientModel):

    def get_order(self):
        """
-        得到单据
+        得到单据    
        :return:
        """
        order_id = self._context.get('active_id')
@@ -38,6 +39,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
        help='Whether to remove specified text from PDF files'  # 是否涂抹PDF中的指定文字
    )

+    sync_match_node = fields.Boolean(
+        string='Sync Push Match Node',  # 同步推送匹配节点
+        default=True,
+        help='Whether to sync and push matched node information'  # 是否同步推送匹配节点信息
+    )
+
    # debug_mode = fields.Boolean(
    #     string='Debug Mode',  # 调试模式
    #     default=False,
@@ -48,6 +55,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
        """
        Confirm operation  # 确认操作
        """
+        #计算整个过程的耗时
+        start_time = time.time()
+        _logger.info(f"开始执行批量获取POD信息操作")
        bl_objs = self.get_order()
        # 调用接口获取提单pdf文件
        pdf_file_arr = self._get_pdf_file_arr()
@@ -78,11 +88,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
        # 再同步和回写
        if self.sync_last_mile_pod and processed_files:
            self._sync_last_mile_pod(processed_files)
-
-    # 写一个方法掉接口获取提单pdf文件
+        
+        # 同步推送匹配节点
+        if self.sync_match_node and processed_files:
+            self._sync_match_node(processed_files)
+        end_time = time.time()
+        _logger.info(f"批量获取POD信息操作完成，耗时: {end_time - start_time}秒")
+
+    # 写一个方法调接口获取提单pdf文件
    def _get_pdf_file_arr(self):
        """
-        Get PDF file from API  # 从API获取PDF文件
+        从API获取PDF文件
        """
        # 获取当前选中的提单对象
        bl_objs = self.get_order()
@@ -241,6 +257,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
            if file_data:
                # 将base64数据转换为二进制
                pdf_binary = base64.b64decode(file_data)
+                
+                # 先提取文本用于后续同步节点功能
+                if 'ocr_texts' not in file_info:
+                    file_info['ocr_texts'] = self._extract_text_from_pdf_with_ocr(pdf_binary, bl.bl_no)
+                
                # 使用OCR方法处理PDF
                processed_pdf = self._process_pdf_with_ocr(
                    pdf_data=pdf_binary,
@@ -258,6 +279,65 @@ class BatchGetPodInfoWizard(models.TransientModel):

        return updated_files

+    def _extract_text_from_pdf_with_ocr(self, pdf_binary, bl_no):
+        """
+        使用OCR提取PDF每页的文本内容（公共方法）
+        :param pdf_binary: PDF二进制数据
+        :param bl_no: 提单号（用于日志）
+        :return: 字典，key为页码(0-based)，value为该页的文本内容
+        """
+        import fitz  # PyMuPDF
+        import pytesseract
+        import numpy as np
+        from PIL import Image
+        
+        # 尝试导入OpenCV，如果失败则使用PIL替代
+        try:
+            import cv2
+            cv2_available = True
+        except ImportError:
+            cv2_available = False
+        
+        # 设置Tesseract路径
+        self._setup_tesseract_path()
+        
+        # 打开PDF文档
+        pdf_document = fitz.open(stream=pdf_binary, filetype="pdf")
+        page_texts = {}
+        
+        # 遍历每一页提取文本
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            
+            # 将页面转换为图像
+            mat = fitz.Matrix(2.0, 2.0)  # 提高分辨率
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+            
+            # 转换为PIL图像
+            if cv2_available:
+                nparr = np.frombuffer(img_data, np.uint8)
+                img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+            else:
+                pil_img = Image.open(io.BytesIO(img_data))
+                if pil_img.mode != 'RGB':
+                    pil_img = pil_img.convert('RGB')
+            
+            # OCR配置
+            config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
+            
+            # 使用Tesseract进行OCR识别
+            try:
+                text_content = pytesseract.image_to_string(pil_img, config=config, lang='eng')
+                page_texts[page_num] = text_content
+            except Exception as e:
+                _logger.error(f"OCR识别失败，第 {page_num + 1} 页: {str(e)}")
+                page_texts[page_num] = ""
+        
+        pdf_document.close()
+        return page_texts
+
    def _process_pdf_with_ocr(self, pdf_data, bl_no, debug_mode=False):
        """
        Process PDF with OCR recognition and text removal (完全按照HTML逻辑)  # 使用OCR识别处理PDF并删除文字
@@ -726,3 +806,231 @@ class BatchGetPodInfoWizard(models.TransientModel):
        except Exception as e:
            _logger.error(f"保存PDF附件失败: {str(e)}")
            raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))
+
+    def _sync_match_node(self, processed_files):
+        """
+        Sync matched node based on POD file, extract time from red boxes  # 根据POD文件同步匹配节点
+        :param processed_files: 处理后的文件数组
+        """
+        # 查找对应的清关节点（勾选了POD节点匹配的节点）
+        pod_node = self.env['cc.node'].search([
+            ('is_pod_node', '=', True),
+            ('node_type', '=', 'package')
+        ], limit=1)
+        for file_info in processed_files:
+            if not file_info.get('bl'):
+                continue
+            bl = file_info['bl']
+            if not pod_node:
+                _logger.info(f"未找到尾程POD节点匹配的节点，提单号: {bl.bl_no}")
+                continue
+            
+            # 从PDF文件提取红色框的时间
+            file_data = file_info.get('file_data')
+            if not file_data:
+                _logger.warning(f"提单 {bl.bl_no} 没有文件数据")
+                continue
+            
+            try:
+                # 如果已识别过OCR文本，则复用
+                ocr_texts = file_info.get('ocr_texts')
+                
+                # 解析PDF提取时间
+                extracted_times = self._extract_time_from_pdf(file_data, bl.bl_no, ocr_texts=ocr_texts)
+                
+                if extracted_times:
+                    # 取最早的时间作为节点操作时间
+                    earliest_time = min(extracted_times)
+                    _logger.info(f"提取到最早时间: {earliest_time}，将作为节点操作时间")
+                    
+                    # 这里需要实现具体的节点操作逻辑
+                    # 根据实际业务需求，可能需要更新某个字段或调用某个方法
+                    # 例如：更新节点的操作时间或状态
+                    # pod_node.operation_time = earliest_time
+                    # 或者调用某个方法来记录节点操作
+                    
+                    _logger.info(f"为提单 {bl.bl_no} 同步节点操作时间: {earliest_time}")
+                else:
+                    _logger.info(f"未从POD文件中提取到时间信息，提单号: {bl.bl_no}")
+                    
+            except Exception as e:
+                _logger.error(f"同步匹配节点失败，提单号: {bl.bl_no}, 错误: {str(e)}")
+
+    def _extract_time_from_pdf(self, file_data, bl_no, ocr_texts=None):
+        """
+        Extract time information from PDF file (从PDF文件中提取红色框中的时间)
+        根据图片中的格式，提取类似 "08:35 17-OCT-2025" 的时间格式
+        :param file_data: PDF文件的base64数据
+        :param bl_no: 提单号（用于日志）
+        :param ocr_texts: 如果已识别过的OCR文本，则直接使用，避免重复识别
+        :return: 提取到的时间列表
+        """
+        import re
+        from datetime import datetime
+        
+        extracted_times = []
+        
+        try:
+            # 如果没有提供OCR文本，则调用OCR识别
+            if ocr_texts is None:
+                pdf_binary = base64.b64decode(file_data)
+                ocr_texts = self._extract_text_from_pdf_with_ocr(pdf_binary, bl_no)
+            
+            # 使用已识别的文本内容查找时间
+            for page_num, text_content in ocr_texts.items():
+                # 初始化本页时间列表
+                page_times = []
+                
+                # 根据图片中的格式，优先提取 "DATE/TIME OF RELEASE" 后面的时间
+                # 格式示例: "21:47 20-OCT-2025", "08:35 17-OCT-2025", "13:52 17-OCT-2025"
+                # OCR识别时可能没有空格，如 "DATETIMEOFRELEASE160220-SEP-2025"
+                
+                # 首先尝试找到 "DATE/TIME OF RELEASE" 后面的时间（红色框中的时间）
+                # 匹配两种情况：
+                # 1. 有空格: "DATE/TIME OF RELEASE  16:02  20-SEP-2025"
+                # 2. 无空格: "DATETIMEOFRELEASE160220-SEP-2025"
+                release_time_patterns = [
+                    # 有空格的情况
+                    r'DATE[/\s]*TIME[/\s]*OF[/\s]*RELEASE[^\n]*?(\d{2}:\d{2})\s+(\d{2}-[A-Z]{3}-\d{4})',
+                    # 无空格的情况，紧跟在RELEASE后面就是时间数字
+                    r'DATETIMEOFRELEASE(\d{2})(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
+                    # 增加容错，RELEASE可能被识别为其他形式
+                    r'[Rr][Ee][Ll][Ee][Aa][Ss][Ee].*?(\d{2}:\d{2})\s+(\d{2}-[A-Z]{3}-\d{4})',
+                    # 更宽松的无空格匹配
+                    r'[Dd][Aa][Tt][Ee].*?[Tt][Ii][Mm][Ee].*?[Oo][Ff].*?[Rr][Ee][Ll][Ee][Aa][Ss][Ee](\d{2})(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
+                    # OCR可能将:识别为数字，如"DATETIMEOFRELEASE163420-SEP-2025"
+                    r'DATETIMEOFRELEASE(\d)(\d)(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
+                    # OCR可能将:识别为多个空格，如"DATETIMEOFRELEASE        163420-SEP-2025"
+                    r'DATETIMEOFRELEASE\s+(\d)(\d)(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
+                ]
+                
+                for idx, pattern in enumerate(release_time_patterns):
+                    release_time_match = re.search(pattern, text_content, re.IGNORECASE)
+                    if release_time_match:
+                        try:
+                            group_count = len(release_time_match.groups())
+                            if group_count == 2:
+                                # 有空格的情况: ('16:02', '20-SEP-2025')
+                                time_part = release_time_match.group(1)
+                                date_part = release_time_match.group(2)
+                                
+                                # 转换月份缩写为大写（Python strptime需要）
+                                month_map = {
+                                    'JAN': 'Jan', 'FEB': 'Feb', 'MAR': 'Mar', 'APR': 'Apr',
+                                    'MAY': 'May', 'JUN': 'Jun', 'JUL': 'Jul', 'AUG': 'Aug',
+                                    'SEP': 'Sep', 'OCT': 'Oct', 'NOV': 'Nov', 'DEC': 'Dec'
+                                }
+                                
+                                # 如果日期部分中的月份是大写，转换为首字母大写
+                                for key, value in month_map.items():
+                                    date_part = re.sub(r'-{}\b'.format(key), f'-{value}', date_part, flags=re.IGNORECASE)
+                                
+                                match_str = f"{time_part} {date_part}"
+                                time_obj = datetime.strptime(match_str, '%H:%M %d-%b-%Y')
+                            elif group_count == 5:
+                                # 无空格的情况: ('16', '02', '20', 'SEP', '2025')
+                                hour = release_time_match.group(1)
+                                minute = release_time_match.group(2)
+                                day = release_time_match.group(3)
+                                month = release_time_match.group(4)
+                                year = release_time_match.group(5)
+                                
+                                # 转换月份缩写
+                                month_map = {
+                                    'JAN': 'Jan', 'FEB': 'Feb', 'MAR': 'Mar', 'APR': 'Apr',
+                                    'MAY': 'May', 'JUN': 'Jun', 'JUL': 'Jul', 'AUG': 'Aug',
+                                    'SEP': 'Sep', 'OCT': 'Oct', 'NOV': 'Nov', 'DEC': 'Dec'
+                                }
+                                month_normalized = month_map.get(month.upper(), month.capitalize())
+                                
+                                # 直接手动创建datetime对象，避免strptime的格式问题
+                                month_num = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,
+                                            'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}[month.upper()]
+                                time_obj = datetime(int(year), month_num, int(day), int(hour), int(minute))
+                                match_str = f"{hour}:{minute} {day.zfill(2)}-{month_normalized}-{year}"
+                            elif group_count == 6:
+                                # OCR将:识别为数字的情况: ('1', '6', '34', '20', 'SEP', '2025')
+                                hour_tens = release_time_match.group(1)
+                                hour_ones = release_time_match.group(2)
+                                minute_str = release_time_match.group(3)
+                                day = release_time_match.group(4)
+                                month = release_time_match.group(5)
+                                year = release_time_match.group(6)
+                                
+                                # 组合小时和分钟
+                                hour = hour_tens + hour_ones
+                                minute = minute_str
+                                
+                                # 直接手动创建datetime对象
+                                month_num = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,
+                                            'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}[month.upper()]
+                                time_obj = datetime(int(year), month_num, int(day), int(hour), int(minute))
+                                match_str = f"{hour}:{minute} {day.zfill(2)}-{month}-{year}"
+                            else:
+                                _logger.warning(f"意外的分组数量: {group_count}, 分组: {release_time_match.groups()}")
+                                continue
+                            
+                            page_times.append(time_obj)
+                            break
+                        except Exception as e:
+                            _logger.warning(f"解析DATE/TIME OF RELEASE时间失败: {release_time_match.groups()}, 错误: {str(e)}")
+                    else:
+                        continue
+                
+                # 然后查找其他时间格式
+                time_patterns = [
+                    # RELEASE NOTE格式: HH:MM DD-MON-YYYY
+                    r'\d{2}:\d{2}\s+\d{2}-[A-Z]{3}-\d{4}',  # 21:47 20-OCT-2025
+                    # 标准格式
+                    r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}',  # 2023-12-31 12:30:00
+                    r'\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}',  # 31/12/2023 12:30
+                    r'\d{4}/\d{2}/\d{2}\s+\d{2}:\d{2}:\d{2}',  # 2023/12/31 12:30:00
+                ]
+                
+                for pattern in time_patterns:
+                    matches = re.findall(pattern, text_content)
+                    for match in matches:
+                        try:
+                            # 尝试解析时间
+                            if re.match(r'\d{2}:\d{2}\s+\d{2}-[A-Z]{3}-\d{4}', match):
+                                # RELEASE NOTE格式: 08:35 17-OCT-2025
+                                time_obj = datetime.strptime(match, '%H:%M %d-%b-%Y')
+                                page_times.append(time_obj)
+                            elif '-' in match and not match.startswith(r'\d{2}:'):
+                                # 2023-12-31 12:30:00 格式
+                                time_obj = datetime.strptime(match, '%Y-%m-%d %H:%M:%S')
+                                page_times.append(time_obj)
+                            elif '/' in match and ':' in match:
+                                # 根据分隔符判断格式
+                                if match.count('/') == 2:
+                                    parts = match.split()
+                                    date_part = parts[0]
+                                    time_part = parts[1] if len(parts) > 1 else '00:00'
+                                    try:
+                                        time_obj = datetime.strptime(date_part + ' ' + time_part, '%d/%m/%Y %H:%M')
+                                    except:
+                                        time_obj = datetime.strptime(date_part + ' ' + time_part, '%Y/%m/%d %H:%M:%S')
+                                    page_times.append(time_obj)
+                                else:
+                                    continue
+                            else:
+                                continue
+                            
+                        except Exception as e:
+                            _logger.warning(f"解析时间失败: {match}, 错误: {str(e)}")
+                            continue
+                
+                # 如果本页有提取到时间，记录最早的时间
+                if page_times:
+                    earliest_page_time = min(page_times)
+                    extracted_times.append(earliest_page_time)
+            
+            # 如果有提取到时间，返回最早的时间
+            if extracted_times:
+                earliest_time = min(extracted_times)
+                return [earliest_time]
+            
+        except Exception as e:
+            _logger.error(f"提取PDF时间信息失败，提单号: {bl_no}, 错误: {str(e)}")
+        
+        return extracted_times
--- a/ccs_base/wizard/batch_get_pod_info_wizard_views.xml
+++ b/ccs_base/wizard/batch_get_pod_info_wizard_views.xml
@@ -16,6 +16,9 @@
                            <group>
                                <field name="remove_specified_text" widget="boolean_toggle"/>
                            </group>
+                            <group>
+                                <field name="sync_match_node" widget="boolean_toggle"/>
+                            </group>
                        </group>
                        
                        <div class="alert alert-info" role="alert">
@@ -23,6 +26,7 @@
                            <ul>
                                <li><strong>Sync Last Mile POD:</strong> Synchronize POD (Proof of Delivery) attachment information with TK system, including big package quantities and container numbers</li>  <!-- 同步尾程POD：向TK同步尾程交接POD(待大包数量和箱号)的附件信息 -->
                                <li><strong>Remove Specified Text:</strong> Remove specified text (AGN, UCLINK LOGISITICS LTD) from PDF files</li>  <!-- 涂抹指定文字：对PDF文件中的指定文字进行涂抹处理 -->
+                                <li><strong>Sync Push Match Node:</strong> Synchronize and push matched node information based on POD file, extract time from red boxes as node operation time</li>  <!-- 同步推送匹配节点：根据POD文件获取对应的清关节点，提取红色框时间作为节点操作时间 -->
                            </ul>
                        </div>
                        <footer>