提交 4b2b1e33 authored 作者: 贺阳's avatar 贺阳

提单上操作获取尾程POD,弹出向导上新增字段:同步推送匹配节点,默认打开

根据获取的POD文件,获取对应的清关节点,提取文件中红色框时间,作为设置的尾程POD节点匹配勾选的节点操作时间。若一个pdf里提取到多个时间,则取时间最早的一条作为节点操作时间。
上级 6934ad2e
......@@ -4,6 +4,7 @@
import base64
import io
import logging
import time
import requests
from odoo import models, fields, _
......@@ -18,7 +19,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
def get_order(self):
"""
得到单据
得到单据
:return:
"""
order_id = self._context.get('active_id')
......@@ -38,6 +39,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
help='Whether to remove specified text from PDF files' # 是否涂抹PDF中的指定文字
)
sync_match_node = fields.Boolean(
string='Sync Push Match Node', # 同步推送匹配节点
default=True,
help='Whether to sync and push matched node information' # 是否同步推送匹配节点信息
)
# debug_mode = fields.Boolean(
# string='Debug Mode', # 调试模式
# default=False,
......@@ -48,6 +55,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
Confirm operation # 确认操作
"""
#计算整个过程的耗时
start_time = time.time()
_logger.info(f"开始执行批量获取POD信息操作")
bl_objs = self.get_order()
# 调用接口获取提单pdf文件
pdf_file_arr = self._get_pdf_file_arr()
......@@ -78,11 +88,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 再同步和回写
if self.sync_last_mile_pod and processed_files:
self._sync_last_mile_pod(processed_files)
# 写一个方法掉接口获取提单pdf文件
# 同步推送匹配节点
if self.sync_match_node and processed_files:
self._sync_match_node(processed_files)
end_time = time.time()
_logger.info(f"批量获取POD信息操作完成,耗时: {end_time - start_time}秒")
# 写一个方法调接口获取提单pdf文件
def _get_pdf_file_arr(self):
"""
Get PDF file from API # 从API获取PDF文件
从API获取PDF文件
"""
# 获取当前选中的提单对象
bl_objs = self.get_order()
......@@ -241,6 +257,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
if file_data:
# 将base64数据转换为二进制
pdf_binary = base64.b64decode(file_data)
# 先提取文本用于后续同步节点功能
if 'ocr_texts' not in file_info:
file_info['ocr_texts'] = self._extract_text_from_pdf_with_ocr(pdf_binary, bl.bl_no)
# 使用OCR方法处理PDF
processed_pdf = self._process_pdf_with_ocr(
pdf_data=pdf_binary,
......@@ -258,6 +279,65 @@ class BatchGetPodInfoWizard(models.TransientModel):
return updated_files
def _extract_text_from_pdf_with_ocr(self, pdf_binary, bl_no):
"""
使用OCR提取PDF每页的文本内容(公共方法)
:param pdf_binary: PDF二进制数据
:param bl_no: 提单号(用于日志)
:return: 字典,key为页码(0-based),value为该页的文本内容
"""
import fitz # PyMuPDF
import pytesseract
import numpy as np
from PIL import Image
# 尝试导入OpenCV,如果失败则使用PIL替代
try:
import cv2
cv2_available = True
except ImportError:
cv2_available = False
# 设置Tesseract路径
self._setup_tesseract_path()
# 打开PDF文档
pdf_document = fitz.open(stream=pdf_binary, filetype="pdf")
page_texts = {}
# 遍历每一页提取文本
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# 将页面转换为图像
mat = fitz.Matrix(2.0, 2.0) # 提高分辨率
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# 转换为PIL图像
if cv2_available:
nparr = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
else:
pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB')
# OCR配置
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
# 使用Tesseract进行OCR识别
try:
text_content = pytesseract.image_to_string(pil_img, config=config, lang='eng')
page_texts[page_num] = text_content
except Exception as e:
_logger.error(f"OCR识别失败,第 {page_num + 1} 页: {str(e)}")
page_texts[page_num] = ""
pdf_document.close()
return page_texts
def _process_pdf_with_ocr(self, pdf_data, bl_no, debug_mode=False):
"""
Process PDF with OCR recognition and text removal (完全按照HTML逻辑) # 使用OCR识别处理PDF并删除文字
......@@ -726,3 +806,231 @@ class BatchGetPodInfoWizard(models.TransientModel):
except Exception as e:
_logger.error(f"保存PDF附件失败: {str(e)}")
raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))
def _sync_match_node(self, processed_files):
"""
Sync matched node based on POD file, extract time from red boxes # 根据POD文件同步匹配节点
:param processed_files: 处理后的文件数组
"""
# 查找对应的清关节点(勾选了POD节点匹配的节点)
pod_node = self.env['cc.node'].search([
('is_pod_node', '=', True),
('node_type', '=', 'package')
], limit=1)
for file_info in processed_files:
if not file_info.get('bl'):
continue
bl = file_info['bl']
if not pod_node:
_logger.info(f"未找到尾程POD节点匹配的节点,提单号: {bl.bl_no}")
continue
# 从PDF文件提取红色框的时间
file_data = file_info.get('file_data')
if not file_data:
_logger.warning(f"提单 {bl.bl_no} 没有文件数据")
continue
try:
# 如果已识别过OCR文本,则复用
ocr_texts = file_info.get('ocr_texts')
# 解析PDF提取时间
extracted_times = self._extract_time_from_pdf(file_data, bl.bl_no, ocr_texts=ocr_texts)
if extracted_times:
# 取最早的时间作为节点操作时间
earliest_time = min(extracted_times)
_logger.info(f"提取到最早时间: {earliest_time},将作为节点操作时间")
# 这里需要实现具体的节点操作逻辑
# 根据实际业务需求,可能需要更新某个字段或调用某个方法
# 例如:更新节点的操作时间或状态
# pod_node.operation_time = earliest_time
# 或者调用某个方法来记录节点操作
_logger.info(f"为提单 {bl.bl_no} 同步节点操作时间: {earliest_time}")
else:
_logger.info(f"未从POD文件中提取到时间信息,提单号: {bl.bl_no}")
except Exception as e:
_logger.error(f"同步匹配节点失败,提单号: {bl.bl_no}, 错误: {str(e)}")
def _extract_time_from_pdf(self, file_data, bl_no, ocr_texts=None):
"""
Extract time information from PDF file (从PDF文件中提取红色框中的时间)
根据图片中的格式,提取类似 "08:35 17-OCT-2025" 的时间格式
:param file_data: PDF文件的base64数据
:param bl_no: 提单号(用于日志)
:param ocr_texts: 如果已识别过的OCR文本,则直接使用,避免重复识别
:return: 提取到的时间列表
"""
import re
from datetime import datetime
extracted_times = []
try:
# 如果没有提供OCR文本,则调用OCR识别
if ocr_texts is None:
pdf_binary = base64.b64decode(file_data)
ocr_texts = self._extract_text_from_pdf_with_ocr(pdf_binary, bl_no)
# 使用已识别的文本内容查找时间
for page_num, text_content in ocr_texts.items():
# 初始化本页时间列表
page_times = []
# 根据图片中的格式,优先提取 "DATE/TIME OF RELEASE" 后面的时间
# 格式示例: "21:47 20-OCT-2025", "08:35 17-OCT-2025", "13:52 17-OCT-2025"
# OCR识别时可能没有空格,如 "DATETIMEOFRELEASE160220-SEP-2025"
# 首先尝试找到 "DATE/TIME OF RELEASE" 后面的时间(红色框中的时间)
# 匹配两种情况:
# 1. 有空格: "DATE/TIME OF RELEASE 16:02 20-SEP-2025"
# 2. 无空格: "DATETIMEOFRELEASE160220-SEP-2025"
release_time_patterns = [
# 有空格的情况
r'DATE[/\s]*TIME[/\s]*OF[/\s]*RELEASE[^\n]*?(\d{2}:\d{2})\s+(\d{2}-[A-Z]{3}-\d{4})',
# 无空格的情况,紧跟在RELEASE后面就是时间数字
r'DATETIMEOFRELEASE(\d{2})(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
# 增加容错,RELEASE可能被识别为其他形式
r'[Rr][Ee][Ll][Ee][Aa][Ss][Ee].*?(\d{2}:\d{2})\s+(\d{2}-[A-Z]{3}-\d{4})',
# 更宽松的无空格匹配
r'[Dd][Aa][Tt][Ee].*?[Tt][Ii][Mm][Ee].*?[Oo][Ff].*?[Rr][Ee][Ll][Ee][Aa][Ss][Ee](\d{2})(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
# OCR可能将:识别为数字,如"DATETIMEOFRELEASE163420-SEP-2025"
r'DATETIMEOFRELEASE(\d)(\d)(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
# OCR可能将:识别为多个空格,如"DATETIMEOFRELEASE 163420-SEP-2025"
r'DATETIMEOFRELEASE\s+(\d)(\d)(\d{2})(\d{2})-([A-Z]{3})-(\d{4})',
]
for idx, pattern in enumerate(release_time_patterns):
release_time_match = re.search(pattern, text_content, re.IGNORECASE)
if release_time_match:
try:
group_count = len(release_time_match.groups())
if group_count == 2:
# 有空格的情况: ('16:02', '20-SEP-2025')
time_part = release_time_match.group(1)
date_part = release_time_match.group(2)
# 转换月份缩写为大写(Python strptime需要)
month_map = {
'JAN': 'Jan', 'FEB': 'Feb', 'MAR': 'Mar', 'APR': 'Apr',
'MAY': 'May', 'JUN': 'Jun', 'JUL': 'Jul', 'AUG': 'Aug',
'SEP': 'Sep', 'OCT': 'Oct', 'NOV': 'Nov', 'DEC': 'Dec'
}
# 如果日期部分中的月份是大写,转换为首字母大写
for key, value in month_map.items():
date_part = re.sub(r'-{}\b'.format(key), f'-{value}', date_part, flags=re.IGNORECASE)
match_str = f"{time_part} {date_part}"
time_obj = datetime.strptime(match_str, '%H:%M %d-%b-%Y')
elif group_count == 5:
# 无空格的情况: ('16', '02', '20', 'SEP', '2025')
hour = release_time_match.group(1)
minute = release_time_match.group(2)
day = release_time_match.group(3)
month = release_time_match.group(4)
year = release_time_match.group(5)
# 转换月份缩写
month_map = {
'JAN': 'Jan', 'FEB': 'Feb', 'MAR': 'Mar', 'APR': 'Apr',
'MAY': 'May', 'JUN': 'Jun', 'JUL': 'Jul', 'AUG': 'Aug',
'SEP': 'Sep', 'OCT': 'Oct', 'NOV': 'Nov', 'DEC': 'Dec'
}
month_normalized = month_map.get(month.upper(), month.capitalize())
# 直接手动创建datetime对象,避免strptime的格式问题
month_num = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,
'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}[month.upper()]
time_obj = datetime(int(year), month_num, int(day), int(hour), int(minute))
match_str = f"{hour}:{minute} {day.zfill(2)}-{month_normalized}-{year}"
elif group_count == 6:
# OCR将:识别为数字的情况: ('1', '6', '34', '20', 'SEP', '2025')
hour_tens = release_time_match.group(1)
hour_ones = release_time_match.group(2)
minute_str = release_time_match.group(3)
day = release_time_match.group(4)
month = release_time_match.group(5)
year = release_time_match.group(6)
# 组合小时和分钟
hour = hour_tens + hour_ones
minute = minute_str
# 直接手动创建datetime对象
month_num = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,
'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}[month.upper()]
time_obj = datetime(int(year), month_num, int(day), int(hour), int(minute))
match_str = f"{hour}:{minute} {day.zfill(2)}-{month}-{year}"
else:
_logger.warning(f"意外的分组数量: {group_count}, 分组: {release_time_match.groups()}")
continue
page_times.append(time_obj)
break
except Exception as e:
_logger.warning(f"解析DATE/TIME OF RELEASE时间失败: {release_time_match.groups()}, 错误: {str(e)}")
else:
continue
# 然后查找其他时间格式
time_patterns = [
# RELEASE NOTE格式: HH:MM DD-MON-YYYY
r'\d{2}:\d{2}\s+\d{2}-[A-Z]{3}-\d{4}', # 21:47 20-OCT-2025
# 标准格式
r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}', # 2023-12-31 12:30:00
r'\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}', # 31/12/2023 12:30
r'\d{4}/\d{2}/\d{2}\s+\d{2}:\d{2}:\d{2}', # 2023/12/31 12:30:00
]
for pattern in time_patterns:
matches = re.findall(pattern, text_content)
for match in matches:
try:
# 尝试解析时间
if re.match(r'\d{2}:\d{2}\s+\d{2}-[A-Z]{3}-\d{4}', match):
# RELEASE NOTE格式: 08:35 17-OCT-2025
time_obj = datetime.strptime(match, '%H:%M %d-%b-%Y')
page_times.append(time_obj)
elif '-' in match and not match.startswith(r'\d{2}:'):
# 2023-12-31 12:30:00 格式
time_obj = datetime.strptime(match, '%Y-%m-%d %H:%M:%S')
page_times.append(time_obj)
elif '/' in match and ':' in match:
# 根据分隔符判断格式
if match.count('/') == 2:
parts = match.split()
date_part = parts[0]
time_part = parts[1] if len(parts) > 1 else '00:00'
try:
time_obj = datetime.strptime(date_part + ' ' + time_part, '%d/%m/%Y %H:%M')
except:
time_obj = datetime.strptime(date_part + ' ' + time_part, '%Y/%m/%d %H:%M:%S')
page_times.append(time_obj)
else:
continue
else:
continue
except Exception as e:
_logger.warning(f"解析时间失败: {match}, 错误: {str(e)}")
continue
# 如果本页有提取到时间,记录最早的时间
if page_times:
earliest_page_time = min(page_times)
extracted_times.append(earliest_page_time)
# 如果有提取到时间,返回最早的时间
if extracted_times:
earliest_time = min(extracted_times)
return [earliest_time]
except Exception as e:
_logger.error(f"提取PDF时间信息失败,提单号: {bl_no}, 错误: {str(e)}")
return extracted_times
......@@ -16,6 +16,9 @@
<group>
<field name="remove_specified_text" widget="boolean_toggle"/>
</group>
<group>
<field name="sync_match_node" widget="boolean_toggle"/>
</group>
</group>
<div class="alert alert-info" role="alert">
......@@ -23,6 +26,7 @@
<ul>
<li><strong>Sync Last Mile POD:</strong> Synchronize POD (Proof of Delivery) attachment information with TK system, including big package quantities and container numbers</li> <!-- 同步尾程POD:向TK同步尾程交接POD(待大包数量和箱号)的附件信息 -->
<li><strong>Remove Specified Text:</strong> Remove specified text (AGN, UCLINK LOGISITICS LTD) from PDF files</li> <!-- 涂抹指定文字:对PDF文件中的指定文字进行涂抹处理 -->
<li><strong>Sync Push Match Node:</strong> Synchronize and push matched node information based on POD file, extract time from red boxes as node operation time</li> <!-- 同步推送匹配节点:根据POD文件获取对应的清关节点,提取红色框时间作为节点操作时间 -->
</ul>
</div>
<footer>
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论