提交 8ca88357 authored 作者: 贺阳's avatar 贺阳

没有勾选同步或没有勾选涂抹文字也需要创建附件信息

上级 b127cb4b
# -*- coding: utf-8 -*-
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import base64
import io
import logging
import base64
import requests
from odoo import models, fields, _
from odoo.exceptions import ValidationError
......@@ -56,16 +57,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 英文提示
raise ValidationError(_('%s bill of loading cannot find release note file') % (
', '.join([bl.bl_no for bl in error_bl]))) # xx提单无法找到release note文件
if self.remove_specified_text:
processed_files = self._remove_specified_text(processed_files)
# 用于测试的:保存处理后的PDF并返回下载链接
# if processed_files and processed_files[0].get('file_data'):
# return self._save_and_return_download_link(processed_files[0])
# 回写到附件信息
if processed_files:
# 回写PDF文件到清关文件
self._write_pdf_file(processed_files)
# 再同步和回写
if self.sync_last_mile_pod:
if self.sync_last_mile_pod and processed_files:
self._sync_last_mile_pod(processed_files)
# 写一个方法掉接口获取提单pdf文件
......@@ -76,12 +80,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取当前选中的提单对象
bl_objs = self.get_order()
bill_numbers = [self.env['common.common'].sudo().process_match_str(bl.bl_no) for bl in bl_objs]
# 调用API获取PDF文件
api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url','http://172.104.52.150:7002')
api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url',
'http://172.104.52.150:7002')
if not api_url:
raise ValidationError(_('API URL not configured'))
# 构建请求数据
request_data = {
"bill_numbers": bill_numbers
......@@ -92,22 +97,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
headers={'Content-Type': 'application/json'},
json=request_data
)
if response.status_code == 200:
result = response.json()
# 检查API响应结构
if not result:
raise ValidationError(_('API returned empty response'))
if not result.get('success'):
error_msg = result.get('message', 'Unknown error')
raise ValidationError(_('API returned error: %s') % error_msg)
# 处理结果数据
results = result.get('results', [])
if not results:
raise ValidationError(_('No PDF files found in API response'))#提示:API调用成功,但没有PDF文件
raise ValidationError(_('No PDF files found in API response')) # 提示:API调用成功,但没有PDF文件
# 构建PDF文件数组
pdf_file_arr = []
for result_item in results:
......@@ -116,11 +121,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
bill_number = result_item.get('bill_number')
filename = result_item.get('filename')
base64_data = result_item.get('base64')
if not all([bill_number, filename, base64_data]):
_logger.warning(f"跳过无效的PDF文件项: {result_item}")
continue
# 验证PDF文件
try:
pdf_binary = base64.b64decode(base64_data)
......@@ -140,7 +145,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
return pdf_file_arr
else:
raise ValidationError(_('Failed to get PDF file from API: %s') % response.text)
except requests.exceptions.RequestException as e:
raise ValidationError(_('API request failed: %s') % str(e))
......@@ -176,7 +181,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
select_bl_no = self.env['common.common'].sudo().process_match_str(bl.bl_no)
for pdf_file in pdf_file_arr:
# 尝试不同的字段名(API可能使用不同的字段名)
file_name = pdf_file.get('file_name' ) # 获取文件名
file_name = pdf_file.get('file_name') # 获取文件名
file_data = pdf_file.get('file_data') # 获取文件数据
bl_no = pdf_file.get('bl_no') # 获取提单号
if bl_no and select_bl_no == bl_no:
......@@ -198,8 +203,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
Sync last mile POD information # 同步尾程POD信息
:param processed_files: 处理后的文件数组
"""
# 回写PDF文件到清关文件
self._write_pdf_file(processed_files)
# return False#测试 先不同步
# 同步尾程POD信息
for file_info in processed_files:
......@@ -220,7 +223,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
:return: 处理后的文件数组(包含处理后的PDF数据)
"""
updated_files = []
for file_info in processed_files:
if not file_info['bl']:
updated_files.append(file_info)
......@@ -241,12 +244,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
if processed_pdf:
# 将处理后的PDF转换回base64
processed_file_data = base64.b64encode(processed_pdf).decode('utf-8')
# 更新文件信息,使用处理后的PDF数据
updated_file_info = file_info.copy()
updated_file_info['file_data'] = processed_file_data
updated_files.append(updated_file_info)
return updated_files
def _process_pdf_with_ocr(self, pdf_data, bl_no):
......@@ -256,12 +259,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
:param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据
"""
import os
import fitz # PyMuPDF
import numpy as np
from PIL import Image
import pytesseract
# 尝试导入OpenCV,如果失败则使用PIL替代
try:
import cv2
......@@ -271,7 +273,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
cv2_available = False
_logger.warning(f"OpenCV不可用,使用PIL替代: {str(e)}")
_logger.warning("建议安装OpenCV: pip install opencv-python-headless")
# 设置Tesseract路径
self._setup_tesseract_path()
# 打开PDF文档
......@@ -280,17 +282,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
processed_pages = 0
detected_texts = []
all_recognized_texts = []
result_data=False
result_data = False
# 处理每一页(完全按照HTML逻辑)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# _logger.info(f"正在OCR识别第{page_num + 1}页")
# 将页面转换为图像(与HTML完全一致)
mat = fitz.Matrix(2.0, 2.0) # 提高分辨率
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# 转换为PIL图像(兼容OpenCV和PIL)
if cv2_available:
# 使用OpenCV处理
......@@ -302,28 +304,28 @@ class BatchGetPodInfoWizard(models.TransientModel):
pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB')
# OCR配置(与HTML完全一致)
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
# 使用Tesseract进行OCR识别
try:
ocr_data = pytesseract.image_to_data(
pil_img,
output_type=pytesseract.Output.DICT,
pil_img,
output_type=pytesseract.Output.DICT,
lang='eng',
config=config
)
except Exception as e:
_logger.error(f"OCR识别失败: {str(e)}")
continue
# 处理OCR结果(与HTML完全一致)
page_width = page.rect.width
page_height = page.rect.height
viewport_width = pil_img.width
viewport_height = pil_img.height
# 存储所有识别到的文字
page_recognized_texts = []
for i in range(len(ocr_data['text'])):
......@@ -340,23 +342,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
},
'page': page_num
})
all_recognized_texts.extend(page_recognized_texts)
# 查找目标文字(完全按照HTML逻辑)
page_texts = self._find_target_texts(
page_recognized_texts,
page_num,
viewport_width,
viewport_height,
page_width,
page_recognized_texts,
page_num,
viewport_width,
viewport_height,
page_width,
page_height
)
detected_texts.extend(page_texts)
# 根据OCR结果删除文字(完全按照HTML逻辑)
if page_texts:
for text_info in page_texts:
# 超精确删除模式(与HTML完全一致)
rect = {
......@@ -365,33 +367,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
'width': text_info['width'],
'height': text_info['height']
}
# 绘制白色矩形覆盖文字
try:
page.draw_rect(
fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
color=(1, 1, 1), # 白色
fill=(1, 1, 1) # 填充白色
fill=(1, 1, 1) # 填充白色
)
# _logger.info(f"删除目标文字: {text_info['text']}")
total_rectangles += 1
except Exception as e:
_logger.error(f"删除失败: {str(e)}")
processed_pages += 1
# 保存处理后的PDF
try:
output_buffer = io.BytesIO()
pdf_document.save(output_buffer, garbage=4, deflate=True, clean=True)
pdf_document.close()
result_data = output_buffer.getvalue()
output_buffer.close()
except Exception as e:
_logger.error(f"PDF保存失败: {str(e)}")
pdf_document.close()
return result_data
def _setup_tesseract_path(self):
......@@ -402,7 +403,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import pytesseract
import os
import shutil
if os.name == 'nt': # Windows
# Windows常见路径
possible_paths = [
......@@ -427,26 +428,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/bin/tesseract', # macOS M1
'/usr/local/Cellar/tesseract/*/bin/tesseract' # macOS Homebrew
]
for path in possible_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
break
# 检查语言数据文件
self._check_tessdata_files()
def _check_tessdata_files(self):
"""
Check if tessdata files exist # 检查tessdata文件是否存在
"""
import pytesseract
import os
# 获取Tesseract数据路径
tesseract_cmd = pytesseract.pytesseract.tesseract_cmd
tessdata_dir = os.path.dirname(tesseract_cmd) + '/tessdata'
# 如果tessdata目录不存在,尝试其他常见位置
if not os.path.exists(tessdata_dir):
possible_tessdata_dirs = [
......@@ -455,12 +456,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/share/tessdata', # macOS M1
'/usr/local/Cellar/tesseract/*/share/tessdata' # macOS Homebrew
]
for tessdata_path in possible_tessdata_dirs:
if os.path.exists(tessdata_path):
tessdata_dir = tessdata_path
break
# 检查英语语言数据文件
eng_data = os.path.join(tessdata_dir, 'eng.traineddata')
if os.path.exists(eng_data):
......@@ -473,14 +474,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
"""
# 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD','UCLINKLOGISITICSLTD']
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5']
TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
'UCLINKLOGISITICSLTD']
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
'Page 5 of 5']
found_texts = []
for word in words:
text = word['text'].strip().upper()
# 首先检查是否在排除列表中(与HTML完全一致)
is_excluded = False
for exclude_text in EXCLUDE_TEXTS:
......@@ -488,21 +491,21 @@ class BatchGetPodInfoWizard(models.TransientModel):
if exclude_upper in text or text in exclude_upper:
is_excluded = True
break
# 检查页码模式(Page X of Y)(与HTML完全一致)
import re
if not is_excluded and (re.match(r'^PAGE\s+\d+\s+OF\s+\d+$', text) or re.match(r'^\d+\s*/\s*\d+$', text)):
is_excluded = True
if is_excluded:
# _logger.info(f"排除文字: {word['text']}")
continue
# 检查目标文字匹配(与HTML完全一致)
for target_text in TARGET_TEXTS:
target_upper = target_text.upper()
is_match = False
if target_text == 'AGN':
# AGN使用精确匹配
is_match = text == 'AGN'
......@@ -512,27 +515,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
else:
# 其他文字使用包含匹配,但更严格(与HTML完全一致)
is_match = target_upper in text and \
'AIR' not in text and \
'EQK' not in text and \
'ARN' not in text
'AIR' not in text and \
'EQK' not in text and \
'ARN' not in text
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
if not is_match and target_text != 'AGN' and target_text != 'LTD':
is_match = self._fuzzy_match(text, target_upper)
if is_match:
# 坐标转换(适配PyMuPDF坐标系统)
scale_x = page_width / viewport_width
scale_y = page_height / viewport_height
# PyMuPDF使用左下角为原点,OCR使用左上角为原点
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
converted_x = word['bbox']['x0'] * scale_x
converted_y = (word['bbox']['y0'] * scale_y) # 直接使用OCR的Y坐标
converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x
converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y
found_texts.append({
'text': target_text,
'full_text': word['text'],
......@@ -545,7 +547,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
'type': 'agn' if target_text == 'AGN' else 'uclink'
})
break
return found_texts
def _fuzzy_match(self, str1, str2):
......@@ -555,14 +557,14 @@ class BatchGetPodInfoWizard(models.TransientModel):
import re
s1 = re.sub(r'[^A-Z]', '', str1)
s2 = re.sub(r'[^A-Z]', '', str2)
if len(s1) == 0 or len(s2) == 0:
return False
# 计算编辑距离
distance = self._levenshtein_distance(s1, s2)
max_len = max(len(s1), len(s2))
# 如果编辑距离小于等于最大长度的1/3,认为匹配
return distance <= max_len / 3
......@@ -572,10 +574,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
if len(s1) < len(s2):
return self._levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
current_row = [i + 1]
......@@ -585,7 +587,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def _save_and_return_download_link(self, file_info):
......@@ -598,22 +600,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取处理后的PDF数据
file_data = file_info.get('file_data', '')
file_name = file_info.get('file_name', 'processed.pdf')
if not file_data:
raise ValidationError(_('No processed file data available'))#提示:没有处理后的文件数据
raise ValidationError(_('No processed file data available')) # 提示:没有处理后的文件数据
# 解码base64数据
if isinstance(file_data, str):
pdf_binary = base64.b64decode(file_data)
else:
pdf_binary = file_data
# 确保PDF数据有效
if not pdf_binary.startswith(b'%PDF-'):
_logger.error(f"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}")
_logger.error(f"文件头(hex): {pdf_binary[:20].hex()}")
_logger.error(f"文件大小: {len(pdf_binary)}字节")
# 尝试修复:如果是base64字符串被错误处理
if isinstance(file_data, str) and len(file_data) > 100:
_logger.info("尝试重新解码base64数据...")
......@@ -631,7 +633,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
else:
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
# 验证PDF可以打开
try:
import fitz
......@@ -641,7 +643,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
except Exception as e:
_logger.error(f"PDF验证失败: {str(e)}")
raise ValidationError(_('Invalid PDF data for saving: cannot open PDF - %s') % str(e))
# 创建附件记录
attachment = self.env['ir.attachment'].create({
'name': f'processed_{file_name}',
......@@ -651,16 +653,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
'res_model': 'batch.get.pod.info.wizard',
'res_id': self.id,
})
_logger.info(f"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
_logger.info(
f"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
# 返回下载动作
return {
'type': 'ir.actions.act_url',
'url': f'/web/content/{attachment.id}?download=true',
'target': 'new',
}
except Exception as e:
_logger.error(f"保存PDF附件失败: {str(e)}")
raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论