提交 8ca88357 authored 作者: 贺阳's avatar 贺阳

没有勾选同步或没有勾选涂抹文字也需要创建附件信息

上级 b127cb4b
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import base64
import io import io
import logging import logging
import base64
import requests import requests
from odoo import models, fields, _ from odoo import models, fields, _
from odoo.exceptions import ValidationError from odoo.exceptions import ValidationError
...@@ -56,16 +57,19 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -56,16 +57,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 英文提示 # 英文提示
raise ValidationError(_('%s bill of loading cannot find release note file') % ( raise ValidationError(_('%s bill of loading cannot find release note file') % (
', '.join([bl.bl_no for bl in error_bl]))) # xx提单无法找到release note文件 ', '.join([bl.bl_no for bl in error_bl]))) # xx提单无法找到release note文件
if self.remove_specified_text: if self.remove_specified_text:
processed_files = self._remove_specified_text(processed_files) processed_files = self._remove_specified_text(processed_files)
# 用于测试的:保存处理后的PDF并返回下载链接 # 用于测试的:保存处理后的PDF并返回下载链接
# if processed_files and processed_files[0].get('file_data'): # if processed_files and processed_files[0].get('file_data'):
# return self._save_and_return_download_link(processed_files[0]) # return self._save_and_return_download_link(processed_files[0])
# 回写到附件信息
if processed_files:
# 回写PDF文件到清关文件
self._write_pdf_file(processed_files)
# 再同步和回写 # 再同步和回写
if self.sync_last_mile_pod: if self.sync_last_mile_pod and processed_files:
self._sync_last_mile_pod(processed_files) self._sync_last_mile_pod(processed_files)
# 写一个方法掉接口获取提单pdf文件 # 写一个方法掉接口获取提单pdf文件
...@@ -76,12 +80,13 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -76,12 +80,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取当前选中的提单对象 # 获取当前选中的提单对象
bl_objs = self.get_order() bl_objs = self.get_order()
bill_numbers = [self.env['common.common'].sudo().process_match_str(bl.bl_no) for bl in bl_objs] bill_numbers = [self.env['common.common'].sudo().process_match_str(bl.bl_no) for bl in bl_objs]
# 调用API获取PDF文件 # 调用API获取PDF文件
api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url','http://172.104.52.150:7002') api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url',
'http://172.104.52.150:7002')
if not api_url: if not api_url:
raise ValidationError(_('API URL not configured')) raise ValidationError(_('API URL not configured'))
# 构建请求数据 # 构建请求数据
request_data = { request_data = {
"bill_numbers": bill_numbers "bill_numbers": bill_numbers
...@@ -92,22 +97,22 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -92,22 +97,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
headers={'Content-Type': 'application/json'}, headers={'Content-Type': 'application/json'},
json=request_data json=request_data
) )
if response.status_code == 200: if response.status_code == 200:
result = response.json() result = response.json()
# 检查API响应结构 # 检查API响应结构
if not result: if not result:
raise ValidationError(_('API returned empty response')) raise ValidationError(_('API returned empty response'))
if not result.get('success'): if not result.get('success'):
error_msg = result.get('message', 'Unknown error') error_msg = result.get('message', 'Unknown error')
raise ValidationError(_('API returned error: %s') % error_msg) raise ValidationError(_('API returned error: %s') % error_msg)
# 处理结果数据 # 处理结果数据
results = result.get('results', []) results = result.get('results', [])
if not results: if not results:
raise ValidationError(_('No PDF files found in API response'))#提示:API调用成功,但没有PDF文件 raise ValidationError(_('No PDF files found in API response')) # 提示:API调用成功,但没有PDF文件
# 构建PDF文件数组 # 构建PDF文件数组
pdf_file_arr = [] pdf_file_arr = []
for result_item in results: for result_item in results:
...@@ -116,11 +121,11 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -116,11 +121,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
bill_number = result_item.get('bill_number') bill_number = result_item.get('bill_number')
filename = result_item.get('filename') filename = result_item.get('filename')
base64_data = result_item.get('base64') base64_data = result_item.get('base64')
if not all([bill_number, filename, base64_data]): if not all([bill_number, filename, base64_data]):
_logger.warning(f"跳过无效的PDF文件项: {result_item}") _logger.warning(f"跳过无效的PDF文件项: {result_item}")
continue continue
# 验证PDF文件 # 验证PDF文件
try: try:
pdf_binary = base64.b64decode(base64_data) pdf_binary = base64.b64decode(base64_data)
...@@ -140,7 +145,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -140,7 +145,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
return pdf_file_arr return pdf_file_arr
else: else:
raise ValidationError(_('Failed to get PDF file from API: %s') % response.text) raise ValidationError(_('Failed to get PDF file from API: %s') % response.text)
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
raise ValidationError(_('API request failed: %s') % str(e)) raise ValidationError(_('API request failed: %s') % str(e))
...@@ -176,7 +181,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -176,7 +181,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
select_bl_no = self.env['common.common'].sudo().process_match_str(bl.bl_no) select_bl_no = self.env['common.common'].sudo().process_match_str(bl.bl_no)
for pdf_file in pdf_file_arr: for pdf_file in pdf_file_arr:
# 尝试不同的字段名(API可能使用不同的字段名) # 尝试不同的字段名(API可能使用不同的字段名)
file_name = pdf_file.get('file_name' ) # 获取文件名 file_name = pdf_file.get('file_name') # 获取文件名
file_data = pdf_file.get('file_data') # 获取文件数据 file_data = pdf_file.get('file_data') # 获取文件数据
bl_no = pdf_file.get('bl_no') # 获取提单号 bl_no = pdf_file.get('bl_no') # 获取提单号
if bl_no and select_bl_no == bl_no: if bl_no and select_bl_no == bl_no:
...@@ -198,8 +203,6 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -198,8 +203,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
Sync last mile POD information # 同步尾程POD信息 Sync last mile POD information # 同步尾程POD信息
:param processed_files: 处理后的文件数组 :param processed_files: 处理后的文件数组
""" """
# 回写PDF文件到清关文件
self._write_pdf_file(processed_files)
# return False#测试 先不同步 # return False#测试 先不同步
# 同步尾程POD信息 # 同步尾程POD信息
for file_info in processed_files: for file_info in processed_files:
...@@ -220,7 +223,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -220,7 +223,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
:return: 处理后的文件数组(包含处理后的PDF数据) :return: 处理后的文件数组(包含处理后的PDF数据)
""" """
updated_files = [] updated_files = []
for file_info in processed_files: for file_info in processed_files:
if not file_info['bl']: if not file_info['bl']:
updated_files.append(file_info) updated_files.append(file_info)
...@@ -241,12 +244,12 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -241,12 +244,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
if processed_pdf: if processed_pdf:
# 将处理后的PDF转换回base64 # 将处理后的PDF转换回base64
processed_file_data = base64.b64encode(processed_pdf).decode('utf-8') processed_file_data = base64.b64encode(processed_pdf).decode('utf-8')
# 更新文件信息,使用处理后的PDF数据 # 更新文件信息,使用处理后的PDF数据
updated_file_info = file_info.copy() updated_file_info = file_info.copy()
updated_file_info['file_data'] = processed_file_data updated_file_info['file_data'] = processed_file_data
updated_files.append(updated_file_info) updated_files.append(updated_file_info)
return updated_files return updated_files
def _process_pdf_with_ocr(self, pdf_data, bl_no): def _process_pdf_with_ocr(self, pdf_data, bl_no):
...@@ -256,12 +259,11 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -256,12 +259,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
:param bl_no: 提单号(用于日志) :param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据 :return: 处理后的PDF二进制数据
""" """
import os
import fitz # PyMuPDF import fitz # PyMuPDF
import numpy as np import numpy as np
from PIL import Image from PIL import Image
import pytesseract import pytesseract
# 尝试导入OpenCV,如果失败则使用PIL替代 # 尝试导入OpenCV,如果失败则使用PIL替代
try: try:
import cv2 import cv2
...@@ -271,7 +273,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -271,7 +273,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
cv2_available = False cv2_available = False
_logger.warning(f"OpenCV不可用,使用PIL替代: {str(e)}") _logger.warning(f"OpenCV不可用,使用PIL替代: {str(e)}")
_logger.warning("建议安装OpenCV: pip install opencv-python-headless") _logger.warning("建议安装OpenCV: pip install opencv-python-headless")
# 设置Tesseract路径 # 设置Tesseract路径
self._setup_tesseract_path() self._setup_tesseract_path()
# 打开PDF文档 # 打开PDF文档
...@@ -280,17 +282,17 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -280,17 +282,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
processed_pages = 0 processed_pages = 0
detected_texts = [] detected_texts = []
all_recognized_texts = [] all_recognized_texts = []
result_data=False result_data = False
# 处理每一页(完全按照HTML逻辑) # 处理每一页(完全按照HTML逻辑)
for page_num in range(len(pdf_document)): for page_num in range(len(pdf_document)):
page = pdf_document[page_num] page = pdf_document[page_num]
# _logger.info(f"正在OCR识别第{page_num + 1}页") # _logger.info(f"正在OCR识别第{page_num + 1}页")
# 将页面转换为图像(与HTML完全一致) # 将页面转换为图像(与HTML完全一致)
mat = fitz.Matrix(2.0, 2.0) # 提高分辨率 mat = fitz.Matrix(2.0, 2.0) # 提高分辨率
pix = page.get_pixmap(matrix=mat) pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png") img_data = pix.tobytes("png")
# 转换为PIL图像(兼容OpenCV和PIL) # 转换为PIL图像(兼容OpenCV和PIL)
if cv2_available: if cv2_available:
# 使用OpenCV处理 # 使用OpenCV处理
...@@ -302,28 +304,28 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -302,28 +304,28 @@ class BatchGetPodInfoWizard(models.TransientModel):
pil_img = Image.open(io.BytesIO(img_data)) pil_img = Image.open(io.BytesIO(img_data))
if pil_img.mode != 'RGB': if pil_img.mode != 'RGB':
pil_img = pil_img.convert('RGB') pil_img = pil_img.convert('RGB')
# OCR配置(与HTML完全一致) # OCR配置(与HTML完全一致)
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0' config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
# 使用Tesseract进行OCR识别 # 使用Tesseract进行OCR识别
try: try:
ocr_data = pytesseract.image_to_data( ocr_data = pytesseract.image_to_data(
pil_img, pil_img,
output_type=pytesseract.Output.DICT, output_type=pytesseract.Output.DICT,
lang='eng', lang='eng',
config=config config=config
) )
except Exception as e: except Exception as e:
_logger.error(f"OCR识别失败: {str(e)}") _logger.error(f"OCR识别失败: {str(e)}")
continue continue
# 处理OCR结果(与HTML完全一致) # 处理OCR结果(与HTML完全一致)
page_width = page.rect.width page_width = page.rect.width
page_height = page.rect.height page_height = page.rect.height
viewport_width = pil_img.width viewport_width = pil_img.width
viewport_height = pil_img.height viewport_height = pil_img.height
# 存储所有识别到的文字 # 存储所有识别到的文字
page_recognized_texts = [] page_recognized_texts = []
for i in range(len(ocr_data['text'])): for i in range(len(ocr_data['text'])):
...@@ -340,23 +342,23 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -340,23 +342,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
}, },
'page': page_num 'page': page_num
}) })
all_recognized_texts.extend(page_recognized_texts) all_recognized_texts.extend(page_recognized_texts)
# 查找目标文字(完全按照HTML逻辑) # 查找目标文字(完全按照HTML逻辑)
page_texts = self._find_target_texts( page_texts = self._find_target_texts(
page_recognized_texts, page_recognized_texts,
page_num, page_num,
viewport_width, viewport_width,
viewport_height, viewport_height,
page_width, page_width,
page_height page_height
) )
detected_texts.extend(page_texts) detected_texts.extend(page_texts)
# 根据OCR结果删除文字(完全按照HTML逻辑) # 根据OCR结果删除文字(完全按照HTML逻辑)
if page_texts: if page_texts:
for text_info in page_texts: for text_info in page_texts:
# 超精确删除模式(与HTML完全一致) # 超精确删除模式(与HTML完全一致)
rect = { rect = {
...@@ -365,33 +367,32 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -365,33 +367,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
'width': text_info['width'], 'width': text_info['width'],
'height': text_info['height'] 'height': text_info['height']
} }
# 绘制白色矩形覆盖文字 # 绘制白色矩形覆盖文字
try: try:
page.draw_rect( page.draw_rect(
fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']), fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
color=(1, 1, 1), # 白色 color=(1, 1, 1), # 白色
fill=(1, 1, 1) # 填充白色 fill=(1, 1, 1) # 填充白色
) )
# _logger.info(f"删除目标文字: {text_info['text']}") # _logger.info(f"删除目标文字: {text_info['text']}")
total_rectangles += 1 total_rectangles += 1
except Exception as e: except Exception as e:
_logger.error(f"删除失败: {str(e)}") _logger.error(f"删除失败: {str(e)}")
processed_pages += 1 processed_pages += 1
# 保存处理后的PDF # 保存处理后的PDF
try: try:
output_buffer = io.BytesIO() output_buffer = io.BytesIO()
pdf_document.save(output_buffer, garbage=4, deflate=True, clean=True) pdf_document.save(output_buffer, garbage=4, deflate=True, clean=True)
pdf_document.close() pdf_document.close()
result_data = output_buffer.getvalue() result_data = output_buffer.getvalue()
output_buffer.close() output_buffer.close()
except Exception as e: except Exception as e:
_logger.error(f"PDF保存失败: {str(e)}") _logger.error(f"PDF保存失败: {str(e)}")
pdf_document.close() pdf_document.close()
return result_data return result_data
def _setup_tesseract_path(self): def _setup_tesseract_path(self):
...@@ -402,7 +403,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -402,7 +403,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import pytesseract import pytesseract
import os import os
import shutil import shutil
if os.name == 'nt': # Windows if os.name == 'nt': # Windows
# Windows常见路径 # Windows常见路径
possible_paths = [ possible_paths = [
...@@ -427,26 +428,26 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -427,26 +428,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/bin/tesseract', # macOS M1 '/opt/homebrew/bin/tesseract', # macOS M1
'/usr/local/Cellar/tesseract/*/bin/tesseract' # macOS Homebrew '/usr/local/Cellar/tesseract/*/bin/tesseract' # macOS Homebrew
] ]
for path in possible_paths: for path in possible_paths:
if os.path.exists(path): if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path pytesseract.pytesseract.tesseract_cmd = path
break break
# 检查语言数据文件 # 检查语言数据文件
self._check_tessdata_files() self._check_tessdata_files()
def _check_tessdata_files(self): def _check_tessdata_files(self):
""" """
Check if tessdata files exist # 检查tessdata文件是否存在 Check if tessdata files exist # 检查tessdata文件是否存在
""" """
import pytesseract import pytesseract
import os import os
# 获取Tesseract数据路径 # 获取Tesseract数据路径
tesseract_cmd = pytesseract.pytesseract.tesseract_cmd tesseract_cmd = pytesseract.pytesseract.tesseract_cmd
tessdata_dir = os.path.dirname(tesseract_cmd) + '/tessdata' tessdata_dir = os.path.dirname(tesseract_cmd) + '/tessdata'
# 如果tessdata目录不存在,尝试其他常见位置 # 如果tessdata目录不存在,尝试其他常见位置
if not os.path.exists(tessdata_dir): if not os.path.exists(tessdata_dir):
possible_tessdata_dirs = [ possible_tessdata_dirs = [
...@@ -455,12 +456,12 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -455,12 +456,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/share/tessdata', # macOS M1 '/opt/homebrew/share/tessdata', # macOS M1
'/usr/local/Cellar/tesseract/*/share/tessdata' # macOS Homebrew '/usr/local/Cellar/tesseract/*/share/tessdata' # macOS Homebrew
] ]
for tessdata_path in possible_tessdata_dirs: for tessdata_path in possible_tessdata_dirs:
if os.path.exists(tessdata_path): if os.path.exists(tessdata_path):
tessdata_dir = tessdata_path tessdata_dir = tessdata_path
break break
# 检查英语语言数据文件 # 检查英语语言数据文件
eng_data = os.path.join(tessdata_dir, 'eng.traineddata') eng_data = os.path.join(tessdata_dir, 'eng.traineddata')
if os.path.exists(eng_data): if os.path.exists(eng_data):
...@@ -473,14 +474,16 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -473,14 +474,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字 Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
""" """
# 定义目标文字和排除文字(与HTML文件完全一致) # 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD','UCLINKLOGISITICSLTD'] TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD',
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5'] 'UCLINKLOGISITICSLTD']
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4',
'Page 5 of 5']
found_texts = [] found_texts = []
for word in words: for word in words:
text = word['text'].strip().upper() text = word['text'].strip().upper()
# 首先检查是否在排除列表中(与HTML完全一致) # 首先检查是否在排除列表中(与HTML完全一致)
is_excluded = False is_excluded = False
for exclude_text in EXCLUDE_TEXTS: for exclude_text in EXCLUDE_TEXTS:
...@@ -488,21 +491,21 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -488,21 +491,21 @@ class BatchGetPodInfoWizard(models.TransientModel):
if exclude_upper in text or text in exclude_upper: if exclude_upper in text or text in exclude_upper:
is_excluded = True is_excluded = True
break break
# 检查页码模式(Page X of Y)(与HTML完全一致) # 检查页码模式(Page X of Y)(与HTML完全一致)
import re import re
if not is_excluded and (re.match(r'^PAGE\s+\d+\s+OF\s+\d+$', text) or re.match(r'^\d+\s*/\s*\d+$', text)): if not is_excluded and (re.match(r'^PAGE\s+\d+\s+OF\s+\d+$', text) or re.match(r'^\d+\s*/\s*\d+$', text)):
is_excluded = True is_excluded = True
if is_excluded: if is_excluded:
# _logger.info(f"排除文字: {word['text']}") # _logger.info(f"排除文字: {word['text']}")
continue continue
# 检查目标文字匹配(与HTML完全一致) # 检查目标文字匹配(与HTML完全一致)
for target_text in TARGET_TEXTS: for target_text in TARGET_TEXTS:
target_upper = target_text.upper() target_upper = target_text.upper()
is_match = False is_match = False
if target_text == 'AGN': if target_text == 'AGN':
# AGN使用精确匹配 # AGN使用精确匹配
is_match = text == 'AGN' is_match = text == 'AGN'
...@@ -512,27 +515,26 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -512,27 +515,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
else: else:
# 其他文字使用包含匹配,但更严格(与HTML完全一致) # 其他文字使用包含匹配,但更严格(与HTML完全一致)
is_match = target_upper in text and \ is_match = target_upper in text and \
'AIR' not in text and \ 'AIR' not in text and \
'EQK' not in text and \ 'EQK' not in text and \
'ARN' not in text 'ARN' not in text
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致) # 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
if not is_match and target_text != 'AGN' and target_text != 'LTD': if not is_match and target_text != 'AGN' and target_text != 'LTD':
is_match = self._fuzzy_match(text, target_upper) is_match = self._fuzzy_match(text, target_upper)
if is_match: if is_match:
# 坐标转换(适配PyMuPDF坐标系统) # 坐标转换(适配PyMuPDF坐标系统)
scale_x = page_width / viewport_width scale_x = page_width / viewport_width
scale_y = page_height / viewport_height scale_y = page_height / viewport_height
# PyMuPDF使用左下角为原点,OCR使用左上角为原点 # PyMuPDF使用左下角为原点,OCR使用左上角为原点
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置 # 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
converted_x = word['bbox']['x0'] * scale_x converted_x = word['bbox']['x0'] * scale_x
converted_y = (word['bbox']['y0'] * scale_y) # 直接使用OCR的Y坐标 converted_y = (word['bbox']['y0'] * scale_y) # 直接使用OCR的Y坐标
converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x
converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y
found_texts.append({ found_texts.append({
'text': target_text, 'text': target_text,
'full_text': word['text'], 'full_text': word['text'],
...@@ -545,7 +547,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -545,7 +547,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
'type': 'agn' if target_text == 'AGN' else 'uclink' 'type': 'agn' if target_text == 'AGN' else 'uclink'
}) })
break break
return found_texts return found_texts
def _fuzzy_match(self, str1, str2): def _fuzzy_match(self, str1, str2):
...@@ -555,14 +557,14 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -555,14 +557,14 @@ class BatchGetPodInfoWizard(models.TransientModel):
import re import re
s1 = re.sub(r'[^A-Z]', '', str1) s1 = re.sub(r'[^A-Z]', '', str1)
s2 = re.sub(r'[^A-Z]', '', str2) s2 = re.sub(r'[^A-Z]', '', str2)
if len(s1) == 0 or len(s2) == 0: if len(s1) == 0 or len(s2) == 0:
return False return False
# 计算编辑距离 # 计算编辑距离
distance = self._levenshtein_distance(s1, s2) distance = self._levenshtein_distance(s1, s2)
max_len = max(len(s1), len(s2)) max_len = max(len(s1), len(s2))
# 如果编辑距离小于等于最大长度的1/3,认为匹配 # 如果编辑距离小于等于最大长度的1/3,认为匹配
return distance <= max_len / 3 return distance <= max_len / 3
...@@ -572,10 +574,10 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -572,10 +574,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
""" """
if len(s1) < len(s2): if len(s1) < len(s2):
return self._levenshtein_distance(s2, s1) return self._levenshtein_distance(s2, s1)
if len(s2) == 0: if len(s2) == 0:
return len(s1) return len(s1)
previous_row = list(range(len(s2) + 1)) previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1): for i, c1 in enumerate(s1):
current_row = [i + 1] current_row = [i + 1]
...@@ -585,7 +587,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -585,7 +587,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
substitutions = previous_row[j] + (c1 != c2) substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions)) current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row previous_row = current_row
return previous_row[-1] return previous_row[-1]
def _save_and_return_download_link(self, file_info): def _save_and_return_download_link(self, file_info):
...@@ -598,22 +600,22 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -598,22 +600,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取处理后的PDF数据 # 获取处理后的PDF数据
file_data = file_info.get('file_data', '') file_data = file_info.get('file_data', '')
file_name = file_info.get('file_name', 'processed.pdf') file_name = file_info.get('file_name', 'processed.pdf')
if not file_data: if not file_data:
raise ValidationError(_('No processed file data available'))#提示:没有处理后的文件数据 raise ValidationError(_('No processed file data available')) # 提示:没有处理后的文件数据
# 解码base64数据 # 解码base64数据
if isinstance(file_data, str): if isinstance(file_data, str):
pdf_binary = base64.b64decode(file_data) pdf_binary = base64.b64decode(file_data)
else: else:
pdf_binary = file_data pdf_binary = file_data
# 确保PDF数据有效 # 确保PDF数据有效
if not pdf_binary.startswith(b'%PDF-'): if not pdf_binary.startswith(b'%PDF-'):
_logger.error(f"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}") _logger.error(f"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}")
_logger.error(f"文件头(hex): {pdf_binary[:20].hex()}") _logger.error(f"文件头(hex): {pdf_binary[:20].hex()}")
_logger.error(f"文件大小: {len(pdf_binary)}字节") _logger.error(f"文件大小: {len(pdf_binary)}字节")
# 尝试修复:如果是base64字符串被错误处理 # 尝试修复:如果是base64字符串被错误处理
if isinstance(file_data, str) and len(file_data) > 100: if isinstance(file_data, str) and len(file_data) > 100:
_logger.info("尝试重新解码base64数据...") _logger.info("尝试重新解码base64数据...")
...@@ -631,7 +633,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -631,7 +633,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format')) raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
else: else:
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format')) raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
# 验证PDF可以打开 # 验证PDF可以打开
try: try:
import fitz import fitz
...@@ -641,7 +643,7 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -641,7 +643,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
except Exception as e: except Exception as e:
_logger.error(f"PDF验证失败: {str(e)}") _logger.error(f"PDF验证失败: {str(e)}")
raise ValidationError(_('Invalid PDF data for saving: cannot open PDF - %s') % str(e)) raise ValidationError(_('Invalid PDF data for saving: cannot open PDF - %s') % str(e))
# 创建附件记录 # 创建附件记录
attachment = self.env['ir.attachment'].create({ attachment = self.env['ir.attachment'].create({
'name': f'processed_{file_name}', 'name': f'processed_{file_name}',
...@@ -651,16 +653,17 @@ class BatchGetPodInfoWizard(models.TransientModel): ...@@ -651,16 +653,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
'res_model': 'batch.get.pod.info.wizard', 'res_model': 'batch.get.pod.info.wizard',
'res_id': self.id, 'res_id': self.id,
}) })
_logger.info(f"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}") _logger.info(
f"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
# 返回下载动作 # 返回下载动作
return { return {
'type': 'ir.actions.act_url', 'type': 'ir.actions.act_url',
'url': f'/web/content/{attachment.id}?download=true', 'url': f'/web/content/{attachment.id}?download=true',
'target': 'new', 'target': 'new',
} }
except Exception as e: except Exception as e:
_logger.error(f"保存PDF附件失败: {str(e)}") _logger.error(f"保存PDF附件失败: {str(e)}")
raise ValidationError(_('Failed to save PDF attachment: %s') % str(e)) raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论