提交 ee951ff9 authored 作者: 贺阳's avatar 贺阳

不调接口的测试

上级 e45ced44
......@@ -471,7 +471,7 @@
<field name="model_id" ref="model_cc_bl"/>
<field name="binding_model_id" ref="model_cc_bl"/>
<field name="state">code</field>
<field name="binding_view_types">list</field>
<field name="binding_view_types">list,form</field>
<field name="groups_id" eval="[(4, ref('ccs_base.group_clearance_of_customs_user'))]"/>
<field name="code">
if records:
......
# -*- coding: utf-8 -*-
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import io
import logging
import base64
import requests
from odoo import models, fields, _
from odoo.exceptions import ValidationError
......@@ -40,11 +41,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
Confirm operation # 确认操作
"""
try:
bl_objs = self.get_order()
# 调用接口获取提单pdf文件
pdf_file_arr = self._get_pdf_file_arr()
# 调用接口获取提单pdf文件
# pdf_file_arr = self._get_pdf_file_arr()
pdf_file_arr = self._get_pdf_file_arr_test()
if not pdf_file_arr:
raise ValidationError(_('No PDF files found'))#提示:没有获取到PDF文件
# 处理PDF文件,匹配提单对象
processed_files = self._match_bl_by_file_name(pdf_file_arr)
# 把没有匹配到文件的进行提示
......@@ -56,47 +59,197 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 英文提示
raise ValidationError(_('%s bill of loading cannot find release note file') % (
', '.join([bl.bl_no for bl in error_bl]))) # xx提单无法找到release note文件
# 先涂抹指定文字
if self.remove_specified_text:
processed_files = self._remove_specified_text(processed_files)
# 用于测试的:保存处理后的PDF并返回下载链接
# if processed_files and processed_files[0].get('file_data'):
# return self._save_and_return_download_link(processed_files[0])
# 再同步和回写
if self.sync_last_mile_pod:
self._sync_last_mile_pod(processed_files)
# 显示成功消息
return {
'type': 'ir.actions.client',
'tag': 'display_notification',
'params': {
'title': _('Operation Completed'), # 操作完成
'message': _('Successfully processed %d PDF files for %d bill of loadings') % (len(processed_files), len(bl_objs)), # 成功处理了%d个PDF文件,涉及%d个提单
'type': 'success',
}
}
def _get_pdf_file_arr_test(self):
"""
Get PDF file from test data # 从测试数据获取PDF文件
"""
pdf_file_arr = []
bl_objs = self.get_order()
for bl in bl_objs:
clearance_file = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
'尾程交接POD(待大包数量和箱号)') #查找清关文件
if clearance_file and clearance_file.file:
try:
# 验证原始文件数据
file_data = clearance_file.file
if isinstance(file_data, bytes):
# 验证PDF文件头
if not file_data.startswith(b'%PDF-'):
# 检查是否是base64编码的字符串
try:
decoded_data = base64.b64decode(file_data)
if decoded_data.startswith(b'%PDF-'):
_logger.info(f"发现base64编码的PDF数据,提单号: {bl.bl_no}")
file_data = decoded_data
else:
_logger.warning(f"base64解码后仍不是PDF格式,提单号: {bl.bl_no}")
continue
except Exception as e:
raise ValidationError(_('Operation failed: %s') % str(e)) # 操作失败
_logger.warning(f"尝试base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}")
continue
elif isinstance(file_data, str):
# 尝试base64解码
try:
decoded_data = base64.b64decode(file_data)
if decoded_data.startswith(b'%PDF-'):
_logger.info(f"字符串base64解码成功,是有效PDF,提单号: {bl.bl_no}")
file_data = decoded_data
else:
_logger.warning(f"字符串base64解码后不是PDF格式,提单号: {bl.bl_no}")
continue
except Exception as e:
_logger.warning(f"字符串base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}")
continue
else:
_logger.warning(f"清关文件数据格式不正确,类型: {type(file_data)},提单号: {bl.bl_no}")
continue
# 验证PDF可以打开
try:
import fitz
test_doc = fitz.open(stream=file_data, filetype="pdf")
page_count = len(test_doc)
test_doc.close()
_logger.info(f"清关文件PDF验证成功,页数: {page_count},提单号: {bl.bl_no}")
except Exception as e:
_logger.warning(f"清关文件PDF无法打开,提单号: {bl.bl_no}, 错误: {str(e)}")
continue
# 转换为base64
file_data_base64 = base64.b64encode(file_data).decode('utf-8')
pdf_file_arr.append({
'bl_no': self.env['common.common'].sudo().process_match_str(bl.bl_no),
'file_name': clearance_file.attachment_name or clearance_file.file_name,
'file_data': file_data_base64
})
_logger.info(f"成功添加PDF文件,提单号: {bl.bl_no}, 文件名: {clearance_file.attachment_name or clearance_file.file_name}")
except Exception as e:
_logger.error(f"处理清关文件失败,提单号: {bl.bl_no}, 错误: {str(e)}")
continue
else:
_logger.warning(f"未找到清关文件,提单号: {bl.bl_no}")
_logger.info(f"从测试数据获取PDF文件,成功获取{len(pdf_file_arr)}个文件")
return pdf_file_arr
# 写一个方法掉接口获取提单pdf文件
def _get_pdf_file_arr(self):
"""
Get PDF file # 获取PDF文件
Get PDF file from API # 从API获取PDF文件
"""
# 调用接口,接口返回数组[{'bl_no':'','file_name':'','file_data':''}]
# bl_no:提单号
# file_name:文件名
# file_data:文件数据
return [{
'bl_no': '436-10259804',
'file_name': '合并提单_436-10259804_20251008.pdf',
'file_data': 'base64_data'
}]
api_url = self.env['ir.config_parameter'].sudo().get_param('ccs_base.last_mile_pod_api_url')
response = requests.get(api_url + '/get_pdf_file')
# 获取当前选中的提单对象
bl_objs = self.get_order()
bill_numbers = [self.env['common.common'].sudo().process_match_str(bl.bl_no) for bl in bl_objs]
# 调用API获取PDF文件
api_url = self.env['ir.config_parameter'].sudo().get_param('last_mile_pod_api_url','http://172.104.52.150:7002')
if not api_url:
raise ValidationError(_('API URL not configured'))
# 构建请求数据
request_data = {
"bill_numbers": bill_numbers
}
try:
response = requests.post(
f"{api_url}/api/release-notes/pdfs",
headers={'Content-Type': 'application/json'},
json=request_data
)
if response.status_code == 200:
return response.json()
result = response.json()
# 检查API响应结构
if not result:
_logger.error("API返回空响应")
raise ValidationError(_('API returned empty response'))
if not result.get('success'):
error_msg = result.get('message', 'Unknown error')
_logger.error(f"API返回失败状态: {error_msg}")
raise ValidationError(_('API returned error: %s') % error_msg)
# 处理结果数据
results = result.get('results', [])
if not results:
_logger.warning("API调用成功,但没有PDF文件")
raise ValidationError(_('No PDF files found in API response'))
# 构建PDF文件数组
pdf_file_arr = []
for result_item in results:
if result_item.get('success'):
# 验证必要字段
bill_number = result_item.get('bill_number')
filename = result_item.get('filename')
base64_data = result_item.get('base64')
if not all([bill_number, filename, base64_data]):
_logger.warning(f"跳过无效的PDF文件项: {result_item}")
continue
# 验证PDF文件
try:
pdf_binary = base64.b64decode(base64_data)
# 验证PDF文件头
if not pdf_binary.startswith(b'%PDF-'):
_logger.warning(f"API返回的文件不是有效的PDF格式,提单号: {bill_number}")
continue
# 验证PDF可以打开
try:
import fitz
test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
page_count = len(test_doc)
test_doc.close()
_logger.info(f"API PDF验证成功,页数: {page_count},提单号: {bill_number}")
except Exception as e:
_logger.warning(f"API PDF文件无法打开,提单号: {bill_number}, 错误: {str(e)}")
continue
pdf_file_arr.append({
'bl_no': bill_number,
'file_name': filename,
'file_data': base64_data
})
_logger.info(f"成功添加API PDF文件,提单号: {bill_number}, 文件名: {filename}")
except Exception as e:
_logger.warning(f"API PDF文件验证失败,提单号: {bill_number}, 错误: {str(e)}")
continue
if not pdf_file_arr:
_logger.error("所有API PDF文件验证都失败")
raise ValidationError(_('All API PDF files failed validation'))
_logger.info(f"API调用成功,获取到{len(pdf_file_arr)}个有效PDF文件")
return pdf_file_arr
else:
raise ValidationError(_('Failed to get PDF file: %s') % response.text)
_logger.error(f"API调用失败,状态码: {response.status_code}")
_logger.error(f"响应内容: {response.text}")
raise ValidationError(_('Failed to get PDF file from API: %s') % response.text)
except requests.exceptions.RequestException as e:
_logger.error(f"API请求异常: {str(e)}")
raise ValidationError(_('API request failed: %s') % str(e))
def _write_pdf_file(self, processed_files):
"""
......@@ -109,7 +262,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
bl = file_info['bl']
file_name = file_info['file_name']
file_data = file_info['file_data']
try:
# 查找或创建清关文件记录
clearance_file = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
'尾程交接POD(待大包数量和箱号)')
......@@ -128,13 +280,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
'attachment_name': file_name,
'file': file_data
})
except Exception as e:
raise ValidationError(_('Failed to write PDF file %s: %s') % (file_name, str(e)))
def _match_bl_by_file_name(self, pdf_file_arr):
"""
Match BL by file name and return processed array # 根据文件名匹配提单并返回处理后的数组
:param pdf_file_arr: PDF文件数组 [{'bl_no':'', 'file_name':'', 'file_data':''}]
:param pdf_file_arr: PDF文件数组 [{'bill_number':'', 'filename':'', 'file_data':''}]
:return: 处理后的数组 [{'bl': bl_obj, 'file_name': 'xxx.pdf', 'file_data': 'xxx', 'matched': True/False}]
"""
bl_obj = self.get_order() # 获取当前选中的提单对象
......@@ -142,13 +292,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
for bl in bl_obj:
select_bl_no = self.env['common.common'].sudo().process_match_str(bl.bl_no)
for pdf_file in pdf_file_arr:
file_name = pdf_file.get('file_name', '') # 获取文件名
file_data = pdf_file.get('file_data', '') # 获取文件数据
bl_no = pdf_file.get('bl_no', '') # 获取提单号
if not bl_no:
# 从文件名获取提单号 合并提单_436-10259804_20251008.pdf
split_bl_no = file_name.split('_')[1]
bl_no = self.env['common.common'].sudo().process_match_str(split_bl_no)
# 尝试不同的字段名(API可能使用不同的字段名)
file_name = pdf_file.get('file_name' ) # 获取文件名
file_data = pdf_file.get('file_data') # 获取文件数据
bl_no = pdf_file.get('bl_no') # 获取提单号
if bl_no and select_bl_no == bl_no:
# 构建处理后的文件信息
processed_file = {
......@@ -156,10 +303,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
'file_name': file_name,
'file_data': file_data,
'bl_no': bl_no,
'original_data': pdf_file # 保留原始数据
}
processed_files.append(processed_file)
break
_logger.info(f"匹配完成,成功匹配{len(processed_files)}个文件,匹配结果: {processed_files}")
return processed_files
def _sync_last_mile_pod(self, processed_files):
......@@ -169,14 +316,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
# 回写PDF文件到清关文件
self._write_pdf_file(processed_files)
return False#测试 先不同步
# 同步尾程POD信息
for file_info in processed_files:
if not file_info['bl']:
continue
bl = file_info['bl']
try:
# 查找清关文件并执行同步
clearance_files = self.env['cc.clearance.file'].sudo().search_clearance_file(bl.id,
'尾程交接POD(待大包数量和箱号)')
......@@ -184,10 +330,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
clearance_file.action_sync() # 同步尾程POD
_logger.info(f"Successfully synced POD for BL {bl.bl_no}")
except Exception as e:
_logger.error(f"Failed to sync POD for BL {bl.bl_no}: {str(e)}")
raise ValidationError(_('Failed to sync POD for BL %s: %s') % (bl.bl_no, str(e)))
def _remove_specified_text(self, processed_files):
"""
Remove specified text from PDF files using OCR recognition # 使用OCR识别涂抹指定文字
......@@ -205,12 +347,34 @@ class BatchGetPodInfoWizard(models.TransientModel):
file_data = file_info['file_data']
processed_file_data = file_data # 默认使用原始数据
try:
# 使用OCR识别和删除指定文字
if file_data and file_data != 'base64_data': # 跳过测试数据
if file_data:
# 将base64数据转换为二进制
import base64
try:
pdf_binary = base64.b64decode(file_data)
_logger.info(f"Base64解码成功,数据大小: {len(pdf_binary)}字节,提单号: {bl.bl_no}")
# 验证PDF文件头
if not pdf_binary.startswith(b'%PDF-'):
_logger.error(f"解码后的数据不是有效的PDF文件,提单号: {bl.bl_no}")
_logger.error(f"文件头: {pdf_binary[:20]}")
raise ValidationError(_('Decoded data is not a valid PDF file for BL %s') % bl.bl_no)
# 验证PDF可以打开
try:
import fitz
test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
page_count = len(test_doc)
test_doc.close()
_logger.info(f"PDF验证成功,页数: {page_count},提单号: {bl.bl_no}")
except Exception as e:
_logger.error(f"PDF文件无法打开,提单号: {bl.bl_no}, 错误: {str(e)}")
raise ValidationError(_('PDF file cannot be opened for BL %s: %s') % (bl.bl_no, str(e)))
except Exception as e:
_logger.error(f"Base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}")
raise ValidationError(_('Failed to decode base64 data for BL %s: %s') % (bl.bl_no, str(e)))
# 使用OCR方法处理PDF
processed_pdf = self._process_pdf_with_ocr(
......@@ -219,12 +383,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
)
# 将处理后的PDF转换回base64
processed_file_data = base64.b64encode(processed_pdf)
processed_file_data = base64.b64encode(processed_pdf).decode('utf-8')
_logger.info(f"Successfully removed specified text from PDF for BL {bl.bl_no}")
except Exception as e:
_logger.error(f"Failed to remove text from PDF for BL {bl.bl_no}: {str(e)}")
raise ValidationError(_('Failed to remove text from PDF for BL %s: %s') % (bl.bl_no, str(e)))
_logger.info(f"处理后的PDF base64数据长度: {len(processed_file_data)}")
# 更新文件信息,使用处理后的PDF数据
updated_file_info = file_info.copy()
......@@ -235,22 +396,27 @@ class BatchGetPodInfoWizard(models.TransientModel):
def _process_pdf_with_ocr(self, pdf_data, bl_no):
"""
Process PDF with OCR recognition and text removal # 使用OCR识别处理PDF并删除文字
Process PDF with OCR recognition and text removal (完全按照HTML逻辑) # 使用OCR识别处理PDF并删除文字
:param pdf_data: PDF二进制数据
:param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据
"""
import os
import fitz # PyMuPDF
import cv2
import numpy as np
from PIL import Image
import pytesseract
import base64
import io
# 定义目标文字和排除文字(与HTML文件保持一致)
TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD']
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5']
# 设置Tesseract路径
self._setup_tesseract_path()
# 验证PDF数据
if not pdf_data or not pdf_data.startswith(b'%PDF-'):
_logger.error(f"PDF数据无效,提单号: {bl_no}")
raise ValidationError(_('Invalid PDF data for BL %s') % bl_no)
_logger.info(f"开始OCR处理PDF,提单号: {bl_no}")
# 打开PDF文档
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
......@@ -259,15 +425,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
detected_texts = []
all_recognized_texts = []
_logger.info(f"开始OCR处理PDF,共{len(pdf_document)}页,提单号: {bl_no}")
# 处理每一页
# 处理每一页(完全按照HTML逻辑)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
_logger.info(f"正在OCR识别第{page_num + 1}页")
try:
# 将页面转换为图像(提高分辨率,与HTML文件保持一致)
# 将页面转换为图像(与HTML完全一致)
mat = fitz.Matrix(2.0, 2.0) # 提高分辨率
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
......@@ -279,15 +442,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 转换为PIL图像
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
# 使用Tesseract进行OCR识别(优化配置,与HTML文件保持一致)
# OCR配置(与HTML完全一致)
config = '--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
# 使用Tesseract进行OCR识别
try:
ocr_data = pytesseract.image_to_data(
pil_img,
output_type=pytesseract.Output.DICT,
lang='eng',
config='--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- '
config=config
)
except Exception as e:
_logger.error(f"OCR识别失败: {str(e)}")
continue
# 处理OCR结果
# 处理OCR结果(与HTML完全一致)
page_width = page.rect.width
page_height = page.rect.height
viewport_width = pil_img.width
......@@ -312,24 +482,25 @@ class BatchGetPodInfoWizard(models.TransientModel):
all_recognized_texts.extend(page_recognized_texts)
# 查找目标文字
# 查找目标文字(完全按照HTML逻辑)
page_texts = self._find_target_texts(
page_recognized_texts,
page_num,
viewport_width,
viewport_height,
page_width,
page_height,
TARGET_TEXTS,
EXCLUDE_TEXTS
page_height
)
detected_texts.extend(page_texts)
_logger.info(f"第{page_num + 1}页OCR完成,找到{len(page_texts)}个目标文字")
# 在页面上绘制删除矩形
# 根据OCR结果删除文字(完全按照HTML逻辑)
if page_texts:
for text_info in page_texts:
# 超精确删除模式(与HTML文件保持一致)
# 超精确删除模式(与HTML完全一致)
rect = {
'x': text_info['x'],
'y': text_info['y'],
......@@ -337,61 +508,162 @@ class BatchGetPodInfoWizard(models.TransientModel):
'height': text_info['height']
}
# 绘制白色矩形覆盖文字
try:
page.draw_rect(
fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
color=(1, 1, 1),
fill=(1, 1, 1)
color=(1, 1, 1), # 白色
fill=(1, 1, 1) # 填充白色
)
_logger.info(f"删除目标文字: {text_info['text']}")
total_rectangles += 1
processed_pages += 1
except Exception as e:
_logger.warning(f"第{page_num + 1}页OCR失败: {str(e)}")
# 使用回退策略:预设坐标
self._apply_fallback_rectangles(page, page_num)
_logger.error(f"删除失败: {str(e)}")
else:
_logger.warning(f"第{page_num + 1}页没有找到目标文字")
processed_pages += 1
# 保存处理后的PDF
try:
output_buffer = io.BytesIO()
pdf_document.save(output_buffer)
pdf_document.save(output_buffer, garbage=4, deflate=True, clean=True)
pdf_document.close()
result_data = output_buffer.getvalue()
output_buffer.close()
_logger.info(f"PDF保存成功,数据大小: {len(result_data)}字节")
except Exception as e:
_logger.error(f"PDF保存失败: {str(e)}")
pdf_document.close()
raise ValidationError(_('Failed to save PDF: %s') % str(e))
_logger.info(f"PDF OCR处理完成,共处理{processed_pages}页,删除{total_rectangles}个文字区域,提单号: {bl_no}")
return result_data
def _find_target_texts(self, words, page_num, viewport_width, viewport_height, page_width, page_height, target_texts, exclude_texts):
def _setup_tesseract_path(self):
"""
Find target texts using OCR results # 使用OCR结果查找目标文字
Setup Tesseract path for different systems # 为不同系统设置Tesseract路径
"""
# try:
import pytesseract
import os
import shutil
if os.name == 'nt': # Windows
# Windows常见路径
possible_paths = [
r'C:\Program Files\Tesseract-OCR\tesseract.exe',
r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
r'C:\Users\%USERNAME%\AppData\Local\Tesseract-OCR\tesseract.exe'
]
for path in possible_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
_logger.info(f"设置Tesseract路径: {path}")
break
else:
_logger.warning("未找到Tesseract安装路径")
else: # Linux/Mac
# 检查Tesseract是否在PATH中
tesseract_path = shutil.which('tesseract')
if tesseract_path:
pytesseract.pytesseract.tesseract_cmd = tesseract_path
_logger.info(f"找到Tesseract路径: {tesseract_path}")
else:
# 尝试常见路径
possible_paths = [
'/usr/bin/tesseract',
'/usr/local/bin/tesseract',
'/opt/homebrew/bin/tesseract', # macOS M1
'/usr/local/Cellar/tesseract/*/bin/tesseract' # macOS Homebrew
]
for path in possible_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
_logger.info(f"设置Tesseract路径: {path}")
break
else:
_logger.warning("未找到Tesseract,请确保已安装tesseract-ocr")
# 检查语言数据文件
self._check_tessdata_files()
# except Exception as e:
# _logger.warning(f"设置Tesseract路径失败: {str(e)}")
def _check_tessdata_files(self):
"""
Check if tessdata files exist # 检查tessdata文件是否存在
"""
import pytesseract
import os
# 获取Tesseract数据路径
tesseract_cmd = pytesseract.pytesseract.tesseract_cmd
tessdata_dir = os.path.dirname(tesseract_cmd) + '/tessdata'
# 如果tessdata目录不存在,尝试其他常见位置
if not os.path.exists(tessdata_dir):
possible_tessdata_dirs = [
'/usr/share/tesseract-ocr/tessdata',
'/usr/local/share/tesseract-ocr/tessdata',
'/opt/homebrew/share/tessdata', # macOS M1
'/usr/local/Cellar/tesseract/*/share/tessdata' # macOS Homebrew
]
for tessdata_path in possible_tessdata_dirs:
if os.path.exists(tessdata_path):
tessdata_dir = tessdata_path
break
# 检查英语语言数据文件
eng_data = os.path.join(tessdata_dir, 'eng.traineddata')
if os.path.exists(eng_data):
_logger.info(f"找到英语语言数据文件: {eng_data}")
else:
_logger.warning(f"未找到英语语言数据文件: {eng_data}")
_logger.warning("请安装英语语言包: sudo apt-get install tesseract-ocr-eng")
def _find_target_texts(self, words, page_num, viewport_width, viewport_height, page_width, page_height):
"""
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
"""
# 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD','UCLINKLOGISITICSLTD']
EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5']
found_texts = []
for word in words:
text = word['text'].strip().upper()
# 首先检查是否在排除列表中
# 首先检查是否在排除列表中(与HTML完全一致)
is_excluded = False
for exclude_text in exclude_texts:
for exclude_text in EXCLUDE_TEXTS:
exclude_upper = exclude_text.upper()
if exclude_upper in text or text in exclude_upper:
is_excluded = True
break
# 检查页码模式(Page X of Y)
# 检查页码模式(Page X of Y)(与HTML完全一致)
import re
if not is_excluded and (re.match(r'^PAGE\s+\d+\s+OF\s+\d+$', text) or re.match(r'^\d+\s*/\s*\d+$', text)):
is_excluded = True
if is_excluded:
_logger.info(f"排除文字: {word['text']}")
continue
# 检查目标文字匹配
for target_text in target_texts:
# 检查目标文字匹配(与HTML完全一致)
for target_text in TARGET_TEXTS:
target_upper = target_text.upper()
is_match = False
......@@ -401,39 +673,30 @@ class BatchGetPodInfoWizard(models.TransientModel):
elif target_text == 'LTD':
# LTD使用精确匹配
is_match = text == 'LTD'
elif target_text == 'UCLINK LOGISITICS LTD':
# 完整短语匹配
is_match = ('UCLINK' in text and 'LOGISITICS' in text and 'LTD' in text) or \
'UCLINK LOGISITICS LTD' in text or \
text == 'UCLINK LOGISITICS LTD'
elif target_text == 'UCLINK LOGISITICS':
# 部分短语匹配
is_match = ('UCLINK' in text and 'LOGISITICS' in text) or \
text == 'UCLINK LOGISITICS'
elif target_text == 'UCLINK':
# 单独UCLINK匹配
is_match = text == 'UCLINK' or text.startswith('UCLINK ')
elif target_text in ['LOGISITICS', 'LOGISTICS']:
# LOGISITICS/LOGISTICS匹配
is_match = text in ['LOGISITICS', 'LOGISTICS'] or \
text.startswith('LOGISITICS') or text.startswith('LOGISTICS')
else:
# 其他文字使用包含匹配,但更严格
# 其他文字使用包含匹配,但更严格(与HTML完全一致)
is_match = target_upper in text and \
'AIR' not in text and \
'EQK' not in text and \
'ARN' not in text
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
if not is_match and target_text != 'AGN' and target_text != 'LTD':
is_match = self._fuzzy_match(text, target_upper)
if is_match:
# 坐标转换(与HTML文件保持一致
# 坐标转换(适配PyMuPDF坐标系统
scale_x = page_width / viewport_width
scale_y = page_height / viewport_height
# PyMuPDF使用左下角为原点,OCR使用左上角为原点
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
converted_x = word['bbox']['x0'] * scale_x
converted_y = (viewport_height - word['bbox']['y1']) * scale_y
converted_y = (word['bbox']['y0'] * scale_y) # 直接使用OCR的Y坐标
converted_width = (word['bbox']['x1'] - word['bbox']['x0']) * scale_x
converted_height = (word['bbox']['y1'] - word['bbox']['y0']) * scale_y
found_texts.append({
'text': target_text,
'full_text': word['text'],
......@@ -449,26 +712,134 @@ class BatchGetPodInfoWizard(models.TransientModel):
return found_texts
def _apply_fallback_rectangles(self, page, page_num):
def _fuzzy_match(self, str1, str2):
"""
Apply fallback rectangles when OCR fails # OCR失败时应用回退矩形
Fuzzy match function (与HTML完全一致) # 模糊匹配函数
"""
page_width = page.rect.width
page_height = page.rect.height
import re
s1 = re.sub(r'[^A-Z]', '', str1)
s2 = re.sub(r'[^A-Z]', '', str2)
# 超精确的预设坐标覆盖(与HTML文件保持一致)
rectangles = [
{'x': 50, 'y': page_height - 200, 'width': 60, 'height': 10}, # AGN
{'x': 50, 'y': page_height - 220, 'width': 100, 'height': 10}, # UCLINK LOGISITICS
{'x': 155, 'y': page_height - 220, 'width': 30, 'height': 10} # LTD
]
if len(s1) == 0 or len(s2) == 0:
return False
# 计算编辑距离
distance = self._levenshtein_distance(s1, s2)
max_len = max(len(s1), len(s2))
# 如果编辑距离小于等于最大长度的1/3,认为匹配
return distance <= max_len / 3
def _levenshtein_distance(self, s1, s2):
"""
Calculate Levenshtein distance (与HTML完全一致) # 计算编辑距离
"""
if len(s1) < len(s2):
return self._levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def _save_and_return_download_link(self, file_info):
"""
Save processed PDF as attachment and return download action # 保存处理后的PDF作为附件并返回下载动作
:param file_info: 处理后的文件信息
:return: Odoo action to download the file
"""
import base64
try:
# 获取处理后的PDF数据
file_data = file_info.get('file_data', '')
file_name = file_info.get('file_name', 'processed.pdf')
if not file_data:
raise ValidationError(_('No processed file data available'))
# 解码base64数据
if isinstance(file_data, str):
_logger.info(f"输入是字符串类型,长度: {len(file_data)}")
_logger.info(f"输入前50字符: {file_data[:50]}")
pdf_binary = base64.b64decode(file_data)
else:
_logger.info(f"输入是bytes类型,长度: {len(file_data)}")
_logger.info(f"输入前20字节: {file_data[:20]}")
pdf_binary = file_data
# 验证PDF数据完整性
_logger.info(f"PDF二进制数据大小: {len(pdf_binary)}字节")
_logger.info(f"PDF文件头: {pdf_binary[:20]}")
_logger.info(f"PDF文件头(hex): {pdf_binary[:20].hex()}")
# 确保PDF数据有效
if not pdf_binary.startswith(b'%PDF-'):
_logger.error(f"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}")
_logger.error(f"文件头(hex): {pdf_binary[:20].hex()}")
_logger.error(f"文件大小: {len(pdf_binary)}字节")
# 尝试修复:如果是base64字符串被错误处理
if isinstance(file_data, str) and len(file_data) > 100:
_logger.info("尝试重新解码base64数据...")
try:
# 重新尝试base64解码
pdf_binary_fixed = base64.b64decode(file_data)
if pdf_binary_fixed.startswith(b'%PDF-'):
_logger.info("✅ 重新解码成功,PDF数据有效")
pdf_binary = pdf_binary_fixed
else:
_logger.error("❌ 重新解码后仍然不是有效的PDF")
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
except Exception as e:
_logger.error(f"重新解码失败: {str(e)}")
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
else:
raise ValidationError(_('Invalid PDF data for saving: not a valid PDF format'))
for rect in rectangles:
# 验证PDF可以打开
try:
import fitz
page.draw_rect(
fitz.Rect(rect['x'], rect['y'], rect['x'] + rect['width'], rect['y'] + rect['height']),
color=(1, 1, 1),
fill=(1, 1, 1)
)
test_doc = fitz.open(stream=pdf_binary, filetype="pdf")
_logger.info(f"PDF验证成功,页数: {len(test_doc)}")
test_doc.close()
except Exception as e:
_logger.error(f"PDF验证失败: {str(e)}")
raise ValidationError(_('Invalid PDF data for saving: cannot open PDF - %s') % str(e))
# 创建附件记录
attachment = self.env['ir.attachment'].create({
'name': f'processed_{file_name}',
'type': 'binary',
'datas': base64.b64encode(pdf_binary),
'mimetype': 'application/pdf',
'res_model': 'batch.get.pod.info.wizard',
'res_id': self.id,
})
_logger.info(f"第{page_num + 1}页使用回退策略,应用了{len(rectangles)}个预设矩形")
_logger.info(f"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}")
# 返回下载动作
return {
'type': 'ir.actions.act_url',
'url': f'/web/content/{attachment.id}?download=true',
'target': 'new',
}
except Exception as e:
_logger.error(f"保存PDF附件失败: {str(e)}")
raise ValidationError(_('Failed to save PDF attachment: %s') % str(e))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论