Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
H
hh_ccs
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
贺阳
hh_ccs
Commits
8ca88357
提交
8ca88357
authored
10月 20, 2025
作者:
贺阳
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
没有勾选同步或没有勾选涂抹文字也需要创建附件信息
上级
b127cb4b
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
95 行增加
和
92 行删除
+95
-92
batch_get_pod_info_wizard.py
ccs_base/wizard/batch_get_pod_info_wizard.py
+95
-92
没有找到文件。
ccs_base/wizard/batch_get_pod_info_wizard.py
浏览文件 @
8ca88357
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import
base64
import
io
import
io
import
logging
import
logging
import
base64
import
requests
import
requests
from
odoo
import
models
,
fields
,
_
from
odoo
import
models
,
fields
,
_
from
odoo.exceptions
import
ValidationError
from
odoo.exceptions
import
ValidationError
...
@@ -56,16 +57,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -56,16 +57,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 英文提示
# 英文提示
raise
ValidationError
(
_
(
'
%
s bill of loading cannot find release note file'
)
%
(
raise
ValidationError
(
_
(
'
%
s bill of loading cannot find release note file'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
])))
# xx提单无法找到release note文件
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
])))
# xx提单无法找到release note文件
if
self
.
remove_specified_text
:
if
self
.
remove_specified_text
:
processed_files
=
self
.
_remove_specified_text
(
processed_files
)
processed_files
=
self
.
_remove_specified_text
(
processed_files
)
# 用于测试的:保存处理后的PDF并返回下载链接
# 用于测试的:保存处理后的PDF并返回下载链接
# if processed_files and processed_files[0].get('file_data'):
# if processed_files and processed_files[0].get('file_data'):
# return self._save_and_return_download_link(processed_files[0])
# return self._save_and_return_download_link(processed_files[0])
# 回写到附件信息
if
processed_files
:
# 回写PDF文件到清关文件
self
.
_write_pdf_file
(
processed_files
)
# 再同步和回写
# 再同步和回写
if
self
.
sync_last_mile_pod
:
if
self
.
sync_last_mile_pod
and
processed_files
:
self
.
_sync_last_mile_pod
(
processed_files
)
self
.
_sync_last_mile_pod
(
processed_files
)
# 写一个方法掉接口获取提单pdf文件
# 写一个方法掉接口获取提单pdf文件
...
@@ -76,12 +80,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -76,12 +80,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取当前选中的提单对象
# 获取当前选中的提单对象
bl_objs
=
self
.
get_order
()
bl_objs
=
self
.
get_order
()
bill_numbers
=
[
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
for
bl
in
bl_objs
]
bill_numbers
=
[
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
for
bl
in
bl_objs
]
# 调用API获取PDF文件
# 调用API获取PDF文件
api_url
=
self
.
env
[
'ir.config_parameter'
]
.
sudo
()
.
get_param
(
'last_mile_pod_api_url'
,
'http://172.104.52.150:7002'
)
api_url
=
self
.
env
[
'ir.config_parameter'
]
.
sudo
()
.
get_param
(
'last_mile_pod_api_url'
,
'http://172.104.52.150:7002'
)
if
not
api_url
:
if
not
api_url
:
raise
ValidationError
(
_
(
'API URL not configured'
))
raise
ValidationError
(
_
(
'API URL not configured'
))
# 构建请求数据
# 构建请求数据
request_data
=
{
request_data
=
{
"bill_numbers"
:
bill_numbers
"bill_numbers"
:
bill_numbers
...
@@ -92,22 +97,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -92,22 +97,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
headers
=
{
'Content-Type'
:
'application/json'
},
headers
=
{
'Content-Type'
:
'application/json'
},
json
=
request_data
json
=
request_data
)
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
result
=
response
.
json
()
result
=
response
.
json
()
# 检查API响应结构
# 检查API响应结构
if
not
result
:
if
not
result
:
raise
ValidationError
(
_
(
'API returned empty response'
))
raise
ValidationError
(
_
(
'API returned empty response'
))
if
not
result
.
get
(
'success'
):
if
not
result
.
get
(
'success'
):
error_msg
=
result
.
get
(
'message'
,
'Unknown error'
)
error_msg
=
result
.
get
(
'message'
,
'Unknown error'
)
raise
ValidationError
(
_
(
'API returned error:
%
s'
)
%
error_msg
)
raise
ValidationError
(
_
(
'API returned error:
%
s'
)
%
error_msg
)
# 处理结果数据
# 处理结果数据
results
=
result
.
get
(
'results'
,
[])
results
=
result
.
get
(
'results'
,
[])
if
not
results
:
if
not
results
:
raise
ValidationError
(
_
(
'No PDF files found in API response'
))
#
提示:API调用成功,但没有PDF文件
raise
ValidationError
(
_
(
'No PDF files found in API response'
))
#
提示:API调用成功,但没有PDF文件
# 构建PDF文件数组
# 构建PDF文件数组
pdf_file_arr
=
[]
pdf_file_arr
=
[]
for
result_item
in
results
:
for
result_item
in
results
:
...
@@ -116,11 +121,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -116,11 +121,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
bill_number
=
result_item
.
get
(
'bill_number'
)
bill_number
=
result_item
.
get
(
'bill_number'
)
filename
=
result_item
.
get
(
'filename'
)
filename
=
result_item
.
get
(
'filename'
)
base64_data
=
result_item
.
get
(
'base64'
)
base64_data
=
result_item
.
get
(
'base64'
)
if
not
all
([
bill_number
,
filename
,
base64_data
]):
if
not
all
([
bill_number
,
filename
,
base64_data
]):
_logger
.
warning
(
f
"跳过无效的PDF文件项: {result_item}"
)
_logger
.
warning
(
f
"跳过无效的PDF文件项: {result_item}"
)
continue
continue
# 验证PDF文件
# 验证PDF文件
try
:
try
:
pdf_binary
=
base64
.
b64decode
(
base64_data
)
pdf_binary
=
base64
.
b64decode
(
base64_data
)
...
@@ -140,7 +145,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -140,7 +145,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
return
pdf_file_arr
return
pdf_file_arr
else
:
else
:
raise
ValidationError
(
_
(
'Failed to get PDF file from API:
%
s'
)
%
response
.
text
)
raise
ValidationError
(
_
(
'Failed to get PDF file from API:
%
s'
)
%
response
.
text
)
except
requests
.
exceptions
.
RequestException
as
e
:
except
requests
.
exceptions
.
RequestException
as
e
:
raise
ValidationError
(
_
(
'API request failed:
%
s'
)
%
str
(
e
))
raise
ValidationError
(
_
(
'API request failed:
%
s'
)
%
str
(
e
))
...
@@ -176,7 +181,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -176,7 +181,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
select_bl_no
=
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
select_bl_no
=
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
for
pdf_file
in
pdf_file_arr
:
for
pdf_file
in
pdf_file_arr
:
# 尝试不同的字段名(API可能使用不同的字段名)
# 尝试不同的字段名(API可能使用不同的字段名)
file_name
=
pdf_file
.
get
(
'file_name'
)
# 获取文件名
file_name
=
pdf_file
.
get
(
'file_name'
)
# 获取文件名
file_data
=
pdf_file
.
get
(
'file_data'
)
# 获取文件数据
file_data
=
pdf_file
.
get
(
'file_data'
)
# 获取文件数据
bl_no
=
pdf_file
.
get
(
'bl_no'
)
# 获取提单号
bl_no
=
pdf_file
.
get
(
'bl_no'
)
# 获取提单号
if
bl_no
and
select_bl_no
==
bl_no
:
if
bl_no
and
select_bl_no
==
bl_no
:
...
@@ -198,8 +203,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -198,8 +203,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
Sync last mile POD information # 同步尾程POD信息
Sync last mile POD information # 同步尾程POD信息
:param processed_files: 处理后的文件数组
:param processed_files: 处理后的文件数组
"""
"""
# 回写PDF文件到清关文件
self
.
_write_pdf_file
(
processed_files
)
# return False#测试 先不同步
# return False#测试 先不同步
# 同步尾程POD信息
# 同步尾程POD信息
for
file_info
in
processed_files
:
for
file_info
in
processed_files
:
...
@@ -220,7 +223,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -220,7 +223,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
:return: 处理后的文件数组(包含处理后的PDF数据)
:return: 处理后的文件数组(包含处理后的PDF数据)
"""
"""
updated_files
=
[]
updated_files
=
[]
for
file_info
in
processed_files
:
for
file_info
in
processed_files
:
if
not
file_info
[
'bl'
]:
if
not
file_info
[
'bl'
]:
updated_files
.
append
(
file_info
)
updated_files
.
append
(
file_info
)
...
@@ -241,12 +244,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -241,12 +244,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
processed_pdf
:
if
processed_pdf
:
# 将处理后的PDF转换回base64
# 将处理后的PDF转换回base64
processed_file_data
=
base64
.
b64encode
(
processed_pdf
)
.
decode
(
'utf-8'
)
processed_file_data
=
base64
.
b64encode
(
processed_pdf
)
.
decode
(
'utf-8'
)
# 更新文件信息,使用处理后的PDF数据
# 更新文件信息,使用处理后的PDF数据
updated_file_info
=
file_info
.
copy
()
updated_file_info
=
file_info
.
copy
()
updated_file_info
[
'file_data'
]
=
processed_file_data
updated_file_info
[
'file_data'
]
=
processed_file_data
updated_files
.
append
(
updated_file_info
)
updated_files
.
append
(
updated_file_info
)
return
updated_files
return
updated_files
def
_process_pdf_with_ocr
(
self
,
pdf_data
,
bl_no
):
def
_process_pdf_with_ocr
(
self
,
pdf_data
,
bl_no
):
...
@@ -256,12 +259,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -256,12 +259,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
:param bl_no: 提单号(用于日志)
:param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据
:return: 处理后的PDF二进制数据
"""
"""
import
os
import
fitz
# PyMuPDF
import
fitz
# PyMuPDF
import
numpy
as
np
import
numpy
as
np
from
PIL
import
Image
from
PIL
import
Image
import
pytesseract
import
pytesseract
# 尝试导入OpenCV,如果失败则使用PIL替代
# 尝试导入OpenCV,如果失败则使用PIL替代
try
:
try
:
import
cv2
import
cv2
...
@@ -271,7 +273,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -271,7 +273,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
cv2_available
=
False
cv2_available
=
False
_logger
.
warning
(
f
"OpenCV不可用,使用PIL替代: {str(e)}"
)
_logger
.
warning
(
f
"OpenCV不可用,使用PIL替代: {str(e)}"
)
_logger
.
warning
(
"建议安装OpenCV: pip install opencv-python-headless"
)
_logger
.
warning
(
"建议安装OpenCV: pip install opencv-python-headless"
)
# 设置Tesseract路径
# 设置Tesseract路径
self
.
_setup_tesseract_path
()
self
.
_setup_tesseract_path
()
# 打开PDF文档
# 打开PDF文档
...
@@ -280,17 +282,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -280,17 +282,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
processed_pages
=
0
processed_pages
=
0
detected_texts
=
[]
detected_texts
=
[]
all_recognized_texts
=
[]
all_recognized_texts
=
[]
result_data
=
False
result_data
=
False
# 处理每一页(完全按照HTML逻辑)
# 处理每一页(完全按照HTML逻辑)
for
page_num
in
range
(
len
(
pdf_document
)):
for
page_num
in
range
(
len
(
pdf_document
)):
page
=
pdf_document
[
page_num
]
page
=
pdf_document
[
page_num
]
# _logger.info(f"正在OCR识别第{page_num + 1}页")
# _logger.info(f"正在OCR识别第{page_num + 1}页")
# 将页面转换为图像(与HTML完全一致)
# 将页面转换为图像(与HTML完全一致)
mat
=
fitz
.
Matrix
(
2.0
,
2.0
)
# 提高分辨率
mat
=
fitz
.
Matrix
(
2.0
,
2.0
)
# 提高分辨率
pix
=
page
.
get_pixmap
(
matrix
=
mat
)
pix
=
page
.
get_pixmap
(
matrix
=
mat
)
img_data
=
pix
.
tobytes
(
"png"
)
img_data
=
pix
.
tobytes
(
"png"
)
# 转换为PIL图像(兼容OpenCV和PIL)
# 转换为PIL图像(兼容OpenCV和PIL)
if
cv2_available
:
if
cv2_available
:
# 使用OpenCV处理
# 使用OpenCV处理
...
@@ -302,28 +304,28 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -302,28 +304,28 @@ class BatchGetPodInfoWizard(models.TransientModel):
pil_img
=
Image
.
open
(
io
.
BytesIO
(
img_data
))
pil_img
=
Image
.
open
(
io
.
BytesIO
(
img_data
))
if
pil_img
.
mode
!=
'RGB'
:
if
pil_img
.
mode
!=
'RGB'
:
pil_img
=
pil_img
.
convert
(
'RGB'
)
pil_img
=
pil_img
.
convert
(
'RGB'
)
# OCR配置(与HTML完全一致)
# OCR配置(与HTML完全一致)
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
# 使用Tesseract进行OCR识别
# 使用Tesseract进行OCR识别
try
:
try
:
ocr_data
=
pytesseract
.
image_to_data
(
ocr_data
=
pytesseract
.
image_to_data
(
pil_img
,
pil_img
,
output_type
=
pytesseract
.
Output
.
DICT
,
output_type
=
pytesseract
.
Output
.
DICT
,
lang
=
'eng'
,
lang
=
'eng'
,
config
=
config
config
=
config
)
)
except
Exception
as
e
:
except
Exception
as
e
:
_logger
.
error
(
f
"OCR识别失败: {str(e)}"
)
_logger
.
error
(
f
"OCR识别失败: {str(e)}"
)
continue
continue
# 处理OCR结果(与HTML完全一致)
# 处理OCR结果(与HTML完全一致)
page_width
=
page
.
rect
.
width
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
page_height
=
page
.
rect
.
height
viewport_width
=
pil_img
.
width
viewport_width
=
pil_img
.
width
viewport_height
=
pil_img
.
height
viewport_height
=
pil_img
.
height
# 存储所有识别到的文字
# 存储所有识别到的文字
page_recognized_texts
=
[]
page_recognized_texts
=
[]
for
i
in
range
(
len
(
ocr_data
[
'text'
])):
for
i
in
range
(
len
(
ocr_data
[
'text'
])):
...
@@ -340,23 +342,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -340,23 +342,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
},
},
'page'
:
page_num
'page'
:
page_num
})
})
all_recognized_texts
.
extend
(
page_recognized_texts
)
all_recognized_texts
.
extend
(
page_recognized_texts
)
# 查找目标文字(完全按照HTML逻辑)
# 查找目标文字(完全按照HTML逻辑)
page_texts
=
self
.
_find_target_texts
(
page_texts
=
self
.
_find_target_texts
(
page_recognized_texts
,
page_recognized_texts
,
page_num
,
page_num
,
viewport_width
,
viewport_width
,
viewport_height
,
viewport_height
,
page_width
,
page_width
,
page_height
page_height
)
)
detected_texts
.
extend
(
page_texts
)
detected_texts
.
extend
(
page_texts
)
# 根据OCR结果删除文字(完全按照HTML逻辑)
# 根据OCR结果删除文字(完全按照HTML逻辑)
if
page_texts
:
if
page_texts
:
for
text_info
in
page_texts
:
for
text_info
in
page_texts
:
# 超精确删除模式(与HTML完全一致)
# 超精确删除模式(与HTML完全一致)
rect
=
{
rect
=
{
...
@@ -365,33 +367,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -365,33 +367,32 @@ class BatchGetPodInfoWizard(models.TransientModel):
'width'
:
text_info
[
'width'
],
'width'
:
text_info
[
'width'
],
'height'
:
text_info
[
'height'
]
'height'
:
text_info
[
'height'
]
}
}
# 绘制白色矩形覆盖文字
# 绘制白色矩形覆盖文字
try
:
try
:
page
.
draw_rect
(
page
.
draw_rect
(
fitz
.
Rect
(
rect
[
'x'
],
rect
[
'y'
],
rect
[
'x'
]
+
rect
[
'width'
],
rect
[
'y'
]
+
rect
[
'height'
]),
fitz
.
Rect
(
rect
[
'x'
],
rect
[
'y'
],
rect
[
'x'
]
+
rect
[
'width'
],
rect
[
'y'
]
+
rect
[
'height'
]),
color
=
(
1
,
1
,
1
),
# 白色
color
=
(
1
,
1
,
1
),
# 白色
fill
=
(
1
,
1
,
1
)
# 填充白色
fill
=
(
1
,
1
,
1
)
# 填充白色
)
)
# _logger.info(f"删除目标文字: {text_info['text']}")
# _logger.info(f"删除目标文字: {text_info['text']}")
total_rectangles
+=
1
total_rectangles
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
_logger
.
error
(
f
"删除失败: {str(e)}"
)
_logger
.
error
(
f
"删除失败: {str(e)}"
)
processed_pages
+=
1
processed_pages
+=
1
# 保存处理后的PDF
# 保存处理后的PDF
try
:
try
:
output_buffer
=
io
.
BytesIO
()
output_buffer
=
io
.
BytesIO
()
pdf_document
.
save
(
output_buffer
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
pdf_document
.
save
(
output_buffer
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
pdf_document
.
close
()
pdf_document
.
close
()
result_data
=
output_buffer
.
getvalue
()
result_data
=
output_buffer
.
getvalue
()
output_buffer
.
close
()
output_buffer
.
close
()
except
Exception
as
e
:
except
Exception
as
e
:
_logger
.
error
(
f
"PDF保存失败: {str(e)}"
)
_logger
.
error
(
f
"PDF保存失败: {str(e)}"
)
pdf_document
.
close
()
pdf_document
.
close
()
return
result_data
return
result_data
def
_setup_tesseract_path
(
self
):
def
_setup_tesseract_path
(
self
):
...
@@ -402,7 +403,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -402,7 +403,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import
pytesseract
import
pytesseract
import
os
import
os
import
shutil
import
shutil
if
os
.
name
==
'nt'
:
# Windows
if
os
.
name
==
'nt'
:
# Windows
# Windows常见路径
# Windows常见路径
possible_paths
=
[
possible_paths
=
[
...
@@ -427,26 +428,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -427,26 +428,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/bin/tesseract'
,
# macOS M1
'/opt/homebrew/bin/tesseract'
,
# macOS M1
'/usr/local/Cellar/tesseract/*/bin/tesseract'
# macOS Homebrew
'/usr/local/Cellar/tesseract/*/bin/tesseract'
# macOS Homebrew
]
]
for
path
in
possible_paths
:
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
if
os
.
path
.
exists
(
path
):
pytesseract
.
pytesseract
.
tesseract_cmd
=
path
pytesseract
.
pytesseract
.
tesseract_cmd
=
path
break
break
# 检查语言数据文件
# 检查语言数据文件
self
.
_check_tessdata_files
()
self
.
_check_tessdata_files
()
def
_check_tessdata_files
(
self
):
def
_check_tessdata_files
(
self
):
"""
"""
Check if tessdata files exist # 检查tessdata文件是否存在
Check if tessdata files exist # 检查tessdata文件是否存在
"""
"""
import
pytesseract
import
pytesseract
import
os
import
os
# 获取Tesseract数据路径
# 获取Tesseract数据路径
tesseract_cmd
=
pytesseract
.
pytesseract
.
tesseract_cmd
tesseract_cmd
=
pytesseract
.
pytesseract
.
tesseract_cmd
tessdata_dir
=
os
.
path
.
dirname
(
tesseract_cmd
)
+
'/tessdata'
tessdata_dir
=
os
.
path
.
dirname
(
tesseract_cmd
)
+
'/tessdata'
# 如果tessdata目录不存在,尝试其他常见位置
# 如果tessdata目录不存在,尝试其他常见位置
if
not
os
.
path
.
exists
(
tessdata_dir
):
if
not
os
.
path
.
exists
(
tessdata_dir
):
possible_tessdata_dirs
=
[
possible_tessdata_dirs
=
[
...
@@ -455,12 +456,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -455,12 +456,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
'/opt/homebrew/share/tessdata'
,
# macOS M1
'/opt/homebrew/share/tessdata'
,
# macOS M1
'/usr/local/Cellar/tesseract/*/share/tessdata'
# macOS Homebrew
'/usr/local/Cellar/tesseract/*/share/tessdata'
# macOS Homebrew
]
]
for
tessdata_path
in
possible_tessdata_dirs
:
for
tessdata_path
in
possible_tessdata_dirs
:
if
os
.
path
.
exists
(
tessdata_path
):
if
os
.
path
.
exists
(
tessdata_path
):
tessdata_dir
=
tessdata_path
tessdata_dir
=
tessdata_path
break
break
# 检查英语语言数据文件
# 检查英语语言数据文件
eng_data
=
os
.
path
.
join
(
tessdata_dir
,
'eng.traineddata'
)
eng_data
=
os
.
path
.
join
(
tessdata_dir
,
'eng.traineddata'
)
if
os
.
path
.
exists
(
eng_data
):
if
os
.
path
.
exists
(
eng_data
):
...
@@ -473,14 +474,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -473,14 +474,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
"""
"""
# 定义目标文字和排除文字(与HTML文件完全一致)
# 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS
=
[
'AGN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
'UCLINKLOGISITICSLTD'
]
TARGET_TEXTS
=
[
'AGN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
'UCLINKLOGISITICSLTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
found_texts
=
[]
found_texts
=
[]
for
word
in
words
:
for
word
in
words
:
text
=
word
[
'text'
]
.
strip
()
.
upper
()
text
=
word
[
'text'
]
.
strip
()
.
upper
()
# 首先检查是否在排除列表中(与HTML完全一致)
# 首先检查是否在排除列表中(与HTML完全一致)
is_excluded
=
False
is_excluded
=
False
for
exclude_text
in
EXCLUDE_TEXTS
:
for
exclude_text
in
EXCLUDE_TEXTS
:
...
@@ -488,21 +491,21 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -488,21 +491,21 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
exclude_upper
in
text
or
text
in
exclude_upper
:
if
exclude_upper
in
text
or
text
in
exclude_upper
:
is_excluded
=
True
is_excluded
=
True
break
break
# 检查页码模式(Page X of Y)(与HTML完全一致)
# 检查页码模式(Page X of Y)(与HTML完全一致)
import
re
import
re
if
not
is_excluded
and
(
re
.
match
(
r'^PAGE\s+\d+\s+OF\s+\d+$'
,
text
)
or
re
.
match
(
r'^\d+\s*/\s*\d+$'
,
text
)):
if
not
is_excluded
and
(
re
.
match
(
r'^PAGE\s+\d+\s+OF\s+\d+$'
,
text
)
or
re
.
match
(
r'^\d+\s*/\s*\d+$'
,
text
)):
is_excluded
=
True
is_excluded
=
True
if
is_excluded
:
if
is_excluded
:
# _logger.info(f"排除文字: {word['text']}")
# _logger.info(f"排除文字: {word['text']}")
continue
continue
# 检查目标文字匹配(与HTML完全一致)
# 检查目标文字匹配(与HTML完全一致)
for
target_text
in
TARGET_TEXTS
:
for
target_text
in
TARGET_TEXTS
:
target_upper
=
target_text
.
upper
()
target_upper
=
target_text
.
upper
()
is_match
=
False
is_match
=
False
if
target_text
==
'AGN'
:
if
target_text
==
'AGN'
:
# AGN使用精确匹配
# AGN使用精确匹配
is_match
=
text
==
'AGN'
is_match
=
text
==
'AGN'
...
@@ -512,27 +515,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -512,27 +515,26 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
else
:
# 其他文字使用包含匹配,但更严格(与HTML完全一致)
# 其他文字使用包含匹配,但更严格(与HTML完全一致)
is_match
=
target_upper
in
text
and
\
is_match
=
target_upper
in
text
and
\
'AIR'
not
in
text
and
\
'AIR'
not
in
text
and
\
'EQK'
not
in
text
and
\
'EQK'
not
in
text
and
\
'ARN'
not
in
text
'ARN'
not
in
text
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
if
not
is_match
and
target_text
!=
'AGN'
and
target_text
!=
'LTD'
:
if
not
is_match
and
target_text
!=
'AGN'
and
target_text
!=
'LTD'
:
is_match
=
self
.
_fuzzy_match
(
text
,
target_upper
)
is_match
=
self
.
_fuzzy_match
(
text
,
target_upper
)
if
is_match
:
if
is_match
:
# 坐标转换(适配PyMuPDF坐标系统)
# 坐标转换(适配PyMuPDF坐标系统)
scale_x
=
page_width
/
viewport_width
scale_x
=
page_width
/
viewport_width
scale_y
=
page_height
/
viewport_height
scale_y
=
page_height
/
viewport_height
# PyMuPDF使用左下角为原点,OCR使用左上角为原点
# PyMuPDF使用左下角为原点,OCR使用左上角为原点
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
converted_x
=
word
[
'bbox'
][
'x0'
]
*
scale_x
converted_x
=
word
[
'bbox'
][
'x0'
]
*
scale_x
converted_y
=
(
word
[
'bbox'
][
'y0'
]
*
scale_y
)
# 直接使用OCR的Y坐标
converted_y
=
(
word
[
'bbox'
][
'y0'
]
*
scale_y
)
# 直接使用OCR的Y坐标
converted_width
=
(
word
[
'bbox'
][
'x1'
]
-
word
[
'bbox'
][
'x0'
])
*
scale_x
converted_width
=
(
word
[
'bbox'
][
'x1'
]
-
word
[
'bbox'
][
'x0'
])
*
scale_x
converted_height
=
(
word
[
'bbox'
][
'y1'
]
-
word
[
'bbox'
][
'y0'
])
*
scale_y
converted_height
=
(
word
[
'bbox'
][
'y1'
]
-
word
[
'bbox'
][
'y0'
])
*
scale_y
found_texts
.
append
({
found_texts
.
append
({
'text'
:
target_text
,
'text'
:
target_text
,
'full_text'
:
word
[
'text'
],
'full_text'
:
word
[
'text'
],
...
@@ -545,7 +547,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -545,7 +547,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
'type'
:
'agn'
if
target_text
==
'AGN'
else
'uclink'
'type'
:
'agn'
if
target_text
==
'AGN'
else
'uclink'
})
})
break
break
return
found_texts
return
found_texts
def
_fuzzy_match
(
self
,
str1
,
str2
):
def
_fuzzy_match
(
self
,
str1
,
str2
):
...
@@ -555,14 +557,14 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -555,14 +557,14 @@ class BatchGetPodInfoWizard(models.TransientModel):
import
re
import
re
s1
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str1
)
s1
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str1
)
s2
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str2
)
s2
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str2
)
if
len
(
s1
)
==
0
or
len
(
s2
)
==
0
:
if
len
(
s1
)
==
0
or
len
(
s2
)
==
0
:
return
False
return
False
# 计算编辑距离
# 计算编辑距离
distance
=
self
.
_levenshtein_distance
(
s1
,
s2
)
distance
=
self
.
_levenshtein_distance
(
s1
,
s2
)
max_len
=
max
(
len
(
s1
),
len
(
s2
))
max_len
=
max
(
len
(
s1
),
len
(
s2
))
# 如果编辑距离小于等于最大长度的1/3,认为匹配
# 如果编辑距离小于等于最大长度的1/3,认为匹配
return
distance
<=
max_len
/
3
return
distance
<=
max_len
/
3
...
@@ -572,10 +574,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -572,10 +574,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
"""
if
len
(
s1
)
<
len
(
s2
):
if
len
(
s1
)
<
len
(
s2
):
return
self
.
_levenshtein_distance
(
s2
,
s1
)
return
self
.
_levenshtein_distance
(
s2
,
s1
)
if
len
(
s2
)
==
0
:
if
len
(
s2
)
==
0
:
return
len
(
s1
)
return
len
(
s1
)
previous_row
=
list
(
range
(
len
(
s2
)
+
1
))
previous_row
=
list
(
range
(
len
(
s2
)
+
1
))
for
i
,
c1
in
enumerate
(
s1
):
for
i
,
c1
in
enumerate
(
s1
):
current_row
=
[
i
+
1
]
current_row
=
[
i
+
1
]
...
@@ -585,7 +587,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -585,7 +587,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
substitutions
=
previous_row
[
j
]
+
(
c1
!=
c2
)
substitutions
=
previous_row
[
j
]
+
(
c1
!=
c2
)
current_row
.
append
(
min
(
insertions
,
deletions
,
substitutions
))
current_row
.
append
(
min
(
insertions
,
deletions
,
substitutions
))
previous_row
=
current_row
previous_row
=
current_row
return
previous_row
[
-
1
]
return
previous_row
[
-
1
]
def
_save_and_return_download_link
(
self
,
file_info
):
def
_save_and_return_download_link
(
self
,
file_info
):
...
@@ -598,22 +600,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -598,22 +600,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 获取处理后的PDF数据
# 获取处理后的PDF数据
file_data
=
file_info
.
get
(
'file_data'
,
''
)
file_data
=
file_info
.
get
(
'file_data'
,
''
)
file_name
=
file_info
.
get
(
'file_name'
,
'processed.pdf'
)
file_name
=
file_info
.
get
(
'file_name'
,
'processed.pdf'
)
if
not
file_data
:
if
not
file_data
:
raise
ValidationError
(
_
(
'No processed file data available'
))
#
提示:没有处理后的文件数据
raise
ValidationError
(
_
(
'No processed file data available'
))
#
提示:没有处理后的文件数据
# 解码base64数据
# 解码base64数据
if
isinstance
(
file_data
,
str
):
if
isinstance
(
file_data
,
str
):
pdf_binary
=
base64
.
b64decode
(
file_data
)
pdf_binary
=
base64
.
b64decode
(
file_data
)
else
:
else
:
pdf_binary
=
file_data
pdf_binary
=
file_data
# 确保PDF数据有效
# 确保PDF数据有效
if
not
pdf_binary
.
startswith
(
b
'
%
PDF-'
):
if
not
pdf_binary
.
startswith
(
b
'
%
PDF-'
):
_logger
.
error
(
f
"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}"
)
_logger
.
error
(
f
"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}"
)
_logger
.
error
(
f
"文件头(hex): {pdf_binary[:20].hex()}"
)
_logger
.
error
(
f
"文件头(hex): {pdf_binary[:20].hex()}"
)
_logger
.
error
(
f
"文件大小: {len(pdf_binary)}字节"
)
_logger
.
error
(
f
"文件大小: {len(pdf_binary)}字节"
)
# 尝试修复:如果是base64字符串被错误处理
# 尝试修复:如果是base64字符串被错误处理
if
isinstance
(
file_data
,
str
)
and
len
(
file_data
)
>
100
:
if
isinstance
(
file_data
,
str
)
and
len
(
file_data
)
>
100
:
_logger
.
info
(
"尝试重新解码base64数据..."
)
_logger
.
info
(
"尝试重新解码base64数据..."
)
...
@@ -631,7 +633,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -631,7 +633,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
else
:
else
:
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
# 验证PDF可以打开
# 验证PDF可以打开
try
:
try
:
import
fitz
import
fitz
...
@@ -641,7 +643,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -641,7 +643,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
except
Exception
as
e
:
except
Exception
as
e
:
_logger
.
error
(
f
"PDF验证失败: {str(e)}"
)
_logger
.
error
(
f
"PDF验证失败: {str(e)}"
)
raise
ValidationError
(
_
(
'Invalid PDF data for saving: cannot open PDF -
%
s'
)
%
str
(
e
))
raise
ValidationError
(
_
(
'Invalid PDF data for saving: cannot open PDF -
%
s'
)
%
str
(
e
))
# 创建附件记录
# 创建附件记录
attachment
=
self
.
env
[
'ir.attachment'
]
.
create
({
attachment
=
self
.
env
[
'ir.attachment'
]
.
create
({
'name'
:
f
'processed_{file_name}'
,
'name'
:
f
'processed_{file_name}'
,
...
@@ -651,16 +653,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
...
@@ -651,16 +653,17 @@ class BatchGetPodInfoWizard(models.TransientModel):
'res_model'
:
'batch.get.pod.info.wizard'
,
'res_model'
:
'batch.get.pod.info.wizard'
,
'res_id'
:
self
.
id
,
'res_id'
:
self
.
id
,
})
})
_logger
.
info
(
f
"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}"
)
_logger
.
info
(
f
"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}"
)
# 返回下载动作
# 返回下载动作
return
{
return
{
'type'
:
'ir.actions.act_url'
,
'type'
:
'ir.actions.act_url'
,
'url'
:
f
'/web/content/{attachment.id}?download=true'
,
'url'
:
f
'/web/content/{attachment.id}?download=true'
,
'target'
:
'new'
,
'target'
:
'new'
,
}
}
except
Exception
as
e
:
except
Exception
as
e
:
_logger
.
error
(
f
"保存PDF附件失败: {str(e)}"
)
_logger
.
error
(
f
"保存PDF附件失败: {str(e)}"
)
raise
ValidationError
(
_
(
'Failed to save PDF attachment:
%
s'
)
%
str
(
e
))
raise
ValidationError
(
_
(
'Failed to save PDF attachment:
%
s'
)
%
str
(
e
))
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论