Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
H
hh_ccs
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
贺阳
hh_ccs
Commits
ee951ff9
提交
ee951ff9
authored
10月 17, 2025
作者:
贺阳
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
不调接口的测试
上级
e45ced44
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
499 行增加
和
128 行删除
+499
-128
cc_bl_view.xml
ccs_base/views/cc_bl_view.xml
+1
-1
batch_get_pod_info_wizard.py
ccs_base/wizard/batch_get_pod_info_wizard.py
+498
-127
没有找到文件。
ccs_base/views/cc_bl_view.xml
浏览文件 @
ee951ff9
...
...
@@ -471,7 +471,7 @@
<field
name=
"model_id"
ref=
"model_cc_bl"
/>
<field
name=
"binding_model_id"
ref=
"model_cc_bl"
/>
<field
name=
"state"
>
code
</field>
<field
name=
"binding_view_types"
>
list
</field>
<field
name=
"binding_view_types"
>
list
,form
</field>
<field
name=
"groups_id"
eval=
"[(4, ref('ccs_base.group_clearance_of_customs_user'))]"
/>
<field
name=
"code"
>
if records:
...
...
ccs_base/wizard/batch_get_pod_info_wizard.py
浏览文件 @
ee951ff9
# -*- coding: utf-8 -*-
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import
io
import
logging
import
base64
import
requests
from
odoo
import
models
,
fields
,
_
from
odoo.exceptions
import
ValidationError
...
...
@@ -40,11 +41,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
Confirm operation # 确认操作
"""
try
:
bl_objs
=
self
.
get_order
()
# 调用接口获取提单pdf文件
pdf_file_arr
=
self
.
_get_pdf_file_arr
()
# 调用接口获取提单pdf文件
# pdf_file_arr = self._get_pdf_file_arr()
pdf_file_arr
=
self
.
_get_pdf_file_arr_test
()
if
not
pdf_file_arr
:
raise
ValidationError
(
_
(
'No PDF files found'
))
#提示:没有获取到PDF文件
# 处理PDF文件,匹配提单对象
processed_files
=
self
.
_match_bl_by_file_name
(
pdf_file_arr
)
# 把没有匹配到文件的进行提示
...
...
@@ -56,47 +59,197 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 英文提示
raise
ValidationError
(
_
(
'
%
s bill of loading cannot find release note file'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
])))
# xx提单无法找到release note文件
# 先涂抹指定文字
if
self
.
remove_specified_text
:
processed_files
=
self
.
_remove_specified_text
(
processed_files
)
# 用于测试的:保存处理后的PDF并返回下载链接
# if processed_files and processed_files[0].get('file_data'):
# return self._save_and_return_download_link(processed_files[0])
# 再同步和回写
if
self
.
sync_last_mile_pod
:
self
.
_sync_last_mile_pod
(
processed_files
)
# 显示成功消息
return
{
'type'
:
'ir.actions.client'
,
'tag'
:
'display_notification'
,
'params'
:
{
'title'
:
_
(
'Operation Completed'
),
# 操作完成
'message'
:
_
(
'Successfully processed
%
d PDF files for
%
d bill of loadings'
)
%
(
len
(
processed_files
),
len
(
bl_objs
)),
# 成功处理了%d个PDF文件,涉及%d个提单
'type'
:
'success'
,
}
}
def
_get_pdf_file_arr_test
(
self
):
"""
Get PDF file from test data # 从测试数据获取PDF文件
"""
pdf_file_arr
=
[]
bl_objs
=
self
.
get_order
()
for
bl
in
bl_objs
:
clearance_file
=
self
.
env
[
'cc.clearance.file'
]
.
sudo
()
.
search_clearance_file
(
bl
.
id
,
'尾程交接POD(待大包数量和箱号)'
)
#查找清关文件
if
clearance_file
and
clearance_file
.
file
:
try
:
# 验证原始文件数据
file_data
=
clearance_file
.
file
if
isinstance
(
file_data
,
bytes
):
# 验证PDF文件头
if
not
file_data
.
startswith
(
b
'
%
PDF-'
):
# 检查是否是base64编码的字符串
try
:
decoded_data
=
base64
.
b64decode
(
file_data
)
if
decoded_data
.
startswith
(
b
'
%
PDF-'
):
_logger
.
info
(
f
"发现base64编码的PDF数据,提单号: {bl.bl_no}"
)
file_data
=
decoded_data
else
:
_logger
.
warning
(
f
"base64解码后仍不是PDF格式,提单号: {bl.bl_no}"
)
continue
except
Exception
as
e
:
raise
ValidationError
(
_
(
'Operation failed:
%
s'
)
%
str
(
e
))
# 操作失败
_logger
.
warning
(
f
"尝试base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}"
)
continue
elif
isinstance
(
file_data
,
str
):
# 尝试base64解码
try
:
decoded_data
=
base64
.
b64decode
(
file_data
)
if
decoded_data
.
startswith
(
b
'
%
PDF-'
):
_logger
.
info
(
f
"字符串base64解码成功,是有效PDF,提单号: {bl.bl_no}"
)
file_data
=
decoded_data
else
:
_logger
.
warning
(
f
"字符串base64解码后不是PDF格式,提单号: {bl.bl_no}"
)
continue
except
Exception
as
e
:
_logger
.
warning
(
f
"字符串base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}"
)
continue
else
:
_logger
.
warning
(
f
"清关文件数据格式不正确,类型: {type(file_data)},提单号: {bl.bl_no}"
)
continue
# 验证PDF可以打开
try
:
import
fitz
test_doc
=
fitz
.
open
(
stream
=
file_data
,
filetype
=
"pdf"
)
page_count
=
len
(
test_doc
)
test_doc
.
close
()
_logger
.
info
(
f
"清关文件PDF验证成功,页数: {page_count},提单号: {bl.bl_no}"
)
except
Exception
as
e
:
_logger
.
warning
(
f
"清关文件PDF无法打开,提单号: {bl.bl_no}, 错误: {str(e)}"
)
continue
# 转换为base64
file_data_base64
=
base64
.
b64encode
(
file_data
)
.
decode
(
'utf-8'
)
pdf_file_arr
.
append
({
'bl_no'
:
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
),
'file_name'
:
clearance_file
.
attachment_name
or
clearance_file
.
file_name
,
'file_data'
:
file_data_base64
})
_logger
.
info
(
f
"成功添加PDF文件,提单号: {bl.bl_no}, 文件名: {clearance_file.attachment_name or clearance_file.file_name}"
)
except
Exception
as
e
:
_logger
.
error
(
f
"处理清关文件失败,提单号: {bl.bl_no}, 错误: {str(e)}"
)
continue
else
:
_logger
.
warning
(
f
"未找到清关文件,提单号: {bl.bl_no}"
)
_logger
.
info
(
f
"从测试数据获取PDF文件,成功获取{len(pdf_file_arr)}个文件"
)
return
pdf_file_arr
# 写一个方法掉接口获取提单pdf文件
def
_get_pdf_file_arr
(
self
):
"""
Get PDF file
#
获取PDF文件
Get PDF file
from API # 从API
获取PDF文件
"""
# 调用接口,接口返回数组[{'bl_no':'','file_name':'','file_data':''}]
# bl_no:提单号
# file_name:文件名
# file_data:文件数据
return
[{
'bl_no'
:
'436-10259804'
,
'file_name'
:
'合并提单_436-10259804_20251008.pdf'
,
'file_data'
:
'base64_data'
}]
api_url
=
self
.
env
[
'ir.config_parameter'
]
.
sudo
()
.
get_param
(
'ccs_base.last_mile_pod_api_url'
)
response
=
requests
.
get
(
api_url
+
'/get_pdf_file'
)
# 获取当前选中的提单对象
bl_objs
=
self
.
get_order
()
bill_numbers
=
[
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
for
bl
in
bl_objs
]
# 调用API获取PDF文件
api_url
=
self
.
env
[
'ir.config_parameter'
]
.
sudo
()
.
get_param
(
'last_mile_pod_api_url'
,
'http://172.104.52.150:7002'
)
if
not
api_url
:
raise
ValidationError
(
_
(
'API URL not configured'
))
# 构建请求数据
request_data
=
{
"bill_numbers"
:
bill_numbers
}
try
:
response
=
requests
.
post
(
f
"{api_url}/api/release-notes/pdfs"
,
headers
=
{
'Content-Type'
:
'application/json'
},
json
=
request_data
)
if
response
.
status_code
==
200
:
return
response
.
json
()
result
=
response
.
json
()
# 检查API响应结构
if
not
result
:
_logger
.
error
(
"API返回空响应"
)
raise
ValidationError
(
_
(
'API returned empty response'
))
if
not
result
.
get
(
'success'
):
error_msg
=
result
.
get
(
'message'
,
'Unknown error'
)
_logger
.
error
(
f
"API返回失败状态: {error_msg}"
)
raise
ValidationError
(
_
(
'API returned error:
%
s'
)
%
error_msg
)
# 处理结果数据
results
=
result
.
get
(
'results'
,
[])
if
not
results
:
_logger
.
warning
(
"API调用成功,但没有PDF文件"
)
raise
ValidationError
(
_
(
'No PDF files found in API response'
))
# 构建PDF文件数组
pdf_file_arr
=
[]
for
result_item
in
results
:
if
result_item
.
get
(
'success'
):
# 验证必要字段
bill_number
=
result_item
.
get
(
'bill_number'
)
filename
=
result_item
.
get
(
'filename'
)
base64_data
=
result_item
.
get
(
'base64'
)
if
not
all
([
bill_number
,
filename
,
base64_data
]):
_logger
.
warning
(
f
"跳过无效的PDF文件项: {result_item}"
)
continue
# 验证PDF文件
try
:
pdf_binary
=
base64
.
b64decode
(
base64_data
)
# 验证PDF文件头
if
not
pdf_binary
.
startswith
(
b
'
%
PDF-'
):
_logger
.
warning
(
f
"API返回的文件不是有效的PDF格式,提单号: {bill_number}"
)
continue
# 验证PDF可以打开
try
:
import
fitz
test_doc
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
page_count
=
len
(
test_doc
)
test_doc
.
close
()
_logger
.
info
(
f
"API PDF验证成功,页数: {page_count},提单号: {bill_number}"
)
except
Exception
as
e
:
_logger
.
warning
(
f
"API PDF文件无法打开,提单号: {bill_number}, 错误: {str(e)}"
)
continue
pdf_file_arr
.
append
({
'bl_no'
:
bill_number
,
'file_name'
:
filename
,
'file_data'
:
base64_data
})
_logger
.
info
(
f
"成功添加API PDF文件,提单号: {bill_number}, 文件名: {filename}"
)
except
Exception
as
e
:
_logger
.
warning
(
f
"API PDF文件验证失败,提单号: {bill_number}, 错误: {str(e)}"
)
continue
if
not
pdf_file_arr
:
_logger
.
error
(
"所有API PDF文件验证都失败"
)
raise
ValidationError
(
_
(
'All API PDF files failed validation'
))
_logger
.
info
(
f
"API调用成功,获取到{len(pdf_file_arr)}个有效PDF文件"
)
return
pdf_file_arr
else
:
raise
ValidationError
(
_
(
'Failed to get PDF file:
%
s'
)
%
response
.
text
)
_logger
.
error
(
f
"API调用失败,状态码: {response.status_code}"
)
_logger
.
error
(
f
"响应内容: {response.text}"
)
raise
ValidationError
(
_
(
'Failed to get PDF file from API:
%
s'
)
%
response
.
text
)
except
requests
.
exceptions
.
RequestException
as
e
:
_logger
.
error
(
f
"API请求异常: {str(e)}"
)
raise
ValidationError
(
_
(
'API request failed:
%
s'
)
%
str
(
e
))
def
_write_pdf_file
(
self
,
processed_files
):
"""
...
...
@@ -109,7 +262,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
bl
=
file_info
[
'bl'
]
file_name
=
file_info
[
'file_name'
]
file_data
=
file_info
[
'file_data'
]
try
:
# 查找或创建清关文件记录
clearance_file
=
self
.
env
[
'cc.clearance.file'
]
.
sudo
()
.
search_clearance_file
(
bl
.
id
,
'尾程交接POD(待大包数量和箱号)'
)
...
...
@@ -128,13 +280,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
'attachment_name'
:
file_name
,
'file'
:
file_data
})
except
Exception
as
e
:
raise
ValidationError
(
_
(
'Failed to write PDF file
%
s:
%
s'
)
%
(
file_name
,
str
(
e
)))
def
_match_bl_by_file_name
(
self
,
pdf_file_arr
):
"""
Match BL by file name and return processed array # 根据文件名匹配提单并返回处理后的数组
:param pdf_file_arr: PDF文件数组 [{'b
l_no':'', 'file_
name':'', 'file_data':''}]
:param pdf_file_arr: PDF文件数组 [{'b
ill_number':'', 'file
name':'', 'file_data':''}]
:return: 处理后的数组 [{'bl': bl_obj, 'file_name': 'xxx.pdf', 'file_data': 'xxx', 'matched': True/False}]
"""
bl_obj
=
self
.
get_order
()
# 获取当前选中的提单对象
...
...
@@ -142,13 +292,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
for
bl
in
bl_obj
:
select_bl_no
=
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
bl
.
bl_no
)
for
pdf_file
in
pdf_file_arr
:
file_name
=
pdf_file
.
get
(
'file_name'
,
''
)
# 获取文件名
file_data
=
pdf_file
.
get
(
'file_data'
,
''
)
# 获取文件数据
bl_no
=
pdf_file
.
get
(
'bl_no'
,
''
)
# 获取提单号
if
not
bl_no
:
# 从文件名获取提单号 合并提单_436-10259804_20251008.pdf
split_bl_no
=
file_name
.
split
(
'_'
)[
1
]
bl_no
=
self
.
env
[
'common.common'
]
.
sudo
()
.
process_match_str
(
split_bl_no
)
# 尝试不同的字段名(API可能使用不同的字段名)
file_name
=
pdf_file
.
get
(
'file_name'
)
# 获取文件名
file_data
=
pdf_file
.
get
(
'file_data'
)
# 获取文件数据
bl_no
=
pdf_file
.
get
(
'bl_no'
)
# 获取提单号
if
bl_no
and
select_bl_no
==
bl_no
:
# 构建处理后的文件信息
processed_file
=
{
...
...
@@ -156,10 +303,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
'file_name'
:
file_name
,
'file_data'
:
file_data
,
'bl_no'
:
bl_no
,
'original_data'
:
pdf_file
# 保留原始数据
}
processed_files
.
append
(
processed_file
)
break
_logger
.
info
(
f
"匹配完成,成功匹配{len(processed_files)}个文件,匹配结果: {processed_files}"
)
return
processed_files
def
_sync_last_mile_pod
(
self
,
processed_files
):
...
...
@@ -169,14 +316,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
# 回写PDF文件到清关文件
self
.
_write_pdf_file
(
processed_files
)
return
False
#测试 先不同步
# 同步尾程POD信息
for
file_info
in
processed_files
:
if
not
file_info
[
'bl'
]:
continue
bl
=
file_info
[
'bl'
]
try
:
# 查找清关文件并执行同步
clearance_files
=
self
.
env
[
'cc.clearance.file'
]
.
sudo
()
.
search_clearance_file
(
bl
.
id
,
'尾程交接POD(待大包数量和箱号)'
)
...
...
@@ -184,10 +330,6 @@ class BatchGetPodInfoWizard(models.TransientModel):
clearance_file
.
action_sync
()
# 同步尾程POD
_logger
.
info
(
f
"Successfully synced POD for BL {bl.bl_no}"
)
except
Exception
as
e
:
_logger
.
error
(
f
"Failed to sync POD for BL {bl.bl_no}: {str(e)}"
)
raise
ValidationError
(
_
(
'Failed to sync POD for BL
%
s:
%
s'
)
%
(
bl
.
bl_no
,
str
(
e
)))
def
_remove_specified_text
(
self
,
processed_files
):
"""
Remove specified text from PDF files using OCR recognition # 使用OCR识别涂抹指定文字
...
...
@@ -205,12 +347,34 @@ class BatchGetPodInfoWizard(models.TransientModel):
file_data
=
file_info
[
'file_data'
]
processed_file_data
=
file_data
# 默认使用原始数据
try
:
# 使用OCR识别和删除指定文字
if
file_data
and
file_data
!=
'base64_data'
:
# 跳过测试数据
if
file_data
:
# 将base64数据转换为二进制
import
base64
try
:
pdf_binary
=
base64
.
b64decode
(
file_data
)
_logger
.
info
(
f
"Base64解码成功,数据大小: {len(pdf_binary)}字节,提单号: {bl.bl_no}"
)
# 验证PDF文件头
if
not
pdf_binary
.
startswith
(
b
'
%
PDF-'
):
_logger
.
error
(
f
"解码后的数据不是有效的PDF文件,提单号: {bl.bl_no}"
)
_logger
.
error
(
f
"文件头: {pdf_binary[:20]}"
)
raise
ValidationError
(
_
(
'Decoded data is not a valid PDF file for BL
%
s'
)
%
bl
.
bl_no
)
# 验证PDF可以打开
try
:
import
fitz
test_doc
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
page_count
=
len
(
test_doc
)
test_doc
.
close
()
_logger
.
info
(
f
"PDF验证成功,页数: {page_count},提单号: {bl.bl_no}"
)
except
Exception
as
e
:
_logger
.
error
(
f
"PDF文件无法打开,提单号: {bl.bl_no}, 错误: {str(e)}"
)
raise
ValidationError
(
_
(
'PDF file cannot be opened for BL
%
s:
%
s'
)
%
(
bl
.
bl_no
,
str
(
e
)))
except
Exception
as
e
:
_logger
.
error
(
f
"Base64解码失败,提单号: {bl.bl_no}, 错误: {str(e)}"
)
raise
ValidationError
(
_
(
'Failed to decode base64 data for BL
%
s:
%
s'
)
%
(
bl
.
bl_no
,
str
(
e
)))
# 使用OCR方法处理PDF
processed_pdf
=
self
.
_process_pdf_with_ocr
(
...
...
@@ -219,12 +383,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
)
# 将处理后的PDF转换回base64
processed_file_data
=
base64
.
b64encode
(
processed_pdf
)
processed_file_data
=
base64
.
b64encode
(
processed_pdf
)
.
decode
(
'utf-8'
)
_logger
.
info
(
f
"Successfully removed specified text from PDF for BL {bl.bl_no}"
)
except
Exception
as
e
:
_logger
.
error
(
f
"Failed to remove text from PDF for BL {bl.bl_no}: {str(e)}"
)
raise
ValidationError
(
_
(
'Failed to remove text from PDF for BL
%
s:
%
s'
)
%
(
bl
.
bl_no
,
str
(
e
)))
_logger
.
info
(
f
"处理后的PDF base64数据长度: {len(processed_file_data)}"
)
# 更新文件信息,使用处理后的PDF数据
updated_file_info
=
file_info
.
copy
()
...
...
@@ -235,22 +396,27 @@ class BatchGetPodInfoWizard(models.TransientModel):
def
_process_pdf_with_ocr
(
self
,
pdf_data
,
bl_no
):
"""
Process PDF with OCR recognition and text removal # 使用OCR识别处理PDF并删除文字
Process PDF with OCR recognition and text removal
(完全按照HTML逻辑)
# 使用OCR识别处理PDF并删除文字
:param pdf_data: PDF二进制数据
:param bl_no: 提单号(用于日志)
:return: 处理后的PDF二进制数据
"""
import
os
import
fitz
# PyMuPDF
import
cv2
import
numpy
as
np
from
PIL
import
Image
import
pytesseract
import
base64
import
io
# 定义目标文字和排除文字(与HTML文件保持一致)
TARGET_TEXTS
=
[
'AGN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
# 设置Tesseract路径
self
.
_setup_tesseract_path
()
# 验证PDF数据
if
not
pdf_data
or
not
pdf_data
.
startswith
(
b
'
%
PDF-'
):
_logger
.
error
(
f
"PDF数据无效,提单号: {bl_no}"
)
raise
ValidationError
(
_
(
'Invalid PDF data for BL
%
s'
)
%
bl_no
)
_logger
.
info
(
f
"开始OCR处理PDF,提单号: {bl_no}"
)
# 打开PDF文档
pdf_document
=
fitz
.
open
(
stream
=
pdf_data
,
filetype
=
"pdf"
)
...
...
@@ -259,15 +425,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
detected_texts
=
[]
all_recognized_texts
=
[]
_logger
.
info
(
f
"开始OCR处理PDF,共{len(pdf_document)}页,提单号: {bl_no}"
)
# 处理每一页
# 处理每一页(完全按照HTML逻辑)
for
page_num
in
range
(
len
(
pdf_document
)):
page
=
pdf_document
[
page_num
]
_logger
.
info
(
f
"正在OCR识别第{page_num + 1}页"
)
try
:
# 将页面转换为图像(提高分辨率,与HTML文件保持一致)
# 将页面转换为图像(与HTML完全一致)
mat
=
fitz
.
Matrix
(
2.0
,
2.0
)
# 提高分辨率
pix
=
page
.
get_pixmap
(
matrix
=
mat
)
img_data
=
pix
.
tobytes
(
"png"
)
...
...
@@ -279,15 +442,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 转换为PIL图像
pil_img
=
Image
.
fromarray
(
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
))
# 使用Tesseract进行OCR识别(优化配置,与HTML文件保持一致)
# OCR配置(与HTML完全一致)
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1 -c tessedit_do_invert=0 -c textord_min_linesize=1.0 -c classify_bln_numeric_mode=0 -c textord_force_make_prop_words=F -c textord_min_xheight=8 -c textord_tabfind_show_vlines=0'
# 使用Tesseract进行OCR识别
try
:
ocr_data
=
pytesseract
.
image_to_data
(
pil_img
,
output_type
=
pytesseract
.
Output
.
DICT
,
lang
=
'eng'
,
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- '
config
=
config
)
except
Exception
as
e
:
_logger
.
error
(
f
"OCR识别失败: {str(e)}"
)
continue
# 处理OCR结果
# 处理OCR结果(与HTML完全一致)
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
viewport_width
=
pil_img
.
width
...
...
@@ -312,24 +482,25 @@ class BatchGetPodInfoWizard(models.TransientModel):
all_recognized_texts
.
extend
(
page_recognized_texts
)
# 查找目标文字
# 查找目标文字(完全按照HTML逻辑)
page_texts
=
self
.
_find_target_texts
(
page_recognized_texts
,
page_num
,
viewport_width
,
viewport_height
,
page_width
,
page_height
,
TARGET_TEXTS
,
EXCLUDE_TEXTS
page_height
)
detected_texts
.
extend
(
page_texts
)
_logger
.
info
(
f
"第{page_num + 1}页OCR完成,找到{len(page_texts)}个目标文字"
)
# 在页面上绘制删除矩形
# 根据OCR结果删除文字(完全按照HTML逻辑)
if
page_texts
:
for
text_info
in
page_texts
:
# 超精确删除模式(与HTML
文件保持
一致)
# 超精确删除模式(与HTML
完全
一致)
rect
=
{
'x'
:
text_info
[
'x'
],
'y'
:
text_info
[
'y'
],
...
...
@@ -337,61 +508,162 @@ class BatchGetPodInfoWizard(models.TransientModel):
'height'
:
text_info
[
'height'
]
}
# 绘制白色矩形覆盖文字
try
:
page
.
draw_rect
(
fitz
.
Rect
(
rect
[
'x'
],
rect
[
'y'
],
rect
[
'x'
]
+
rect
[
'width'
],
rect
[
'y'
]
+
rect
[
'height'
]),
color
=
(
1
,
1
,
1
),
fill
=
(
1
,
1
,
1
)
color
=
(
1
,
1
,
1
),
# 白色
fill
=
(
1
,
1
,
1
)
# 填充白色
)
_logger
.
info
(
f
"删除目标文字: {text_info['text']}"
)
total_rectangles
+=
1
processed_pages
+=
1
except
Exception
as
e
:
_logger
.
warning
(
f
"第{page_num + 1}页OCR失败: {str(e)}"
)
# 使用回退策略:预设坐标
self
.
_apply_fallback_rectangles
(
page
,
page_num
)
_logger
.
error
(
f
"删除失败: {str(e)}"
)
else
:
_logger
.
warning
(
f
"第{page_num + 1}页没有找到目标文字"
)
processed_pages
+=
1
# 保存处理后的PDF
try
:
output_buffer
=
io
.
BytesIO
()
pdf_document
.
save
(
output_buffer
)
pdf_document
.
save
(
output_buffer
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
pdf_document
.
close
()
result_data
=
output_buffer
.
getvalue
()
output_buffer
.
close
()
_logger
.
info
(
f
"PDF保存成功,数据大小: {len(result_data)}字节"
)
except
Exception
as
e
:
_logger
.
error
(
f
"PDF保存失败: {str(e)}"
)
pdf_document
.
close
()
raise
ValidationError
(
_
(
'Failed to save PDF:
%
s'
)
%
str
(
e
))
_logger
.
info
(
f
"PDF OCR处理完成,共处理{processed_pages}页,删除{total_rectangles}个文字区域,提单号: {bl_no}"
)
return
result_data
def
_
find_target_texts
(
self
,
words
,
page_num
,
viewport_width
,
viewport_height
,
page_width
,
page_height
,
target_texts
,
exclude_texts
):
def
_
setup_tesseract_path
(
self
):
"""
Find target texts using OCR results # 使用OCR结果查找目标文字
Setup Tesseract path for different systems # 为不同系统设置Tesseract路径
"""
# try:
import
pytesseract
import
os
import
shutil
if
os
.
name
==
'nt'
:
# Windows
# Windows常见路径
possible_paths
=
[
r'C:\Program Files\Tesseract-OCR\tesseract.exe'
,
r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
,
r'C:\Users\%USERNAME
%
\AppData\Local\Tesseract-OCR\tesseract.exe'
]
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
pytesseract
.
pytesseract
.
tesseract_cmd
=
path
_logger
.
info
(
f
"设置Tesseract路径: {path}"
)
break
else
:
_logger
.
warning
(
"未找到Tesseract安装路径"
)
else
:
# Linux/Mac
# 检查Tesseract是否在PATH中
tesseract_path
=
shutil
.
which
(
'tesseract'
)
if
tesseract_path
:
pytesseract
.
pytesseract
.
tesseract_cmd
=
tesseract_path
_logger
.
info
(
f
"找到Tesseract路径: {tesseract_path}"
)
else
:
# 尝试常见路径
possible_paths
=
[
'/usr/bin/tesseract'
,
'/usr/local/bin/tesseract'
,
'/opt/homebrew/bin/tesseract'
,
# macOS M1
'/usr/local/Cellar/tesseract/*/bin/tesseract'
# macOS Homebrew
]
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
pytesseract
.
pytesseract
.
tesseract_cmd
=
path
_logger
.
info
(
f
"设置Tesseract路径: {path}"
)
break
else
:
_logger
.
warning
(
"未找到Tesseract,请确保已安装tesseract-ocr"
)
# 检查语言数据文件
self
.
_check_tessdata_files
()
# except Exception as e:
# _logger.warning(f"设置Tesseract路径失败: {str(e)}")
def
_check_tessdata_files
(
self
):
"""
Check if tessdata files exist # 检查tessdata文件是否存在
"""
import
pytesseract
import
os
# 获取Tesseract数据路径
tesseract_cmd
=
pytesseract
.
pytesseract
.
tesseract_cmd
tessdata_dir
=
os
.
path
.
dirname
(
tesseract_cmd
)
+
'/tessdata'
# 如果tessdata目录不存在,尝试其他常见位置
if
not
os
.
path
.
exists
(
tessdata_dir
):
possible_tessdata_dirs
=
[
'/usr/share/tesseract-ocr/tessdata'
,
'/usr/local/share/tesseract-ocr/tessdata'
,
'/opt/homebrew/share/tessdata'
,
# macOS M1
'/usr/local/Cellar/tesseract/*/share/tessdata'
# macOS Homebrew
]
for
tessdata_path
in
possible_tessdata_dirs
:
if
os
.
path
.
exists
(
tessdata_path
):
tessdata_dir
=
tessdata_path
break
# 检查英语语言数据文件
eng_data
=
os
.
path
.
join
(
tessdata_dir
,
'eng.traineddata'
)
if
os
.
path
.
exists
(
eng_data
):
_logger
.
info
(
f
"找到英语语言数据文件: {eng_data}"
)
else
:
_logger
.
warning
(
f
"未找到英语语言数据文件: {eng_data}"
)
_logger
.
warning
(
"请安装英语语言包: sudo apt-get install tesseract-ocr-eng"
)
def
_find_target_texts
(
self
,
words
,
page_num
,
viewport_width
,
viewport_height
,
page_width
,
page_height
):
"""
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
"""
# 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS
=
[
'AGN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
'UCLINKLOGISITICSLTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
found_texts
=
[]
for
word
in
words
:
text
=
word
[
'text'
]
.
strip
()
.
upper
()
# 首先检查是否在排除列表中
# 首先检查是否在排除列表中
(与HTML完全一致)
is_excluded
=
False
for
exclude_text
in
exclude_texts
:
for
exclude_text
in
EXCLUDE_TEXTS
:
exclude_upper
=
exclude_text
.
upper
()
if
exclude_upper
in
text
or
text
in
exclude_upper
:
is_excluded
=
True
break
# 检查页码模式(Page X of Y)
# 检查页码模式(Page X of Y)
(与HTML完全一致)
import
re
if
not
is_excluded
and
(
re
.
match
(
r'^PAGE\s+\d+\s+OF\s+\d+$'
,
text
)
or
re
.
match
(
r'^\d+\s*/\s*\d+$'
,
text
)):
is_excluded
=
True
if
is_excluded
:
_logger
.
info
(
f
"排除文字: {word['text']}"
)
continue
# 检查目标文字匹配
for
target_text
in
target_texts
:
# 检查目标文字匹配
(与HTML完全一致)
for
target_text
in
TARGET_TEXTS
:
target_upper
=
target_text
.
upper
()
is_match
=
False
...
...
@@ -401,39 +673,30 @@ class BatchGetPodInfoWizard(models.TransientModel):
elif
target_text
==
'LTD'
:
# LTD使用精确匹配
is_match
=
text
==
'LTD'
elif
target_text
==
'UCLINK LOGISITICS LTD'
:
# 完整短语匹配
is_match
=
(
'UCLINK'
in
text
and
'LOGISITICS'
in
text
and
'LTD'
in
text
)
or
\
'UCLINK LOGISITICS LTD'
in
text
or
\
text
==
'UCLINK LOGISITICS LTD'
elif
target_text
==
'UCLINK LOGISITICS'
:
# 部分短语匹配
is_match
=
(
'UCLINK'
in
text
and
'LOGISITICS'
in
text
)
or
\
text
==
'UCLINK LOGISITICS'
elif
target_text
==
'UCLINK'
:
# 单独UCLINK匹配
is_match
=
text
==
'UCLINK'
or
text
.
startswith
(
'UCLINK '
)
elif
target_text
in
[
'LOGISITICS'
,
'LOGISTICS'
]:
# LOGISITICS/LOGISTICS匹配
is_match
=
text
in
[
'LOGISITICS'
,
'LOGISTICS'
]
or
\
text
.
startswith
(
'LOGISITICS'
)
or
text
.
startswith
(
'LOGISTICS'
)
else
:
# 其他文字使用包含匹配,但更严格
# 其他文字使用包含匹配,但更严格
(与HTML完全一致)
is_match
=
target_upper
in
text
and
\
'AIR'
not
in
text
and
\
'EQK'
not
in
text
and
\
'ARN'
not
in
text
# 如果精确匹配失败,尝试模糊匹配(与HTML完全一致)
if
not
is_match
and
target_text
!=
'AGN'
and
target_text
!=
'LTD'
:
is_match
=
self
.
_fuzzy_match
(
text
,
target_upper
)
if
is_match
:
# 坐标转换(
与HTML文件保持一致
)
# 坐标转换(
适配PyMuPDF坐标系统
)
scale_x
=
page_width
/
viewport_width
scale_y
=
page_height
/
viewport_height
# PyMuPDF使用左下角为原点,OCR使用左上角为原点
# 简化Y坐标转换:直接使用OCR的Y坐标,但调整到正确位置
converted_x
=
word
[
'bbox'
][
'x0'
]
*
scale_x
converted_y
=
(
viewport_height
-
word
[
'bbox'
][
'y1'
])
*
scale_y
converted_y
=
(
word
[
'bbox'
][
'y0'
]
*
scale_y
)
# 直接使用OCR的Y坐标
converted_width
=
(
word
[
'bbox'
][
'x1'
]
-
word
[
'bbox'
][
'x0'
])
*
scale_x
converted_height
=
(
word
[
'bbox'
][
'y1'
]
-
word
[
'bbox'
][
'y0'
])
*
scale_y
found_texts
.
append
({
'text'
:
target_text
,
'full_text'
:
word
[
'text'
],
...
...
@@ -449,26 +712,134 @@ class BatchGetPodInfoWizard(models.TransientModel):
return
found_texts
def
_
apply_fallback_rectangles
(
self
,
page
,
page_num
):
def
_
fuzzy_match
(
self
,
str1
,
str2
):
"""
Apply fallback rectangles when OCR fails # OCR失败时应用回退矩形
Fuzzy match function (与HTML完全一致) # 模糊匹配函数
"""
page_width
=
page
.
rect
.
width
page_height
=
page
.
rect
.
height
import
re
s1
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str1
)
s2
=
re
.
sub
(
r'[^A-Z]'
,
''
,
str2
)
# 超精确的预设坐标覆盖(与HTML文件保持一致)
rectangles
=
[
{
'x'
:
50
,
'y'
:
page_height
-
200
,
'width'
:
60
,
'height'
:
10
},
# AGN
{
'x'
:
50
,
'y'
:
page_height
-
220
,
'width'
:
100
,
'height'
:
10
},
# UCLINK LOGISITICS
{
'x'
:
155
,
'y'
:
page_height
-
220
,
'width'
:
30
,
'height'
:
10
}
# LTD
]
if
len
(
s1
)
==
0
or
len
(
s2
)
==
0
:
return
False
# 计算编辑距离
distance
=
self
.
_levenshtein_distance
(
s1
,
s2
)
max_len
=
max
(
len
(
s1
),
len
(
s2
))
# 如果编辑距离小于等于最大长度的1/3,认为匹配
return
distance
<=
max_len
/
3
def
_levenshtein_distance
(
self
,
s1
,
s2
):
"""
Calculate Levenshtein distance (与HTML完全一致) # 计算编辑距离
"""
if
len
(
s1
)
<
len
(
s2
):
return
self
.
_levenshtein_distance
(
s2
,
s1
)
if
len
(
s2
)
==
0
:
return
len
(
s1
)
previous_row
=
list
(
range
(
len
(
s2
)
+
1
))
for
i
,
c1
in
enumerate
(
s1
):
current_row
=
[
i
+
1
]
for
j
,
c2
in
enumerate
(
s2
):
insertions
=
previous_row
[
j
+
1
]
+
1
deletions
=
current_row
[
j
]
+
1
substitutions
=
previous_row
[
j
]
+
(
c1
!=
c2
)
current_row
.
append
(
min
(
insertions
,
deletions
,
substitutions
))
previous_row
=
current_row
return
previous_row
[
-
1
]
def
_save_and_return_download_link
(
self
,
file_info
):
"""
Save processed PDF as attachment and return download action # 保存处理后的PDF作为附件并返回下载动作
:param file_info: 处理后的文件信息
:return: Odoo action to download the file
"""
import
base64
try
:
# 获取处理后的PDF数据
file_data
=
file_info
.
get
(
'file_data'
,
''
)
file_name
=
file_info
.
get
(
'file_name'
,
'processed.pdf'
)
if
not
file_data
:
raise
ValidationError
(
_
(
'No processed file data available'
))
# 解码base64数据
if
isinstance
(
file_data
,
str
):
_logger
.
info
(
f
"输入是字符串类型,长度: {len(file_data)}"
)
_logger
.
info
(
f
"输入前50字符: {file_data[:50]}"
)
pdf_binary
=
base64
.
b64decode
(
file_data
)
else
:
_logger
.
info
(
f
"输入是bytes类型,长度: {len(file_data)}"
)
_logger
.
info
(
f
"输入前20字节: {file_data[:20]}"
)
pdf_binary
=
file_data
# 验证PDF数据完整性
_logger
.
info
(
f
"PDF二进制数据大小: {len(pdf_binary)}字节"
)
_logger
.
info
(
f
"PDF文件头: {pdf_binary[:20]}"
)
_logger
.
info
(
f
"PDF文件头(hex): {pdf_binary[:20].hex()}"
)
# 确保PDF数据有效
if
not
pdf_binary
.
startswith
(
b
'
%
PDF-'
):
_logger
.
error
(
f
"保存的PDF数据不是有效的PDF格式,文件头: {pdf_binary[:20]}"
)
_logger
.
error
(
f
"文件头(hex): {pdf_binary[:20].hex()}"
)
_logger
.
error
(
f
"文件大小: {len(pdf_binary)}字节"
)
# 尝试修复:如果是base64字符串被错误处理
if
isinstance
(
file_data
,
str
)
and
len
(
file_data
)
>
100
:
_logger
.
info
(
"尝试重新解码base64数据..."
)
try
:
# 重新尝试base64解码
pdf_binary_fixed
=
base64
.
b64decode
(
file_data
)
if
pdf_binary_fixed
.
startswith
(
b
'
%
PDF-'
):
_logger
.
info
(
"✅ 重新解码成功,PDF数据有效"
)
pdf_binary
=
pdf_binary_fixed
else
:
_logger
.
error
(
"❌ 重新解码后仍然不是有效的PDF"
)
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
except
Exception
as
e
:
_logger
.
error
(
f
"重新解码失败: {str(e)}"
)
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
else
:
raise
ValidationError
(
_
(
'Invalid PDF data for saving: not a valid PDF format'
))
for
rect
in
rectangles
:
# 验证PDF可以打开
try
:
import
fitz
page
.
draw_rect
(
fitz
.
Rect
(
rect
[
'x'
],
rect
[
'y'
],
rect
[
'x'
]
+
rect
[
'width'
],
rect
[
'y'
]
+
rect
[
'height'
]),
color
=
(
1
,
1
,
1
),
fill
=
(
1
,
1
,
1
)
)
test_doc
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
_logger
.
info
(
f
"PDF验证成功,页数: {len(test_doc)}"
)
test_doc
.
close
()
except
Exception
as
e
:
_logger
.
error
(
f
"PDF验证失败: {str(e)}"
)
raise
ValidationError
(
_
(
'Invalid PDF data for saving: cannot open PDF -
%
s'
)
%
str
(
e
))
# 创建附件记录
attachment
=
self
.
env
[
'ir.attachment'
]
.
create
({
'name'
:
f
'processed_{file_name}'
,
'type'
:
'binary'
,
'datas'
:
base64
.
b64encode
(
pdf_binary
),
'mimetype'
:
'application/pdf'
,
'res_model'
:
'batch.get.pod.info.wizard'
,
'res_id'
:
self
.
id
,
})
_logger
.
info
(
f
"第{page_num + 1}页使用回退策略,应用了{len(rectangles)}个预设矩形"
)
_logger
.
info
(
f
"成功保存处理后的PDF附件,文件名: {file_name}, 大小: {len(pdf_binary)}字节, 附件ID: {attachment.id}"
)
# 返回下载动作
return
{
'type'
:
'ir.actions.act_url'
,
'url'
:
f
'/web/content/{attachment.id}?download=true'
,
'target'
:
'new'
,
}
except
Exception
as
e
:
_logger
.
error
(
f
"保存PDF附件失败: {str(e)}"
)
raise
ValidationError
(
_
(
'Failed to save PDF attachment:
%
s'
)
%
str
(
e
))
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论