Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
H
hh_ccs
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
贺阳
hh_ccs
Commits
2e0670ae
提交
2e0670ae
authored
11月 06, 2025
作者:
贺阳
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
同步失败的提示
上级
9e6a63bc
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
229 行增加
和
203 行删除
+229
-203
batch_get_pod_info_wizard.py
ccs_base/wizard/batch_get_pod_info_wizard.py
+229
-203
没有找到文件。
ccs_base/wizard/batch_get_pod_info_wizard.py
浏览文件 @
2e0670ae
...
...
@@ -6,12 +6,14 @@ import io
import
json
import
logging
import
time
from
datetime
import
datetime
,
timedelta
import
requests
from
odoo
import
models
,
fields
,
api
,
_
from
odoo.exceptions
import
ValidationError
from
.ai_image_edit_service
import
AIImageEditService
from
datetime
import
datetime
,
timedelta
_logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -44,7 +46,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
skip_ocr_direct_ai
=
fields
.
Boolean
(
string
=
'Skip OCR Direct AI'
,
# 跳过OCR直接使用AI
default
=
False
,
help
=
'Whether to skip OCR processing and directly use AI processing (for testing AI)'
# 是否跳过OCR处理,直接使用AI处理(用于测试AI)
help
=
'Whether to skip OCR processing and directly use AI processing (for testing AI)'
# 是否跳过OCR处理,直接使用AI处理(用于测试AI)
)
sync_match_node
=
fields
.
Boolean
(
...
...
@@ -69,7 +72,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
help
=
'Show error message'
)
# PDF相关字段
pdf_file
=
fields
.
Binary
(
string
=
'PDF文件'
,
help
=
'涂抹后的所有pdf文件合并为一个pdf文件'
)
pdf_file
=
fields
.
Binary
(
string
=
'PDF文件'
,
help
=
'涂抹后的所有pdf文件合并为一个pdf文件'
)
pdf_filename
=
fields
.
Char
(
string
=
'PDF文件名称'
)
processed_files_data
=
fields
.
Text
(
string
=
'已处理的文件数据'
,
help
=
'存储已处理的文件信息(JSON格式)'
)
...
...
@@ -80,12 +83,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
try
:
bl_objs
=
self
.
get_order
()
_logger
.
info
(
f
"开始预览操作,提单数量: {len(bl_objs)}"
)
# 调用接口获取提单pdf文件
pdf_file_arr
=
self
.
_get_pdf_file_arr
()
# 处理PDF文件,匹配提单对象
processed_files
=
self
.
_match_bl_by_file_name
(
pdf_file_arr
)
# 把没有匹配到文件的进行提示
error_bl
=
[]
matched_bl_ids
=
[
f
[
'bl'
]
.
id
for
f
in
processed_files
if
f
.
get
(
'bl'
)]
...
...
@@ -97,11 +100,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
not
self
.
_context
.
get
(
'is_skip_raise_error'
):
self
.
show_error_message
=
_
(
'
%
s bill of loading cannot find release note file'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
]))
# 如果启用了涂抹文字,进行处理
if
self
.
remove_specified_text
and
processed_files
:
processed_files
=
self
.
_remove_specified_text
(
processed_files
,
debug_mode
=
False
)
# 分离成功和失败的文件
successful_files
=
[]
failed_files
=
[]
...
...
@@ -116,18 +119,18 @@ class BatchGetPodInfoWizard(models.TransientModel):
error_msg
=
str
(
self
.
show_error_message
)
if
bl
and
bl
.
bl_no
in
error_msg
:
has_error
=
True
# 如果处理失败或者有错误,则认为失败
if
processing_failed
or
has_error
or
not
file_data
:
failed_files
.
append
(
file_info
)
else
:
# 文件数据存在且处理成功
successful_files
.
append
(
file_info
)
# 只合并成功的文件
if
successful_files
:
self
.
_merge_pdf_files
(
successful_files
)
# 如果所有文件都成功了(没有失败的文件),自动勾选"是否同步成功涂抹的提单"
if
len
(
successful_files
)
==
len
(
processed_files
)
and
not
failed_files
:
self
.
sync_successful_processed
=
True
...
...
@@ -146,9 +149,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
successful_bl_nos_str
=
'、'
.
join
(
successful_bl_nos
)
if
successful_bl_nos
else
''
success_msg
=
f
"
\n
成功处理的提单: {successful_bl_nos_str}"
if
successful_bl_nos_str
else
''
self
.
show_error_message
=
f
"{existing_error}{success_msg}"
_logger
.
info
(
f
"部分提单处理失败(成功:{len(successful_files)},失败:{len(failed_files)}),成功处理的提单号已显示"
)
_logger
.
info
(
f
"部分提单处理失败(成功:{len(successful_files)},失败:{len(failed_files)}),成功处理的提单号已显示"
)
self
.
sync_successful_processed
=
False
# 序列化并存储处理后的文件数据(包括成功和失败的,但只有成功的才会合并PDF)
if
processed_files
:
self
.
processed_files_data
=
self
.
_serialize_processed_files
(
processed_files
)
...
...
@@ -172,7 +176,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
self
.
processed_files_data
=
''
self
.
sync_successful_processed
=
False
# 返回表单视图
return
{
'type'
:
'ir.actions.act_window'
,
...
...
@@ -205,13 +209,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
self
.
show_error_message
=
False
bl_objs
=
self
.
get_order
()
_logger
.
info
(
f
"
%
s提单开始执行批量获取POD信息操作"
%
len
(
bl_objs
))
# 优先使用已处理的文件数据(预览时已处理)
processed_files
=
None
if
self
.
processed_files_data
:
processed_files
=
self
.
_deserialize_processed_files
(
self
.
processed_files_data
)
_logger
.
info
(
f
"使用已处理的文件数据,共 {len(processed_files)} 个文件"
)
# 检查文件数据是否完整
valid_files
=
[]
for
file_info
in
processed_files
:
...
...
@@ -221,7 +225,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger
.
warning
(
f
"提单 {file_info.get('bl', {}).get('bl_no', 'Unknown')} 的文件数据为空"
)
processed_files
=
valid_files
_logger
.
info
(
f
"有效文件数量: {len(processed_files)}"
)
# 如果没有已处理的数据,则执行处理流程
if
not
processed_files
:
# 调用接口获取提单pdf文件
...
...
@@ -240,7 +244,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
not
self
.
_context
.
get
(
'is_skip_raise_error'
):
self
.
show_error_message
=
_
(
'
%
s bill of loading cannot find release note file'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
]))
# 如果启用了涂抹文字,进行处理
if
self
.
remove_specified_text
and
processed_files
:
processed_files
=
self
.
_remove_specified_text
(
processed_files
,
debug_mode
=
False
)
...
...
@@ -257,11 +261,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
'view_mode'
:
'form'
,
'res_id'
:
self
.
id
,
'target'
:
'new'
,
'context'
:
{
'active_id'
:
bl_objs
.
ids
,}
'context'
:
{
'active_id'
:
bl_objs
.
ids
,
}
}
# 检查是否有文字清除失败的错误
if
self
.
show_error_message
and
any
(
'仍存在目标文字'
in
str
(
self
.
show_error_message
)
or
'未完全清除文字'
in
str
(
self
.
show_error_message
)):
if
self
.
show_error_message
and
any
(
'仍存在目标文字'
in
str
(
self
.
show_error_message
)
or
'未完全清除文字'
in
str
(
self
.
show_error_message
)):
_logger
.
error
(
f
"检测到文字清除失败,停止处理: {self.show_error_message}"
)
return
{
'type'
:
'ir.actions.act_window'
,
...
...
@@ -272,7 +277,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
'target'
:
'new'
,
'context'
:
{
'default_show_error_message'
:
self
.
show_error_message
,
'active_id'
:
bl_objs
.
ids
}
}
# 只处理成功涂抹的提单
# 直接根据processed_files中的processing_failed标志筛选成功处理的文件,无需从文本解析
successful_processed_files
=
[]
...
...
@@ -291,19 +296,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger
.
info
(
f
"从{len(processed_files)}个文件中筛选出{len(successful_processed_files)}个成功处理的文件"
)
else
:
_logger
.
warning
(
"没有找到已处理的文件数据"
)
# 回写到附件信息
if
successful_processed_files
and
(
self
.
sync_last_mile_pod
or
self
.
sync_match_node
):
# 回写PDF文件到清关文件
self
.
_write_pdf_file
(
successful_processed_files
)
# 再同步和回写
if
self
.
sync_last_mile_pod
and
successful_processed_files
:
self
.
_sync_last_mile_pod
(
successful_processed_files
)
# 同步推送匹配节点
if
self
.
sync_match_node
and
successful_processed_files
:
#且需先对比小包当前节点的操作时间是否小于提取时间(同时区对比)若大于则不能推送,
#
且需先对比小包当前节点的操作时间是否小于提取时间(同时区对比)若大于则不能推送,
# 若需补推节点,则需判断提取时间-写入节点(不取写入第一个节点)的前序间隔时间是否大于小包当前节点的操作时间。
# 若不满足以上条件,则不执行生成和自动推送节点,并在小包上新增推送备注(新增该字段)回写备注信息:获取尾程POD,自动推送节点失败,有风险产生倒挂。请手动操作205-10-20 10:20:20(获取时间)
valid_files
=
self
.
_validate_node_push_conditions
(
successful_processed_files
)
...
...
@@ -311,11 +316,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
self
.
get_date_sync_match_node
(
valid_files
)
else
:
_logger
.
info
(
f
"没有满足条件的文件,不执行生成和自动推送节点"
)
# 清理所有临时文件(包括数据库记录和物理文件),不能删,不然回写的时候没有文件了
self
.
_cleanup_temp_attachments
(
bl_objs
)
end_time
=
time
.
time
()
_logger
.
info
(
f
"批量获取POD信息操作完成,耗时: {end_time - start_time}秒"
)
if
self
.
show_error_message
and
not
self
.
_context
.
get
(
'is_skip_raise_error'
):
...
...
@@ -326,10 +330,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
'view_mode'
:
'form'
,
'res_id'
:
self
.
id
,
'target'
:
'new'
,
'context'
:
{
'default_show_error_message'
:
self
.
show_error_message
,
'active_id'
:
bl_objs
.
ids
}
'context'
:
{
'default_show_error_message'
:
self
.
show_error_message
,
'active_id'
:
bl_objs
.
ids
}
}
def
_validate_node_push_conditions
(
self
,
processed_files
):
"""
验证节点推送条件
...
...
@@ -428,9 +431,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
file_data
=
file_info
.
get
(
'file_data'
,
''
)
if
not
file_data
:
continue
# 如果有文件为空的就回写,否则就创建新的清关文件记录
fix_name
=
'尾程交接POD(待大包数量和箱号)'
clearance_file
=
self
.
env
[
'cc.clearance.file'
]
.
search
(
...
...
@@ -463,7 +465,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
import
tempfile
import
os
import
gc
temp_file_path
=
None
try
:
# 过滤有效的PDF文件
...
...
@@ -471,64 +473,64 @@ class BatchGetPodInfoWizard(models.TransientModel):
for
file_info
in
processed_files
:
if
file_info
.
get
(
'bl'
)
and
file_info
.
get
(
'file_data'
):
valid_files
.
append
(
file_info
)
if
not
valid_files
:
_logger
.
warning
(
"没有有效的PDF文件可以合并"
)
return
# 如果只有一个PDF文件,直接使用,不需要合并
if
len
(
valid_files
)
==
1
:
file_info
=
valid_files
[
0
]
bl
=
file_info
[
'bl'
]
file_data
=
file_info
[
'file_data'
]
file_name
=
file_info
.
get
(
'file_name'
,
f
"{bl.bl_no}.pdf"
)
# 生成文件名(包含提单号和日期)
timestamp
=
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d_
%
H
%
M
%
S'
)
pdf_filename
=
f
"POD文件_{bl.bl_no}_{timestamp}.pdf"
# 直接保存到字段
self
.
write
({
'pdf_file'
:
file_data
,
'pdf_filename'
:
pdf_filename
})
_logger
.
info
(
f
"单个PDF文件直接保存: {pdf_filename}"
)
return
# 多个PDF文件需要合并
_logger
.
info
(
f
"开始合并 {len(valid_files)} 个PDF文件"
)
# 使用临时文件方式合并,避免内存占用过大
temp_file_path
=
tempfile
.
mktemp
(
suffix
=
'.pdf'
)
merged_pdf
=
fitz
.
open
()
bl_numbers
=
[]
# 遍历所有处理后的PDF文件,分批处理以减少内存占用
batch_size
=
5
# 每批处理5个PDF
for
batch_start
in
range
(
0
,
len
(
valid_files
),
batch_size
):
batch_files
=
valid_files
[
batch_start
:
batch_start
+
batch_size
]
_logger
.
info
(
f
"处理第 {batch_start // batch_size + 1} 批,共 {len(batch_files)} 个PDF"
)
for
file_info
in
batch_files
:
bl
=
file_info
[
'bl'
]
file_data
=
file_info
[
'file_data'
]
bl_numbers
.
append
(
bl
.
bl_no
)
source_pdf
=
None
pdf_binary
=
None
try
:
# 将base64数据转换为二进制
pdf_binary
=
base64
.
b64decode
(
file_data
)
# 打开PDF文档
source_pdf
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
# 将源PDF的所有页面插入到合并的PDF中
merged_pdf
.
insert_pdf
(
source_pdf
)
_logger
.
info
(
f
"已添加提单 {bl.bl_no} 的PDF到合并文档({len(source_pdf)} 页)"
)
except
Exception
as
e
:
_logger
.
error
(
f
"合并提单 {bl.bl_no} 的PDF失败: {str(e)}"
)
continue
...
...
@@ -539,7 +541,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
source_pdf
=
None
pdf_binary
=
None
gc
.
collect
()
# 强制垃圾回收
# 每批处理完后,保存到临时文件并释放内存
if
batch_start
+
batch_size
<
len
(
valid_files
):
# 保存当前合并结果到临时文件
...
...
@@ -548,7 +550,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 重新打开临时文件继续合并
merged_pdf
=
fitz
.
open
(
temp_file_path
)
gc
.
collect
()
# 如果有页面,保存合并后的PDF
if
len
(
merged_pdf
)
>
0
:
# 使用临时文件保存,减少内存占用
...
...
@@ -556,39 +558,39 @@ class BatchGetPodInfoWizard(models.TransientModel):
temp_file_path
=
tempfile
.
mktemp
(
suffix
=
'.pdf'
)
merged_pdf
.
save
(
temp_file_path
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
merged_pdf
.
close
()
# 从临时文件读取并转换为base64
with
open
(
temp_file_path
,
'rb'
)
as
f
:
pdf_data
=
f
.
read
()
# 转换为base64
merged_pdf_base64
=
base64
.
b64encode
(
pdf_data
)
.
decode
(
'utf-8'
)
# 清理临时数据
del
pdf_data
gc
.
collect
()
# 生成文件名(包含提单号和日期)
bl_numbers_str
=
'_'
.
join
(
bl_numbers
[:
5
])
# 最多显示5个提单号
if
len
(
bl_numbers
)
>
5
:
bl_numbers_str
+=
f
'_等{len(bl_numbers)}个'
timestamp
=
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d_
%
H
%
M
%
S'
)
pdf_filename
=
f
"合并POD文件_{bl_numbers_str}_{timestamp}.pdf"
# 保存到字段
self
.
write
({
'pdf_file'
:
merged_pdf_base64
,
'pdf_filename'
:
pdf_filename
})
# 清理base64数据
del
merged_pdf_base64
gc
.
collect
()
_logger
.
info
(
f
"成功合并 {len(bl_numbers)} 个PDF文件,文件名: {pdf_filename}"
)
else
:
_logger
.
warning
(
"没有有效的PDF文件可以合并"
)
except
Exception
as
e
:
_logger
.
error
(
f
"合并PDF文件失败: {str(e)}"
)
finally
:
...
...
@@ -634,16 +636,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
# return False#测试 先不同步
# 同步尾程POD信息
is_fail
=
[]
# 同步失败
for
file_info
in
processed_files
:
if
not
file_info
[
'bl'
]:
continue
bl
=
file_info
[
'bl'
]
# 查找清关文件并执行同步
clearance_file
=
file_info
.
get
(
'clearance_file'
)
if
clearance_file
:
clearance_file
.
action_sync
()
# 同步尾程POD
try
:
clearance_file
.
action_sync
()
# 同步尾程POD
except
Exception
as
e
:
logging
.
info
(
'_sync_last_mile_pod:
%
s'
%
e
)
is_fail
=
True
break
_logger
.
info
(
f
"Successfully synced POD for BL {bl.bl_no}"
)
if
is_fail
:
raise
ValidationError
(
'本次同步失败,请重试!'
)
def
_check_target_texts_exist
(
self
,
pdf_binary
,
bl_no
):
"""
...
...
@@ -657,40 +666,41 @@ class BatchGetPodInfoWizard(models.TransientModel):
import
numpy
as
np
from
PIL
import
Image
import
re
# 定义目标文字(与_find_target_texts一致)
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
'UCLINKLOGISITICSLTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
try
:
# 设置Tesseract路径
self
.
_setup_tesseract_path
()
# 打开PDF文档
pdf_document
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
found_texts
=
[]
# 尝试导入OpenCV,如果失败则使用PIL替代
try
:
import
cv2
cv2_available
=
True
except
ImportError
:
cv2_available
=
False
# 遍历每一页
for
page_num
in
range
(
len
(
pdf_document
)):
page
=
pdf_document
[
page_num
]
# 首先尝试从PDF文本层提取(如果是文本型PDF)
page_text_pdf
=
page
.
get_text
()
.
upper
()
# 将页面转换为图像进行OCR识别
mat
=
fitz
.
Matrix
(
3.0
,
3.0
)
# 进一步提高分辨率,从2.0提升到3.0
pix
=
page
.
get_pixmap
(
matrix
=
mat
)
img_data
=
pix
.
tobytes
(
"png"
)
# 转换为PIL图像
if
cv2_available
:
nparr
=
np
.
frombuffer
(
img_data
,
np
.
uint8
)
...
...
@@ -700,7 +710,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
pil_img
=
Image
.
open
(
io
.
BytesIO
(
img_data
))
if
pil_img
.
mode
!=
'RGB'
:
pil_img
=
pil_img
.
convert
(
'RGB'
)
# OCR识别
try
:
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
...
...
@@ -708,14 +718,14 @@ class BatchGetPodInfoWizard(models.TransientModel):
except
Exception
as
e
:
_logger
.
warning
(
f
"OCR识别失败,第{page_num + 1}页,使用PDF文本: {str(e)}"
)
ocr_text
=
page_text_pdf
# 合并PDF文本和OCR文本进行检查
combined_text
=
(
page_text_pdf
+
' '
+
ocr_text
)
.
upper
()
# 检查目标文字
for
target_text
in
TARGET_TEXTS
:
target_upper
=
target_text
.
upper
()
# 检查是否包含目标文字
is_match
=
False
if
target_text
==
'AGN'
:
...
...
@@ -736,7 +746,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 排除AIR、EQK、ARN等(需要这些词都不存在)
if
'AIR EQK'
not
in
combined_text
and
'ARN'
not
in
combined_text
:
is_match
=
True
# 如果匹配,检查是否在排除列表中
if
is_match
:
is_excluded
=
False
...
...
@@ -744,27 +754,28 @@ class BatchGetPodInfoWizard(models.TransientModel):
exclude_upper
=
exclude_text
.
upper
()
if
exclude_upper
in
combined_text
and
target_upper
in
combined_text
:
# 检查是否是页码
if
re
.
search
(
r'PAGE\s+\d+\s+OF\s+\d+'
,
combined_text
)
or
re
.
search
(
r'\d+\s*/\s*\d+'
,
combined_text
):
if
re
.
search
(
r'PAGE\s+\d+\s+OF\s+\d+'
,
combined_text
)
or
re
.
search
(
r'\d+\s*/\s*\d+'
,
combined_text
):
is_excluded
=
True
break
# 检查是否是AIR EQK等排除项
if
'AIR EQK'
in
combined_text
or
'ARN'
in
combined_text
:
is_excluded
=
True
break
if
not
is_excluded
:
found_texts
.
append
(
f
"第{page_num + 1}页: {target_text}"
)
break
# 找到就跳出,避免重复
pdf_document
.
close
()
if
found_texts
:
_logger
.
warning
(
f
"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}"
)
return
True
,
found_texts
else
:
_logger
.
info
(
f
"提单 {bl_no} 未发现目标文字"
)
return
False
,
[]
except
Exception
as
e
:
_logger
.
error
(
f
"检查目标文字失败,提单号: {bl_no}, 错误: {str(e)}"
)
# 检查失败时,假设不存在(避免误报)
...
...
@@ -810,11 +821,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
)
if
ai_processed_pdf
:
processed_file_data
=
base64
.
b64encode
(
ai_processed_pdf
)
.
decode
(
'utf-8'
)
# 检查是否还存在目标文字
final_check_pdf
=
base64
.
b64decode
(
processed_file_data
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
if
text_still_exists
:
error_msg
=
f
"提单 {bl.bl_no} 经过AI处理后仍存在目标文字: {', '.join(final_found_texts)},请取消该提单操作,手动处理"
_logger
.
error
(
error_msg
)
...
...
@@ -860,11 +872,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
)
if
ai_processed_pdf
:
processed_file_data
=
base64
.
b64encode
(
ai_processed_pdf
)
.
decode
(
'utf-8'
)
# 第四步:再次检查是否还存在目标文字
final_check_pdf
=
base64
.
b64decode
(
processed_file_data
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
if
text_still_exists
:
# 第五步:如果仍然存在,记录错误信息并停止处理
error_msg
=
f
"提单 {bl.bl_no} 经过系统处理后仍存在目标文字: {', '.join(final_found_texts)},请取消该提单操作,手动处理"
...
...
@@ -877,7 +890,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger
.
warning
(
f
"提单 {bl.bl_no} AI处理失败,检查OCR处理结果"
)
# AI处理失败,检查OCR结果是否真的清除了目标文字
ocr_check_pdf
=
base64
.
b64decode
(
processed_file_data
)
text_still_exists
,
ocr_found_texts
=
self
.
_check_target_texts_exist
(
ocr_check_pdf
,
bl
.
bl_no
)
text_still_exists
,
ocr_found_texts
=
self
.
_check_target_texts_exist
(
ocr_check_pdf
,
bl
.
bl_no
)
if
text_still_exists
:
error_msg
=
f
"提单 {bl.bl_no} 经过系统处理后仍存在目标文字: {', '.join(ocr_found_texts)},请取消该提单操作,手动处理"
error_messages
.
append
(
error_msg
)
...
...
@@ -890,7 +904,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger
.
error
(
f
"提单 {bl.bl_no} AI处理异常: {str(e)}"
)
# AI处理失败,使用OCR结果,但需要检查
final_check_pdf
=
base64
.
b64decode
(
processed_file_data
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
text_still_exists
,
final_found_texts
=
self
.
_check_target_texts_exist
(
final_check_pdf
,
bl
.
bl_no
)
if
text_still_exists
:
error_msg
=
f
"提单 {bl.bl_no} 经过系统处理后仍存在目标文字: {', '.join(final_found_texts)},请取消该提单操作,手动处理"
error_messages
.
append
(
error_msg
)
...
...
@@ -991,26 +1006,25 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
import
fitz
# PyMuPDF
import
base64
import
mimetypes
import
gc
import
os
import
tempfile
from
PIL
import
Image
import
time
start_time
=
time
.
time
()
_logger
.
info
(
f
"开始使用AI图片编辑处理PDF,提单号: {bl_no}"
)
# 初始化AI服务
ai_service
=
AIImageEditService
()
# 打开PDF文档
pdf_document
=
fitz
.
open
(
stream
=
pdf_data
,
filetype
=
"pdf"
)
total_pages
=
len
(
pdf_document
)
total_ai_time
=
0.0
# 累计AI总耗时
_logger
.
info
(
f
"PDF总页数: {total_pages}"
)
# 对于多页PDF,使用临时文件方式减少内存占用
use_temp_file
=
total_pages
>
5
# 超过5页使用临时文件
temp_file_path
=
None
...
...
@@ -1018,16 +1032,16 @@ class BatchGetPodInfoWizard(models.TransientModel):
import
tempfile
temp_file_path
=
tempfile
.
mktemp
(
suffix
=
'.pdf'
)
_logger
.
info
(
f
"使用临时文件方式处理,减少内存占用: {temp_file_path}"
)
processed_images
=
[]
# 存储处理后的PIL图片对象(分批处理)
batch_size
=
5
# 每批处理5页图片
# 遍历每一页(按照image-to-coordinate.py的逻辑)
for
page_num
in
range
(
total_pages
):
page_start_time
=
time
.
time
()
page
=
pdf_document
[
page_num
]
_logger
.
info
(
f
"正在处理第{page_num + 1}页"
)
# 将页面转换为图像(按照image-to-coordinate.py的pdf_to_images函数,使用dpi=150)
# 对于内存优化,使用稍低的分辨率(120 DPI)以避免内存问题
dpi
=
120
...
...
@@ -1035,18 +1049,18 @@ class BatchGetPodInfoWizard(models.TransientModel):
pix
=
None
img
=
None
img_bytes_io
=
None
try
:
pix
=
page
.
get_pixmap
(
matrix
=
mat
)
# 将pixmap转换为PIL Image对象
img_data
=
pix
.
tobytes
(
"png"
)
del
pix
# 立即释放pixmap以节省内存
pix
=
None
gc
.
collect
()
# 强制垃圾回收
img
=
Image
.
open
(
io
.
BytesIO
(
img_data
))
# 获取图片尺寸(按照image-to-coordinate.py的逻辑)
img_w
,
img_h
=
img
.
size
_logger
.
info
(
f
"第{page_num + 1}页页面尺寸: {img_w}x{img_h} 像素"
)
...
...
@@ -1060,12 +1074,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
img_bytes_io
=
None
del
img_data
# 释放图片数据
gc
.
collect
()
# 强制垃圾回收
# 使用AI编辑图片,移除指定文字(带重试机制)
edited_img_base64
=
None
ai_processing_time
=
0.0
max_retries
=
2
# 最多尝试2次(首次+1次重试)
for
attempt
in
range
(
1
,
max_retries
+
1
):
ai_start_time
=
time
.
time
()
try
:
...
...
@@ -1078,30 +1092,33 @@ class BatchGetPodInfoWizard(models.TransientModel):
attempt_time
=
ai_end_time
-
ai_start_time
ai_processing_time
+=
attempt_time
# 累计AI耗时
total_ai_time
+=
attempt_time
# 累计总AI耗时
if
edited_img_base64_raw
:
edited_img_base64
=
edited_img_base64_raw
_logger
.
info
(
f
"第{page_num + 1}页AI处理成功(第{attempt}次尝试),耗时: {attempt_time:.2f}秒"
)
break
else
:
if
attempt
<
max_retries
:
_logger
.
warning
(
f
"第{page_num + 1}页AI处理失败(第{attempt}次尝试),将重试,耗时: {attempt_time:.2f}秒"
)
_logger
.
warning
(
f
"第{page_num + 1}页AI处理失败(第{attempt}次尝试),将重试,耗时: {attempt_time:.2f}秒"
)
else
:
_logger
.
warning
(
f
"第{page_num + 1}页AI处理失败(第{attempt}次尝试,已用尽重试),耗时: {attempt_time:.2f}秒"
)
_logger
.
warning
(
f
"第{page_num + 1}页AI处理失败(第{attempt}次尝试,已用尽重试),耗时: {attempt_time:.2f}秒"
)
except
Exception
as
e
:
ai_end_time
=
time
.
time
()
attempt_time
=
ai_end_time
-
ai_start_time
ai_processing_time
+=
attempt_time
total_ai_time
+=
attempt_time
_logger
.
error
(
f
"第{page_num + 1}页AI处理异常(第{attempt}次尝试): {str(e)},耗时: {attempt_time:.2f}秒"
)
_logger
.
error
(
f
"第{page_num + 1}页AI处理异常(第{attempt}次尝试): {str(e)},耗时: {attempt_time:.2f}秒"
)
if
attempt
<
max_retries
:
_logger
.
info
(
f
"第{page_num + 1}页将进行第{attempt + 1}次重试"
)
edited_img_base64
=
None
# 释放encoded_string以节省内存
del
encoded_string
gc
.
collect
()
if
edited_img_base64
:
# 解码base64图片数据并转换为PIL Image对象(按照image-to-coordinate.py的逻辑)
edited_img_data
=
base64
.
b64decode
(
edited_img_base64
)
...
...
@@ -1111,17 +1128,18 @@ class BatchGetPodInfoWizard(models.TransientModel):
processed_images
.
append
(
edited_img
)
_logger
.
info
(
f
"第{page_num + 1}页AI处理最终成功,总耗时: {ai_processing_time:.2f}秒"
)
else
:
_logger
.
warning
(
f
"第{page_num + 1}页AI处理最终失败(已重试),使用原始页面,总耗时: {ai_processing_time:.2f}秒"
)
_logger
.
warning
(
f
"第{page_num + 1}页AI处理最终失败(已重试),使用原始页面,总耗时: {ai_processing_time:.2f}秒"
)
# 如果AI处理失败,使用原始图片
processed_images
.
append
(
img
.
convert
(
'RGB'
))
# 释放原始图片对象
if
img
:
img
.
close
()
del
img
img
=
None
gc
.
collect
()
# 强制垃圾回收
# 分批处理:每处理batch_size页,就转换为PDF并保存到临时文件
if
use_temp_file
and
len
(
processed_images
)
>=
batch_size
:
_logger
.
info
(
f
"达到批次大小 {batch_size},开始保存到临时文件"
)
...
...
@@ -1134,33 +1152,33 @@ class BatchGetPodInfoWizard(models.TransientModel):
batch_buffer
.
seek
(
0
)
pdf_bytes
=
batch_buffer
.
getvalue
()
batch_buffer
.
close
()
# 释放已处理的图片
for
img_obj
in
processed_images
:
if
img_obj
:
img_obj
.
close
()
processed_images
=
[]
gc
.
collect
()
if
os
.
path
.
exists
(
temp_file_path
)
and
os
.
path
.
getsize
(
temp_file_path
)
>
0
:
# 追加到现有PDF:先读取现有内容,合并后保存到新文件,再替换
with
open
(
temp_file_path
,
'rb'
)
as
f
:
existing_bytes
=
f
.
read
()
existing_pdf
=
fitz
.
open
(
stream
=
existing_bytes
,
filetype
=
"pdf"
)
new_pdf
=
fitz
.
open
(
stream
=
pdf_bytes
,
filetype
=
"pdf"
)
existing_pdf
.
insert_pdf
(
new_pdf
)
new_pdf
.
close
()
# 保存到新临时文件,避免"save to original must be incremental"错误
new_temp_path
=
tempfile
.
mktemp
(
suffix
=
'.pdf'
)
existing_pdf
.
save
(
new_temp_path
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
existing_pdf
.
close
()
# 替换旧文件
os
.
remove
(
temp_file_path
)
os
.
rename
(
new_temp_path
,
temp_file_path
)
# 释放资源
del
existing_bytes
del
pdf_bytes
...
...
@@ -1180,7 +1198,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
img_obj
.
close
()
processed_images
=
[]
gc
.
collect
()
except
Exception
as
e
:
_logger
.
error
(
f
"第{page_num + 1}页处理异常: {str(e)}"
)
# 确保资源被释放
...
...
@@ -1194,18 +1212,18 @@ class BatchGetPodInfoWizard(models.TransientModel):
gc
.
collect
()
# 如果处理失败,跳过这一页或使用原始页面
continue
page_end_time
=
time
.
time
()
page_processing_time
=
page_end_time
-
page_start_time
_logger
.
info
(
f
"第{page_num + 1}页总处理时间: {page_processing_time:.2f}秒"
)
pdf_document
.
close
()
# 将处理后的图片转换为PDF(按照image-to-coordinate.py的images_to_pdf函数逻辑)
pdf_creation_start
=
time
.
time
()
result_data
=
None
import
os
try
:
if
use_temp_file
and
temp_file_path
:
# 如果还有剩余的图片,追加到临时文件
...
...
@@ -1220,35 +1238,35 @@ class BatchGetPodInfoWizard(models.TransientModel):
batch_buffer
.
seek
(
0
)
temp_pdf_bytes
=
batch_buffer
.
getvalue
()
batch_buffer
.
close
()
# 释放图片
for
img_obj
in
processed_images
:
if
img_obj
:
img_obj
.
close
()
processed_images
=
None
gc
.
collect
()
# 追加到临时文件
if
os
.
path
.
exists
(
temp_file_path
)
and
os
.
path
.
getsize
(
temp_file_path
)
>
0
:
# 如果临时文件已存在,先读取内容
with
open
(
temp_file_path
,
'rb'
)
as
f
:
existing_pdf_bytes
=
f
.
read
()
# 合并PDF:打开现有PDF和新PDF,然后合并
existing_pdf
=
fitz
.
open
(
stream
=
existing_pdf_bytes
,
filetype
=
"pdf"
)
new_pdf
=
fitz
.
open
(
stream
=
temp_pdf_bytes
,
filetype
=
"pdf"
)
existing_pdf
.
insert_pdf
(
new_pdf
)
new_pdf
.
close
()
# 保存到新的临时文件,避免"save to original must be incremental"错误
new_temp_path
=
tempfile
.
mktemp
(
suffix
=
'.pdf'
)
existing_pdf
.
save
(
new_temp_path
,
garbage
=
4
,
deflate
=
True
,
clean
=
True
)
existing_pdf
.
close
()
# 删除旧临时文件,重命名新文件
os
.
remove
(
temp_file_path
)
os
.
rename
(
new_temp_path
,
temp_file_path
)
# 释放资源
del
existing_pdf_bytes
del
temp_pdf_bytes
...
...
@@ -1264,12 +1282,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 注意:processed_images 在这里已经被释放了,需要重新获取
# 如果还有剩余图片,需要重新处理(这种情况不应该发生,因为前面已经释放了)
_logger
.
warning
(
"追加剩余图片失败,剩余图片已在之前释放"
)
# 从临时文件读取最终结果
if
os
.
path
.
exists
(
temp_file_path
):
with
open
(
temp_file_path
,
'rb'
)
as
f
:
result_data
=
f
.
read
()
# 删除临时文件
try
:
os
.
remove
(
temp_file_path
)
...
...
@@ -1279,7 +1297,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
_logger
.
error
(
"临时文件不存在,无法读取结果"
)
return
None
elif
processed_images
:
# 使用内存方式处理(5页以内)
output_buffer
=
io
.
BytesIO
()
...
...
@@ -1289,10 +1307,10 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 即使rest是空列表,也直接传入(PIL会正确处理)
first
.
save
(
output_buffer
,
format
=
'PDF'
,
save_all
=
True
,
append_images
=
rest
)
output_buffer
.
seek
(
0
)
result_data
=
output_buffer
.
getvalue
()
output_buffer
.
close
()
# 释放所有图片对象
for
img_obj
in
processed_images
:
if
img_obj
:
...
...
@@ -1303,9 +1321,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
_logger
.
error
(
"没有需要写入PDF的图片"
)
return
None
gc
.
collect
()
# 强制垃圾回收
except
Exception
as
e
:
_logger
.
error
(
f
"PDF创建失败: {str(e)}"
)
# 确保资源被释放
...
...
@@ -1320,18 +1338,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
pass
gc
.
collect
()
return
None
pdf_creation_end
=
time
.
time
()
total_time
=
time
.
time
()
-
start_time
pdf_creation_time
=
pdf_creation_end
-
pdf_creation_start
_logger
.
info
(
f
"AI图片编辑PDF处理完成,提单号: {bl_no}"
)
_logger
.
info
(
f
"总处理时间: {total_time:.2f}秒"
)
_logger
.
info
(
f
"AI总耗时: {total_ai_time:.2f}秒(累计所有页面的AI处理时间)"
)
_logger
.
info
(
f
"PDF创建时间: {pdf_creation_time:.2f}秒"
)
_logger
.
info
(
f
"平均每页AI处理时间: {total_ai_time/total_pages:.2f}秒"
if
total_pages
>
0
else
"平均每页AI处理时间: 0.00秒"
)
_logger
.
info
(
f
"平均每页AI处理时间: {total_ai_time / total_pages:.2f}秒"
if
total_pages
>
0
else
"平均每页AI处理时间: 0.00秒"
)
return
result_data
def
_process_pdf_with_ocr
(
self
,
pdf_data
,
bl_no
,
debug_mode
=
False
):
...
...
@@ -1371,7 +1390,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
all_recognized_texts
=
[]
result_data
=
False
total_pages
=
len
(
pdf_document
)
# 处理每一页(完全按照HTML逻辑)
for
page_num
in
range
(
total_pages
):
page_start_time
=
time
.
time
()
...
...
@@ -1516,12 +1535,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 计算总处理时间
total_time
=
time
.
time
()
-
start_time
# 输出处理总结
_logger
.
info
(
f
"OCR处理完成 - 提单号: {bl_no}, 处理页数: {processed_pages}, 删除矩形数: {total_rectangles}, 检测到文字数: {len(detected_texts)}"
)
_logger
.
info
(
f
"OCR处理完成 - 提单号: {bl_no}, 处理页数: {processed_pages}, 删除矩形数: {total_rectangles}, 检测到文字数: {len(detected_texts)}"
)
_logger
.
info
(
f
"OCR总处理时间: {total_time:.2f}秒"
)
_logger
.
info
(
f
"PDF保存时间: {pdf_save_time:.2f}秒"
)
_logger
.
info
(
f
"平均每页OCR处理时间: {total_time
/
total_pages:.2f}秒"
)
_logger
.
info
(
f
"平均每页OCR处理时间: {total_time
/
total_pages:.2f}秒"
)
if
detected_texts
:
_logger
.
info
(
f
"检测到的目标文字: {[text['text'] for text in detected_texts]}"
)
except
Exception
as
e
:
...
...
@@ -1609,7 +1629,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
Find target texts using OCR results (完全按照HTML逻辑) # 使用OCR结果查找目标文字
"""
# 定义目标文字和排除文字(与HTML文件完全一致)
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
'UCLINKLOGISITICSLTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
...
...
@@ -1782,13 +1803,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
not
pod_node
:
_logger
.
info
(
f
"未找到尾程POD节点匹配的节点,提单号: {bl.bl_no}"
)
continue
# 只使用满足条件的小包(经过验证的valid_packages)
valid_packages
=
file_info
.
get
(
'valid_packages'
,
[])
if
not
valid_packages
:
_logger
.
warning
(
f
"提单 {bl.bl_no} 没有满足条件的小包,跳过节点推送"
)
continue
# 从valid_packages中提取小包ID(记录集对象或列表)
if
hasattr
(
valid_packages
,
'ids'
):
# 如果是记录集对象,直接获取IDs
...
...
@@ -1799,13 +1820,13 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
_logger
.
warning
(
f
"提单 {bl.bl_no} valid_packages格式不正确: {type(valid_packages)}"
)
valid_package_ids
=
[]
_logger
.
info
(
f
"提单 {bl.bl_no} 满足条件的小包ID: {valid_package_ids} (共 {len(valid_package_ids)} 个)"
)
if
not
valid_package_ids
:
_logger
.
warning
(
f
"提单 {bl.bl_no} 满足条件的小包ID为空,跳过节点推送"
)
continue
# 从PDF文件提取红色框的时间
file_data
=
file_info
.
get
(
'file_data'
)
if
not
file_data
:
...
...
@@ -1819,7 +1840,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
extracted_times
:
# 取最早的时间作为节点操作时间
earliest_time
=
min
(
extracted_times
)
_logger
.
info
(
f
"提取到最早时间: {earliest_time},将作为节点操作时间,满足条件的小包数量: {len(valid_package_ids)},小包ID: {valid_package_ids}"
)
_logger
.
info
(
f
"提取到最早时间: {earliest_time},将作为节点操作时间,满足条件的小包数量: {len(valid_package_ids)},小包ID: {valid_package_ids}"
)
ship_packages
.
append
({
'bl_id'
:
bl
.
id
,
'id'
:
valid_package_ids
,
# 只包含满足条件的小包ID
...
...
@@ -1833,11 +1855,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
error_bl
.
append
(
bl
)
if
error_bl
:
_logger
.
warning
(
f
"提单 {', '.join([bl.bl_no for bl in error_bl])} 没有提取到时间信息"
)
if
not
self
.
_context
.
get
(
'is_skip_raise_error'
):
raise
ValidationError
(
_
(
'
%
s bill of loading cannot get node operation time,please manually upload push tk'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
])))
# xx提单号没有获取到节点操作时间,请手动上传推送提单到TK
_
(
'
%
s bill of loading cannot get node operation time,please manually upload push tk'
)
%
(
', '
.
join
([
bl
.
bl_no
for
bl
in
error_bl
])))
# xx提单号没有获取到节点操作时间,请手动上传推送提单到TK
return
ship_packages
,
pod_node
.
id
def
_sync_match_node
(
self
,
ship_packages
,
pod_node_id
):
...
...
@@ -2054,29 +2076,30 @@ class BatchGetPodInfoWizard(models.TransientModel):
from
PIL
import
Image
import
re
import
gc
# 定义目标文字(与_find_target_texts一致)
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
TARGET_TEXTS
=
[
'AGN'
,
'ACN'
,
'UCLINK LOGISITICS LTD'
,
'UCLINK LOGISITICS'
,
'UCLINK'
,
'LOGISITICS'
,
'LOGISTICS'
,
'LTD'
,
'UCLINKLOGISITICSLTD'
]
EXCLUDE_TEXTS
=
[
'AIR EQK'
,
'ARN'
,
'EQK'
,
'AIR'
,
'Page 1 of 1'
,
'Page 2 of 2'
,
'Page 3 of 3'
,
'Page 4 of 4'
,
'Page 5 of 5'
]
pdf_document
=
None
try
:
# 设置Tesseract路径
self
.
_setup_tesseract_path
()
# 打开PDF文档
pdf_document
=
fitz
.
open
(
stream
=
pdf_binary
,
filetype
=
"pdf"
)
found_texts
=
[]
# 尝试导入OpenCV,如果失败则使用PIL替代
try
:
import
cv2
cv2_available
=
True
except
ImportError
:
cv2_available
=
False
# 遍历每一页
for
page_num
in
range
(
len
(
pdf_document
)):
page
=
pdf_document
[
page_num
]
...
...
@@ -2085,11 +2108,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
img
=
None
nparr
=
None
img_data
=
None
try
:
# 首先尝试从PDF文本层提取(如果是文本型PDF)
page_text_pdf
=
page
.
get_text
()
.
upper
()
# 将页面转换为图像进行OCR识别(降低分辨率以节省内存)
# 使用 2.0 倍分辨率(约 144 DPI)而不是 3.0 倍(约 216 DPI)
mat
=
fitz
.
Matrix
(
2.0
,
2.0
)
...
...
@@ -2098,7 +2121,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
del
pix
# 立即释放pixmap
pix
=
None
gc
.
collect
()
# 强制垃圾回收
# 转换为PIL图像
if
cv2_available
:
nparr
=
np
.
frombuffer
(
img_data
,
np
.
uint8
)
...
...
@@ -2113,12 +2136,12 @@ class BatchGetPodInfoWizard(models.TransientModel):
pil_img
=
Image
.
open
(
io
.
BytesIO
(
img_data
))
if
pil_img
.
mode
!=
'RGB'
:
pil_img
=
pil_img
.
convert
(
'RGB'
)
# 释放img_data
del
img_data
img_data
=
None
gc
.
collect
()
# OCR识别
try
:
config
=
'--psm 6 --oem 1 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- -c preserve_interword_spaces=1'
...
...
@@ -2126,15 +2149,15 @@ class BatchGetPodInfoWizard(models.TransientModel):
except
Exception
as
e
:
_logger
.
warning
(
f
"OCR识别失败,第{page_num + 1}页,使用PDF文本: {str(e)}"
)
ocr_text
=
page_text_pdf
# 合并PDF文本和OCR文本进行检查
combined_text
=
(
page_text_pdf
+
' '
+
ocr_text
)
.
upper
()
# 使用与_find_target_texts完全相同的逻辑:先进行OCR单词识别
try
:
# 获取OCR识别的单词列表
words
=
pytesseract
.
image_to_data
(
pil_img
,
output_type
=
pytesseract
.
Output
.
DICT
,
lang
=
'eng'
)
# 过滤出有效的单词
valid_words
=
[]
for
i
in
range
(
len
(
words
[
'text'
])):
...
...
@@ -2150,27 +2173,27 @@ class BatchGetPodInfoWizard(models.TransientModel):
'y1'
:
words
[
'top'
][
i
]
+
words
[
'height'
][
i
]
}
})
# 释放words字典以节省内存
del
words
gc
.
collect
()
# 使用与_find_target_texts相同的匹配逻辑
page_found_texts
=
self
.
_find_target_texts
(
valid_words
,
page_num
,
800
,
600
,
800
,
600
)
del
valid_words
# 释放valid_words列表
gc
.
collect
()
if
page_found_texts
:
for
found_text
in
page_found_texts
:
found_texts
.
append
(
f
"第{page_num + 1}页: {found_text['text']}"
)
break
# 找到就跳出,避免重复
except
Exception
as
e
:
_logger
.
warning
(
f
"OCR单词识别失败,第{page_num + 1}页,使用文本匹配: {str(e)}"
)
# 如果OCR单词识别失败,回退到文本匹配
for
target_text
in
TARGET_TEXTS
:
target_upper
=
target_text
.
upper
()
# 检查是否包含目标文字
is_match
=
False
if
target_text
==
'AGN'
:
...
...
@@ -2191,7 +2214,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 排除AIR、EQK、ARN等(需要这些词都不存在)
if
'AIR EQK'
not
in
combined_text
and
'ARN'
not
in
combined_text
:
is_match
=
True
# 如果匹配,检查是否在排除列表中
if
is_match
:
is_excluded
=
False
...
...
@@ -2199,18 +2222,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
exclude_upper
=
exclude_text
.
upper
()
if
exclude_upper
in
combined_text
and
target_upper
in
combined_text
:
# 检查是否是页码
if
re
.
search
(
r'PAGE\s+\d+\s+OF\s+\d+'
,
combined_text
)
or
re
.
search
(
r'\d+\s*/\s*\d+'
,
combined_text
):
if
re
.
search
(
r'PAGE\s+\d+\s+OF\s+\d+'
,
combined_text
)
or
re
.
search
(
r'\d+\s*/\s*\d+'
,
combined_text
):
is_excluded
=
True
break
# 检查是否是AIR EQK等排除项
if
'AIR EQK'
in
combined_text
or
'ARN'
in
combined_text
:
is_excluded
=
True
break
if
not
is_excluded
:
found_texts
.
append
(
f
"第{page_num + 1}页: {target_text}"
)
break
# 找到就跳出,避免重复
# 释放PIL图像和文本变量
if
pil_img
:
pil_img
.
close
()
...
...
@@ -2220,7 +2244,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
del
combined_text
pil_img
=
None
gc
.
collect
()
# 强制垃圾回收
except
Exception
as
e
:
_logger
.
error
(
f
"第{page_num + 1}页处理异常: {str(e)}"
)
# 确保资源被释放
...
...
@@ -2237,19 +2261,19 @@ class BatchGetPodInfoWizard(models.TransientModel):
del
img_data
gc
.
collect
()
continue
if
pdf_document
:
pdf_document
.
close
()
pdf_document
=
None
gc
.
collect
()
if
found_texts
:
_logger
.
warning
(
f
"提单 {bl_no} 仍存在目标文字: {', '.join(found_texts)}"
)
return
True
,
found_texts
else
:
_logger
.
info
(
f
"提单 {bl_no} 未发现目标文字"
)
return
False
,
[]
except
Exception
as
e
:
_logger
.
error
(
f
"检查目标文字失败,提单号: {bl_no}, 错误: {str(e)}"
)
# 确保资源被释放
...
...
@@ -2262,8 +2286,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 检查失败时,假设不存在(避免误报)
return
False
,
[]
def
_cleanup_temp_attachments
(
self
,
bl_objs
=
None
):
def
_cleanup_temp_attachments
(
self
,
bl_objs
=
None
):
"""
清理与当前向导相关的临时附件,包括服务器和本地开发环境的物理文件
"""
...
...
@@ -2273,11 +2296,11 @@ class BatchGetPodInfoWizard(models.TransientModel):
(
'res_id'
,
'in'
,
bl_objs
.
ids
),
(
'name'
,
'like'
,
'temp_pod_
%
'
)
])
if
attachments
:
# 删除数据库记录
attachments
.
unlink
()
except
Exception
as
e
:
_logger
.
error
(
f
"清理临时附件失败: {str(e)}"
)
...
...
@@ -2289,7 +2312,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
"""
# 注意:不在这里清理临时附件,因为预览时需要保留附件数据
# 只有在确认操作完成后才清理临时附件
serialized_data
=
[]
for
file_info
in
processed_files
:
if
not
file_info
.
get
(
'bl'
):
...
...
@@ -2310,7 +2333,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
})
attachment_id
=
attachment
.
id
_logger
.
info
(
f
"已创建临时附件存储文件: {attachment.name}, ID: {attachment_id}"
)
# 验证附件创建后数据是否正确
created_attachment
=
self
.
env
[
'ir.attachment'
]
.
browse
(
attachment_id
)
if
created_attachment
.
datas
:
...
...
@@ -2318,21 +2341,22 @@ class BatchGetPodInfoWizard(models.TransientModel):
try
:
original_decoded
=
base64
.
b64decode
(
file_data
)
attachment_decoded
=
base64
.
b64decode
(
created_attachment
.
datas
)
if
len
(
original_decoded
)
==
len
(
attachment_decoded
):
_logger
.
info
(
f
"附件数据验证成功,解码后长度: {len(original_decoded)}"
)
else
:
_logger
.
warning
(
f
"附件数据长度不匹配: 原始={len(original_decoded)}, 附件={len(attachment_decoded)}"
)
_logger
.
warning
(
f
"附件数据长度不匹配: 原始={len(original_decoded)}, 附件={len(attachment_decoded)}"
)
except
Exception
as
e
:
_logger
.
warning
(
f
"附件数据验证失败: {str(e)}"
)
else
:
_logger
.
error
(
f
"附件数据为空"
)
except
Exception
as
e
:
_logger
.
error
(
f
"创建临时附件失败: {str(e)}"
)
else
:
_logger
.
warning
(
f
"提单 {bl.bl_no} 的文件数据为空,无法创建附件"
)
data
=
{
'bl_id'
:
bl
.
id
,
'bl_no'
:
bl
.
bl_no
,
...
...
@@ -2353,7 +2377,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
data
[
'valid_package_ids'
]
=
[
p
.
id
for
p
in
valid_packages
if
hasattr
(
p
,
'id'
)]
else
:
data
[
'valid_package_ids'
]
=
[]
_logger
.
info
(
f
"序列化时保存valid_packages: 提单 {bl.bl_no}, 满足条件的小包ID: {data['valid_package_ids']}"
)
_logger
.
info
(
f
"序列化时保存valid_packages: 提单 {bl.bl_no}, 满足条件的小包ID: {data['valid_package_ids']}"
)
serialized_data
.
append
(
data
)
return
json
.
dumps
(
serialized_data
,
ensure_ascii
=
False
)
...
...
@@ -2382,8 +2407,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
if
attachment
.
exists
():
# attachment.datas 已经是 base64 编码的字符串
file_data
=
attachment
.
datas
_logger
.
info
(
f
"从附件读取文件: {attachment.name}, ID: {attachment_id}, 数据长度: {len(file_data) if file_data else 0}"
)
_logger
.
info
(
f
"从附件读取文件: {attachment.name}, ID: {attachment_id}, 数据长度: {len(file_data) if file_data else 0}"
)
# 验证数据格式
if
file_data
:
_logger
.
info
(
f
"附件数据格式: 前100个字符: {file_data[:100]}"
)
...
...
@@ -2403,7 +2429,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
_logger
.
error
(
f
"读取附件失败: {str(e)}"
)
else
:
_logger
.
warning
(
f
"提单 {bl.bl_no} 没有附件ID,无法读取文件数据"
)
file_info
=
{
'bl'
:
bl
,
'bl_no'
:
data
.
get
(
'bl_no'
,
''
),
...
...
@@ -2419,7 +2445,8 @@ class BatchGetPodInfoWizard(models.TransientModel):
# 重建记录集对象
valid_packages
=
self
.
env
[
'cc.ship.package'
]
.
browse
(
valid_package_ids
)
file_info
[
'valid_packages'
]
=
valid_packages
_logger
.
info
(
f
"反序列化时恢复valid_packages: 提单 {bl.bl_no}, 满足条件的小包ID: {valid_package_ids}, 数量: {len(valid_packages)}"
)
_logger
.
info
(
f
"反序列化时恢复valid_packages: 提单 {bl.bl_no}, 满足条件的小包ID: {valid_package_ids}, 数量: {len(valid_packages)}"
)
processed_files
.
append
(
file_info
)
return
processed_files
except
Exception
as
e
:
...
...
@@ -2435,7 +2462,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
try
:
# 计算1天前的时间(前一天23:59:59)
today
=
datetime
.
now
()
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
one_day_ago
=
today
+
timedelta
(
days
=
2
)
-
timedelta
(
seconds
=
1
)
# 前一天23:59:59
one_day_ago
=
today
+
timedelta
(
days
=
2
)
-
timedelta
(
seconds
=
1
)
# 前一天23:59:59
_logger
.
info
(
f
"开始执行定时清理临时附件任务,清理时间点: {one_day_ago.strftime('
%
Y-
%
m-
%
d
%
H:
%
M:
%
S')}"
)
# 构建SQL查询
sql_query
=
"""
...
...
@@ -2445,7 +2472,7 @@ class BatchGetPodInfoWizard(models.TransientModel):
AND create_date < '
%
s'
ORDER BY create_date DESC
"""
%
(
one_day_ago
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
))
# 执行SQL查询
self
.
env
.
cr
.
execute
(
sql_query
)
sql_results
=
self
.
env
.
cr
.
fetchall
()
...
...
@@ -2456,9 +2483,9 @@ class BatchGetPodInfoWizard(models.TransientModel):
attachment_count
=
len
(
temp_attachments
)
attachment_names
=
[
att
.
name
for
att
in
temp_attachments
]
_logger
.
info
(
f
"找到 {attachment_count} 个{one_day_ago.strftime('
%
Y-
%
m-
%
d')}之前创建的临时附件,开始清理"
)
# 删除物理文件
for
attachment
in
temp_attachments
:
try
:
...
...
@@ -2472,24 +2499,23 @@ class BatchGetPodInfoWizard(models.TransientModel):
else
:
# 尝试从 name 字段构建路径
file_path
=
attachment
.
name
# 构建完整的文件路径
import
os
from
odoo.tools
import
config
# 获取 Odoo 数据目录
data_dir
=
config
.
filestore
(
self
.
env
.
cr
.
dbname
)
if
data_dir
and
file_path
:
full_path
=
os
.
path
.
join
(
data_dir
,
file_path
)
if
os
.
path
.
exists
(
full_path
):
os
.
remove
(
full_path
)
except
Exception
as
file_e
:
_logger
.
warning
(
f
"删除物理文件失败 {attachment.name}: {str(file_e)}"
)
# 删除数据库记录
temp_attachments
.
unlink
()
except
Exception
as
e
:
_logger
.
error
(
f
"定时清理临时附件失败: {str(e)}"
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论