提交 5430de23 authored 作者: 刘擎阳's avatar 刘擎阳

1.优化邮件pdf识别

上级 02db7a0f
......@@ -26,6 +26,19 @@
<field name="value">20</field>
</record>
<!-- 增加百度云appid apikey secretkey的系统参数 -->
<record id="baidu_ocr_app_id" model="ir.config_parameter">
<field name="key">baidu_ocr_app_id</field>
<field name="value">118782515</field>
</record>
<record id="baidu_ocr_api_key" model="ir.config_parameter">
<field name="key">baidu_ocr_api_key</field>
<field name="value">gWnGCmjJYzaYwhph8sJEdiRJ</field>
</record>
<record id="baidu_ocr_secret_key" model="ir.config_parameter">
<field name="key">baidu_ocr_secret_key</field>
<field name="value">mjgUUgbxXK8UHcRi5MTlPrb4BWM8NrOu</field>
</record>
</data>
</odoo>
\ No newline at end of file
......@@ -12,6 +12,9 @@ import pdfplumber
import xlrd
from odoo import models
from odoo.exceptions import ValidationError
import tempfile
from aip.ocr import AipOcr
from pdf2image import convert_from_path
_logger = logging.getLogger(__name__)
import html
......@@ -161,12 +164,25 @@ class OrderStateChangeRule(models.Model):
_logger.info(f"上传文件 {file_obj.attachment_name} 失败,已尝试 {max_retries} 次,仍然失败。")
break # 超过最大重试次数后跳出循环
def get_pdf_order_data(self, attachment_data):
"""识别PDF数据方法"""
order_no, date_str = self.read_pdf(attachment_data)
if not order_no:
_logger.info('未识别到提单号,开始调用百度OCR识别')
error_msg, result = self.get_pdf_waybill_content_by_baidu(attachment_data)
if error_msg:
_logger.error('百度OCR识别错误 : %s' % error_msg)
else:
order_no = result['order_no']
date_str = result['date_str']
return order_no, date_str
def fetch_mail_dlv_attachment(self, **kwargs):
attachment_arr = kwargs['attachment_arr']
for attachment_tuple in attachment_arr:
try:
attachment_name, attachment_data = attachment_tuple
order_no, date_str = self.read_pdf(attachment_data)
order_no, date_str = self.get_pdf_order_data(attachment_data)
if order_no:
# 转换为 datetime 对象
local_time = datetime.strptime(date_str.replace(' ', ''), '%d/%m/%Y%H:%M:%S')
......@@ -303,3 +319,70 @@ class OrderStateChangeRule(models.Model):
if month in pick_date_text:
return int(month_abbr_arr.index(mon))
return 0
def convert_image_by_pdf_attachment(self, attachment):
error_msg = img_path = ''
attachment_content = attachment
if attachment_content:
# 创建临时文件保存PDF
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(attachment_content)
pdf_path = temp_pdf.name
try:
# 使用pdf2image将PDF转换为图片
# 本地代码 需要指定
# poppler_path = r"E:\poppler-23.08.0\Library\bin"
# images = convert_from_path(pdf_path, poppler_path=poppler_path)
# 非本地代码
images = convert_from_path(pdf_path)#如果文件损坏的会报错,需要处理
# 保存每一页为图片文件
for i, image in enumerate(images):
if i == 0:
img_path = f"{pdf_path}_{i}.jpg"
image.save(img_path, 'JPEG')
return img_path, error_msg
except Exception as e:
error_msg = '%s' % str(e)
return img_path, error_msg
def get_pdf_waybill_content_by_baidu(self, attachment):
"""
根据图片获取内容
:param :path base64
"""
img_path, error_msg = self.convert_image_by_pdf_attachment(attachment)
result = False
if error_msg:
return error_msg, result
file = open(img_path, 'rb').read()
app_id = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_app_id')
api_key = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_api_key')
secret_key = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_secret_key')
ocr = AipOcr(app_id, api_key, secret_key)
result = ocr.basicGeneral(file)
if not isinstance(result, str):
if not result.get('error_msg'):
words_result = result.get('words_result')
# 1. 先把所有words提取成列表,方便后续定位
words_list = [item['words'] for item in words_result]
# 3. 提取各字段
# 提单号
order_no = None
ze_2 = re.compile("\\d{3}-\\d{8,}")
# 清关开始日期
date_str = None
pattern = r'\d{2}/\d{2}/\d{4}\s*\d{2}:\d{2}:\d{2}'
for w in words_list:
if order_no and date_str:
break
if ze_2.findall(w):
order_no = ze_2.findall(w)[0]
matches = re.findall(pattern, w)
if matches:
date_str = matches[0]
# 4. 组装最终结果
result = {
'order_no': order_no,
'date_str': date_str
}
return error_msg, result
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论