1.优化邮件pdf识别

5430de23 · 刘擎阳 · 02db7a0f · 5430de23 · 5430de23
--- a/ccs_base/data/data.xml
+++ b/ccs_base/data/data.xml
@@ -26,6 +26,19 @@
            <field name="value">20</field>
        </record>

+        <!-- 增加百度云appid  apikey  secretkey的系统参数 -->
+        <record id="baidu_ocr_app_id" model="ir.config_parameter">
+            <field name="key">baidu_ocr_app_id</field>
+            <field name="value">118782515</field>
+        </record>
+        <record id="baidu_ocr_api_key" model="ir.config_parameter">
+            <field name="key">baidu_ocr_api_key</field>
+            <field name="value">gWnGCmjJYzaYwhph8sJEdiRJ</field>
+        </record>
+        <record id="baidu_ocr_secret_key" model="ir.config_parameter">
+            <field name="key">baidu_ocr_secret_key</field>
+            <field name="value">mjgUUgbxXK8UHcRi5MTlPrb4BWM8NrOu</field>
+        </record>

    </data>
 </odoo>
\ No newline at end of file
--- a/ccs_base/models/order_state_change_rule.py
+++ b/ccs_base/models/order_state_change_rule.py
@@ -12,6 +12,9 @@ import pdfplumber
 import xlrd
 from odoo import models
 from odoo.exceptions import ValidationError
+import tempfile
+from aip.ocr import AipOcr
+from pdf2image import convert_from_path

 _logger = logging.getLogger(__name__)
 import html
@@ -161,12 +164,25 @@ class OrderStateChangeRule(models.Model):
                        _logger.info(f"上传文件 {file_obj.attachment_name} 失败，已尝试 {max_retries} 次，仍然失败。")
                        break  # 超过最大重试次数后跳出循环

+    def get_pdf_order_data(self, attachment_data):
+        """识别PDF数据方法"""
+        order_no, date_str = self.read_pdf(attachment_data)
+        if not order_no:
+            _logger.info('未识别到提单号,开始调用百度OCR识别')
+            error_msg, result = self.get_pdf_waybill_content_by_baidu(attachment_data)
+            if error_msg:
+                _logger.error('百度OCR识别错误 : %s' % error_msg)
+            else:
+                order_no = result['order_no']
+                date_str = result['date_str']
+        return order_no, date_str
+
    def fetch_mail_dlv_attachment(self, **kwargs):
        attachment_arr = kwargs['attachment_arr']
        for attachment_tuple in attachment_arr:
            try:
                attachment_name, attachment_data = attachment_tuple
-                order_no, date_str = self.read_pdf(attachment_data)
+                order_no, date_str = self.get_pdf_order_data(attachment_data)
                if order_no:
                    # 转换为 datetime 对象
                    local_time = datetime.strptime(date_str.replace(' ', ''), '%d/%m/%Y%H:%M:%S')
@@ -303,3 +319,70 @@ class OrderStateChangeRule(models.Model):
                if month in pick_date_text:
                    return int(month_abbr_arr.index(mon))
        return 0
+
+    def convert_image_by_pdf_attachment(self, attachment):
+        error_msg = img_path = ''
+        attachment_content = attachment
+        if attachment_content:
+            # 创建临时文件保存PDF
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+                temp_pdf.write(attachment_content)
+                pdf_path = temp_pdf.name
+            try:
+                # 使用pdf2image将PDF转换为图片
+                # 本地代码 需要指定
+                # poppler_path = r"E:\poppler-23.08.0\Library\bin"
+                # images = convert_from_path(pdf_path, poppler_path=poppler_path)
+                # 非本地代码
+                images = convert_from_path(pdf_path)#如果文件损坏的会报错，需要处理
+                # 保存每一页为图片文件
+                for i, image in enumerate(images):
+                    if i == 0:
+                        img_path = f"{pdf_path}_{i}.jpg"
+                        image.save(img_path, 'JPEG')
+                        return img_path, error_msg
+            except Exception as e:
+                error_msg = '%s' % str(e)
+        return img_path, error_msg
+
+    def get_pdf_waybill_content_by_baidu(self, attachment):
+        """
+        根据图片获取内容
+        :param :path base64
+        """
+        img_path, error_msg = self.convert_image_by_pdf_attachment(attachment)
+        result = False
+        if error_msg:
+            return error_msg, result
+        file = open(img_path, 'rb').read()
+        app_id = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_app_id')
+        api_key = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_api_key')
+        secret_key = self.env['ir.config_parameter'].sudo().get_param('baidu_ocr_secret_key')
+        ocr = AipOcr(app_id, api_key, secret_key)
+        result = ocr.basicGeneral(file)
+        if not isinstance(result, str):
+            if not result.get('error_msg'):
+                words_result = result.get('words_result')
+                # 1. 先把所有words提取成列表，方便后续定位
+                words_list = [item['words'] for item in words_result]
+                # 3. 提取各字段
+                # 提单号
+                order_no = None
+                ze_2 = re.compile("\\d{3}-\\d{8,}")
+                # 清关开始日期
+                date_str = None
+                pattern = r'\d{2}/\d{2}/\d{4}\s*\d{2}:\d{2}:\d{2}'
+                for w in words_list:
+                    if order_no and date_str:
+                        break
+                    if ze_2.findall(w):
+                        order_no = ze_2.findall(w)[0]
+                    matches = re.findall(pattern, w)
+                    if matches:
+                        date_str = matches[0]
+                # 4. 组装最终结果
+                result = {
+                    'order_no': order_no,
+                    'date_str': date_str
+                }
+        return error_msg, result