提交

2026-06-01 10:25:16 +08:00
parent dcc66aa4a9
commit ffcb62cece
2 changed files with 229 additions and 42 deletions
--- a/fad-hrm/pom.xml
+++ b/fad-hrm/pom.xml
@@ -50,5 +50,28 @@
            <artifactId>pdfbox</artifactId>
            <version>2.0.29</version>
        </dependency>
+        <!-- 二维码识别（发票二维码兜底） -->
+        <dependency>
+            <groupId>com.google.zxing</groupId>
+            <artifactId>core</artifactId>
+            <version>3.5.1</version>
+        </dependency>
+        <dependency>
+            <groupId>com.google.zxing</groupId>
+            <artifactId>javase</artifactId>
+            <version>3.5.1</version>
+        </dependency>
+        <!-- 本地 OCR（Tesseract JNA 绑定）：仅在 PDF 没有文本层且二维码不可读时启用 -->
+        <dependency>
+            <groupId>net.sourceforge.tess4j</groupId>
+            <artifactId>tess4j</artifactId>
+            <version>5.11.0</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-log4j12</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
    </dependencies>
 </project>
--- a/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java
+++ b/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java
@@ -1,6 +1,12 @@
 package com.ruoyi.hrm.service.impl;

 import cn.hutool.core.io.IoUtil;
+import com.google.zxing.BinaryBitmap;
+import com.google.zxing.DecodeHintType;
+import com.google.zxing.MultiFormatReader;
+import com.google.zxing.Result;
+import com.google.zxing.client.j2se.BufferedImageLuminanceSource;
+import com.google.zxing.common.HybridBinarizer;
 import com.ruoyi.common.exception.ServiceException;
 import com.ruoyi.hrm.domain.vo.HrmInvoiceOcrResultVo;
 import com.ruoyi.hrm.service.IHrmInvoiceOcrService;
@@ -9,22 +15,41 @@ import com.ruoyi.system.domain.vo.SysOssVo;
 import com.ruoyi.system.mapper.SysOssMapper;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
+import net.sourceforge.tess4j.Tesseract;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.text.PDFTextStripper;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;

+import javax.annotation.PostConstruct;
+import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.InputStream;
 import java.math.BigDecimal;
+import java.net.URL;
+import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.EnumMap;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
- * 发票识别服务实现：直接解析电子发票 PDF 文本，无外部模型依赖。
- * 仅支持 PDF（电子普通发票 / 电子专用发票 / 全电数电票）。
+ * 发票识别服务实现：本地三段式管线，无任何外部 API 调用。
+ *
+ * <ol>
+ *   <li>PDFBox 文本层抽取：原生电子发票直接搞定（毫秒级，几乎零算力）</li>
+ *   <li>ZXing 二维码识别：拍照/扫描发票 PDF，从二维码读结构化字段</li>
+ *   <li>Tesseract OCR（仅在前两步失败时触发）：本地 chi_sim 字库，无网络</li>
+ * </ol>
+ *
+ * Tesseract 字库路径默认 {jar 同级目录}/tessdata，可用 fad.ocr.tessdata-path 覆盖。
+ * 系统需预装：apt install -y tesseract-ocr tesseract-ocr-chi-sim
 */
@Slf4j
@RequiredArgsConstructor
@@ -33,28 +58,48 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {

    private final SysOssMapper sysOssMapper;

-    /** "价税合计（小写）¥123.45" 这种小写金额 */
+    /** 可通过 application.yml 覆盖；默认 jar 同级目录下的 tessdata */
+    @Value("${fad.ocr.tessdata-path:}")
+    private String tessdataPathConfig;
+
+    private String tessdataPath;
+
+    @PostConstruct
+    void resolveTessdataPath() {
+        if (StringUtils.isNotBlank(tessdataPathConfig)) {
+            tessdataPath = tessdataPathConfig;
+        } else {
+            tessdataPath = Paths.get(jarDir(), "tessdata").toString();
+        }
+        log.info("[Invoice] tessdata path = {}", tessdataPath);
+    }
+
+    /** 取 jar 所在目录；IDE 调试时 fall back 到工作目录 */
+    private static String jarDir() {
+        try {
+            URL url = HrmInvoiceOcrServiceImpl.class
+                .getProtectionDomain().getCodeSource().getLocation();
+            File f = new File(url.toURI());
+            return (f.isFile() ? f.getParentFile() : f).getAbsolutePath();
+        } catch (Exception e) {
+            return System.getProperty("user.dir");
+        }
+    }
+
+    // === 字段抽取正则 ===
    private static final Pattern P_TOTAL = Pattern.compile(
        "(?:价税合计|小写)[^0-9¥￥]{0,30}[¥￥]?\\s*([0-9,]+\\.[0-9]{2})");
-
-    /** 开票日期：2024年01月01日 或 2024-01-01 */
    private static final Pattern P_DATE = Pattern.compile(
        "开票日期[：: ]*([0-9]{4}[年\\-/][0-9]{1,2}[月\\-/][0-9]{1,2}日?)");
-
-    /** 发票类型抬头 */
    private static final Pattern P_TYPE = Pattern.compile(
        "(电子(?:普通)?发票|增值税电子(?:普通|专用)发票|电子发票（[^）]+）|数电(?:普通)?发票|普通发票|专用发票)");
-
-    /** 销售方名称：兼顾 "销售方名称：xxx"、"销 售 方 名称：xxx"、新版"销售方信息名称：xxx" */
    private static final Pattern P_SELLER = Pattern.compile(
        "销\\s*售\\s*方[^名]*名\\s*称[：: ]*([^\\n\\r]+?)(?=\\s{2,}|纳税人|统一社会|地址|开户|$)");
-
-    /** 明细行金额（行末两列：金额  税率%  税额 或 金额 税率% 价税合计） */
    private static final Pattern P_LINE_AMOUNT = Pattern.compile(
-        "([\\u4e00-\\u9fa5A-Za-z0-9（）()\\-·.\\*\\s]{2,40}?)\\s+" +     // 名称
-        "([0-9,]+\\.[0-9]{2})\\s+" +                                       // 金额（不含税）
-        "(\\d{1,2}%|免税|不征税|\\*)\\s+" +                                 // 税率
-        "([0-9,]+\\.[0-9]{2})");                                            // 税额
+        "([\\u4e00-\\u9fa5A-Za-z0-9（）()\\-·.\\*\\s]{2,40}?)\\s+"
+        + "([0-9,]+\\.[0-9]{2})\\s+"
+        + "(\\d{1,2}%|免税|不征税|\\*)\\s+"
+        + "([0-9,]+\\.[0-9]{2})");

    @Override
    public HrmInvoiceOcrResultVo recognizeByOssId(Long ossId) {
@@ -62,7 +107,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
        if (oss == null) {
            throw new ServiceException("附件不存在: " + ossId);
        }
-
        String suffix = StringUtils.defaultIfBlank(oss.getFileSuffix(), "").toLowerCase().replace(".", "");
        if (!"pdf".equals(suffix)) {
            throw new ServiceException("仅支持 PDF 电子发票，当前文件类型: " + suffix);
@@ -75,26 +119,154 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
            throw new ServiceException("读取附件失败: " + e.getMessage());
        }

-        return parsePdf(fileBytes);
+        try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(fileBytes))) {
+            // 第一步：文本层
+            String text = extractText(doc);
+            if (StringUtils.isNotBlank(text) && looksLikeInvoice(text)) {
+                log.debug("[Invoice] hit text layer");
+                return buildFromText(text);
+            }
+
+            // 第二步：二维码
+            HrmInvoiceOcrResultVo qr = tryDecodeQrFromPdf(doc);
+            if (qr != null) {
+                log.info("[Invoice] hit QR code");
+                return qr;
+            }
+
+            // 第三步：本地 OCR
+            log.info("[Invoice] fallback to local OCR");
+            String ocrText = runTesseract(doc);
+            if (StringUtils.isBlank(ocrText)) {
+                throw new ServiceException("无法识别该 PDF，请上传开票平台下载的正规 PDF 电子发票。");
+            }
+            return buildFromText(ocrText);
+        } catch (ServiceException e) {
+            throw e;
+        } catch (Exception e) {
+            log.error("[Invoice] 解析失败", e);
+            throw new ServiceException("发票解析失败: " + e.getMessage());
+        }
    }

-    /** 直接从 PDF 中抽文本并按发票常见版面解析字段 */
-    private HrmInvoiceOcrResultVo parsePdf(byte[] bytes) {
-        String text;
-        try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(bytes))) {
-            PDFTextStripper stripper = new PDFTextStripper();
-            stripper.setSortByPosition(true);
-            stripper.setLineSeparator("\n");
-            text = stripper.getText(doc);
-        } catch (Exception e) {
-            log.error("[Invoice] PDF 解析失败", e);
-            throw new ServiceException("PDF 解析失败: " + e.getMessage()
-                + "。若为扫描件，请提供电子发票原始 PDF。");
-        }
-        if (StringUtils.isBlank(text)) {
-            throw new ServiceException("无法从 PDF 提取文本，可能为扫描件，请上传电子发票原始 PDF。");
-        }
+    /** 文本层抽取 */
+    private String extractText(PDDocument doc) throws Exception {
+        PDFTextStripper stripper = new PDFTextStripper();
+        stripper.setSortByPosition(true);
+        stripper.setLineSeparator("\n");
+        return stripper.getText(doc);
+    }

+    /** 是否长得像发票：要至少出现金额/日期/抬头其中一个关键字段 */
+    private boolean looksLikeInvoice(String text) {
+        return P_TOTAL.matcher(text).find()
+            || P_DATE.matcher(text).find()
+            || P_TYPE.matcher(text).find()
+            || text.contains("发票");
+    }
+
+    /** 第二步：把每页渲染成图，再用 ZXing 扫二维码 */
+    private HrmInvoiceOcrResultVo tryDecodeQrFromPdf(PDDocument doc) {
+        try {
+            PDFRenderer renderer = new PDFRenderer(doc);
+            // 二维码 200dpi 已经够清晰，CPU 也轻
+            int pages = doc.getNumberOfPages();
+            for (int i = 0; i < pages; i++) {
+                BufferedImage img = renderer.renderImageWithDPI(i, 200, ImageType.GRAY);
+                String content = decodeQr(img);
+                if (StringUtils.isNotBlank(content)) {
+                    HrmInvoiceOcrResultVo vo = parseInvoiceQr(content);
+                    if (vo != null) return vo;
+                }
+            }
+        } catch (Exception e) {
+            log.debug("[Invoice] QR decode failed: {}", e.getMessage());
+        }
+        return null;
+    }
+
+    /** ZXing 解码一张图，返回二维码文本（无则 null） */
+    private String decodeQr(BufferedImage img) {
+        try {
+            BinaryBitmap bitmap = new BinaryBitmap(
+                new HybridBinarizer(new BufferedImageLuminanceSource(img)));
+            Map<DecodeHintType, Object> hints = new EnumMap<>(DecodeHintType.class);
+            hints.put(DecodeHintType.TRY_HARDER, Boolean.TRUE);
+            hints.put(DecodeHintType.CHARACTER_SET, "UTF-8");
+            Result r = new MultiFormatReader().decode(bitmap, hints);
+            return r != null ? r.getText() : null;
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    /**
+     * 解析增值税发票二维码内容。格式（逗号分隔）：
+     * 01,版本号,发票代码,发票号码,金额(不含税),开票日期(YYYYMMDD),校验码后6位,...
+     */
+    private HrmInvoiceOcrResultVo parseInvoiceQr(String raw) {
+        if (raw == null) return null;
+        String[] parts = raw.split(",");
+        if (parts.length < 7 || !parts[0].startsWith("0")) return null;
+        try {
+            HrmInvoiceOcrResultVo vo = new HrmInvoiceOcrResultVo();
+            // parts[4] 是不含税金额；纸质票二维码无价税合计，先填到 totalAmount 让前端可调
+            BigDecimal amt = parseBigDecimal(parts[4]);
+            vo.setTotalAmount(amt);
+            String d = parts[5];
+            if (d != null && d.length() == 8) {
+                vo.setInvoiceDate(d.substring(0, 4) + "-" + d.substring(4, 6) + "-" + d.substring(6, 8));
+            }
+            vo.setInvoiceType("增值税发票（二维码识别）");
+            List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
+            HrmInvoiceOcrResultVo.Item it = new HrmInvoiceOcrResultVo.Item();
+            it.setItemName("发票款项（金额请核对原票，二维码仅含不含税金额）");
+            it.setAmount(amt);
+            items.add(it);
+            vo.setItems(items);
+            return vo;
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    /** 第三步：Tesseract 本地 OCR，整文本拼接交给 regex 解析 */
+    private String runTesseract(PDDocument doc) {
+        try {
+            File td = new File(tessdataPath);
+            if (!td.isDirectory()) {
+                log.warn("[Invoice] tessdata 目录不存在: {}，无法走 OCR 兜底", tessdataPath);
+                throw new ServiceException(
+                    "本地 OCR 字库未配置。请在 jar 同级目录建 tessdata/ 并放入 chi_sim.traineddata，"
+                    + "或通过 fad.ocr.tessdata-path 指定路径。");
+            }
+            Tesseract t = new Tesseract();
+            t.setDatapath(tessdataPath);
+            t.setLanguage("chi_sim");
+            t.setPageSegMode(6);   // 假设单块文本
+            t.setOcrEngineMode(1); // LSTM only
+
+            PDFRenderer renderer = new PDFRenderer(doc);
+            StringBuilder sb = new StringBuilder();
+            int pages = doc.getNumberOfPages();
+            for (int i = 0; i < pages; i++) {
+                BufferedImage img = renderer.renderImageWithDPI(i, 300, ImageType.GRAY);
+                sb.append(t.doOCR(img)).append('\n');
+            }
+            return sb.toString();
+        } catch (ServiceException e) {
+            throw e;
+        } catch (Throwable e) {
+            log.error("[Invoice] tesseract 失败: {}", e.getMessage());
+            throw new ServiceException(
+                "本地 OCR 失败：" + e.getMessage()
+                + "。请确认服务器已 apt install tesseract-ocr tesseract-ocr-chi-sim，"
+                + "且 jar 同级 tessdata/ 下有 chi_sim.traineddata。");
+        }
+    }
+
+    /** 文本 → VO 的统一抽取逻辑 */
+    private HrmInvoiceOcrResultVo buildFromText(String text) {
        HrmInvoiceOcrResultVo result = new HrmInvoiceOcrResultVo();
        result.setInvoiceType(firstGroup(P_TYPE, text));
        result.setInvoiceDate(firstGroup(P_DATE, text));
@@ -102,8 +274,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
        result.setTotalAmount(parseBigDecimal(firstGroup(P_TOTAL, text)));

        List<HrmInvoiceOcrResultVo.Item> items = parseLineItems(text);
-
-        // 兜底：解析不到明细但有总价 → 生成一条汇总
        if (items.isEmpty() && result.getTotalAmount() != null) {
            HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
            item.setItemName(StringUtils.defaultIfBlank(result.getSellerName(), "发票款项"));
@@ -114,21 +284,18 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
        return result;
    }

-    /** 抽明细行：在"货物名称 … 合计"之间逐行匹配 */
    private List<HrmInvoiceOcrResultVo.Item> parseLineItems(String text) {
        List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
-        // 定位明细表区间，找不到也没关系，直接全文匹配也能跑
        int begin = indexOfAny(text, "项目名称", "货物或应税劳务", "货物名称");
        int end   = indexOfAny(text, "合\\s*计", "价税合计", "（大写）", "(大写)");
        String area = (begin >= 0 && end > begin) ? text.substring(begin, end) : text;
-
        for (String line : area.split("\\n")) {
            line = line.trim();
            if (line.length() < 6) continue;
            Matcher m = P_LINE_AMOUNT.matcher(line);
            if (!m.find()) continue;
            HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
-            String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); // 去掉 *类别* 前缀
+            String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", "");
            BigDecimal preTax = parseBigDecimal(m.group(2));
            String rate = m.group(3);
            BigDecimal tax = parseBigDecimal(m.group(4));
@@ -162,9 +329,7 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {

    private static String cleanSeller(String s) {
        if (s == null) return null;
-        // 去掉抬头里常见的"名称："残留 + 末尾空白和半角空格序列
        s = s.replaceAll("^[：:\\s]+", "").trim();
-        // 截断到第一个非中文/字母/数字/常见公司符号块
        String[] tail = s.split("\\s{2,}");
        return tail.length > 0 ? tail[0].trim() : s;
    }
@@ -177,5 +342,4 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
            return null;
        }
    }
-
 }