From ffcb62ceceed868a266cec655fa6c435834a7c49 Mon Sep 17 00:00:00 2001
From: wangyu <823267011@qq.com>
Date: Mon, 1 Jun 2026 10:25:16 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
fad-hrm/pom.xml | 23 ++
.../impl/HrmInvoiceOcrServiceImpl.java | 248 +++++++++++++++---
2 files changed, 229 insertions(+), 42 deletions(-)
diff --git a/fad-hrm/pom.xml b/fad-hrm/pom.xml
index 8ac94b0..d6a7e30 100644
--- a/fad-hrm/pom.xml
+++ b/fad-hrm/pom.xml
@@ -50,5 +50,28 @@
pdfbox
2.0.29
+
+
+ com.google.zxing
+ core
+ 3.5.1
+
+
+ com.google.zxing
+ javase
+ 3.5.1
+
+
+
+ net.sourceforge.tess4j
+ tess4j
+ 5.11.0
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
diff --git a/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java b/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java
index fd1eba2..b5f1af7 100644
--- a/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java
+++ b/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java
@@ -1,6 +1,12 @@
package com.ruoyi.hrm.service.impl;
import cn.hutool.core.io.IoUtil;
+import com.google.zxing.BinaryBitmap;
+import com.google.zxing.DecodeHintType;
+import com.google.zxing.MultiFormatReader;
+import com.google.zxing.Result;
+import com.google.zxing.client.j2se.BufferedImageLuminanceSource;
+import com.google.zxing.common.HybridBinarizer;
import com.ruoyi.common.exception.ServiceException;
import com.ruoyi.hrm.domain.vo.HrmInvoiceOcrResultVo;
import com.ruoyi.hrm.service.IHrmInvoiceOcrService;
@@ -9,22 +15,41 @@ import com.ruoyi.system.domain.vo.SysOssVo;
import com.ruoyi.system.mapper.SysOssMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
+import net.sourceforge.tess4j.Tesseract;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
+import javax.annotation.PostConstruct;
+import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
+import java.io.File;
import java.io.InputStream;
import java.math.BigDecimal;
+import java.net.URL;
+import java.nio.file.Paths;
import java.util.ArrayList;
+import java.util.EnumMap;
import java.util.List;
+import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
- * 发票识别服务实现:直接解析电子发票 PDF 文本,无外部模型依赖。
- * 仅支持 PDF(电子普通发票 / 电子专用发票 / 全电数电票)。
+ * 发票识别服务实现:本地三段式管线,无任何外部 API 调用。
+ *
+ *
+ * - PDFBox 文本层抽取:原生电子发票直接搞定(毫秒级,几乎零算力)
+ * - ZXing 二维码识别:拍照/扫描发票 PDF,从二维码读结构化字段
+ * - Tesseract OCR(仅在前两步失败时触发):本地 chi_sim 字库,无网络
+ *
+ *
+ * Tesseract 字库路径默认 {jar 同级目录}/tessdata,可用 fad.ocr.tessdata-path 覆盖。
+ * 系统需预装:apt install -y tesseract-ocr tesseract-ocr-chi-sim
*/
@Slf4j
@RequiredArgsConstructor
@@ -33,28 +58,48 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
private final SysOssMapper sysOssMapper;
- /** "价税合计(小写)¥123.45" 这种小写金额 */
+ /** 可通过 application.yml 覆盖;默认 jar 同级目录下的 tessdata */
+ @Value("${fad.ocr.tessdata-path:}")
+ private String tessdataPathConfig;
+
+ private String tessdataPath;
+
+ @PostConstruct
+ void resolveTessdataPath() {
+ if (StringUtils.isNotBlank(tessdataPathConfig)) {
+ tessdataPath = tessdataPathConfig;
+ } else {
+ tessdataPath = Paths.get(jarDir(), "tessdata").toString();
+ }
+ log.info("[Invoice] tessdata path = {}", tessdataPath);
+ }
+
+ /** 取 jar 所在目录;IDE 调试时 fall back 到工作目录 */
+ private static String jarDir() {
+ try {
+ URL url = HrmInvoiceOcrServiceImpl.class
+ .getProtectionDomain().getCodeSource().getLocation();
+ File f = new File(url.toURI());
+ return (f.isFile() ? f.getParentFile() : f).getAbsolutePath();
+ } catch (Exception e) {
+ return System.getProperty("user.dir");
+ }
+ }
+
+ // === 字段抽取正则 ===
private static final Pattern P_TOTAL = Pattern.compile(
"(?:价税合计|小写)[^0-9¥¥]{0,30}[¥¥]?\\s*([0-9,]+\\.[0-9]{2})");
-
- /** 开票日期:2024年01月01日 或 2024-01-01 */
private static final Pattern P_DATE = Pattern.compile(
"开票日期[:: ]*([0-9]{4}[年\\-/][0-9]{1,2}[月\\-/][0-9]{1,2}日?)");
-
- /** 发票类型抬头 */
private static final Pattern P_TYPE = Pattern.compile(
"(电子(?:普通)?发票|增值税电子(?:普通|专用)发票|电子发票([^)]+)|数电(?:普通)?发票|普通发票|专用发票)");
-
- /** 销售方名称:兼顾 "销售方名称:xxx"、"销 售 方 名称:xxx"、新版"销售方信息名称:xxx" */
private static final Pattern P_SELLER = Pattern.compile(
"销\\s*售\\s*方[^名]*名\\s*称[:: ]*([^\\n\\r]+?)(?=\\s{2,}|纳税人|统一社会|地址|开户|$)");
-
- /** 明细行金额(行末两列:金额 税率% 税额 或 金额 税率% 价税合计) */
private static final Pattern P_LINE_AMOUNT = Pattern.compile(
- "([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+" + // 名称
- "([0-9,]+\\.[0-9]{2})\\s+" + // 金额(不含税)
- "(\\d{1,2}%|免税|不征税|\\*)\\s+" + // 税率
- "([0-9,]+\\.[0-9]{2})"); // 税额
+ "([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+"
+ + "([0-9,]+\\.[0-9]{2})\\s+"
+ + "(\\d{1,2}%|免税|不征税|\\*)\\s+"
+ + "([0-9,]+\\.[0-9]{2})");
@Override
public HrmInvoiceOcrResultVo recognizeByOssId(Long ossId) {
@@ -62,7 +107,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
if (oss == null) {
throw new ServiceException("附件不存在: " + ossId);
}
-
String suffix = StringUtils.defaultIfBlank(oss.getFileSuffix(), "").toLowerCase().replace(".", "");
if (!"pdf".equals(suffix)) {
throw new ServiceException("仅支持 PDF 电子发票,当前文件类型: " + suffix);
@@ -75,26 +119,154 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
throw new ServiceException("读取附件失败: " + e.getMessage());
}
- return parsePdf(fileBytes);
+ try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(fileBytes))) {
+ // 第一步:文本层
+ String text = extractText(doc);
+ if (StringUtils.isNotBlank(text) && looksLikeInvoice(text)) {
+ log.debug("[Invoice] hit text layer");
+ return buildFromText(text);
+ }
+
+ // 第二步:二维码
+ HrmInvoiceOcrResultVo qr = tryDecodeQrFromPdf(doc);
+ if (qr != null) {
+ log.info("[Invoice] hit QR code");
+ return qr;
+ }
+
+ // 第三步:本地 OCR
+ log.info("[Invoice] fallback to local OCR");
+ String ocrText = runTesseract(doc);
+ if (StringUtils.isBlank(ocrText)) {
+ throw new ServiceException("无法识别该 PDF,请上传开票平台下载的正规 PDF 电子发票。");
+ }
+ return buildFromText(ocrText);
+ } catch (ServiceException e) {
+ throw e;
+ } catch (Exception e) {
+ log.error("[Invoice] 解析失败", e);
+ throw new ServiceException("发票解析失败: " + e.getMessage());
+ }
}
- /** 直接从 PDF 中抽文本并按发票常见版面解析字段 */
- private HrmInvoiceOcrResultVo parsePdf(byte[] bytes) {
- String text;
- try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(bytes))) {
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.setSortByPosition(true);
- stripper.setLineSeparator("\n");
- text = stripper.getText(doc);
- } catch (Exception e) {
- log.error("[Invoice] PDF 解析失败", e);
- throw new ServiceException("PDF 解析失败: " + e.getMessage()
- + "。若为扫描件,请提供电子发票原始 PDF。");
- }
- if (StringUtils.isBlank(text)) {
- throw new ServiceException("无法从 PDF 提取文本,可能为扫描件,请上传电子发票原始 PDF。");
- }
+ /** 文本层抽取 */
+ private String extractText(PDDocument doc) throws Exception {
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.setSortByPosition(true);
+ stripper.setLineSeparator("\n");
+ return stripper.getText(doc);
+ }
+ /** 是否长得像发票:要至少出现金额/日期/抬头其中一个关键字段 */
+ private boolean looksLikeInvoice(String text) {
+ return P_TOTAL.matcher(text).find()
+ || P_DATE.matcher(text).find()
+ || P_TYPE.matcher(text).find()
+ || text.contains("发票");
+ }
+
+ /** 第二步:把每页渲染成图,再用 ZXing 扫二维码 */
+ private HrmInvoiceOcrResultVo tryDecodeQrFromPdf(PDDocument doc) {
+ try {
+ PDFRenderer renderer = new PDFRenderer(doc);
+ // 二维码 200dpi 已经够清晰,CPU 也轻
+ int pages = doc.getNumberOfPages();
+ for (int i = 0; i < pages; i++) {
+ BufferedImage img = renderer.renderImageWithDPI(i, 200, ImageType.GRAY);
+ String content = decodeQr(img);
+ if (StringUtils.isNotBlank(content)) {
+ HrmInvoiceOcrResultVo vo = parseInvoiceQr(content);
+ if (vo != null) return vo;
+ }
+ }
+ } catch (Exception e) {
+ log.debug("[Invoice] QR decode failed: {}", e.getMessage());
+ }
+ return null;
+ }
+
+ /** ZXing 解码一张图,返回二维码文本(无则 null) */
+ private String decodeQr(BufferedImage img) {
+ try {
+ BinaryBitmap bitmap = new BinaryBitmap(
+ new HybridBinarizer(new BufferedImageLuminanceSource(img)));
+ Map hints = new EnumMap<>(DecodeHintType.class);
+ hints.put(DecodeHintType.TRY_HARDER, Boolean.TRUE);
+ hints.put(DecodeHintType.CHARACTER_SET, "UTF-8");
+ Result r = new MultiFormatReader().decode(bitmap, hints);
+ return r != null ? r.getText() : null;
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ /**
+ * 解析增值税发票二维码内容。格式(逗号分隔):
+ * 01,版本号,发票代码,发票号码,金额(不含税),开票日期(YYYYMMDD),校验码后6位,...
+ */
+ private HrmInvoiceOcrResultVo parseInvoiceQr(String raw) {
+ if (raw == null) return null;
+ String[] parts = raw.split(",");
+ if (parts.length < 7 || !parts[0].startsWith("0")) return null;
+ try {
+ HrmInvoiceOcrResultVo vo = new HrmInvoiceOcrResultVo();
+ // parts[4] 是不含税金额;纸质票二维码无价税合计,先填到 totalAmount 让前端可调
+ BigDecimal amt = parseBigDecimal(parts[4]);
+ vo.setTotalAmount(amt);
+ String d = parts[5];
+ if (d != null && d.length() == 8) {
+ vo.setInvoiceDate(d.substring(0, 4) + "-" + d.substring(4, 6) + "-" + d.substring(6, 8));
+ }
+ vo.setInvoiceType("增值税发票(二维码识别)");
+ List items = new ArrayList<>();
+ HrmInvoiceOcrResultVo.Item it = new HrmInvoiceOcrResultVo.Item();
+ it.setItemName("发票款项(金额请核对原票,二维码仅含不含税金额)");
+ it.setAmount(amt);
+ items.add(it);
+ vo.setItems(items);
+ return vo;
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ /** 第三步:Tesseract 本地 OCR,整文本拼接交给 regex 解析 */
+ private String runTesseract(PDDocument doc) {
+ try {
+ File td = new File(tessdataPath);
+ if (!td.isDirectory()) {
+ log.warn("[Invoice] tessdata 目录不存在: {},无法走 OCR 兜底", tessdataPath);
+ throw new ServiceException(
+ "本地 OCR 字库未配置。请在 jar 同级目录建 tessdata/ 并放入 chi_sim.traineddata,"
+ + "或通过 fad.ocr.tessdata-path 指定路径。");
+ }
+ Tesseract t = new Tesseract();
+ t.setDatapath(tessdataPath);
+ t.setLanguage("chi_sim");
+ t.setPageSegMode(6); // 假设单块文本
+ t.setOcrEngineMode(1); // LSTM only
+
+ PDFRenderer renderer = new PDFRenderer(doc);
+ StringBuilder sb = new StringBuilder();
+ int pages = doc.getNumberOfPages();
+ for (int i = 0; i < pages; i++) {
+ BufferedImage img = renderer.renderImageWithDPI(i, 300, ImageType.GRAY);
+ sb.append(t.doOCR(img)).append('\n');
+ }
+ return sb.toString();
+ } catch (ServiceException e) {
+ throw e;
+ } catch (Throwable e) {
+ log.error("[Invoice] tesseract 失败: {}", e.getMessage());
+ throw new ServiceException(
+ "本地 OCR 失败:" + e.getMessage()
+ + "。请确认服务器已 apt install tesseract-ocr tesseract-ocr-chi-sim,"
+ + "且 jar 同级 tessdata/ 下有 chi_sim.traineddata。");
+ }
+ }
+
+ /** 文本 → VO 的统一抽取逻辑 */
+ private HrmInvoiceOcrResultVo buildFromText(String text) {
HrmInvoiceOcrResultVo result = new HrmInvoiceOcrResultVo();
result.setInvoiceType(firstGroup(P_TYPE, text));
result.setInvoiceDate(firstGroup(P_DATE, text));
@@ -102,8 +274,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
result.setTotalAmount(parseBigDecimal(firstGroup(P_TOTAL, text)));
List items = parseLineItems(text);
-
- // 兜底:解析不到明细但有总价 → 生成一条汇总
if (items.isEmpty() && result.getTotalAmount() != null) {
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
item.setItemName(StringUtils.defaultIfBlank(result.getSellerName(), "发票款项"));
@@ -114,21 +284,18 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
return result;
}
- /** 抽明细行:在"货物名称 … 合计"之间逐行匹配 */
private List parseLineItems(String text) {
List items = new ArrayList<>();
- // 定位明细表区间,找不到也没关系,直接全文匹配也能跑
int begin = indexOfAny(text, "项目名称", "货物或应税劳务", "货物名称");
int end = indexOfAny(text, "合\\s*计", "价税合计", "(大写)", "(大写)");
String area = (begin >= 0 && end > begin) ? text.substring(begin, end) : text;
-
for (String line : area.split("\\n")) {
line = line.trim();
if (line.length() < 6) continue;
Matcher m = P_LINE_AMOUNT.matcher(line);
if (!m.find()) continue;
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
- String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); // 去掉 *类别* 前缀
+ String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", "");
BigDecimal preTax = parseBigDecimal(m.group(2));
String rate = m.group(3);
BigDecimal tax = parseBigDecimal(m.group(4));
@@ -162,9 +329,7 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
private static String cleanSeller(String s) {
if (s == null) return null;
- // 去掉抬头里常见的"名称:"残留 + 末尾空白和半角空格序列
s = s.replaceAll("^[::\\s]+", "").trim();
- // 截断到第一个非中文/字母/数字/常见公司符号块
String[] tail = s.split("\\s{2,}");
return tail.length > 0 ? tail[0].trim() : s;
}
@@ -177,5 +342,4 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
return null;
}
}
-
}