This commit is contained in:
2026-06-01 10:25:16 +08:00
parent dcc66aa4a9
commit ffcb62cece
2 changed files with 229 additions and 42 deletions

View File

@@ -50,5 +50,28 @@
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<!-- 二维码识别(发票二维码兜底) -->
<dependency>
<groupId>com.google.zxing</groupId>
<artifactId>core</artifactId>
<version>3.5.1</version>
</dependency>
<dependency>
<groupId>com.google.zxing</groupId>
<artifactId>javase</artifactId>
<version>3.5.1</version>
</dependency>
<!-- 本地 OCRTesseract JNA 绑定):仅在 PDF 没有文本层且二维码不可读时启用 -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.11.0</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

View File

@@ -1,6 +1,12 @@
package com.ruoyi.hrm.service.impl;
import cn.hutool.core.io.IoUtil;
import com.google.zxing.BinaryBitmap;
import com.google.zxing.DecodeHintType;
import com.google.zxing.MultiFormatReader;
import com.google.zxing.Result;
import com.google.zxing.client.j2se.BufferedImageLuminanceSource;
import com.google.zxing.common.HybridBinarizer;
import com.ruoyi.common.exception.ServiceException;
import com.ruoyi.hrm.domain.vo.HrmInvoiceOcrResultVo;
import com.ruoyi.hrm.service.IHrmInvoiceOcrService;
@@ -9,22 +15,41 @@ import com.ruoyi.system.domain.vo.SysOssVo;
import com.ruoyi.system.mapper.SysOssMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.Tesseract;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.math.BigDecimal;
import java.net.URL;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 发票识别服务实现:直接解析电子发票 PDF 文本,无外部模型依赖
* 仅支持 PDF电子普通发票 / 电子专用发票 / 全电数电票)。
* 发票识别服务实现:本地三段式管线,无任何外部 API 调用
*
* <ol>
* <li>PDFBox 文本层抽取:原生电子发票直接搞定(毫秒级,几乎零算力)</li>
* <li>ZXing 二维码识别:拍照/扫描发票 PDF从二维码读结构化字段</li>
* <li>Tesseract OCR仅在前两步失败时触发本地 chi_sim 字库,无网络</li>
* </ol>
*
* Tesseract 字库路径默认 {jar 同级目录}/tessdata可用 fad.ocr.tessdata-path 覆盖。
* 系统需预装apt install -y tesseract-ocr tesseract-ocr-chi-sim
*/
@Slf4j
@RequiredArgsConstructor
@@ -33,28 +58,48 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
private final SysOssMapper sysOssMapper;
/** "价税合计小写¥123.45" 这种小写金额 */
/** 可通过 application.yml 覆盖;默认 jar 同级目录下的 tessdata */
@Value("${fad.ocr.tessdata-path:}")
private String tessdataPathConfig;
private String tessdataPath;
@PostConstruct
void resolveTessdataPath() {
if (StringUtils.isNotBlank(tessdataPathConfig)) {
tessdataPath = tessdataPathConfig;
} else {
tessdataPath = Paths.get(jarDir(), "tessdata").toString();
}
log.info("[Invoice] tessdata path = {}", tessdataPath);
}
/** 取 jar 所在目录IDE 调试时 fall back 到工作目录 */
private static String jarDir() {
try {
URL url = HrmInvoiceOcrServiceImpl.class
.getProtectionDomain().getCodeSource().getLocation();
File f = new File(url.toURI());
return (f.isFile() ? f.getParentFile() : f).getAbsolutePath();
} catch (Exception e) {
return System.getProperty("user.dir");
}
}
// === 字段抽取正则 ===
private static final Pattern P_TOTAL = Pattern.compile(
"(?:价税合计|小写)[^0-9¥¥]{0,30}[¥¥]?\\s*([0-9,]+\\.[0-9]{2})");
/** 开票日期2024年01月01日 或 2024-01-01 */
private static final Pattern P_DATE = Pattern.compile(
"开票日期[: ]*([0-9]{4}[年\\-/][0-9]{1,2}[月\\-/][0-9]{1,2}日?)");
/** 发票类型抬头 */
private static final Pattern P_TYPE = Pattern.compile(
"(电子(?:普通)?发票|增值税电子(?:普通|专用)发票|电子发票([^]+|数电(?:普通)?发票|普通发票|专用发票)");
/** 销售方名称:兼顾 "销售方名称xxx"、"销 售 方 名称xxx"、新版"销售方信息名称xxx" */
private static final Pattern P_SELLER = Pattern.compile(
"\\s*售\\s*方[^名]*名\\s*称[: ]*([^\\n\\r]+?)(?=\\s{2,}|纳税人|统一社会|地址|开户|$)");
/** 明细行金额(行末两列:金额 税率% 税额 或 金额 税率% 价税合计) */
private static final Pattern P_LINE_AMOUNT = Pattern.compile(
"([\\u4e00-\\u9fa5A-Za-z0-9()\\-·.\\*\\s]{2,40}?)\\s+" + // 名称
"([0-9,]+\\.[0-9]{2})\\s+" + // 金额(不含税)
"(\\d{1,2}%|免税|不征税|\\*)\\s+" + // 税率
"([0-9,]+\\.[0-9]{2})"); // 税额
"([\\u4e00-\\u9fa5A-Za-z0-9()\\-·.\\*\\s]{2,40}?)\\s+"
+ "([0-9,]+\\.[0-9]{2})\\s+"
+ "(\\d{1,2}%|免税|不征税|\\*)\\s+"
+ "([0-9,]+\\.[0-9]{2})");
@Override
public HrmInvoiceOcrResultVo recognizeByOssId(Long ossId) {
@@ -62,7 +107,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
if (oss == null) {
throw new ServiceException("附件不存在: " + ossId);
}
String suffix = StringUtils.defaultIfBlank(oss.getFileSuffix(), "").toLowerCase().replace(".", "");
if (!"pdf".equals(suffix)) {
throw new ServiceException("仅支持 PDF 电子发票,当前文件类型: " + suffix);
@@ -75,26 +119,154 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
throw new ServiceException("读取附件失败: " + e.getMessage());
}
return parsePdf(fileBytes);
try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(fileBytes))) {
// 第一步:文本层
String text = extractText(doc);
if (StringUtils.isNotBlank(text) && looksLikeInvoice(text)) {
log.debug("[Invoice] hit text layer");
return buildFromText(text);
}
// 第二步:二维码
HrmInvoiceOcrResultVo qr = tryDecodeQrFromPdf(doc);
if (qr != null) {
log.info("[Invoice] hit QR code");
return qr;
}
// 第三步:本地 OCR
log.info("[Invoice] fallback to local OCR");
String ocrText = runTesseract(doc);
if (StringUtils.isBlank(ocrText)) {
throw new ServiceException("无法识别该 PDF请上传开票平台下载的正规 PDF 电子发票。");
}
return buildFromText(ocrText);
} catch (ServiceException e) {
throw e;
} catch (Exception e) {
log.error("[Invoice] 解析失败", e);
throw new ServiceException("发票解析失败: " + e.getMessage());
}
}
/** 直接从 PDF 中抽文本并按发票常见版面解析字段 */
private HrmInvoiceOcrResultVo parsePdf(byte[] bytes) {
String text;
try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(bytes))) {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
stripper.setLineSeparator("\n");
text = stripper.getText(doc);
} catch (Exception e) {
log.error("[Invoice] PDF 解析失败", e);
throw new ServiceException("PDF 解析失败: " + e.getMessage()
+ "。若为扫描件,请提供电子发票原始 PDF。");
}
if (StringUtils.isBlank(text)) {
throw new ServiceException("无法从 PDF 提取文本,可能为扫描件,请上传电子发票原始 PDF。");
}
/** 文本层抽取 */
private String extractText(PDDocument doc) throws Exception {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
stripper.setLineSeparator("\n");
return stripper.getText(doc);
}
/** 是否长得像发票:要至少出现金额/日期/抬头其中一个关键字段 */
private boolean looksLikeInvoice(String text) {
return P_TOTAL.matcher(text).find()
|| P_DATE.matcher(text).find()
|| P_TYPE.matcher(text).find()
|| text.contains("发票");
}
/** 第二步:把每页渲染成图,再用 ZXing 扫二维码 */
private HrmInvoiceOcrResultVo tryDecodeQrFromPdf(PDDocument doc) {
try {
PDFRenderer renderer = new PDFRenderer(doc);
// 二维码 200dpi 已经够清晰CPU 也轻
int pages = doc.getNumberOfPages();
for (int i = 0; i < pages; i++) {
BufferedImage img = renderer.renderImageWithDPI(i, 200, ImageType.GRAY);
String content = decodeQr(img);
if (StringUtils.isNotBlank(content)) {
HrmInvoiceOcrResultVo vo = parseInvoiceQr(content);
if (vo != null) return vo;
}
}
} catch (Exception e) {
log.debug("[Invoice] QR decode failed: {}", e.getMessage());
}
return null;
}
/** ZXing 解码一张图,返回二维码文本(无则 null */
private String decodeQr(BufferedImage img) {
try {
BinaryBitmap bitmap = new BinaryBitmap(
new HybridBinarizer(new BufferedImageLuminanceSource(img)));
Map<DecodeHintType, Object> hints = new EnumMap<>(DecodeHintType.class);
hints.put(DecodeHintType.TRY_HARDER, Boolean.TRUE);
hints.put(DecodeHintType.CHARACTER_SET, "UTF-8");
Result r = new MultiFormatReader().decode(bitmap, hints);
return r != null ? r.getText() : null;
} catch (Exception e) {
return null;
}
}
/**
* 解析增值税发票二维码内容。格式(逗号分隔):
* 01,版本号,发票代码,发票号码,金额(不含税),开票日期(YYYYMMDD),校验码后6位,...
*/
private HrmInvoiceOcrResultVo parseInvoiceQr(String raw) {
if (raw == null) return null;
String[] parts = raw.split(",");
if (parts.length < 7 || !parts[0].startsWith("0")) return null;
try {
HrmInvoiceOcrResultVo vo = new HrmInvoiceOcrResultVo();
// parts[4] 是不含税金额;纸质票二维码无价税合计,先填到 totalAmount 让前端可调
BigDecimal amt = parseBigDecimal(parts[4]);
vo.setTotalAmount(amt);
String d = parts[5];
if (d != null && d.length() == 8) {
vo.setInvoiceDate(d.substring(0, 4) + "-" + d.substring(4, 6) + "-" + d.substring(6, 8));
}
vo.setInvoiceType("增值税发票(二维码识别)");
List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
HrmInvoiceOcrResultVo.Item it = new HrmInvoiceOcrResultVo.Item();
it.setItemName("发票款项(金额请核对原票,二维码仅含不含税金额)");
it.setAmount(amt);
items.add(it);
vo.setItems(items);
return vo;
} catch (Exception e) {
return null;
}
}
/** 第三步Tesseract 本地 OCR整文本拼接交给 regex 解析 */
private String runTesseract(PDDocument doc) {
try {
File td = new File(tessdataPath);
if (!td.isDirectory()) {
log.warn("[Invoice] tessdata 目录不存在: {},无法走 OCR 兜底", tessdataPath);
throw new ServiceException(
"本地 OCR 字库未配置。请在 jar 同级目录建 tessdata/ 并放入 chi_sim.traineddata"
+ "或通过 fad.ocr.tessdata-path 指定路径。");
}
Tesseract t = new Tesseract();
t.setDatapath(tessdataPath);
t.setLanguage("chi_sim");
t.setPageSegMode(6); // 假设单块文本
t.setOcrEngineMode(1); // LSTM only
PDFRenderer renderer = new PDFRenderer(doc);
StringBuilder sb = new StringBuilder();
int pages = doc.getNumberOfPages();
for (int i = 0; i < pages; i++) {
BufferedImage img = renderer.renderImageWithDPI(i, 300, ImageType.GRAY);
sb.append(t.doOCR(img)).append('\n');
}
return sb.toString();
} catch (ServiceException e) {
throw e;
} catch (Throwable e) {
log.error("[Invoice] tesseract 失败: {}", e.getMessage());
throw new ServiceException(
"本地 OCR 失败:" + e.getMessage()
+ "。请确认服务器已 apt install tesseract-ocr tesseract-ocr-chi-sim"
+ "且 jar 同级 tessdata/ 下有 chi_sim.traineddata。");
}
}
/** 文本 → VO 的统一抽取逻辑 */
private HrmInvoiceOcrResultVo buildFromText(String text) {
HrmInvoiceOcrResultVo result = new HrmInvoiceOcrResultVo();
result.setInvoiceType(firstGroup(P_TYPE, text));
result.setInvoiceDate(firstGroup(P_DATE, text));
@@ -102,8 +274,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
result.setTotalAmount(parseBigDecimal(firstGroup(P_TOTAL, text)));
List<HrmInvoiceOcrResultVo.Item> items = parseLineItems(text);
// 兜底:解析不到明细但有总价 → 生成一条汇总
if (items.isEmpty() && result.getTotalAmount() != null) {
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
item.setItemName(StringUtils.defaultIfBlank(result.getSellerName(), "发票款项"));
@@ -114,21 +284,18 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
return result;
}
/** 抽明细行:在"货物名称 … 合计"之间逐行匹配 */
private List<HrmInvoiceOcrResultVo.Item> parseLineItems(String text) {
List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
// 定位明细表区间,找不到也没关系,直接全文匹配也能跑
int begin = indexOfAny(text, "项目名称", "货物或应税劳务", "货物名称");
int end = indexOfAny(text, "\\s*计", "价税合计", "(大写)", "(大写)");
String area = (begin >= 0 && end > begin) ? text.substring(begin, end) : text;
for (String line : area.split("\\n")) {
line = line.trim();
if (line.length() < 6) continue;
Matcher m = P_LINE_AMOUNT.matcher(line);
if (!m.find()) continue;
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); // 去掉 *类别* 前缀
String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", "");
BigDecimal preTax = parseBigDecimal(m.group(2));
String rate = m.group(3);
BigDecimal tax = parseBigDecimal(m.group(4));
@@ -162,9 +329,7 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
private static String cleanSeller(String s) {
if (s == null) return null;
// 去掉抬头里常见的"名称:"残留 + 末尾空白和半角空格序列
s = s.replaceAll("^[:\\s]+", "").trim();
// 截断到第一个非中文/字母/数字/常见公司符号块
String[] tail = s.split("\\s{2,}");
return tail.length > 0 ? tail[0].trim() : s;
}
@@ -177,5 +342,4 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
return null;
}
}
}