提交
This commit is contained in:
@@ -50,5 +50,28 @@
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.29</version>
|
||||
</dependency>
|
||||
<!-- 二维码识别(发票二维码兜底) -->
|
||||
<dependency>
|
||||
<groupId>com.google.zxing</groupId>
|
||||
<artifactId>core</artifactId>
|
||||
<version>3.5.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.zxing</groupId>
|
||||
<artifactId>javase</artifactId>
|
||||
<version>3.5.1</version>
|
||||
</dependency>
|
||||
<!-- 本地 OCR(Tesseract JNA 绑定):仅在 PDF 没有文本层且二维码不可读时启用 -->
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
<version>5.11.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
package com.ruoyi.hrm.service.impl;
|
||||
|
||||
import cn.hutool.core.io.IoUtil;
|
||||
import com.google.zxing.BinaryBitmap;
|
||||
import com.google.zxing.DecodeHintType;
|
||||
import com.google.zxing.MultiFormatReader;
|
||||
import com.google.zxing.Result;
|
||||
import com.google.zxing.client.j2se.BufferedImageLuminanceSource;
|
||||
import com.google.zxing.common.HybridBinarizer;
|
||||
import com.ruoyi.common.exception.ServiceException;
|
||||
import com.ruoyi.hrm.domain.vo.HrmInvoiceOcrResultVo;
|
||||
import com.ruoyi.hrm.service.IHrmInvoiceOcrService;
|
||||
@@ -9,22 +15,41 @@ import com.ruoyi.system.domain.vo.SysOssVo;
|
||||
import com.ruoyi.system.mapper.SysOssMapper;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.math.BigDecimal;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 发票识别服务实现:直接解析电子发票 PDF 文本,无外部模型依赖。
|
||||
* 仅支持 PDF(电子普通发票 / 电子专用发票 / 全电数电票)。
|
||||
* 发票识别服务实现:本地三段式管线,无任何外部 API 调用。
|
||||
*
|
||||
* <ol>
|
||||
* <li>PDFBox 文本层抽取:原生电子发票直接搞定(毫秒级,几乎零算力)</li>
|
||||
* <li>ZXing 二维码识别:拍照/扫描发票 PDF,从二维码读结构化字段</li>
|
||||
* <li>Tesseract OCR(仅在前两步失败时触发):本地 chi_sim 字库,无网络</li>
|
||||
* </ol>
|
||||
*
|
||||
* Tesseract 字库路径默认 {jar 同级目录}/tessdata,可用 fad.ocr.tessdata-path 覆盖。
|
||||
* 系统需预装:apt install -y tesseract-ocr tesseract-ocr-chi-sim
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@@ -33,28 +58,48 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
|
||||
private final SysOssMapper sysOssMapper;
|
||||
|
||||
/** "价税合计(小写)¥123.45" 这种小写金额 */
|
||||
/** 可通过 application.yml 覆盖;默认 jar 同级目录下的 tessdata */
|
||||
@Value("${fad.ocr.tessdata-path:}")
|
||||
private String tessdataPathConfig;
|
||||
|
||||
private String tessdataPath;
|
||||
|
||||
@PostConstruct
|
||||
void resolveTessdataPath() {
|
||||
if (StringUtils.isNotBlank(tessdataPathConfig)) {
|
||||
tessdataPath = tessdataPathConfig;
|
||||
} else {
|
||||
tessdataPath = Paths.get(jarDir(), "tessdata").toString();
|
||||
}
|
||||
log.info("[Invoice] tessdata path = {}", tessdataPath);
|
||||
}
|
||||
|
||||
/** 取 jar 所在目录;IDE 调试时 fall back 到工作目录 */
|
||||
private static String jarDir() {
|
||||
try {
|
||||
URL url = HrmInvoiceOcrServiceImpl.class
|
||||
.getProtectionDomain().getCodeSource().getLocation();
|
||||
File f = new File(url.toURI());
|
||||
return (f.isFile() ? f.getParentFile() : f).getAbsolutePath();
|
||||
} catch (Exception e) {
|
||||
return System.getProperty("user.dir");
|
||||
}
|
||||
}
|
||||
|
||||
// === 字段抽取正则 ===
|
||||
private static final Pattern P_TOTAL = Pattern.compile(
|
||||
"(?:价税合计|小写)[^0-9¥¥]{0,30}[¥¥]?\\s*([0-9,]+\\.[0-9]{2})");
|
||||
|
||||
/** 开票日期:2024年01月01日 或 2024-01-01 */
|
||||
private static final Pattern P_DATE = Pattern.compile(
|
||||
"开票日期[:: ]*([0-9]{4}[年\\-/][0-9]{1,2}[月\\-/][0-9]{1,2}日?)");
|
||||
|
||||
/** 发票类型抬头 */
|
||||
private static final Pattern P_TYPE = Pattern.compile(
|
||||
"(电子(?:普通)?发票|增值税电子(?:普通|专用)发票|电子发票([^)]+)|数电(?:普通)?发票|普通发票|专用发票)");
|
||||
|
||||
/** 销售方名称:兼顾 "销售方名称:xxx"、"销 售 方 名称:xxx"、新版"销售方信息名称:xxx" */
|
||||
private static final Pattern P_SELLER = Pattern.compile(
|
||||
"销\\s*售\\s*方[^名]*名\\s*称[:: ]*([^\\n\\r]+?)(?=\\s{2,}|纳税人|统一社会|地址|开户|$)");
|
||||
|
||||
/** 明细行金额(行末两列:金额 税率% 税额 或 金额 税率% 价税合计) */
|
||||
private static final Pattern P_LINE_AMOUNT = Pattern.compile(
|
||||
"([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+" + // 名称
|
||||
"([0-9,]+\\.[0-9]{2})\\s+" + // 金额(不含税)
|
||||
"(\\d{1,2}%|免税|不征税|\\*)\\s+" + // 税率
|
||||
"([0-9,]+\\.[0-9]{2})"); // 税额
|
||||
"([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+"
|
||||
+ "([0-9,]+\\.[0-9]{2})\\s+"
|
||||
+ "(\\d{1,2}%|免税|不征税|\\*)\\s+"
|
||||
+ "([0-9,]+\\.[0-9]{2})");
|
||||
|
||||
@Override
|
||||
public HrmInvoiceOcrResultVo recognizeByOssId(Long ossId) {
|
||||
@@ -62,7 +107,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
if (oss == null) {
|
||||
throw new ServiceException("附件不存在: " + ossId);
|
||||
}
|
||||
|
||||
String suffix = StringUtils.defaultIfBlank(oss.getFileSuffix(), "").toLowerCase().replace(".", "");
|
||||
if (!"pdf".equals(suffix)) {
|
||||
throw new ServiceException("仅支持 PDF 电子发票,当前文件类型: " + suffix);
|
||||
@@ -75,26 +119,154 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
throw new ServiceException("读取附件失败: " + e.getMessage());
|
||||
}
|
||||
|
||||
return parsePdf(fileBytes);
|
||||
try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(fileBytes))) {
|
||||
// 第一步:文本层
|
||||
String text = extractText(doc);
|
||||
if (StringUtils.isNotBlank(text) && looksLikeInvoice(text)) {
|
||||
log.debug("[Invoice] hit text layer");
|
||||
return buildFromText(text);
|
||||
}
|
||||
|
||||
// 第二步:二维码
|
||||
HrmInvoiceOcrResultVo qr = tryDecodeQrFromPdf(doc);
|
||||
if (qr != null) {
|
||||
log.info("[Invoice] hit QR code");
|
||||
return qr;
|
||||
}
|
||||
|
||||
// 第三步:本地 OCR
|
||||
log.info("[Invoice] fallback to local OCR");
|
||||
String ocrText = runTesseract(doc);
|
||||
if (StringUtils.isBlank(ocrText)) {
|
||||
throw new ServiceException("无法识别该 PDF,请上传开票平台下载的正规 PDF 电子发票。");
|
||||
}
|
||||
return buildFromText(ocrText);
|
||||
} catch (ServiceException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
log.error("[Invoice] 解析失败", e);
|
||||
throw new ServiceException("发票解析失败: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/** 直接从 PDF 中抽文本并按发票常见版面解析字段 */
|
||||
private HrmInvoiceOcrResultVo parsePdf(byte[] bytes) {
|
||||
String text;
|
||||
try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(bytes))) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setLineSeparator("\n");
|
||||
text = stripper.getText(doc);
|
||||
} catch (Exception e) {
|
||||
log.error("[Invoice] PDF 解析失败", e);
|
||||
throw new ServiceException("PDF 解析失败: " + e.getMessage()
|
||||
+ "。若为扫描件,请提供电子发票原始 PDF。");
|
||||
}
|
||||
if (StringUtils.isBlank(text)) {
|
||||
throw new ServiceException("无法从 PDF 提取文本,可能为扫描件,请上传电子发票原始 PDF。");
|
||||
}
|
||||
/** 文本层抽取 */
|
||||
private String extractText(PDDocument doc) throws Exception {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setLineSeparator("\n");
|
||||
return stripper.getText(doc);
|
||||
}
|
||||
|
||||
/** 是否长得像发票:要至少出现金额/日期/抬头其中一个关键字段 */
|
||||
private boolean looksLikeInvoice(String text) {
|
||||
return P_TOTAL.matcher(text).find()
|
||||
|| P_DATE.matcher(text).find()
|
||||
|| P_TYPE.matcher(text).find()
|
||||
|| text.contains("发票");
|
||||
}
|
||||
|
||||
/** 第二步:把每页渲染成图,再用 ZXing 扫二维码 */
|
||||
private HrmInvoiceOcrResultVo tryDecodeQrFromPdf(PDDocument doc) {
|
||||
try {
|
||||
PDFRenderer renderer = new PDFRenderer(doc);
|
||||
// 二维码 200dpi 已经够清晰,CPU 也轻
|
||||
int pages = doc.getNumberOfPages();
|
||||
for (int i = 0; i < pages; i++) {
|
||||
BufferedImage img = renderer.renderImageWithDPI(i, 200, ImageType.GRAY);
|
||||
String content = decodeQr(img);
|
||||
if (StringUtils.isNotBlank(content)) {
|
||||
HrmInvoiceOcrResultVo vo = parseInvoiceQr(content);
|
||||
if (vo != null) return vo;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("[Invoice] QR decode failed: {}", e.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** ZXing 解码一张图,返回二维码文本(无则 null) */
|
||||
private String decodeQr(BufferedImage img) {
|
||||
try {
|
||||
BinaryBitmap bitmap = new BinaryBitmap(
|
||||
new HybridBinarizer(new BufferedImageLuminanceSource(img)));
|
||||
Map<DecodeHintType, Object> hints = new EnumMap<>(DecodeHintType.class);
|
||||
hints.put(DecodeHintType.TRY_HARDER, Boolean.TRUE);
|
||||
hints.put(DecodeHintType.CHARACTER_SET, "UTF-8");
|
||||
Result r = new MultiFormatReader().decode(bitmap, hints);
|
||||
return r != null ? r.getText() : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析增值税发票二维码内容。格式(逗号分隔):
|
||||
* 01,版本号,发票代码,发票号码,金额(不含税),开票日期(YYYYMMDD),校验码后6位,...
|
||||
*/
|
||||
private HrmInvoiceOcrResultVo parseInvoiceQr(String raw) {
|
||||
if (raw == null) return null;
|
||||
String[] parts = raw.split(",");
|
||||
if (parts.length < 7 || !parts[0].startsWith("0")) return null;
|
||||
try {
|
||||
HrmInvoiceOcrResultVo vo = new HrmInvoiceOcrResultVo();
|
||||
// parts[4] 是不含税金额;纸质票二维码无价税合计,先填到 totalAmount 让前端可调
|
||||
BigDecimal amt = parseBigDecimal(parts[4]);
|
||||
vo.setTotalAmount(amt);
|
||||
String d = parts[5];
|
||||
if (d != null && d.length() == 8) {
|
||||
vo.setInvoiceDate(d.substring(0, 4) + "-" + d.substring(4, 6) + "-" + d.substring(6, 8));
|
||||
}
|
||||
vo.setInvoiceType("增值税发票(二维码识别)");
|
||||
List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
|
||||
HrmInvoiceOcrResultVo.Item it = new HrmInvoiceOcrResultVo.Item();
|
||||
it.setItemName("发票款项(金额请核对原票,二维码仅含不含税金额)");
|
||||
it.setAmount(amt);
|
||||
items.add(it);
|
||||
vo.setItems(items);
|
||||
return vo;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** 第三步:Tesseract 本地 OCR,整文本拼接交给 regex 解析 */
|
||||
private String runTesseract(PDDocument doc) {
|
||||
try {
|
||||
File td = new File(tessdataPath);
|
||||
if (!td.isDirectory()) {
|
||||
log.warn("[Invoice] tessdata 目录不存在: {},无法走 OCR 兜底", tessdataPath);
|
||||
throw new ServiceException(
|
||||
"本地 OCR 字库未配置。请在 jar 同级目录建 tessdata/ 并放入 chi_sim.traineddata,"
|
||||
+ "或通过 fad.ocr.tessdata-path 指定路径。");
|
||||
}
|
||||
Tesseract t = new Tesseract();
|
||||
t.setDatapath(tessdataPath);
|
||||
t.setLanguage("chi_sim");
|
||||
t.setPageSegMode(6); // 假设单块文本
|
||||
t.setOcrEngineMode(1); // LSTM only
|
||||
|
||||
PDFRenderer renderer = new PDFRenderer(doc);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pages = doc.getNumberOfPages();
|
||||
for (int i = 0; i < pages; i++) {
|
||||
BufferedImage img = renderer.renderImageWithDPI(i, 300, ImageType.GRAY);
|
||||
sb.append(t.doOCR(img)).append('\n');
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (ServiceException e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
log.error("[Invoice] tesseract 失败: {}", e.getMessage());
|
||||
throw new ServiceException(
|
||||
"本地 OCR 失败:" + e.getMessage()
|
||||
+ "。请确认服务器已 apt install tesseract-ocr tesseract-ocr-chi-sim,"
|
||||
+ "且 jar 同级 tessdata/ 下有 chi_sim.traineddata。");
|
||||
}
|
||||
}
|
||||
|
||||
/** 文本 → VO 的统一抽取逻辑 */
|
||||
private HrmInvoiceOcrResultVo buildFromText(String text) {
|
||||
HrmInvoiceOcrResultVo result = new HrmInvoiceOcrResultVo();
|
||||
result.setInvoiceType(firstGroup(P_TYPE, text));
|
||||
result.setInvoiceDate(firstGroup(P_DATE, text));
|
||||
@@ -102,8 +274,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
result.setTotalAmount(parseBigDecimal(firstGroup(P_TOTAL, text)));
|
||||
|
||||
List<HrmInvoiceOcrResultVo.Item> items = parseLineItems(text);
|
||||
|
||||
// 兜底:解析不到明细但有总价 → 生成一条汇总
|
||||
if (items.isEmpty() && result.getTotalAmount() != null) {
|
||||
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
|
||||
item.setItemName(StringUtils.defaultIfBlank(result.getSellerName(), "发票款项"));
|
||||
@@ -114,21 +284,18 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
return result;
|
||||
}
|
||||
|
||||
/** 抽明细行:在"货物名称 … 合计"之间逐行匹配 */
|
||||
private List<HrmInvoiceOcrResultVo.Item> parseLineItems(String text) {
|
||||
List<HrmInvoiceOcrResultVo.Item> items = new ArrayList<>();
|
||||
// 定位明细表区间,找不到也没关系,直接全文匹配也能跑
|
||||
int begin = indexOfAny(text, "项目名称", "货物或应税劳务", "货物名称");
|
||||
int end = indexOfAny(text, "合\\s*计", "价税合计", "(大写)", "(大写)");
|
||||
String area = (begin >= 0 && end > begin) ? text.substring(begin, end) : text;
|
||||
|
||||
for (String line : area.split("\\n")) {
|
||||
line = line.trim();
|
||||
if (line.length() < 6) continue;
|
||||
Matcher m = P_LINE_AMOUNT.matcher(line);
|
||||
if (!m.find()) continue;
|
||||
HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item();
|
||||
String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); // 去掉 *类别* 前缀
|
||||
String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", "");
|
||||
BigDecimal preTax = parseBigDecimal(m.group(2));
|
||||
String rate = m.group(3);
|
||||
BigDecimal tax = parseBigDecimal(m.group(4));
|
||||
@@ -162,9 +329,7 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
|
||||
private static String cleanSeller(String s) {
|
||||
if (s == null) return null;
|
||||
// 去掉抬头里常见的"名称:"残留 + 末尾空白和半角空格序列
|
||||
s = s.replaceAll("^[::\\s]+", "").trim();
|
||||
// 截断到第一个非中文/字母/数字/常见公司符号块
|
||||
String[] tail = s.split("\\s{2,}");
|
||||
return tail.length > 0 ? tail[0].trim() : s;
|
||||
}
|
||||
@@ -177,5 +342,4 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user