diff --git a/fad-hrm/pom.xml b/fad-hrm/pom.xml index 8ac94b0..d6a7e30 100644 --- a/fad-hrm/pom.xml +++ b/fad-hrm/pom.xml @@ -50,5 +50,28 @@ pdfbox 2.0.29 + + + com.google.zxing + core + 3.5.1 + + + com.google.zxing + javase + 3.5.1 + + + + net.sourceforge.tess4j + tess4j + 5.11.0 + + + org.slf4j + slf4j-log4j12 + + + diff --git a/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java b/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java index fd1eba2..b5f1af7 100644 --- a/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java +++ b/fad-hrm/src/main/java/com/ruoyi/hrm/service/impl/HrmInvoiceOcrServiceImpl.java @@ -1,6 +1,12 @@ package com.ruoyi.hrm.service.impl; import cn.hutool.core.io.IoUtil; +import com.google.zxing.BinaryBitmap; +import com.google.zxing.DecodeHintType; +import com.google.zxing.MultiFormatReader; +import com.google.zxing.Result; +import com.google.zxing.client.j2se.BufferedImageLuminanceSource; +import com.google.zxing.common.HybridBinarizer; import com.ruoyi.common.exception.ServiceException; import com.ruoyi.hrm.domain.vo.HrmInvoiceOcrResultVo; import com.ruoyi.hrm.service.IHrmInvoiceOcrService; @@ -9,22 +15,41 @@ import com.ruoyi.system.domain.vo.SysOssVo; import com.ruoyi.system.mapper.SysOssMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import net.sourceforge.tess4j.Tesseract; import org.apache.commons.lang3.StringUtils; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; +import javax.annotation.PostConstruct; +import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.InputStream; import java.math.BigDecimal; +import java.net.URL; +import java.nio.file.Paths; import java.util.ArrayList; +import java.util.EnumMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * 发票识别服务实现:直接解析电子发票 PDF 文本,无外部模型依赖。 - * 仅支持 PDF(电子普通发票 / 电子专用发票 / 全电数电票)。 + * 发票识别服务实现:本地三段式管线,无任何外部 API 调用。 + * + *
    + *
  1. PDFBox 文本层抽取:原生电子发票直接搞定(毫秒级,几乎零算力)
  2. + *
  3. ZXing 二维码识别:拍照/扫描发票 PDF,从二维码读结构化字段
  4. + *
  5. Tesseract OCR(仅在前两步失败时触发):本地 chi_sim 字库,无网络
  6. + *
+ * + * Tesseract 字库路径默认 {jar 同级目录}/tessdata,可用 fad.ocr.tessdata-path 覆盖。 + * 系统需预装:apt install -y tesseract-ocr tesseract-ocr-chi-sim */ @Slf4j @RequiredArgsConstructor @@ -33,28 +58,48 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { private final SysOssMapper sysOssMapper; - /** "价税合计(小写)¥123.45" 这种小写金额 */ + /** 可通过 application.yml 覆盖;默认 jar 同级目录下的 tessdata */ + @Value("${fad.ocr.tessdata-path:}") + private String tessdataPathConfig; + + private String tessdataPath; + + @PostConstruct + void resolveTessdataPath() { + if (StringUtils.isNotBlank(tessdataPathConfig)) { + tessdataPath = tessdataPathConfig; + } else { + tessdataPath = Paths.get(jarDir(), "tessdata").toString(); + } + log.info("[Invoice] tessdata path = {}", tessdataPath); + } + + /** 取 jar 所在目录;IDE 调试时 fall back 到工作目录 */ + private static String jarDir() { + try { + URL url = HrmInvoiceOcrServiceImpl.class + .getProtectionDomain().getCodeSource().getLocation(); + File f = new File(url.toURI()); + return (f.isFile() ? f.getParentFile() : f).getAbsolutePath(); + } catch (Exception e) { + return System.getProperty("user.dir"); + } + } + + // === 字段抽取正则 === private static final Pattern P_TOTAL = Pattern.compile( "(?:价税合计|小写)[^0-9¥¥]{0,30}[¥¥]?\\s*([0-9,]+\\.[0-9]{2})"); - - /** 开票日期:2024年01月01日 或 2024-01-01 */ private static final Pattern P_DATE = Pattern.compile( "开票日期[:: ]*([0-9]{4}[年\\-/][0-9]{1,2}[月\\-/][0-9]{1,2}日?)"); - - /** 发票类型抬头 */ private static final Pattern P_TYPE = Pattern.compile( "(电子(?:普通)?发票|增值税电子(?:普通|专用)发票|电子发票([^)]+)|数电(?:普通)?发票|普通发票|专用发票)"); - - /** 销售方名称:兼顾 "销售方名称:xxx"、"销 售 方 名称:xxx"、新版"销售方信息名称:xxx" */ private static final Pattern P_SELLER = Pattern.compile( "销\\s*售\\s*方[^名]*名\\s*称[:: ]*([^\\n\\r]+?)(?=\\s{2,}|纳税人|统一社会|地址|开户|$)"); - - /** 明细行金额(行末两列:金额 税率% 税额 或 金额 税率% 价税合计) */ private static final Pattern P_LINE_AMOUNT = Pattern.compile( - "([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+" + // 名称 - "([0-9,]+\\.[0-9]{2})\\s+" + // 金额(不含税) - "(\\d{1,2}%|免税|不征税|\\*)\\s+" + // 税率 - "([0-9,]+\\.[0-9]{2})"); // 税额 + "([\\u4e00-\\u9fa5A-Za-z0-9()()\\-·.\\*\\s]{2,40}?)\\s+" + + "([0-9,]+\\.[0-9]{2})\\s+" + + "(\\d{1,2}%|免税|不征税|\\*)\\s+" + + "([0-9,]+\\.[0-9]{2})"); @Override public HrmInvoiceOcrResultVo recognizeByOssId(Long ossId) { @@ -62,7 +107,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { if (oss == null) { throw new ServiceException("附件不存在: " + ossId); } - String suffix = StringUtils.defaultIfBlank(oss.getFileSuffix(), "").toLowerCase().replace(".", ""); if (!"pdf".equals(suffix)) { throw new ServiceException("仅支持 PDF 电子发票,当前文件类型: " + suffix); @@ -75,26 +119,154 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { throw new ServiceException("读取附件失败: " + e.getMessage()); } - return parsePdf(fileBytes); + try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(fileBytes))) { + // 第一步:文本层 + String text = extractText(doc); + if (StringUtils.isNotBlank(text) && looksLikeInvoice(text)) { + log.debug("[Invoice] hit text layer"); + return buildFromText(text); + } + + // 第二步:二维码 + HrmInvoiceOcrResultVo qr = tryDecodeQrFromPdf(doc); + if (qr != null) { + log.info("[Invoice] hit QR code"); + return qr; + } + + // 第三步:本地 OCR + log.info("[Invoice] fallback to local OCR"); + String ocrText = runTesseract(doc); + if (StringUtils.isBlank(ocrText)) { + throw new ServiceException("无法识别该 PDF,请上传开票平台下载的正规 PDF 电子发票。"); + } + return buildFromText(ocrText); + } catch (ServiceException e) { + throw e; + } catch (Exception e) { + log.error("[Invoice] 解析失败", e); + throw new ServiceException("发票解析失败: " + e.getMessage()); + } } - /** 直接从 PDF 中抽文本并按发票常见版面解析字段 */ - private HrmInvoiceOcrResultVo parsePdf(byte[] bytes) { - String text; - try (PDDocument doc = PDDocument.load(new ByteArrayInputStream(bytes))) { - PDFTextStripper stripper = new PDFTextStripper(); - stripper.setSortByPosition(true); - stripper.setLineSeparator("\n"); - text = stripper.getText(doc); - } catch (Exception e) { - log.error("[Invoice] PDF 解析失败", e); - throw new ServiceException("PDF 解析失败: " + e.getMessage() - + "。若为扫描件,请提供电子发票原始 PDF。"); - } - if (StringUtils.isBlank(text)) { - throw new ServiceException("无法从 PDF 提取文本,可能为扫描件,请上传电子发票原始 PDF。"); - } + /** 文本层抽取 */ + private String extractText(PDDocument doc) throws Exception { + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(true); + stripper.setLineSeparator("\n"); + return stripper.getText(doc); + } + /** 是否长得像发票:要至少出现金额/日期/抬头其中一个关键字段 */ + private boolean looksLikeInvoice(String text) { + return P_TOTAL.matcher(text).find() + || P_DATE.matcher(text).find() + || P_TYPE.matcher(text).find() + || text.contains("发票"); + } + + /** 第二步:把每页渲染成图,再用 ZXing 扫二维码 */ + private HrmInvoiceOcrResultVo tryDecodeQrFromPdf(PDDocument doc) { + try { + PDFRenderer renderer = new PDFRenderer(doc); + // 二维码 200dpi 已经够清晰,CPU 也轻 + int pages = doc.getNumberOfPages(); + for (int i = 0; i < pages; i++) { + BufferedImage img = renderer.renderImageWithDPI(i, 200, ImageType.GRAY); + String content = decodeQr(img); + if (StringUtils.isNotBlank(content)) { + HrmInvoiceOcrResultVo vo = parseInvoiceQr(content); + if (vo != null) return vo; + } + } + } catch (Exception e) { + log.debug("[Invoice] QR decode failed: {}", e.getMessage()); + } + return null; + } + + /** ZXing 解码一张图,返回二维码文本(无则 null) */ + private String decodeQr(BufferedImage img) { + try { + BinaryBitmap bitmap = new BinaryBitmap( + new HybridBinarizer(new BufferedImageLuminanceSource(img))); + Map hints = new EnumMap<>(DecodeHintType.class); + hints.put(DecodeHintType.TRY_HARDER, Boolean.TRUE); + hints.put(DecodeHintType.CHARACTER_SET, "UTF-8"); + Result r = new MultiFormatReader().decode(bitmap, hints); + return r != null ? r.getText() : null; + } catch (Exception e) { + return null; + } + } + + /** + * 解析增值税发票二维码内容。格式(逗号分隔): + * 01,版本号,发票代码,发票号码,金额(不含税),开票日期(YYYYMMDD),校验码后6位,... + */ + private HrmInvoiceOcrResultVo parseInvoiceQr(String raw) { + if (raw == null) return null; + String[] parts = raw.split(","); + if (parts.length < 7 || !parts[0].startsWith("0")) return null; + try { + HrmInvoiceOcrResultVo vo = new HrmInvoiceOcrResultVo(); + // parts[4] 是不含税金额;纸质票二维码无价税合计,先填到 totalAmount 让前端可调 + BigDecimal amt = parseBigDecimal(parts[4]); + vo.setTotalAmount(amt); + String d = parts[5]; + if (d != null && d.length() == 8) { + vo.setInvoiceDate(d.substring(0, 4) + "-" + d.substring(4, 6) + "-" + d.substring(6, 8)); + } + vo.setInvoiceType("增值税发票(二维码识别)"); + List items = new ArrayList<>(); + HrmInvoiceOcrResultVo.Item it = new HrmInvoiceOcrResultVo.Item(); + it.setItemName("发票款项(金额请核对原票,二维码仅含不含税金额)"); + it.setAmount(amt); + items.add(it); + vo.setItems(items); + return vo; + } catch (Exception e) { + return null; + } + } + + /** 第三步:Tesseract 本地 OCR,整文本拼接交给 regex 解析 */ + private String runTesseract(PDDocument doc) { + try { + File td = new File(tessdataPath); + if (!td.isDirectory()) { + log.warn("[Invoice] tessdata 目录不存在: {},无法走 OCR 兜底", tessdataPath); + throw new ServiceException( + "本地 OCR 字库未配置。请在 jar 同级目录建 tessdata/ 并放入 chi_sim.traineddata," + + "或通过 fad.ocr.tessdata-path 指定路径。"); + } + Tesseract t = new Tesseract(); + t.setDatapath(tessdataPath); + t.setLanguage("chi_sim"); + t.setPageSegMode(6); // 假设单块文本 + t.setOcrEngineMode(1); // LSTM only + + PDFRenderer renderer = new PDFRenderer(doc); + StringBuilder sb = new StringBuilder(); + int pages = doc.getNumberOfPages(); + for (int i = 0; i < pages; i++) { + BufferedImage img = renderer.renderImageWithDPI(i, 300, ImageType.GRAY); + sb.append(t.doOCR(img)).append('\n'); + } + return sb.toString(); + } catch (ServiceException e) { + throw e; + } catch (Throwable e) { + log.error("[Invoice] tesseract 失败: {}", e.getMessage()); + throw new ServiceException( + "本地 OCR 失败:" + e.getMessage() + + "。请确认服务器已 apt install tesseract-ocr tesseract-ocr-chi-sim," + + "且 jar 同级 tessdata/ 下有 chi_sim.traineddata。"); + } + } + + /** 文本 → VO 的统一抽取逻辑 */ + private HrmInvoiceOcrResultVo buildFromText(String text) { HrmInvoiceOcrResultVo result = new HrmInvoiceOcrResultVo(); result.setInvoiceType(firstGroup(P_TYPE, text)); result.setInvoiceDate(firstGroup(P_DATE, text)); @@ -102,8 +274,6 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { result.setTotalAmount(parseBigDecimal(firstGroup(P_TOTAL, text))); List items = parseLineItems(text); - - // 兜底:解析不到明细但有总价 → 生成一条汇总 if (items.isEmpty() && result.getTotalAmount() != null) { HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item(); item.setItemName(StringUtils.defaultIfBlank(result.getSellerName(), "发票款项")); @@ -114,21 +284,18 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { return result; } - /** 抽明细行:在"货物名称 … 合计"之间逐行匹配 */ private List parseLineItems(String text) { List items = new ArrayList<>(); - // 定位明细表区间,找不到也没关系,直接全文匹配也能跑 int begin = indexOfAny(text, "项目名称", "货物或应税劳务", "货物名称"); int end = indexOfAny(text, "合\\s*计", "价税合计", "(大写)", "(大写)"); String area = (begin >= 0 && end > begin) ? text.substring(begin, end) : text; - for (String line : area.split("\\n")) { line = line.trim(); if (line.length() < 6) continue; Matcher m = P_LINE_AMOUNT.matcher(line); if (!m.find()) continue; HrmInvoiceOcrResultVo.Item item = new HrmInvoiceOcrResultVo.Item(); - String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); // 去掉 *类别* 前缀 + String name = m.group(1).trim().replaceAll("^\\*[^*]+\\*", ""); BigDecimal preTax = parseBigDecimal(m.group(2)); String rate = m.group(3); BigDecimal tax = parseBigDecimal(m.group(4)); @@ -162,9 +329,7 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { private static String cleanSeller(String s) { if (s == null) return null; - // 去掉抬头里常见的"名称:"残留 + 末尾空白和半角空格序列 s = s.replaceAll("^[::\\s]+", "").trim(); - // 截断到第一个非中文/字母/数字/常见公司符号块 String[] tail = s.split("\\s{2,}"); return tail.length > 0 ? tail[0].trim() : s; } @@ -177,5 +342,4 @@ public class HrmInvoiceOcrServiceImpl implements IHrmInvoiceOcrService { return null; } } - }