From dc67788f51a31658dc244f641aedd4ac6b52cefa Mon Sep 17 00:00:00 2001
From: wangyu <823267011@qq.com>
Date: Tue, 5 May 2026 20:15:54 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90ocr=E8=AF=86=E5=88=AB?=
=?UTF-8?q?=E5=8F=91=E7=A5=A8=E7=9A=84=E5=89=8D=E6=8F=90=E6=9D=A1=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ruoyi-oa/pom.xml | 9 +-
.../oa/controller/InvoiceOcrController.java | 36 +++
.../ruoyi/oa/domain/vo/InvoiceOcrItemVo.java | 27 ++
.../oa/domain/vo/InvoiceOcrResultVo.java | 56 ++++
.../ruoyi/oa/service/IInvoiceOcrService.java | 18 ++
.../service/impl/InvoiceOcrServiceImpl.java | 286 ++++++++++++++++++
6 files changed, 429 insertions(+), 3 deletions(-)
create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java
create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java
create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java
create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java
create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java
diff --git a/ruoyi-oa/pom.xml b/ruoyi-oa/pom.xml
index 2827d04..6b7611c 100644
--- a/ruoyi-oa/pom.xml
+++ b/ruoyi-oa/pom.xml
@@ -68,17 +68,20 @@
1.0.0
-
com.xuxueli
xxl-job-core
-
+
com.ruoyi
fad-hrm
-
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.29
+
diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java
new file mode 100644
index 0000000..4227862
--- /dev/null
+++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java
@@ -0,0 +1,36 @@
+package com.ruoyi.oa.controller;
+
+import com.ruoyi.common.annotation.Log;
+import com.ruoyi.common.core.controller.BaseController;
+import com.ruoyi.common.core.domain.R;
+import com.ruoyi.common.enums.BusinessType;
+import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
+import com.ruoyi.oa.service.IInvoiceOcrService;
+import lombok.RequiredArgsConstructor;
+import org.springframework.validation.annotation.Validated;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+import org.springframework.web.multipart.MultipartFile;
+
+/**
+ * 发票 PDF 识别
+ */
+@Validated
+@RequiredArgsConstructor
+@RestController
+@RequestMapping("/oa/invoiceOcr")
+public class InvoiceOcrController extends BaseController {
+
+ private final IInvoiceOcrService invoiceOcrService;
+
+ /**
+ * 上传电子发票 PDF 并识别金额、内容等核心字段
+ */
+ @Log(title = "发票识别", businessType = BusinessType.OTHER)
+ @PostMapping("/recognize")
+ public R recognize(@RequestParam("file") MultipartFile file) {
+ return R.ok(invoiceOcrService.recognizePdf(file));
+ }
+}
diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java
new file mode 100644
index 0000000..49a2748
--- /dev/null
+++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java
@@ -0,0 +1,27 @@
+package com.ruoyi.oa.domain.vo;
+
+import lombok.Data;
+
+import java.io.Serializable;
+import java.math.BigDecimal;
+
+/**
+ * 发票识别明细行
+ */
+@Data
+public class InvoiceOcrItemVo implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /** 货物或应税劳务、服务名称 */
+ private String itemName;
+
+ /** 金额 */
+ private BigDecimal amount;
+
+ /** 税率 */
+ private String taxRate;
+
+ /** 税额 */
+ private BigDecimal taxAmount;
+}
diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java
new file mode 100644
index 0000000..6b2b365
--- /dev/null
+++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java
@@ -0,0 +1,56 @@
+package com.ruoyi.oa.domain.vo;
+
+import lombok.Data;
+
+import java.io.Serializable;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * 发票识别结果
+ */
+@Data
+public class InvoiceOcrResultVo implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /** 文件名 */
+ private String fileName;
+
+ /** 发票类型 */
+ private String invoiceType;
+
+ /** 发票代码 */
+ private String invoiceCode;
+
+ /** 发票号码 */
+ private String invoiceNumber;
+
+ /** 开票日期 */
+ private String invoiceDate;
+
+ /** 购买方名称 */
+ private String buyerName;
+
+ /** 销售方名称 */
+ private String sellerName;
+
+ /** 不含税金额 */
+ private BigDecimal amountWithoutTax;
+
+ /** 税额 */
+ private BigDecimal taxAmount;
+
+ /** 价税合计 */
+ private BigDecimal totalAmount;
+
+ /** 发票内容摘要 */
+ private String contentSummary;
+
+ /** 提取到的原始文本 */
+ private String rawText;
+
+ /** 明细 */
+ private List items = new ArrayList<>();
+}
diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java
new file mode 100644
index 0000000..b6f6eef
--- /dev/null
+++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java
@@ -0,0 +1,18 @@
+package com.ruoyi.oa.service;
+
+import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
+import org.springframework.web.multipart.MultipartFile;
+
+/**
+ * 发票 PDF 识别服务
+ */
+public interface IInvoiceOcrService {
+
+ /**
+ * 识别发票 PDF 中的核心字段
+ *
+ * @param file 上传的 PDF 文件
+ * @return 识别结果
+ */
+ InvoiceOcrResultVo recognizePdf(MultipartFile file);
+}
diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java
new file mode 100644
index 0000000..b64f3e9
--- /dev/null
+++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java
@@ -0,0 +1,286 @@
+package com.ruoyi.oa.service.impl;
+
+import com.ruoyi.common.exception.ServiceException;
+import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo;
+import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
+import com.ruoyi.oa.service.IInvoiceOcrService;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.springframework.stereotype.Service;
+import org.springframework.web.multipart.MultipartFile;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * 发票 PDF 识别服务实现
+ *
+ * 当前实现为方案 A:
+ * 1. 优先解析 PDF 文字层
+ * 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段
+ */
+@Service
+public class InvoiceOcrServiceImpl implements IInvoiceOcrService {
+
+ private static final Pattern INVOICE_CODE_PATTERN = Pattern.compile("发票代码[::\\s]*([0-9]{10,12})");
+ private static final Pattern INVOICE_NUMBER_PATTERN = Pattern.compile("发票号码[::\\s]*([0-9]{8})");
+ private static final Pattern DATE_PATTERN = Pattern.compile("(?:开票日期|票据日期)[::\\s]*([0-9]{4}[年\\-/.][0-9]{1,2}[月\\-/.][0-9]{1,2}日?)");
+ private static final Pattern TAX_AMOUNT_PATTERN = Pattern.compile("(?:税额|税\\s*额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
+ private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern.compile("(?:价税合计|小写)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
+ private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern.compile("(?:金额|合计|不含税金额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
+ private static final Pattern MONEY_LINE_PATTERN = Pattern.compile("([0-9,]+(?:\\.[0-9]{1,2})?)");
+ private static final Pattern BUYER_PATTERN = Pattern.compile("购买方(?:名称)?[::\\s]*([^\\n]+)");
+ private static final Pattern SELLER_PATTERN = Pattern.compile("销售方(?:名称)?[::\\s]*([^\\n]+)");
+ private static final Pattern ITEM_HEADER_PATTERN = Pattern.compile("(货物或应税劳务、服务名称|项目名称|服务名称)");
+ private static final Pattern TAX_RATE_PATTERN = Pattern.compile("([0-9]{1,2}%|免税)");
+
+ @Override
+ public InvoiceOcrResultVo recognizePdf(MultipartFile file) {
+ validateFile(file);
+ String rawText = extractPdfText(file);
+ if (StringUtils.isBlank(rawText)) {
+ throw new ServiceException("PDF 未解析到文本内容,请确认上传的是电子发票 PDF");
+ }
+ return parseInvoice(file.getOriginalFilename(), normalizeText(rawText));
+ }
+
+ private void validateFile(MultipartFile file) {
+ if (file == null || file.isEmpty()) {
+ throw new ServiceException("请上传 PDF 文件");
+ }
+ String fileName = file.getOriginalFilename();
+ if (StringUtils.isBlank(fileName) || !StringUtils.endsWithIgnoreCase(fileName, ".pdf")) {
+ throw new ServiceException("仅支持 PDF 文件识别");
+ }
+ }
+
+ private String extractPdfText(MultipartFile file) {
+ try (InputStream inputStream = file.getInputStream();
+ PDDocument document = PDDocument.load(inputStream)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.setSortByPosition(true);
+ return stripper.getText(document);
+ } catch (IOException e) {
+ throw new ServiceException("PDF 解析失败: " + e.getMessage());
+ }
+ }
+
+ private InvoiceOcrResultVo parseInvoice(String fileName, String rawText) {
+ InvoiceOcrResultVo result = new InvoiceOcrResultVo();
+ result.setFileName(fileName);
+ result.setRawText(rawText);
+ result.setInvoiceType(detectInvoiceType(rawText));
+ result.setInvoiceCode(firstGroup(rawText, INVOICE_CODE_PATTERN));
+ result.setInvoiceNumber(firstGroup(rawText, INVOICE_NUMBER_PATTERN));
+ result.setInvoiceDate(firstGroup(rawText, DATE_PATTERN));
+ result.setBuyerName(cleanLineValue(firstGroup(rawText, BUYER_PATTERN)));
+ result.setSellerName(cleanLineValue(firstGroup(rawText, SELLER_PATTERN)));
+ result.setTaxAmount(parseMoney(firstGroup(rawText, TAX_AMOUNT_PATTERN)));
+ result.setTotalAmount(parseMoney(firstGroup(rawText, TOTAL_AMOUNT_PATTERN)));
+ result.setAmountWithoutTax(resolveAmountWithoutTax(rawText, result.getTaxAmount(), result.getTotalAmount()));
+
+ List items = extractItems(rawText);
+ result.setItems(items);
+ result.setContentSummary(buildContentSummary(items, rawText));
+ return result;
+ }
+
+ private String detectInvoiceType(String text) {
+ if (text.contains("增值税专用发票")) {
+ return "增值税专用发票";
+ }
+ if (text.contains("增值税普通发票")) {
+ return "增值税普通发票";
+ }
+ if (text.contains("电子发票")) {
+ return "电子发票";
+ }
+ return "未知发票类型";
+ }
+
+ private BigDecimal resolveAmountWithoutTax(String text, BigDecimal taxAmount, BigDecimal totalAmount) {
+ BigDecimal directAmount = parseMoney(firstGroup(text, AMOUNT_WITHOUT_TAX_PATTERN));
+ if (directAmount != null && isReasonableAmount(directAmount, taxAmount, totalAmount)) {
+ return directAmount;
+ }
+ if (totalAmount != null && taxAmount != null) {
+ return totalAmount.subtract(taxAmount);
+ }
+ return directAmount;
+ }
+
+ private boolean isReasonableAmount(BigDecimal directAmount, BigDecimal taxAmount, BigDecimal totalAmount) {
+ if (directAmount == null) {
+ return false;
+ }
+ if (totalAmount == null) {
+ return true;
+ }
+ if (taxAmount == null) {
+ return directAmount.compareTo(totalAmount) <= 0;
+ }
+ return directAmount.add(taxAmount).compareTo(totalAmount) <= 0;
+ }
+
+ private List extractItems(String text) {
+ List items = new ArrayList<>();
+ String[] lines = text.split("\\n");
+ boolean detailStarted = false;
+ for (String originalLine : lines) {
+ String line = normalizeInlineWhitespace(originalLine);
+ if (StringUtils.isBlank(line)) {
+ continue;
+ }
+ if (!detailStarted && ITEM_HEADER_PATTERN.matcher(line).find()) {
+ detailStarted = true;
+ continue;
+ }
+ if (detailStarted && isDetailEndLine(line)) {
+ break;
+ }
+ if (detailStarted) {
+ InvoiceOcrItemVo item = parseItemLine(line);
+ if (item != null) {
+ items.add(item);
+ }
+ }
+ }
+ return deduplicateItems(items);
+ }
+
+ private boolean isDetailEndLine(String line) {
+ return line.contains("合计") || line.contains("价税合计") || line.contains("销售方") || line.contains("购买方") || line.contains("备注");
+ }
+
+ private InvoiceOcrItemVo parseItemLine(String line) {
+ if (line.length() < 2) {
+ return null;
+ }
+ Matcher moneyMatcher = MONEY_LINE_PATTERN.matcher(line);
+ List numbers = new ArrayList<>();
+ while (moneyMatcher.find()) {
+ numbers.add(moneyMatcher.group(1));
+ }
+ if (numbers.isEmpty()) {
+ return null;
+ }
+
+ String itemName = line;
+ for (String number : numbers) {
+ itemName = itemName.replace(number, " ");
+ }
+ Matcher taxRateMatcher = TAX_RATE_PATTERN.matcher(line);
+ String taxRate = taxRateMatcher.find() ? taxRateMatcher.group(1) : null;
+ if (taxRate != null) {
+ itemName = itemName.replace(taxRate, " ");
+ }
+ itemName = itemName.replace("*", " ").trim();
+ itemName = normalizeInlineWhitespace(itemName);
+ if (StringUtils.isBlank(itemName)) {
+ return null;
+ }
+
+ InvoiceOcrItemVo item = new InvoiceOcrItemVo();
+ item.setItemName(itemName);
+ item.setAmount(parseMoney(numbers.get(Math.max(0, numbers.size() - 2))));
+ item.setTaxAmount(parseMoney(numbers.get(numbers.size() - 1)));
+ item.setTaxRate(taxRate);
+ return item;
+ }
+
+ private List deduplicateItems(List items) {
+ List result = new ArrayList<>();
+ Set uniqueKeys = new LinkedHashSet<>();
+ for (InvoiceOcrItemVo item : items) {
+ String key = item.getItemName() + "|" + item.getAmount() + "|" + item.getTaxAmount();
+ if (uniqueKeys.add(key)) {
+ result.add(item);
+ }
+ }
+ return result;
+ }
+
+ private String buildContentSummary(List items, String text) {
+ if (items != null && !items.isEmpty()) {
+ List names = new ArrayList<>();
+ for (InvoiceOcrItemVo item : items) {
+ if (StringUtils.isNotBlank(item.getItemName())) {
+ names.add(item.getItemName());
+ }
+ if (names.size() >= 3) {
+ break;
+ }
+ }
+ if (!names.isEmpty()) {
+ return String.join("、", names);
+ }
+ }
+ String[] lines = text.split("\\n");
+ for (int i = 0; i < lines.length; i++) {
+ if (ITEM_HEADER_PATTERN.matcher(lines[i]).find()) {
+ StringBuilder builder = new StringBuilder();
+ for (int j = i + 1; j < lines.length && j <= i + 3; j++) {
+ String value = normalizeInlineWhitespace(lines[j]);
+ if (StringUtils.isBlank(value) || isDetailEndLine(value)) {
+ break;
+ }
+ if (builder.length() > 0) {
+ builder.append(";");
+ }
+ builder.append(value);
+ }
+ if (builder.length() > 0) {
+ return builder.toString();
+ }
+ }
+ }
+ return null;
+ }
+
+ private String normalizeText(String text) {
+ return text == null ? "" : text.replace("\r\n", "\n").replace('\r', '\n');
+ }
+
+ private String normalizeInlineWhitespace(String value) {
+ return value == null ? null : value.replaceAll("\\s+", " ").trim();
+ }
+
+ private String firstGroup(String text, Pattern pattern) {
+ Matcher matcher = pattern.matcher(text);
+ if (matcher.find()) {
+ return matcher.group(1);
+ }
+ return null;
+ }
+
+ private String cleanLineValue(String value) {
+ if (value == null) {
+ return null;
+ }
+ value = normalizeInlineWhitespace(value);
+ if (StringUtils.length(value) > 80) {
+ value = StringUtils.substring(value, 0, 80);
+ }
+ return value;
+ }
+
+ private BigDecimal parseMoney(String raw) {
+ if (StringUtils.isBlank(raw)) {
+ return null;
+ }
+ String cleaned = raw.replace(",", "").replace("¥", "").replace("¥", "").trim();
+ try {
+ return new BigDecimal(cleaned);
+ } catch (Exception e) {
+ return null;
+ }
+ }
+}