From dc67788f51a31658dc244f641aedd4ac6b52cefa Mon Sep 17 00:00:00 2001 From: wangyu <823267011@qq.com> Date: Tue, 5 May 2026 20:15:54 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90ocr=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E5=8F=91=E7=A5=A8=E7=9A=84=E5=89=8D=E6=8F=90=E6=9D=A1=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ruoyi-oa/pom.xml | 9 +- .../oa/controller/InvoiceOcrController.java | 36 +++ .../ruoyi/oa/domain/vo/InvoiceOcrItemVo.java | 27 ++ .../oa/domain/vo/InvoiceOcrResultVo.java | 56 ++++ .../ruoyi/oa/service/IInvoiceOcrService.java | 18 ++ .../service/impl/InvoiceOcrServiceImpl.java | 286 ++++++++++++++++++ 6 files changed, 429 insertions(+), 3 deletions(-) create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java create mode 100644 ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java diff --git a/ruoyi-oa/pom.xml b/ruoyi-oa/pom.xml index 2827d04..6b7611c 100644 --- a/ruoyi-oa/pom.xml +++ b/ruoyi-oa/pom.xml @@ -68,17 +68,20 @@ 1.0.0 - com.xuxueli xxl-job-core - + com.ruoyi fad-hrm - + + org.apache.pdfbox + pdfbox + 2.0.29 + diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java new file mode 100644 index 0000000..4227862 --- /dev/null +++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java @@ -0,0 +1,36 @@ +package com.ruoyi.oa.controller; + +import com.ruoyi.common.annotation.Log; +import com.ruoyi.common.core.controller.BaseController; +import com.ruoyi.common.core.domain.R; +import com.ruoyi.common.enums.BusinessType; +import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; +import com.ruoyi.oa.service.IInvoiceOcrService; +import lombok.RequiredArgsConstructor; +import org.springframework.validation.annotation.Validated; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +/** + * 发票 PDF 识别 + */ +@Validated +@RequiredArgsConstructor +@RestController +@RequestMapping("/oa/invoiceOcr") +public class InvoiceOcrController extends BaseController { + + private final IInvoiceOcrService invoiceOcrService; + + /** + * 上传电子发票 PDF 并识别金额、内容等核心字段 + */ + @Log(title = "发票识别", businessType = BusinessType.OTHER) + @PostMapping("/recognize") + public R recognize(@RequestParam("file") MultipartFile file) { + return R.ok(invoiceOcrService.recognizePdf(file)); + } +} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java new file mode 100644 index 0000000..49a2748 --- /dev/null +++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrItemVo.java @@ -0,0 +1,27 @@ +package com.ruoyi.oa.domain.vo; + +import lombok.Data; + +import java.io.Serializable; +import java.math.BigDecimal; + +/** + * 发票识别明细行 + */ +@Data +public class InvoiceOcrItemVo implements Serializable { + + private static final long serialVersionUID = 1L; + + /** 货物或应税劳务、服务名称 */ + private String itemName; + + /** 金额 */ + private BigDecimal amount; + + /** 税率 */ + private String taxRate; + + /** 税额 */ + private BigDecimal taxAmount; +} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java new file mode 100644 index 0000000..6b2b365 --- /dev/null +++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/domain/vo/InvoiceOcrResultVo.java @@ -0,0 +1,56 @@ +package com.ruoyi.oa.domain.vo; + +import lombok.Data; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; + +/** + * 发票识别结果 + */ +@Data +public class InvoiceOcrResultVo implements Serializable { + + private static final long serialVersionUID = 1L; + + /** 文件名 */ + private String fileName; + + /** 发票类型 */ + private String invoiceType; + + /** 发票代码 */ + private String invoiceCode; + + /** 发票号码 */ + private String invoiceNumber; + + /** 开票日期 */ + private String invoiceDate; + + /** 购买方名称 */ + private String buyerName; + + /** 销售方名称 */ + private String sellerName; + + /** 不含税金额 */ + private BigDecimal amountWithoutTax; + + /** 税额 */ + private BigDecimal taxAmount; + + /** 价税合计 */ + private BigDecimal totalAmount; + + /** 发票内容摘要 */ + private String contentSummary; + + /** 提取到的原始文本 */ + private String rawText; + + /** 明细 */ + private List items = new ArrayList<>(); +} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java new file mode 100644 index 0000000..b6f6eef --- /dev/null +++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java @@ -0,0 +1,18 @@ +package com.ruoyi.oa.service; + +import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; +import org.springframework.web.multipart.MultipartFile; + +/** + * 发票 PDF 识别服务 + */ +public interface IInvoiceOcrService { + + /** + * 识别发票 PDF 中的核心字段 + * + * @param file 上传的 PDF 文件 + * @return 识别结果 + */ + InvoiceOcrResultVo recognizePdf(MultipartFile file); +} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java new file mode 100644 index 0000000..b64f3e9 --- /dev/null +++ b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java @@ -0,0 +1,286 @@ +package com.ruoyi.oa.service.impl; + +import com.ruoyi.common.exception.ServiceException; +import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo; +import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; +import com.ruoyi.oa.service.IInvoiceOcrService; +import org.apache.commons.lang3.StringUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import java.io.IOException; +import java.io.InputStream; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * 发票 PDF 识别服务实现 + * + * 当前实现为方案 A: + * 1. 优先解析 PDF 文字层 + * 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段 + */ +@Service +public class InvoiceOcrServiceImpl implements IInvoiceOcrService { + + private static final Pattern INVOICE_CODE_PATTERN = Pattern.compile("发票代码[::\\s]*([0-9]{10,12})"); + private static final Pattern INVOICE_NUMBER_PATTERN = Pattern.compile("发票号码[::\\s]*([0-9]{8})"); + private static final Pattern DATE_PATTERN = Pattern.compile("(?:开票日期|票据日期)[::\\s]*([0-9]{4}[年\\-/.][0-9]{1,2}[月\\-/.][0-9]{1,2}日?)"); + private static final Pattern TAX_AMOUNT_PATTERN = Pattern.compile("(?:税额|税\\s*额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); + private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern.compile("(?:价税合计|小写)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); + private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern.compile("(?:金额|合计|不含税金额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); + private static final Pattern MONEY_LINE_PATTERN = Pattern.compile("([0-9,]+(?:\\.[0-9]{1,2})?)"); + private static final Pattern BUYER_PATTERN = Pattern.compile("购买方(?:名称)?[::\\s]*([^\\n]+)"); + private static final Pattern SELLER_PATTERN = Pattern.compile("销售方(?:名称)?[::\\s]*([^\\n]+)"); + private static final Pattern ITEM_HEADER_PATTERN = Pattern.compile("(货物或应税劳务、服务名称|项目名称|服务名称)"); + private static final Pattern TAX_RATE_PATTERN = Pattern.compile("([0-9]{1,2}%|免税)"); + + @Override + public InvoiceOcrResultVo recognizePdf(MultipartFile file) { + validateFile(file); + String rawText = extractPdfText(file); + if (StringUtils.isBlank(rawText)) { + throw new ServiceException("PDF 未解析到文本内容,请确认上传的是电子发票 PDF"); + } + return parseInvoice(file.getOriginalFilename(), normalizeText(rawText)); + } + + private void validateFile(MultipartFile file) { + if (file == null || file.isEmpty()) { + throw new ServiceException("请上传 PDF 文件"); + } + String fileName = file.getOriginalFilename(); + if (StringUtils.isBlank(fileName) || !StringUtils.endsWithIgnoreCase(fileName, ".pdf")) { + throw new ServiceException("仅支持 PDF 文件识别"); + } + } + + private String extractPdfText(MultipartFile file) { + try (InputStream inputStream = file.getInputStream(); + PDDocument document = PDDocument.load(inputStream)) { + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(true); + return stripper.getText(document); + } catch (IOException e) { + throw new ServiceException("PDF 解析失败: " + e.getMessage()); + } + } + + private InvoiceOcrResultVo parseInvoice(String fileName, String rawText) { + InvoiceOcrResultVo result = new InvoiceOcrResultVo(); + result.setFileName(fileName); + result.setRawText(rawText); + result.setInvoiceType(detectInvoiceType(rawText)); + result.setInvoiceCode(firstGroup(rawText, INVOICE_CODE_PATTERN)); + result.setInvoiceNumber(firstGroup(rawText, INVOICE_NUMBER_PATTERN)); + result.setInvoiceDate(firstGroup(rawText, DATE_PATTERN)); + result.setBuyerName(cleanLineValue(firstGroup(rawText, BUYER_PATTERN))); + result.setSellerName(cleanLineValue(firstGroup(rawText, SELLER_PATTERN))); + result.setTaxAmount(parseMoney(firstGroup(rawText, TAX_AMOUNT_PATTERN))); + result.setTotalAmount(parseMoney(firstGroup(rawText, TOTAL_AMOUNT_PATTERN))); + result.setAmountWithoutTax(resolveAmountWithoutTax(rawText, result.getTaxAmount(), result.getTotalAmount())); + + List items = extractItems(rawText); + result.setItems(items); + result.setContentSummary(buildContentSummary(items, rawText)); + return result; + } + + private String detectInvoiceType(String text) { + if (text.contains("增值税专用发票")) { + return "增值税专用发票"; + } + if (text.contains("增值税普通发票")) { + return "增值税普通发票"; + } + if (text.contains("电子发票")) { + return "电子发票"; + } + return "未知发票类型"; + } + + private BigDecimal resolveAmountWithoutTax(String text, BigDecimal taxAmount, BigDecimal totalAmount) { + BigDecimal directAmount = parseMoney(firstGroup(text, AMOUNT_WITHOUT_TAX_PATTERN)); + if (directAmount != null && isReasonableAmount(directAmount, taxAmount, totalAmount)) { + return directAmount; + } + if (totalAmount != null && taxAmount != null) { + return totalAmount.subtract(taxAmount); + } + return directAmount; + } + + private boolean isReasonableAmount(BigDecimal directAmount, BigDecimal taxAmount, BigDecimal totalAmount) { + if (directAmount == null) { + return false; + } + if (totalAmount == null) { + return true; + } + if (taxAmount == null) { + return directAmount.compareTo(totalAmount) <= 0; + } + return directAmount.add(taxAmount).compareTo(totalAmount) <= 0; + } + + private List extractItems(String text) { + List items = new ArrayList<>(); + String[] lines = text.split("\\n"); + boolean detailStarted = false; + for (String originalLine : lines) { + String line = normalizeInlineWhitespace(originalLine); + if (StringUtils.isBlank(line)) { + continue; + } + if (!detailStarted && ITEM_HEADER_PATTERN.matcher(line).find()) { + detailStarted = true; + continue; + } + if (detailStarted && isDetailEndLine(line)) { + break; + } + if (detailStarted) { + InvoiceOcrItemVo item = parseItemLine(line); + if (item != null) { + items.add(item); + } + } + } + return deduplicateItems(items); + } + + private boolean isDetailEndLine(String line) { + return line.contains("合计") || line.contains("价税合计") || line.contains("销售方") || line.contains("购买方") || line.contains("备注"); + } + + private InvoiceOcrItemVo parseItemLine(String line) { + if (line.length() < 2) { + return null; + } + Matcher moneyMatcher = MONEY_LINE_PATTERN.matcher(line); + List numbers = new ArrayList<>(); + while (moneyMatcher.find()) { + numbers.add(moneyMatcher.group(1)); + } + if (numbers.isEmpty()) { + return null; + } + + String itemName = line; + for (String number : numbers) { + itemName = itemName.replace(number, " "); + } + Matcher taxRateMatcher = TAX_RATE_PATTERN.matcher(line); + String taxRate = taxRateMatcher.find() ? taxRateMatcher.group(1) : null; + if (taxRate != null) { + itemName = itemName.replace(taxRate, " "); + } + itemName = itemName.replace("*", " ").trim(); + itemName = normalizeInlineWhitespace(itemName); + if (StringUtils.isBlank(itemName)) { + return null; + } + + InvoiceOcrItemVo item = new InvoiceOcrItemVo(); + item.setItemName(itemName); + item.setAmount(parseMoney(numbers.get(Math.max(0, numbers.size() - 2)))); + item.setTaxAmount(parseMoney(numbers.get(numbers.size() - 1))); + item.setTaxRate(taxRate); + return item; + } + + private List deduplicateItems(List items) { + List result = new ArrayList<>(); + Set uniqueKeys = new LinkedHashSet<>(); + for (InvoiceOcrItemVo item : items) { + String key = item.getItemName() + "|" + item.getAmount() + "|" + item.getTaxAmount(); + if (uniqueKeys.add(key)) { + result.add(item); + } + } + return result; + } + + private String buildContentSummary(List items, String text) { + if (items != null && !items.isEmpty()) { + List names = new ArrayList<>(); + for (InvoiceOcrItemVo item : items) { + if (StringUtils.isNotBlank(item.getItemName())) { + names.add(item.getItemName()); + } + if (names.size() >= 3) { + break; + } + } + if (!names.isEmpty()) { + return String.join("、", names); + } + } + String[] lines = text.split("\\n"); + for (int i = 0; i < lines.length; i++) { + if (ITEM_HEADER_PATTERN.matcher(lines[i]).find()) { + StringBuilder builder = new StringBuilder(); + for (int j = i + 1; j < lines.length && j <= i + 3; j++) { + String value = normalizeInlineWhitespace(lines[j]); + if (StringUtils.isBlank(value) || isDetailEndLine(value)) { + break; + } + if (builder.length() > 0) { + builder.append(";"); + } + builder.append(value); + } + if (builder.length() > 0) { + return builder.toString(); + } + } + } + return null; + } + + private String normalizeText(String text) { + return text == null ? "" : text.replace("\r\n", "\n").replace('\r', '\n'); + } + + private String normalizeInlineWhitespace(String value) { + return value == null ? null : value.replaceAll("\\s+", " ").trim(); + } + + private String firstGroup(String text, Pattern pattern) { + Matcher matcher = pattern.matcher(text); + if (matcher.find()) { + return matcher.group(1); + } + return null; + } + + private String cleanLineValue(String value) { + if (value == null) { + return null; + } + value = normalizeInlineWhitespace(value); + if (StringUtils.length(value) > 80) { + value = StringUtils.substring(value, 0, 80); + } + return value; + } + + private BigDecimal parseMoney(String raw) { + if (StringUtils.isBlank(raw)) { + return null; + } + String cleaned = raw.replace(",", "").replace("¥", "").replace("¥", "").trim(); + try { + return new BigDecimal(cleaned); + } catch (Exception e) { + return null; + } + } +}