完成ocr识别发票的前提条件

This commit is contained in:
2026-05-05 20:15:54 +08:00
parent 04c84a3ed3
commit dc67788f51
6 changed files with 429 additions and 3 deletions

View File

@@ -68,7 +68,6 @@
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>com.xuxueli</groupId>
<artifactId>xxl-job-core</artifactId>
</dependency>
@@ -78,7 +77,11 @@
<artifactId>fad-hrm</artifactId>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
</dependencies>

View File

@@ -0,0 +1,36 @@
package com.ruoyi.oa.controller;
import com.ruoyi.common.annotation.Log;
import com.ruoyi.common.core.controller.BaseController;
import com.ruoyi.common.core.domain.R;
import com.ruoyi.common.enums.BusinessType;
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
import com.ruoyi.oa.service.IInvoiceOcrService;
import lombok.RequiredArgsConstructor;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
/**
* 发票 PDF 识别
*/
@Validated
@RequiredArgsConstructor
@RestController
@RequestMapping("/oa/invoiceOcr")
public class InvoiceOcrController extends BaseController {
private final IInvoiceOcrService invoiceOcrService;
/**
* 上传电子发票 PDF 并识别金额、内容等核心字段
*/
@Log(title = "发票识别", businessType = BusinessType.OTHER)
@PostMapping("/recognize")
public R<InvoiceOcrResultVo> recognize(@RequestParam("file") MultipartFile file) {
return R.ok(invoiceOcrService.recognizePdf(file));
}
}

View File

@@ -0,0 +1,27 @@
package com.ruoyi.oa.domain.vo;
import lombok.Data;
import java.io.Serializable;
import java.math.BigDecimal;
/**
* 发票识别明细行
*/
@Data
public class InvoiceOcrItemVo implements Serializable {
private static final long serialVersionUID = 1L;
/** 货物或应税劳务、服务名称 */
private String itemName;
/** 金额 */
private BigDecimal amount;
/** 税率 */
private String taxRate;
/** 税额 */
private BigDecimal taxAmount;
}

View File

@@ -0,0 +1,56 @@
package com.ruoyi.oa.domain.vo;
import lombok.Data;
import java.io.Serializable;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
/**
* 发票识别结果
*/
@Data
public class InvoiceOcrResultVo implements Serializable {
private static final long serialVersionUID = 1L;
/** 文件名 */
private String fileName;
/** 发票类型 */
private String invoiceType;
/** 发票代码 */
private String invoiceCode;
/** 发票号码 */
private String invoiceNumber;
/** 开票日期 */
private String invoiceDate;
/** 购买方名称 */
private String buyerName;
/** 销售方名称 */
private String sellerName;
/** 不含税金额 */
private BigDecimal amountWithoutTax;
/** 税额 */
private BigDecimal taxAmount;
/** 价税合计 */
private BigDecimal totalAmount;
/** 发票内容摘要 */
private String contentSummary;
/** 提取到的原始文本 */
private String rawText;
/** 明细 */
private List<InvoiceOcrItemVo> items = new ArrayList<>();
}

View File

@@ -0,0 +1,18 @@
package com.ruoyi.oa.service;
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
import org.springframework.web.multipart.MultipartFile;
/**
* 发票 PDF 识别服务
*/
public interface IInvoiceOcrService {
/**
* 识别发票 PDF 中的核心字段
*
* @param file 上传的 PDF 文件
* @return 识别结果
*/
InvoiceOcrResultVo recognizePdf(MultipartFile file);
}

View File

@@ -0,0 +1,286 @@
package com.ruoyi.oa.service.impl;
import com.ruoyi.common.exception.ServiceException;
import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo;
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
import com.ruoyi.oa.service.IInvoiceOcrService;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 发票 PDF 识别服务实现
*
* 当前实现为方案 A
* 1. 优先解析 PDF 文字层
* 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段
*/
@Service
public class InvoiceOcrServiceImpl implements IInvoiceOcrService {
private static final Pattern INVOICE_CODE_PATTERN = Pattern.compile("发票代码[:\\s]*([0-9]{10,12})");
private static final Pattern INVOICE_NUMBER_PATTERN = Pattern.compile("发票号码[:\\s]*([0-9]{8})");
private static final Pattern DATE_PATTERN = Pattern.compile("(?:开票日期|票据日期)[:\\s]*([0-9]{4}[年\\-/.][0-9]{1,2}[月\\-/.][0-9]{1,2}日?)");
private static final Pattern TAX_AMOUNT_PATTERN = Pattern.compile("(?:税额|税\\s*额)[:\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern.compile("(?:价税合计|小写)[:\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern.compile("(?:金额|合计|不含税金额)[:\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
private static final Pattern MONEY_LINE_PATTERN = Pattern.compile("([0-9,]+(?:\\.[0-9]{1,2})?)");
private static final Pattern BUYER_PATTERN = Pattern.compile("购买方(?:名称)?[:\\s]*([^\\n]+)");
private static final Pattern SELLER_PATTERN = Pattern.compile("销售方(?:名称)?[:\\s]*([^\\n]+)");
private static final Pattern ITEM_HEADER_PATTERN = Pattern.compile("(货物或应税劳务、服务名称|项目名称|服务名称)");
private static final Pattern TAX_RATE_PATTERN = Pattern.compile("([0-9]{1,2}%|免税)");
@Override
public InvoiceOcrResultVo recognizePdf(MultipartFile file) {
validateFile(file);
String rawText = extractPdfText(file);
if (StringUtils.isBlank(rawText)) {
throw new ServiceException("PDF 未解析到文本内容,请确认上传的是电子发票 PDF");
}
return parseInvoice(file.getOriginalFilename(), normalizeText(rawText));
}
private void validateFile(MultipartFile file) {
if (file == null || file.isEmpty()) {
throw new ServiceException("请上传 PDF 文件");
}
String fileName = file.getOriginalFilename();
if (StringUtils.isBlank(fileName) || !StringUtils.endsWithIgnoreCase(fileName, ".pdf")) {
throw new ServiceException("仅支持 PDF 文件识别");
}
}
private String extractPdfText(MultipartFile file) {
try (InputStream inputStream = file.getInputStream();
PDDocument document = PDDocument.load(inputStream)) {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
return stripper.getText(document);
} catch (IOException e) {
throw new ServiceException("PDF 解析失败: " + e.getMessage());
}
}
private InvoiceOcrResultVo parseInvoice(String fileName, String rawText) {
InvoiceOcrResultVo result = new InvoiceOcrResultVo();
result.setFileName(fileName);
result.setRawText(rawText);
result.setInvoiceType(detectInvoiceType(rawText));
result.setInvoiceCode(firstGroup(rawText, INVOICE_CODE_PATTERN));
result.setInvoiceNumber(firstGroup(rawText, INVOICE_NUMBER_PATTERN));
result.setInvoiceDate(firstGroup(rawText, DATE_PATTERN));
result.setBuyerName(cleanLineValue(firstGroup(rawText, BUYER_PATTERN)));
result.setSellerName(cleanLineValue(firstGroup(rawText, SELLER_PATTERN)));
result.setTaxAmount(parseMoney(firstGroup(rawText, TAX_AMOUNT_PATTERN)));
result.setTotalAmount(parseMoney(firstGroup(rawText, TOTAL_AMOUNT_PATTERN)));
result.setAmountWithoutTax(resolveAmountWithoutTax(rawText, result.getTaxAmount(), result.getTotalAmount()));
List<InvoiceOcrItemVo> items = extractItems(rawText);
result.setItems(items);
result.setContentSummary(buildContentSummary(items, rawText));
return result;
}
private String detectInvoiceType(String text) {
if (text.contains("增值税专用发票")) {
return "增值税专用发票";
}
if (text.contains("增值税普通发票")) {
return "增值税普通发票";
}
if (text.contains("电子发票")) {
return "电子发票";
}
return "未知发票类型";
}
private BigDecimal resolveAmountWithoutTax(String text, BigDecimal taxAmount, BigDecimal totalAmount) {
BigDecimal directAmount = parseMoney(firstGroup(text, AMOUNT_WITHOUT_TAX_PATTERN));
if (directAmount != null && isReasonableAmount(directAmount, taxAmount, totalAmount)) {
return directAmount;
}
if (totalAmount != null && taxAmount != null) {
return totalAmount.subtract(taxAmount);
}
return directAmount;
}
private boolean isReasonableAmount(BigDecimal directAmount, BigDecimal taxAmount, BigDecimal totalAmount) {
if (directAmount == null) {
return false;
}
if (totalAmount == null) {
return true;
}
if (taxAmount == null) {
return directAmount.compareTo(totalAmount) <= 0;
}
return directAmount.add(taxAmount).compareTo(totalAmount) <= 0;
}
private List<InvoiceOcrItemVo> extractItems(String text) {
List<InvoiceOcrItemVo> items = new ArrayList<>();
String[] lines = text.split("\\n");
boolean detailStarted = false;
for (String originalLine : lines) {
String line = normalizeInlineWhitespace(originalLine);
if (StringUtils.isBlank(line)) {
continue;
}
if (!detailStarted && ITEM_HEADER_PATTERN.matcher(line).find()) {
detailStarted = true;
continue;
}
if (detailStarted && isDetailEndLine(line)) {
break;
}
if (detailStarted) {
InvoiceOcrItemVo item = parseItemLine(line);
if (item != null) {
items.add(item);
}
}
}
return deduplicateItems(items);
}
private boolean isDetailEndLine(String line) {
return line.contains("合计") || line.contains("价税合计") || line.contains("销售方") || line.contains("购买方") || line.contains("备注");
}
private InvoiceOcrItemVo parseItemLine(String line) {
if (line.length() < 2) {
return null;
}
Matcher moneyMatcher = MONEY_LINE_PATTERN.matcher(line);
List<String> numbers = new ArrayList<>();
while (moneyMatcher.find()) {
numbers.add(moneyMatcher.group(1));
}
if (numbers.isEmpty()) {
return null;
}
String itemName = line;
for (String number : numbers) {
itemName = itemName.replace(number, " ");
}
Matcher taxRateMatcher = TAX_RATE_PATTERN.matcher(line);
String taxRate = taxRateMatcher.find() ? taxRateMatcher.group(1) : null;
if (taxRate != null) {
itemName = itemName.replace(taxRate, " ");
}
itemName = itemName.replace("*", " ").trim();
itemName = normalizeInlineWhitespace(itemName);
if (StringUtils.isBlank(itemName)) {
return null;
}
InvoiceOcrItemVo item = new InvoiceOcrItemVo();
item.setItemName(itemName);
item.setAmount(parseMoney(numbers.get(Math.max(0, numbers.size() - 2))));
item.setTaxAmount(parseMoney(numbers.get(numbers.size() - 1)));
item.setTaxRate(taxRate);
return item;
}
private List<InvoiceOcrItemVo> deduplicateItems(List<InvoiceOcrItemVo> items) {
List<InvoiceOcrItemVo> result = new ArrayList<>();
Set<String> uniqueKeys = new LinkedHashSet<>();
for (InvoiceOcrItemVo item : items) {
String key = item.getItemName() + "|" + item.getAmount() + "|" + item.getTaxAmount();
if (uniqueKeys.add(key)) {
result.add(item);
}
}
return result;
}
private String buildContentSummary(List<InvoiceOcrItemVo> items, String text) {
if (items != null && !items.isEmpty()) {
List<String> names = new ArrayList<>();
for (InvoiceOcrItemVo item : items) {
if (StringUtils.isNotBlank(item.getItemName())) {
names.add(item.getItemName());
}
if (names.size() >= 3) {
break;
}
}
if (!names.isEmpty()) {
return String.join("", names);
}
}
String[] lines = text.split("\\n");
for (int i = 0; i < lines.length; i++) {
if (ITEM_HEADER_PATTERN.matcher(lines[i]).find()) {
StringBuilder builder = new StringBuilder();
for (int j = i + 1; j < lines.length && j <= i + 3; j++) {
String value = normalizeInlineWhitespace(lines[j]);
if (StringUtils.isBlank(value) || isDetailEndLine(value)) {
break;
}
if (builder.length() > 0) {
builder.append("");
}
builder.append(value);
}
if (builder.length() > 0) {
return builder.toString();
}
}
}
return null;
}
private String normalizeText(String text) {
return text == null ? "" : text.replace("\r\n", "\n").replace('\r', '\n');
}
private String normalizeInlineWhitespace(String value) {
return value == null ? null : value.replaceAll("\\s+", " ").trim();
}
private String firstGroup(String text, Pattern pattern) {
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
return matcher.group(1);
}
return null;
}
private String cleanLineValue(String value) {
if (value == null) {
return null;
}
value = normalizeInlineWhitespace(value);
if (StringUtils.length(value) > 80) {
value = StringUtils.substring(value, 0, 80);
}
return value;
}
private BigDecimal parseMoney(String raw) {
if (StringUtils.isBlank(raw)) {
return null;
}
String cleaned = raw.replace(",", "").replace("", "").replace("¥", "").trim();
try {
return new BigDecimal(cleaned);
} catch (Exception e) {
return null;
}
}
}