完成ocr识别发票的前提条件
This commit is contained in:
@@ -68,17 +68,20 @@
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
||||
<groupId>com.xuxueli</groupId>
|
||||
<artifactId>xxl-job-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<dependency>
|
||||
<groupId>com.ruoyi</groupId>
|
||||
<artifactId>fad-hrm</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.29</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
package com.ruoyi.oa.controller;
|
||||
|
||||
import com.ruoyi.common.annotation.Log;
|
||||
import com.ruoyi.common.core.controller.BaseController;
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.ruoyi.common.enums.BusinessType;
|
||||
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
|
||||
import com.ruoyi.oa.service.IInvoiceOcrService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.validation.annotation.Validated;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
/**
|
||||
* 发票 PDF 识别
|
||||
*/
|
||||
@Validated
|
||||
@RequiredArgsConstructor
|
||||
@RestController
|
||||
@RequestMapping("/oa/invoiceOcr")
|
||||
public class InvoiceOcrController extends BaseController {
|
||||
|
||||
private final IInvoiceOcrService invoiceOcrService;
|
||||
|
||||
/**
|
||||
* 上传电子发票 PDF 并识别金额、内容等核心字段
|
||||
*/
|
||||
@Log(title = "发票识别", businessType = BusinessType.OTHER)
|
||||
@PostMapping("/recognize")
|
||||
public R<InvoiceOcrResultVo> recognize(@RequestParam("file") MultipartFile file) {
|
||||
return R.ok(invoiceOcrService.recognizePdf(file));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.ruoyi.oa.domain.vo;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.math.BigDecimal;
|
||||
|
||||
/**
|
||||
* 发票识别明细行
|
||||
*/
|
||||
@Data
|
||||
public class InvoiceOcrItemVo implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/** 货物或应税劳务、服务名称 */
|
||||
private String itemName;
|
||||
|
||||
/** 金额 */
|
||||
private BigDecimal amount;
|
||||
|
||||
/** 税率 */
|
||||
private String taxRate;
|
||||
|
||||
/** 税额 */
|
||||
private BigDecimal taxAmount;
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
package com.ruoyi.oa.domain.vo;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.math.BigDecimal;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 发票识别结果
|
||||
*/
|
||||
@Data
|
||||
public class InvoiceOcrResultVo implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/** 文件名 */
|
||||
private String fileName;
|
||||
|
||||
/** 发票类型 */
|
||||
private String invoiceType;
|
||||
|
||||
/** 发票代码 */
|
||||
private String invoiceCode;
|
||||
|
||||
/** 发票号码 */
|
||||
private String invoiceNumber;
|
||||
|
||||
/** 开票日期 */
|
||||
private String invoiceDate;
|
||||
|
||||
/** 购买方名称 */
|
||||
private String buyerName;
|
||||
|
||||
/** 销售方名称 */
|
||||
private String sellerName;
|
||||
|
||||
/** 不含税金额 */
|
||||
private BigDecimal amountWithoutTax;
|
||||
|
||||
/** 税额 */
|
||||
private BigDecimal taxAmount;
|
||||
|
||||
/** 价税合计 */
|
||||
private BigDecimal totalAmount;
|
||||
|
||||
/** 发票内容摘要 */
|
||||
private String contentSummary;
|
||||
|
||||
/** 提取到的原始文本 */
|
||||
private String rawText;
|
||||
|
||||
/** 明细 */
|
||||
private List<InvoiceOcrItemVo> items = new ArrayList<>();
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.ruoyi.oa.service;
|
||||
|
||||
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
/**
|
||||
* 发票 PDF 识别服务
|
||||
*/
|
||||
public interface IInvoiceOcrService {
|
||||
|
||||
/**
|
||||
* 识别发票 PDF 中的核心字段
|
||||
*
|
||||
* @param file 上传的 PDF 文件
|
||||
* @return 识别结果
|
||||
*/
|
||||
InvoiceOcrResultVo recognizePdf(MultipartFile file);
|
||||
}
|
||||
@@ -0,0 +1,286 @@
|
||||
package com.ruoyi.oa.service.impl;
|
||||
|
||||
import com.ruoyi.common.exception.ServiceException;
|
||||
import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo;
|
||||
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo;
|
||||
import com.ruoyi.oa.service.IInvoiceOcrService;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.math.BigDecimal;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 发票 PDF 识别服务实现
|
||||
*
|
||||
* 当前实现为方案 A:
|
||||
* 1. 优先解析 PDF 文字层
|
||||
* 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段
|
||||
*/
|
||||
@Service
|
||||
public class InvoiceOcrServiceImpl implements IInvoiceOcrService {
|
||||
|
||||
private static final Pattern INVOICE_CODE_PATTERN = Pattern.compile("发票代码[::\\s]*([0-9]{10,12})");
|
||||
private static final Pattern INVOICE_NUMBER_PATTERN = Pattern.compile("发票号码[::\\s]*([0-9]{8})");
|
||||
private static final Pattern DATE_PATTERN = Pattern.compile("(?:开票日期|票据日期)[::\\s]*([0-9]{4}[年\\-/.][0-9]{1,2}[月\\-/.][0-9]{1,2}日?)");
|
||||
private static final Pattern TAX_AMOUNT_PATTERN = Pattern.compile("(?:税额|税\\s*额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
|
||||
private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern.compile("(?:价税合计|小写)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
|
||||
private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern.compile("(?:金额|合计|不含税金额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)");
|
||||
private static final Pattern MONEY_LINE_PATTERN = Pattern.compile("([0-9,]+(?:\\.[0-9]{1,2})?)");
|
||||
private static final Pattern BUYER_PATTERN = Pattern.compile("购买方(?:名称)?[::\\s]*([^\\n]+)");
|
||||
private static final Pattern SELLER_PATTERN = Pattern.compile("销售方(?:名称)?[::\\s]*([^\\n]+)");
|
||||
private static final Pattern ITEM_HEADER_PATTERN = Pattern.compile("(货物或应税劳务、服务名称|项目名称|服务名称)");
|
||||
private static final Pattern TAX_RATE_PATTERN = Pattern.compile("([0-9]{1,2}%|免税)");
|
||||
|
||||
@Override
|
||||
public InvoiceOcrResultVo recognizePdf(MultipartFile file) {
|
||||
validateFile(file);
|
||||
String rawText = extractPdfText(file);
|
||||
if (StringUtils.isBlank(rawText)) {
|
||||
throw new ServiceException("PDF 未解析到文本内容,请确认上传的是电子发票 PDF");
|
||||
}
|
||||
return parseInvoice(file.getOriginalFilename(), normalizeText(rawText));
|
||||
}
|
||||
|
||||
private void validateFile(MultipartFile file) {
|
||||
if (file == null || file.isEmpty()) {
|
||||
throw new ServiceException("请上传 PDF 文件");
|
||||
}
|
||||
String fileName = file.getOriginalFilename();
|
||||
if (StringUtils.isBlank(fileName) || !StringUtils.endsWithIgnoreCase(fileName, ".pdf")) {
|
||||
throw new ServiceException("仅支持 PDF 文件识别");
|
||||
}
|
||||
}
|
||||
|
||||
private String extractPdfText(MultipartFile file) {
|
||||
try (InputStream inputStream = file.getInputStream();
|
||||
PDDocument document = PDDocument.load(inputStream)) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
stripper.setSortByPosition(true);
|
||||
return stripper.getText(document);
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException("PDF 解析失败: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private InvoiceOcrResultVo parseInvoice(String fileName, String rawText) {
|
||||
InvoiceOcrResultVo result = new InvoiceOcrResultVo();
|
||||
result.setFileName(fileName);
|
||||
result.setRawText(rawText);
|
||||
result.setInvoiceType(detectInvoiceType(rawText));
|
||||
result.setInvoiceCode(firstGroup(rawText, INVOICE_CODE_PATTERN));
|
||||
result.setInvoiceNumber(firstGroup(rawText, INVOICE_NUMBER_PATTERN));
|
||||
result.setInvoiceDate(firstGroup(rawText, DATE_PATTERN));
|
||||
result.setBuyerName(cleanLineValue(firstGroup(rawText, BUYER_PATTERN)));
|
||||
result.setSellerName(cleanLineValue(firstGroup(rawText, SELLER_PATTERN)));
|
||||
result.setTaxAmount(parseMoney(firstGroup(rawText, TAX_AMOUNT_PATTERN)));
|
||||
result.setTotalAmount(parseMoney(firstGroup(rawText, TOTAL_AMOUNT_PATTERN)));
|
||||
result.setAmountWithoutTax(resolveAmountWithoutTax(rawText, result.getTaxAmount(), result.getTotalAmount()));
|
||||
|
||||
List<InvoiceOcrItemVo> items = extractItems(rawText);
|
||||
result.setItems(items);
|
||||
result.setContentSummary(buildContentSummary(items, rawText));
|
||||
return result;
|
||||
}
|
||||
|
||||
private String detectInvoiceType(String text) {
|
||||
if (text.contains("增值税专用发票")) {
|
||||
return "增值税专用发票";
|
||||
}
|
||||
if (text.contains("增值税普通发票")) {
|
||||
return "增值税普通发票";
|
||||
}
|
||||
if (text.contains("电子发票")) {
|
||||
return "电子发票";
|
||||
}
|
||||
return "未知发票类型";
|
||||
}
|
||||
|
||||
private BigDecimal resolveAmountWithoutTax(String text, BigDecimal taxAmount, BigDecimal totalAmount) {
|
||||
BigDecimal directAmount = parseMoney(firstGroup(text, AMOUNT_WITHOUT_TAX_PATTERN));
|
||||
if (directAmount != null && isReasonableAmount(directAmount, taxAmount, totalAmount)) {
|
||||
return directAmount;
|
||||
}
|
||||
if (totalAmount != null && taxAmount != null) {
|
||||
return totalAmount.subtract(taxAmount);
|
||||
}
|
||||
return directAmount;
|
||||
}
|
||||
|
||||
private boolean isReasonableAmount(BigDecimal directAmount, BigDecimal taxAmount, BigDecimal totalAmount) {
|
||||
if (directAmount == null) {
|
||||
return false;
|
||||
}
|
||||
if (totalAmount == null) {
|
||||
return true;
|
||||
}
|
||||
if (taxAmount == null) {
|
||||
return directAmount.compareTo(totalAmount) <= 0;
|
||||
}
|
||||
return directAmount.add(taxAmount).compareTo(totalAmount) <= 0;
|
||||
}
|
||||
|
||||
private List<InvoiceOcrItemVo> extractItems(String text) {
|
||||
List<InvoiceOcrItemVo> items = new ArrayList<>();
|
||||
String[] lines = text.split("\\n");
|
||||
boolean detailStarted = false;
|
||||
for (String originalLine : lines) {
|
||||
String line = normalizeInlineWhitespace(originalLine);
|
||||
if (StringUtils.isBlank(line)) {
|
||||
continue;
|
||||
}
|
||||
if (!detailStarted && ITEM_HEADER_PATTERN.matcher(line).find()) {
|
||||
detailStarted = true;
|
||||
continue;
|
||||
}
|
||||
if (detailStarted && isDetailEndLine(line)) {
|
||||
break;
|
||||
}
|
||||
if (detailStarted) {
|
||||
InvoiceOcrItemVo item = parseItemLine(line);
|
||||
if (item != null) {
|
||||
items.add(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
return deduplicateItems(items);
|
||||
}
|
||||
|
||||
private boolean isDetailEndLine(String line) {
|
||||
return line.contains("合计") || line.contains("价税合计") || line.contains("销售方") || line.contains("购买方") || line.contains("备注");
|
||||
}
|
||||
|
||||
private InvoiceOcrItemVo parseItemLine(String line) {
|
||||
if (line.length() < 2) {
|
||||
return null;
|
||||
}
|
||||
Matcher moneyMatcher = MONEY_LINE_PATTERN.matcher(line);
|
||||
List<String> numbers = new ArrayList<>();
|
||||
while (moneyMatcher.find()) {
|
||||
numbers.add(moneyMatcher.group(1));
|
||||
}
|
||||
if (numbers.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String itemName = line;
|
||||
for (String number : numbers) {
|
||||
itemName = itemName.replace(number, " ");
|
||||
}
|
||||
Matcher taxRateMatcher = TAX_RATE_PATTERN.matcher(line);
|
||||
String taxRate = taxRateMatcher.find() ? taxRateMatcher.group(1) : null;
|
||||
if (taxRate != null) {
|
||||
itemName = itemName.replace(taxRate, " ");
|
||||
}
|
||||
itemName = itemName.replace("*", " ").trim();
|
||||
itemName = normalizeInlineWhitespace(itemName);
|
||||
if (StringUtils.isBlank(itemName)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
InvoiceOcrItemVo item = new InvoiceOcrItemVo();
|
||||
item.setItemName(itemName);
|
||||
item.setAmount(parseMoney(numbers.get(Math.max(0, numbers.size() - 2))));
|
||||
item.setTaxAmount(parseMoney(numbers.get(numbers.size() - 1)));
|
||||
item.setTaxRate(taxRate);
|
||||
return item;
|
||||
}
|
||||
|
||||
private List<InvoiceOcrItemVo> deduplicateItems(List<InvoiceOcrItemVo> items) {
|
||||
List<InvoiceOcrItemVo> result = new ArrayList<>();
|
||||
Set<String> uniqueKeys = new LinkedHashSet<>();
|
||||
for (InvoiceOcrItemVo item : items) {
|
||||
String key = item.getItemName() + "|" + item.getAmount() + "|" + item.getTaxAmount();
|
||||
if (uniqueKeys.add(key)) {
|
||||
result.add(item);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String buildContentSummary(List<InvoiceOcrItemVo> items, String text) {
|
||||
if (items != null && !items.isEmpty()) {
|
||||
List<String> names = new ArrayList<>();
|
||||
for (InvoiceOcrItemVo item : items) {
|
||||
if (StringUtils.isNotBlank(item.getItemName())) {
|
||||
names.add(item.getItemName());
|
||||
}
|
||||
if (names.size() >= 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!names.isEmpty()) {
|
||||
return String.join("、", names);
|
||||
}
|
||||
}
|
||||
String[] lines = text.split("\\n");
|
||||
for (int i = 0; i < lines.length; i++) {
|
||||
if (ITEM_HEADER_PATTERN.matcher(lines[i]).find()) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int j = i + 1; j < lines.length && j <= i + 3; j++) {
|
||||
String value = normalizeInlineWhitespace(lines[j]);
|
||||
if (StringUtils.isBlank(value) || isDetailEndLine(value)) {
|
||||
break;
|
||||
}
|
||||
if (builder.length() > 0) {
|
||||
builder.append(";");
|
||||
}
|
||||
builder.append(value);
|
||||
}
|
||||
if (builder.length() > 0) {
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String normalizeText(String text) {
|
||||
return text == null ? "" : text.replace("\r\n", "\n").replace('\r', '\n');
|
||||
}
|
||||
|
||||
private String normalizeInlineWhitespace(String value) {
|
||||
return value == null ? null : value.replaceAll("\\s+", " ").trim();
|
||||
}
|
||||
|
||||
private String firstGroup(String text, Pattern pattern) {
|
||||
Matcher matcher = pattern.matcher(text);
|
||||
if (matcher.find()) {
|
||||
return matcher.group(1);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String cleanLineValue(String value) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
value = normalizeInlineWhitespace(value);
|
||||
if (StringUtils.length(value) > 80) {
|
||||
value = StringUtils.substring(value, 0, 80);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private BigDecimal parseMoney(String raw) {
|
||||
if (StringUtils.isBlank(raw)) {
|
||||
return null;
|
||||
}
|
||||
String cleaned = raw.replace(",", "").replace("¥", "").replace("¥", "").trim();
|
||||
try {
|
||||
return new BigDecimal(cleaned);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user