diff --git a/ruoyi-admin/src/main/resources/application.yml b/ruoyi-admin/src/main/resources/application.yml index 0f869b1..73f1909 100644 --- a/ruoyi-admin/src/main/resources/application.yml +++ b/ruoyi-admin/src/main/resources/application.yml @@ -329,7 +329,7 @@ fad: securityKey: 6f9171724396deb5f8c42ef256b3cbc5 ocr: # 发票OCR服务地址(ai-ocr Python服务) - url: http://127.0.0.1:8000 + url: http://127.0.0.1:8810 # OCR服务 API Key api-key: change-me-debug-key diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java deleted file mode 100644 index 4227862..0000000 --- a/ruoyi-oa/src/main/java/com/ruoyi/oa/controller/InvoiceOcrController.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.ruoyi.oa.controller; - -import com.ruoyi.common.annotation.Log; -import com.ruoyi.common.core.controller.BaseController; -import com.ruoyi.common.core.domain.R; -import com.ruoyi.common.enums.BusinessType; -import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; -import com.ruoyi.oa.service.IInvoiceOcrService; -import lombok.RequiredArgsConstructor; -import org.springframework.validation.annotation.Validated; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.multipart.MultipartFile; - -/** - * 发票 PDF 识别 - */ -@Validated -@RequiredArgsConstructor -@RestController -@RequestMapping("/oa/invoiceOcr") -public class InvoiceOcrController extends BaseController { - - private final IInvoiceOcrService invoiceOcrService; - - /** - * 上传电子发票 PDF 并识别金额、内容等核心字段 - */ - @Log(title = "发票识别", businessType = BusinessType.OTHER) - @PostMapping("/recognize") - public R recognize(@RequestParam("file") MultipartFile file) { - return R.ok(invoiceOcrService.recognizePdf(file)); - } -} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java deleted file mode 100644 index b6f6eef..0000000 --- a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/IInvoiceOcrService.java +++ /dev/null @@ -1,18 +0,0 @@ -package com.ruoyi.oa.service; - -import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; -import org.springframework.web.multipart.MultipartFile; - -/** - * 发票 PDF 识别服务 - */ -public interface IInvoiceOcrService { - - /** - * 识别发票 PDF 中的核心字段 - * - * @param file 上传的 PDF 文件 - * @return 识别结果 - */ - InvoiceOcrResultVo recognizePdf(MultipartFile file); -} diff --git a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java b/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java deleted file mode 100644 index b64f3e9..0000000 --- a/ruoyi-oa/src/main/java/com/ruoyi/oa/service/impl/InvoiceOcrServiceImpl.java +++ /dev/null @@ -1,286 +0,0 @@ -package com.ruoyi.oa.service.impl; - -import com.ruoyi.common.exception.ServiceException; -import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo; -import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo; -import com.ruoyi.oa.service.IInvoiceOcrService; -import org.apache.commons.lang3.StringUtils; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; -import org.springframework.stereotype.Service; -import org.springframework.web.multipart.MultipartFile; - -import java.io.IOException; -import java.io.InputStream; -import java.math.BigDecimal; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * 发票 PDF 识别服务实现 - * - * 当前实现为方案 A: - * 1. 优先解析 PDF 文字层 - * 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段 - */ -@Service -public class InvoiceOcrServiceImpl implements IInvoiceOcrService { - - private static final Pattern INVOICE_CODE_PATTERN = Pattern.compile("发票代码[::\\s]*([0-9]{10,12})"); - private static final Pattern INVOICE_NUMBER_PATTERN = Pattern.compile("发票号码[::\\s]*([0-9]{8})"); - private static final Pattern DATE_PATTERN = Pattern.compile("(?:开票日期|票据日期)[::\\s]*([0-9]{4}[年\\-/.][0-9]{1,2}[月\\-/.][0-9]{1,2}日?)"); - private static final Pattern TAX_AMOUNT_PATTERN = Pattern.compile("(?:税额|税\\s*额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); - private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern.compile("(?:价税合计|小写)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); - private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern.compile("(?:金额|合计|不含税金额)[::\\s¥¥]*([0-9,]+(?:\\.[0-9]{1,2})?)"); - private static final Pattern MONEY_LINE_PATTERN = Pattern.compile("([0-9,]+(?:\\.[0-9]{1,2})?)"); - private static final Pattern BUYER_PATTERN = Pattern.compile("购买方(?:名称)?[::\\s]*([^\\n]+)"); - private static final Pattern SELLER_PATTERN = Pattern.compile("销售方(?:名称)?[::\\s]*([^\\n]+)"); - private static final Pattern ITEM_HEADER_PATTERN = Pattern.compile("(货物或应税劳务、服务名称|项目名称|服务名称)"); - private static final Pattern TAX_RATE_PATTERN = Pattern.compile("([0-9]{1,2}%|免税)"); - - @Override - public InvoiceOcrResultVo recognizePdf(MultipartFile file) { - validateFile(file); - String rawText = extractPdfText(file); - if (StringUtils.isBlank(rawText)) { - throw new ServiceException("PDF 未解析到文本内容,请确认上传的是电子发票 PDF"); - } - return parseInvoice(file.getOriginalFilename(), normalizeText(rawText)); - } - - private void validateFile(MultipartFile file) { - if (file == null || file.isEmpty()) { - throw new ServiceException("请上传 PDF 文件"); - } - String fileName = file.getOriginalFilename(); - if (StringUtils.isBlank(fileName) || !StringUtils.endsWithIgnoreCase(fileName, ".pdf")) { - throw new ServiceException("仅支持 PDF 文件识别"); - } - } - - private String extractPdfText(MultipartFile file) { - try (InputStream inputStream = file.getInputStream(); - PDDocument document = PDDocument.load(inputStream)) { - PDFTextStripper stripper = new PDFTextStripper(); - stripper.setSortByPosition(true); - return stripper.getText(document); - } catch (IOException e) { - throw new ServiceException("PDF 解析失败: " + e.getMessage()); - } - } - - private InvoiceOcrResultVo parseInvoice(String fileName, String rawText) { - InvoiceOcrResultVo result = new InvoiceOcrResultVo(); - result.setFileName(fileName); - result.setRawText(rawText); - result.setInvoiceType(detectInvoiceType(rawText)); - result.setInvoiceCode(firstGroup(rawText, INVOICE_CODE_PATTERN)); - result.setInvoiceNumber(firstGroup(rawText, INVOICE_NUMBER_PATTERN)); - result.setInvoiceDate(firstGroup(rawText, DATE_PATTERN)); - result.setBuyerName(cleanLineValue(firstGroup(rawText, BUYER_PATTERN))); - result.setSellerName(cleanLineValue(firstGroup(rawText, SELLER_PATTERN))); - result.setTaxAmount(parseMoney(firstGroup(rawText, TAX_AMOUNT_PATTERN))); - result.setTotalAmount(parseMoney(firstGroup(rawText, TOTAL_AMOUNT_PATTERN))); - result.setAmountWithoutTax(resolveAmountWithoutTax(rawText, result.getTaxAmount(), result.getTotalAmount())); - - List items = extractItems(rawText); - result.setItems(items); - result.setContentSummary(buildContentSummary(items, rawText)); - return result; - } - - private String detectInvoiceType(String text) { - if (text.contains("增值税专用发票")) { - return "增值税专用发票"; - } - if (text.contains("增值税普通发票")) { - return "增值税普通发票"; - } - if (text.contains("电子发票")) { - return "电子发票"; - } - return "未知发票类型"; - } - - private BigDecimal resolveAmountWithoutTax(String text, BigDecimal taxAmount, BigDecimal totalAmount) { - BigDecimal directAmount = parseMoney(firstGroup(text, AMOUNT_WITHOUT_TAX_PATTERN)); - if (directAmount != null && isReasonableAmount(directAmount, taxAmount, totalAmount)) { - return directAmount; - } - if (totalAmount != null && taxAmount != null) { - return totalAmount.subtract(taxAmount); - } - return directAmount; - } - - private boolean isReasonableAmount(BigDecimal directAmount, BigDecimal taxAmount, BigDecimal totalAmount) { - if (directAmount == null) { - return false; - } - if (totalAmount == null) { - return true; - } - if (taxAmount == null) { - return directAmount.compareTo(totalAmount) <= 0; - } - return directAmount.add(taxAmount).compareTo(totalAmount) <= 0; - } - - private List extractItems(String text) { - List items = new ArrayList<>(); - String[] lines = text.split("\\n"); - boolean detailStarted = false; - for (String originalLine : lines) { - String line = normalizeInlineWhitespace(originalLine); - if (StringUtils.isBlank(line)) { - continue; - } - if (!detailStarted && ITEM_HEADER_PATTERN.matcher(line).find()) { - detailStarted = true; - continue; - } - if (detailStarted && isDetailEndLine(line)) { - break; - } - if (detailStarted) { - InvoiceOcrItemVo item = parseItemLine(line); - if (item != null) { - items.add(item); - } - } - } - return deduplicateItems(items); - } - - private boolean isDetailEndLine(String line) { - return line.contains("合计") || line.contains("价税合计") || line.contains("销售方") || line.contains("购买方") || line.contains("备注"); - } - - private InvoiceOcrItemVo parseItemLine(String line) { - if (line.length() < 2) { - return null; - } - Matcher moneyMatcher = MONEY_LINE_PATTERN.matcher(line); - List numbers = new ArrayList<>(); - while (moneyMatcher.find()) { - numbers.add(moneyMatcher.group(1)); - } - if (numbers.isEmpty()) { - return null; - } - - String itemName = line; - for (String number : numbers) { - itemName = itemName.replace(number, " "); - } - Matcher taxRateMatcher = TAX_RATE_PATTERN.matcher(line); - String taxRate = taxRateMatcher.find() ? taxRateMatcher.group(1) : null; - if (taxRate != null) { - itemName = itemName.replace(taxRate, " "); - } - itemName = itemName.replace("*", " ").trim(); - itemName = normalizeInlineWhitespace(itemName); - if (StringUtils.isBlank(itemName)) { - return null; - } - - InvoiceOcrItemVo item = new InvoiceOcrItemVo(); - item.setItemName(itemName); - item.setAmount(parseMoney(numbers.get(Math.max(0, numbers.size() - 2)))); - item.setTaxAmount(parseMoney(numbers.get(numbers.size() - 1))); - item.setTaxRate(taxRate); - return item; - } - - private List deduplicateItems(List items) { - List result = new ArrayList<>(); - Set uniqueKeys = new LinkedHashSet<>(); - for (InvoiceOcrItemVo item : items) { - String key = item.getItemName() + "|" + item.getAmount() + "|" + item.getTaxAmount(); - if (uniqueKeys.add(key)) { - result.add(item); - } - } - return result; - } - - private String buildContentSummary(List items, String text) { - if (items != null && !items.isEmpty()) { - List names = new ArrayList<>(); - for (InvoiceOcrItemVo item : items) { - if (StringUtils.isNotBlank(item.getItemName())) { - names.add(item.getItemName()); - } - if (names.size() >= 3) { - break; - } - } - if (!names.isEmpty()) { - return String.join("、", names); - } - } - String[] lines = text.split("\\n"); - for (int i = 0; i < lines.length; i++) { - if (ITEM_HEADER_PATTERN.matcher(lines[i]).find()) { - StringBuilder builder = new StringBuilder(); - for (int j = i + 1; j < lines.length && j <= i + 3; j++) { - String value = normalizeInlineWhitespace(lines[j]); - if (StringUtils.isBlank(value) || isDetailEndLine(value)) { - break; - } - if (builder.length() > 0) { - builder.append(";"); - } - builder.append(value); - } - if (builder.length() > 0) { - return builder.toString(); - } - } - } - return null; - } - - private String normalizeText(String text) { - return text == null ? "" : text.replace("\r\n", "\n").replace('\r', '\n'); - } - - private String normalizeInlineWhitespace(String value) { - return value == null ? null : value.replaceAll("\\s+", " ").trim(); - } - - private String firstGroup(String text, Pattern pattern) { - Matcher matcher = pattern.matcher(text); - if (matcher.find()) { - return matcher.group(1); - } - return null; - } - - private String cleanLineValue(String value) { - if (value == null) { - return null; - } - value = normalizeInlineWhitespace(value); - if (StringUtils.length(value) > 80) { - value = StringUtils.substring(value, 0, 80); - } - return value; - } - - private BigDecimal parseMoney(String raw) { - if (StringUtils.isBlank(raw)) { - return null; - } - String cleaned = raw.replace(",", "").replace("¥", "").replace("¥", "").trim(); - try { - return new BigDecimal(cleaned); - } catch (Exception e) { - return null; - } - } -}