@@ -1,286 +0,0 @@
package com.ruoyi.oa.service.impl ;
import com.ruoyi.common.exception.ServiceException ;
import com.ruoyi.oa.domain.vo.InvoiceOcrItemVo ;
import com.ruoyi.oa.domain.vo.InvoiceOcrResultVo ;
import com.ruoyi.oa.service.IInvoiceOcrService ;
import org.apache.commons.lang3.StringUtils ;
import org.apache.pdfbox.pdmodel.PDDocument ;
import org.apache.pdfbox.text.PDFTextStripper ;
import org.springframework.stereotype.Service ;
import org.springframework.web.multipart.MultipartFile ;
import java.io.IOException ;
import java.io.InputStream ;
import java.math.BigDecimal ;
import java.util.ArrayList ;
import java.util.LinkedHashSet ;
import java.util.List ;
import java.util.Set ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
/**
* 发票 PDF 识别服务实现
*
* 当前实现为方案 A:
* 1. 优先解析 PDF 文字层
* 2. 使用规则抽取金额、日期、发票代码、发票号码、内容摘要等字段
*/
@Service
public class InvoiceOcrServiceImpl implements IInvoiceOcrService {
private static final Pattern INVOICE_CODE_PATTERN = Pattern . compile ( " 发票代码[:: \\ s]*([0-9]{10,12}) " ) ;
private static final Pattern INVOICE_NUMBER_PATTERN = Pattern . compile ( " 发票号码[:: \\ s]*([0-9]{8}) " ) ;
private static final Pattern DATE_PATTERN = Pattern . compile ( " (?:开票日期|票据日期)[:: \\ s]*([0-9]{4}[年 \\ -/.][0-9]{1,2}[月 \\ -/.][0-9]{1,2}日?) " ) ;
private static final Pattern TAX_AMOUNT_PATTERN = Pattern . compile ( " (?:税额|税 \\ s*额)[:: \\ s¥¥]*([0-9,]+(?: \\ .[0-9]{1,2})?) " ) ;
private static final Pattern TOTAL_AMOUNT_PATTERN = Pattern . compile ( " (?:价税合计|小写)[:: \\ s¥¥]*([0-9,]+(?: \\ .[0-9]{1,2})?) " ) ;
private static final Pattern AMOUNT_WITHOUT_TAX_PATTERN = Pattern . compile ( " (?:金额|合计|不含税金额)[:: \\ s¥¥]*([0-9,]+(?: \\ .[0-9]{1,2})?) " ) ;
private static final Pattern MONEY_LINE_PATTERN = Pattern . compile ( " ([0-9,]+(?: \\ .[0-9]{1,2})?) " ) ;
private static final Pattern BUYER_PATTERN = Pattern . compile ( " 购买方(?:名称)?[:: \\ s]*([^ \\ n]+) " ) ;
private static final Pattern SELLER_PATTERN = Pattern . compile ( " 销售方(?:名称)?[:: \\ s]*([^ \\ n]+) " ) ;
private static final Pattern ITEM_HEADER_PATTERN = Pattern . compile ( " (货物或应税劳务、服务名称|项目名称|服务名称) " ) ;
private static final Pattern TAX_RATE_PATTERN = Pattern . compile ( " ([0-9]{1,2}%|免税) " ) ;
@Override
public InvoiceOcrResultVo recognizePdf ( MultipartFile file ) {
validateFile ( file ) ;
String rawText = extractPdfText ( file ) ;
if ( StringUtils . isBlank ( rawText ) ) {
throw new ServiceException ( " PDF 未解析到文本内容,请确认上传的是电子发票 PDF " ) ;
}
return parseInvoice ( file . getOriginalFilename ( ) , normalizeText ( rawText ) ) ;
}
private void validateFile ( MultipartFile file ) {
if ( file = = null | | file . isEmpty ( ) ) {
throw new ServiceException ( " 请上传 PDF 文件 " ) ;
}
String fileName = file . getOriginalFilename ( ) ;
if ( StringUtils . isBlank ( fileName ) | | ! StringUtils . endsWithIgnoreCase ( fileName , " .pdf " ) ) {
throw new ServiceException ( " 仅支持 PDF 文件识别 " ) ;
}
}
private String extractPdfText ( MultipartFile file ) {
try ( InputStream inputStream = file . getInputStream ( ) ;
PDDocument document = PDDocument . load ( inputStream ) ) {
PDFTextStripper stripper = new PDFTextStripper ( ) ;
stripper . setSortByPosition ( true ) ;
return stripper . getText ( document ) ;
} catch ( IOException e ) {
throw new ServiceException ( " PDF 解析失败: " + e . getMessage ( ) ) ;
}
}
private InvoiceOcrResultVo parseInvoice ( String fileName , String rawText ) {
InvoiceOcrResultVo result = new InvoiceOcrResultVo ( ) ;
result . setFileName ( fileName ) ;
result . setRawText ( rawText ) ;
result . setInvoiceType ( detectInvoiceType ( rawText ) ) ;
result . setInvoiceCode ( firstGroup ( rawText , INVOICE_CODE_PATTERN ) ) ;
result . setInvoiceNumber ( firstGroup ( rawText , INVOICE_NUMBER_PATTERN ) ) ;
result . setInvoiceDate ( firstGroup ( rawText , DATE_PATTERN ) ) ;
result . setBuyerName ( cleanLineValue ( firstGroup ( rawText , BUYER_PATTERN ) ) ) ;
result . setSellerName ( cleanLineValue ( firstGroup ( rawText , SELLER_PATTERN ) ) ) ;
result . setTaxAmount ( parseMoney ( firstGroup ( rawText , TAX_AMOUNT_PATTERN ) ) ) ;
result . setTotalAmount ( parseMoney ( firstGroup ( rawText , TOTAL_AMOUNT_PATTERN ) ) ) ;
result . setAmountWithoutTax ( resolveAmountWithoutTax ( rawText , result . getTaxAmount ( ) , result . getTotalAmount ( ) ) ) ;
List < InvoiceOcrItemVo > items = extractItems ( rawText ) ;
result . setItems ( items ) ;
result . setContentSummary ( buildContentSummary ( items , rawText ) ) ;
return result ;
}
private String detectInvoiceType ( String text ) {
if ( text . contains ( " 增值税专用发票 " ) ) {
return " 增值税专用发票 " ;
}
if ( text . contains ( " 增值税普通发票 " ) ) {
return " 增值税普通发票 " ;
}
if ( text . contains ( " 电子发票 " ) ) {
return " 电子发票 " ;
}
return " 未知发票类型 " ;
}
private BigDecimal resolveAmountWithoutTax ( String text , BigDecimal taxAmount , BigDecimal totalAmount ) {
BigDecimal directAmount = parseMoney ( firstGroup ( text , AMOUNT_WITHOUT_TAX_PATTERN ) ) ;
if ( directAmount ! = null & & isReasonableAmount ( directAmount , taxAmount , totalAmount ) ) {
return directAmount ;
}
if ( totalAmount ! = null & & taxAmount ! = null ) {
return totalAmount . subtract ( taxAmount ) ;
}
return directAmount ;
}
private boolean isReasonableAmount ( BigDecimal directAmount , BigDecimal taxAmount , BigDecimal totalAmount ) {
if ( directAmount = = null ) {
return false ;
}
if ( totalAmount = = null ) {
return true ;
}
if ( taxAmount = = null ) {
return directAmount . compareTo ( totalAmount ) < = 0 ;
}
return directAmount . add ( taxAmount ) . compareTo ( totalAmount ) < = 0 ;
}
private List < InvoiceOcrItemVo > extractItems ( String text ) {
List < InvoiceOcrItemVo > items = new ArrayList < > ( ) ;
String [ ] lines = text . split ( " \\ n " ) ;
boolean detailStarted = false ;
for ( String originalLine : lines ) {
String line = normalizeInlineWhitespace ( originalLine ) ;
if ( StringUtils . isBlank ( line ) ) {
continue ;
}
if ( ! detailStarted & & ITEM_HEADER_PATTERN . matcher ( line ) . find ( ) ) {
detailStarted = true ;
continue ;
}
if ( detailStarted & & isDetailEndLine ( line ) ) {
break ;
}
if ( detailStarted ) {
InvoiceOcrItemVo item = parseItemLine ( line ) ;
if ( item ! = null ) {
items . add ( item ) ;
}
}
}
return deduplicateItems ( items ) ;
}
private boolean isDetailEndLine ( String line ) {
return line . contains ( " 合计 " ) | | line . contains ( " 价税合计 " ) | | line . contains ( " 销售方 " ) | | line . contains ( " 购买方 " ) | | line . contains ( " 备注 " ) ;
}
private InvoiceOcrItemVo parseItemLine ( String line ) {
if ( line . length ( ) < 2 ) {
return null ;
}
Matcher moneyMatcher = MONEY_LINE_PATTERN . matcher ( line ) ;
List < String > numbers = new ArrayList < > ( ) ;
while ( moneyMatcher . find ( ) ) {
numbers . add ( moneyMatcher . group ( 1 ) ) ;
}
if ( numbers . isEmpty ( ) ) {
return null ;
}
String itemName = line ;
for ( String number : numbers ) {
itemName = itemName . replace ( number , " " ) ;
}
Matcher taxRateMatcher = TAX_RATE_PATTERN . matcher ( line ) ;
String taxRate = taxRateMatcher . find ( ) ? taxRateMatcher . group ( 1 ) : null ;
if ( taxRate ! = null ) {
itemName = itemName . replace ( taxRate , " " ) ;
}
itemName = itemName . replace ( " * " , " " ) . trim ( ) ;
itemName = normalizeInlineWhitespace ( itemName ) ;
if ( StringUtils . isBlank ( itemName ) ) {
return null ;
}
InvoiceOcrItemVo item = new InvoiceOcrItemVo ( ) ;
item . setItemName ( itemName ) ;
item . setAmount ( parseMoney ( numbers . get ( Math . max ( 0 , numbers . size ( ) - 2 ) ) ) ) ;
item . setTaxAmount ( parseMoney ( numbers . get ( numbers . size ( ) - 1 ) ) ) ;
item . setTaxRate ( taxRate ) ;
return item ;
}
private List < InvoiceOcrItemVo > deduplicateItems ( List < InvoiceOcrItemVo > items ) {
List < InvoiceOcrItemVo > result = new ArrayList < > ( ) ;
Set < String > uniqueKeys = new LinkedHashSet < > ( ) ;
for ( InvoiceOcrItemVo item : items ) {
String key = item . getItemName ( ) + " | " + item . getAmount ( ) + " | " + item . getTaxAmount ( ) ;
if ( uniqueKeys . add ( key ) ) {
result . add ( item ) ;
}
}
return result ;
}
private String buildContentSummary ( List < InvoiceOcrItemVo > items , String text ) {
if ( items ! = null & & ! items . isEmpty ( ) ) {
List < String > names = new ArrayList < > ( ) ;
for ( InvoiceOcrItemVo item : items ) {
if ( StringUtils . isNotBlank ( item . getItemName ( ) ) ) {
names . add ( item . getItemName ( ) ) ;
}
if ( names . size ( ) > = 3 ) {
break ;
}
}
if ( ! names . isEmpty ( ) ) {
return String . join ( " 、 " , names ) ;
}
}
String [ ] lines = text . split ( " \\ n " ) ;
for ( int i = 0 ; i < lines . length ; i + + ) {
if ( ITEM_HEADER_PATTERN . matcher ( lines [ i ] ) . find ( ) ) {
StringBuilder builder = new StringBuilder ( ) ;
for ( int j = i + 1 ; j < lines . length & & j < = i + 3 ; j + + ) {
String value = normalizeInlineWhitespace ( lines [ j ] ) ;
if ( StringUtils . isBlank ( value ) | | isDetailEndLine ( value ) ) {
break ;
}
if ( builder . length ( ) > 0 ) {
builder . append ( " ; " ) ;
}
builder . append ( value ) ;
}
if ( builder . length ( ) > 0 ) {
return builder . toString ( ) ;
}
}
}
return null ;
}
private String normalizeText ( String text ) {
return text = = null ? " " : text . replace ( " \ r \ n " , " \ n " ) . replace ( '\r' , '\n' ) ;
}
private String normalizeInlineWhitespace ( String value ) {
return value = = null ? null : value . replaceAll ( " \\ s+ " , " " ) . trim ( ) ;
}
private String firstGroup ( String text , Pattern pattern ) {
Matcher matcher = pattern . matcher ( text ) ;
if ( matcher . find ( ) ) {
return matcher . group ( 1 ) ;
}
return null ;
}
private String cleanLineValue ( String value ) {
if ( value = = null ) {
return null ;
}
value = normalizeInlineWhitespace ( value ) ;
if ( StringUtils . length ( value ) > 80 ) {
value = StringUtils . substring ( value , 0 , 80 ) ;
}
return value ;
}
private BigDecimal parseMoney ( String raw ) {
if ( StringUtils . isBlank ( raw ) ) {
return null ;
}
String cleaned = raw . replace ( " , " , " " ) . replace ( " ¥ " , " " ) . replace ( " ¥ " , " " ) . trim ( ) ;
try {
return new BigDecimal ( cleaned ) ;
} catch ( Exception e ) {
return null ;
}
}
}