From 831695e2363e555c058b507d3a19cb9639e3d52c Mon Sep 17 00:00:00 2001 From: JR <3573153686@qq.com> Date: Mon, 4 Aug 2025 10:18:17 +0800 Subject: [PATCH] =?UTF-8?q?(ocrfeat):=20=E5=A2=9E=E5=8A=A0=20PDF=20?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=96=87=E5=AD=97=E8=AF=86=E5=88=AB=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 实现了 ITesseractOcrService接口中的 recognizePdfText 方法- 添加了 PDFBox 依赖用于处理 PDF 文件 - 在 TesseractOcrServiceImpl 中实现了 PDF 文件的文字提取和清理 - 在 WmsPurchasePlanController 中添加了识别 PDF 文件文字的 API 接口 --- klp-wms/pom.xml | 7 ++- .../controller/WmsPurchasePlanController.java | 12 ++++ .../com/klp/service/ITesseractOcrService.java | 21 +++++-- .../service/impl/TesseractOcrServiceImpl.java | 59 +++++++++++++++++++ 4 files changed, 94 insertions(+), 5 deletions(-) diff --git a/klp-wms/pom.xml b/klp-wms/pom.xml index af8a4730..00773272 100644 --- a/klp-wms/pom.xml +++ b/klp-wms/pom.xml @@ -28,7 +28,12 @@ tess4j 5.11.0 - + + + org.apache.pdfbox + pdfbox + 2.0.29 + diff --git a/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java b/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java index e51706d7..e86cdbb5 100644 --- a/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java +++ b/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java @@ -68,6 +68,18 @@ public class WmsPurchasePlanController extends BaseController { return R.ok(new RecognizeTextVo(text)); } + /** + * 识别PDF文件中的文字 + * @param request PDF文件地址 + * @return 识别出的文字 + */ + @PostMapping("/recognizePdfText") + public R recognizePdf(@RequestBody Map request) { + String pdfUrl = request.get("pdfUrl"); + String text = iTesseractOcrService.recognizePdfText(pdfUrl); + return R.ok(new RecognizeTextVo(text)); + } + /** * 查询采购计划主列表 */ diff --git a/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java b/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java index 9b3ad1c8..6cb6f18d 100644 --- a/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java +++ b/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java @@ -1,11 +1,24 @@ package com.klp.service; +/** + * OCR文字识别Service接口 + * + * @author Joshi + * @date 2025-07-18 + */ public interface ITesseractOcrService { /** - * 识别网络图片 - * @param imageUrl 图片URL - * @return 识别结果 + * 识别图片中的文字 + * @param imgUrl 图片URL + * @return 识别出的文字 */ - String recognizeText(String imageUrl); + String recognizeText(String imgUrl); + + /** + * 识别PDF文件中的文字 + * @param pdfUrl PDF文件URL + * @return 识别出的文字 + */ + String recognizePdfText(String pdfUrl); } diff --git a/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java b/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java index aef63eec..66e23311 100644 --- a/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java +++ b/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java @@ -5,6 +5,8 @@ import com.klp.service.ITesseractOcrService; import net.sourceforge.tess4j.ITesseract; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; import org.springframework.stereotype.Service; import javax.annotation.Resource; @@ -20,6 +22,7 @@ public class TesseractOcrServiceImpl implements ITesseractOcrService { @Resource private TesseractConfig tesseractConfig; + @Override public String recognizeText(String imageUrl){ // 读取网络图片为 BufferedImage @@ -61,6 +64,62 @@ public class TesseractOcrServiceImpl implements ITesseractOcrService { return cleanedResult; } + @Override + public String recognizePdfText(String pdfUrl) { + try { + System.out.println("开始处理PDF文件: " + pdfUrl); + + // 下载PDF文件 + URL url = new URL(pdfUrl); + InputStream inputStream = url.openStream(); + + // 使用PDFBox读取PDF + PDDocument document = PDDocument.load(inputStream); + + // 创建文本提取器 + PDFTextStripper textStripper = new PDFTextStripper(); + + // 提取所有页面的文字 + String text = textStripper.getText(document); + + // 关闭文档和流 + document.close(); + inputStream.close(); + + // 清理和格式化结果 +// String cleanedResult = cleanPdfText(text); + + System.out.println("\n=== PDF文字提取结果 ==="); + System.out.println(text); + + return text; + + } catch (IOException e) { + throw new RuntimeException("处理PDF文件失败: " + e.getMessage(), e); + } catch (Exception e) { + throw new RuntimeException("PDF文字提取失败: " + e.getMessage(), e); + } + } + + /** + * 清理PDF提取的文字 + * @param text 原始PDF文字 + * @return 清理后的文字 + */ + private static String cleanPdfText(String text) { + if (text == null || text.trim().isEmpty()) { + return ""; + } + + return text + // 移除多余的空白字符 + .replaceAll("\\s+", " ") + // 移除页面分隔符 + .replaceAll("\\f", "\n") + // 清理行首行尾空白 + .trim(); + } + /** * 灰度化图像(直接处理 BufferedImage) * @param image 原始图片