diff --git a/klp-wms/pom.xml b/klp-wms/pom.xml
index af8a4730..00773272 100644
--- a/klp-wms/pom.xml
+++ b/klp-wms/pom.xml
@@ -28,7 +28,12 @@
tess4j
5.11.0
-
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.29
+
diff --git a/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java b/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java
index e51706d7..e86cdbb5 100644
--- a/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java
+++ b/klp-wms/src/main/java/com/klp/controller/WmsPurchasePlanController.java
@@ -68,6 +68,18 @@ public class WmsPurchasePlanController extends BaseController {
return R.ok(new RecognizeTextVo(text));
}
+ /**
+ * 识别PDF文件中的文字
+ * @param request PDF文件地址
+ * @return 识别出的文字
+ */
+ @PostMapping("/recognizePdfText")
+ public R recognizePdf(@RequestBody Map request) {
+ String pdfUrl = request.get("pdfUrl");
+ String text = iTesseractOcrService.recognizePdfText(pdfUrl);
+ return R.ok(new RecognizeTextVo(text));
+ }
+
/**
* 查询采购计划主列表
*/
diff --git a/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java b/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java
index 9b3ad1c8..6cb6f18d 100644
--- a/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java
+++ b/klp-wms/src/main/java/com/klp/service/ITesseractOcrService.java
@@ -1,11 +1,24 @@
package com.klp.service;
+/**
+ * OCR文字识别Service接口
+ *
+ * @author Joshi
+ * @date 2025-07-18
+ */
public interface ITesseractOcrService {
/**
- * 识别网络图片
- * @param imageUrl 图片URL
- * @return 识别结果
+ * 识别图片中的文字
+ * @param imgUrl 图片URL
+ * @return 识别出的文字
*/
- String recognizeText(String imageUrl);
+ String recognizeText(String imgUrl);
+
+ /**
+ * 识别PDF文件中的文字
+ * @param pdfUrl PDF文件URL
+ * @return 识别出的文字
+ */
+ String recognizePdfText(String pdfUrl);
}
diff --git a/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java b/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java
index aef63eec..66e23311 100644
--- a/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java
+++ b/klp-wms/src/main/java/com/klp/service/impl/TesseractOcrServiceImpl.java
@@ -5,6 +5,8 @@ import com.klp.service.ITesseractOcrService;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
@@ -20,6 +22,7 @@ public class TesseractOcrServiceImpl implements ITesseractOcrService {
@Resource
private TesseractConfig tesseractConfig;
+
@Override
public String recognizeText(String imageUrl){
// 读取网络图片为 BufferedImage
@@ -61,6 +64,62 @@ public class TesseractOcrServiceImpl implements ITesseractOcrService {
return cleanedResult;
}
+ @Override
+ public String recognizePdfText(String pdfUrl) {
+ try {
+ System.out.println("开始处理PDF文件: " + pdfUrl);
+
+ // 下载PDF文件
+ URL url = new URL(pdfUrl);
+ InputStream inputStream = url.openStream();
+
+ // 使用PDFBox读取PDF
+ PDDocument document = PDDocument.load(inputStream);
+
+ // 创建文本提取器
+ PDFTextStripper textStripper = new PDFTextStripper();
+
+ // 提取所有页面的文字
+ String text = textStripper.getText(document);
+
+ // 关闭文档和流
+ document.close();
+ inputStream.close();
+
+ // 清理和格式化结果
+// String cleanedResult = cleanPdfText(text);
+
+ System.out.println("\n=== PDF文字提取结果 ===");
+ System.out.println(text);
+
+ return text;
+
+ } catch (IOException e) {
+ throw new RuntimeException("处理PDF文件失败: " + e.getMessage(), e);
+ } catch (Exception e) {
+ throw new RuntimeException("PDF文字提取失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 清理PDF提取的文字
+ * @param text 原始PDF文字
+ * @return 清理后的文字
+ */
+ private static String cleanPdfText(String text) {
+ if (text == null || text.trim().isEmpty()) {
+ return "";
+ }
+
+ return text
+ // 移除多余的空白字符
+ .replaceAll("\\s+", " ")
+ // 移除页面分隔符
+ .replaceAll("\\f", "\n")
+ // 清理行首行尾空白
+ .trim();
+ }
+
/**
* 灰度化图像(直接处理 BufferedImage)
* @param image 原始图片