- 实现了 ITesseractOcrService接口中的 recognizePdfText 方法- 添加了 PDFBox 依赖用于处理 PDF 文件 - 在 TesseractOcrServiceImpl 中实现了 PDF 文件的文字提取和清理 - 在 WmsPurchasePlanController 中添加了识别 PDF 文件文字的 API 接口
184 lines
6.1 KiB
Java
184 lines
6.1 KiB
Java
package com.klp.service.impl;
|
||
|
||
import com.klp.common.config.TesseractConfig;
|
||
import com.klp.service.ITesseractOcrService;
|
||
import net.sourceforge.tess4j.ITesseract;
|
||
import net.sourceforge.tess4j.Tesseract;
|
||
import net.sourceforge.tess4j.TesseractException;
|
||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||
import org.apache.pdfbox.text.PDFTextStripper;
|
||
import org.springframework.stereotype.Service;
|
||
|
||
import javax.annotation.Resource;
|
||
import javax.imageio.ImageIO;
|
||
import java.awt.*;
|
||
import java.awt.image.BufferedImage;
|
||
import java.io.IOException;
|
||
import java.io.InputStream;
|
||
import java.net.URL;
|
||
|
||
@Service
|
||
public class TesseractOcrServiceImpl implements ITesseractOcrService {
|
||
|
||
@Resource
|
||
private TesseractConfig tesseractConfig;
|
||
|
||
@Override
|
||
public String recognizeText(String imageUrl){
|
||
// 读取网络图片为 BufferedImage
|
||
BufferedImage image = null;
|
||
try {
|
||
URL url = new URL(imageUrl);
|
||
InputStream inputStream = url.openStream();
|
||
image = ImageIO.read(inputStream);
|
||
inputStream.close();
|
||
} catch (IOException e) {
|
||
throw new RuntimeException(e);
|
||
}
|
||
// 预处理图片
|
||
BufferedImage bufferedImage = preprocessImage(image);
|
||
System.out.println("开始OCR识别...");
|
||
// 创建Tesseract对象
|
||
ITesseract tesseract = new Tesseract();
|
||
// 设置字体库路径(绝对路径)
|
||
tesseract.setDatapath(tesseractConfig.getDatapath());
|
||
// 设置语言简体中文
|
||
tesseract.setLanguage("chi_sim");
|
||
// 优化OCR配置
|
||
tesseract.setPageSegMode(6); // 假设统一的文本块
|
||
tesseract.setOcrEngineMode(3); // 使用默认引擎
|
||
// 执行OCR识别图片
|
||
String result = null;
|
||
try {
|
||
result = tesseract.doOCR(bufferedImage);
|
||
} catch (TesseractException e) {
|
||
throw new RuntimeException(e);
|
||
}
|
||
|
||
// 清理和格式化结果
|
||
String cleanedResult = cleanOcrResult(result);
|
||
|
||
System.out.println("\n=== 识别结果 ===");
|
||
System.out.println(cleanedResult);
|
||
|
||
return cleanedResult;
|
||
}
|
||
|
||
@Override
|
||
public String recognizePdfText(String pdfUrl) {
|
||
try {
|
||
System.out.println("开始处理PDF文件: " + pdfUrl);
|
||
|
||
// 下载PDF文件
|
||
URL url = new URL(pdfUrl);
|
||
InputStream inputStream = url.openStream();
|
||
|
||
// 使用PDFBox读取PDF
|
||
PDDocument document = PDDocument.load(inputStream);
|
||
|
||
// 创建文本提取器
|
||
PDFTextStripper textStripper = new PDFTextStripper();
|
||
|
||
// 提取所有页面的文字
|
||
String text = textStripper.getText(document);
|
||
|
||
// 关闭文档和流
|
||
document.close();
|
||
inputStream.close();
|
||
|
||
// 清理和格式化结果
|
||
// String cleanedResult = cleanPdfText(text);
|
||
|
||
System.out.println("\n=== PDF文字提取结果 ===");
|
||
System.out.println(text);
|
||
|
||
return text;
|
||
|
||
} catch (IOException e) {
|
||
throw new RuntimeException("处理PDF文件失败: " + e.getMessage(), e);
|
||
} catch (Exception e) {
|
||
throw new RuntimeException("PDF文字提取失败: " + e.getMessage(), e);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 清理PDF提取的文字
|
||
* @param text 原始PDF文字
|
||
* @return 清理后的文字
|
||
*/
|
||
private static String cleanPdfText(String text) {
|
||
if (text == null || text.trim().isEmpty()) {
|
||
return "";
|
||
}
|
||
|
||
return text
|
||
// 移除多余的空白字符
|
||
.replaceAll("\\s+", " ")
|
||
// 移除页面分隔符
|
||
.replaceAll("\\f", "\n")
|
||
// 清理行首行尾空白
|
||
.trim();
|
||
}
|
||
|
||
/**
|
||
* 灰度化图像(直接处理 BufferedImage)
|
||
* @param image 原始图片
|
||
* @return 灰度图
|
||
*/
|
||
private static BufferedImage preprocessImage(BufferedImage image) {
|
||
BufferedImage grayImage = new BufferedImage(
|
||
image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||
|
||
Graphics2D g2d = grayImage.createGraphics();
|
||
g2d.drawImage(image, 0, 0, null);
|
||
g2d.dispose();
|
||
|
||
return grayImage;
|
||
}
|
||
|
||
/**
|
||
* 清理OCR识别结果
|
||
* @param result 原始识别结果
|
||
* @return 清理后的结果
|
||
*/
|
||
private static String cleanOcrResult(String result) {
|
||
if (result == null || result.trim().isEmpty()) {
|
||
return "";
|
||
}
|
||
|
||
// 替换常见的OCR错误
|
||
String cleaned = result
|
||
// 修复常见的OCR错误
|
||
.replaceAll("英声租", "产品名称")
|
||
.replaceAll("库咤埕号", "产品型号")
|
||
.replaceAll("产晓序列号", "产品序列号")
|
||
.replaceAll("购买纳证缉号", "购买凭证编号")
|
||
.replaceAll("质保条孰", "质保条款")
|
||
.replaceAll("本亢命", "本产品")
|
||
.replaceAll("质僚服务", "质保服务")
|
||
.replaceAll("质保朝内", "质保期内")
|
||
.replaceAll("团素", "因素")
|
||
.replaceAll("质量闰题", "质量问题")
|
||
.replaceAll("免贵维修", "免费维修")
|
||
.replaceAll("更挺服", "更换服")
|
||
.replaceAll("不包挂", "不包括")
|
||
.replaceAll("溢用", "滥用")
|
||
.replaceAll("探环", "损坏")
|
||
.replaceAll("取葛保管", "妥善保管")
|
||
.replaceAll("雷凭吊证明", "需凭此证明")
|
||
.replaceAll("客户信恩", "客户信息")
|
||
.replaceAll("姆钗", "姓名")
|
||
.replaceAll("联系电语", "联系电话")
|
||
.replaceAll("电子邹件", "电子邮件")
|
||
.replaceAll("地抛", "地址")
|
||
.replaceAll("摒权代表", "授权代表")
|
||
.replaceAll("介为示例", "仅为示例")
|
||
.replaceAll("根揪实际情况", "根据实际情况")
|
||
.replaceAll("调教", "调整")
|
||
// 移除多余的空格
|
||
.trim();
|
||
|
||
return cleaned;
|
||
}
|
||
}
|