feat: 新增tesseract OCR服务接口

This commit is contained in:
JR
2025-08-01 15:15:17 +08:00
parent 21e354258e
commit 0e50b769e9
9 changed files with 142 additions and 202 deletions

View File

@@ -0,0 +1,11 @@
package com.klp.service;
public interface ITesseractOcrService {
/**
* 识别网络图片
* @param imageUrl 图片URL
* @return 识别结果
*/
String recognizeText(String imageUrl) throws Exception;
}

View File

@@ -0,0 +1,112 @@
package com.klp.service.impl;
import com.klp.common.config.TesseractConfig;
import com.klp.service.ITesseractOcrService;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.net.URL;
@Service
public class TesseractOcrServiceImpl implements ITesseractOcrService {
@Resource
private TesseractConfig tesseractConfig;
@Override
public String recognizeText(String imageUrl) throws Exception {
// 读取网络图片为 BufferedImage
URL url = new URL(imageUrl);
InputStream inputStream = url.openStream();
BufferedImage image = ImageIO.read(inputStream);
inputStream.close();
// 预处理图片
BufferedImage bufferedImage = preprocessImage(image);
System.out.println("开始OCR识别...");
// 创建Tesseract对象
ITesseract tesseract = new Tesseract();
// 设置字体库路径(绝对路径)
tesseract.setDatapath(tesseractConfig.getDatapath());
// 设置语言简体中文
tesseract.setLanguage("chi_sim");
// 优化OCR配置
tesseract.setPageSegMode(6); // 假设统一的文本块
tesseract.setOcrEngineMode(3); // 使用默认引擎
// 执行OCR识别图片
String result = tesseract.doOCR(bufferedImage);
// 清理和格式化结果
String cleanedResult = cleanOcrResult(result);
System.out.println("\n=== 识别结果 ===");
System.out.println(cleanedResult);
return cleanedResult;
}
/**
* 灰度化图像(直接处理 BufferedImage
* @param image 原始图片
* @return 灰度图
*/
private static BufferedImage preprocessImage(BufferedImage image) {
BufferedImage grayImage = new BufferedImage(
image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics2D g2d = grayImage.createGraphics();
g2d.drawImage(image, 0, 0, null);
g2d.dispose();
return grayImage;
}
/**
* 清理OCR识别结果
* @param result 原始识别结果
* @return 清理后的结果
*/
private static String cleanOcrResult(String result) {
if (result == null || result.trim().isEmpty()) {
return "";
}
// 替换常见的OCR错误
String cleaned = result
// 修复常见的OCR错误
.replaceAll("英声租", "产品名称")
.replaceAll("库咤埕号", "产品型号")
.replaceAll("产晓序列号", "产品序列号")
.replaceAll("购买纳证缉号", "购买凭证编号")
.replaceAll("质保条孰", "质保条款")
.replaceAll("本亢命", "本产品")
.replaceAll("质僚服务", "质保服务")
.replaceAll("质保朝内", "质保期内")
.replaceAll("团素", "因素")
.replaceAll("质量闰题", "质量问题")
.replaceAll("免贵维修", "免费维修")
.replaceAll("更挺服", "更换服")
.replaceAll("不包挂", "不包括")
.replaceAll("溢用", "滥用")
.replaceAll("探环", "损坏")
.replaceAll("取葛保管", "妥善保管")
.replaceAll("雷凭吊证明", "需凭此证明")
.replaceAll("客户信恩", "客户信息")
.replaceAll("姆钗", "姓名")
.replaceAll("联系电语", "联系电话")
.replaceAll("电子邹件", "电子邮件")
.replaceAll("地抛", "地址")
.replaceAll("摒权代表", "授权代表")
.replaceAll("介为示例", "仅为示例")
.replaceAll("根揪实际情况", "根据实际情况")
.replaceAll("调教", "调整")
// 移除多余的空格
.trim();
return cleaned;
}
}

View File

@@ -6,7 +6,6 @@ import com.klp.common.core.domain.PageQuery;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.klp.common.utils.StringUtils;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;