Merge branch '0.8.X' of https://gitee.com/hdka/klp-oa into 0.8.X

This commit is contained in:
砂糖
2025-08-01 12:57:34 +08:00
2 changed files with 158 additions and 0 deletions

View File

@@ -22,5 +22,13 @@
<artifactId>klp-common</artifactId>
<version>0.8.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.11.0</version>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,150 @@
package com.klp.controller;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class Application {
/**
* @Description: 识别图片中的文字
* @param args
*/
public static void main(String[] args) {
try {
// 获取本地图片
File file = new File("D:\\桌面\\照片\\11.png");
if (!file.exists()) {
System.err.println("图片文件不存在: " + file.getAbsolutePath());
return;
}
System.out.println("开始OCR识别...");
System.out.println("图片路径: " + file.getAbsolutePath());
// 创建Tesseract对象
ITesseract tesseract = new Tesseract();
// 设置字体库路径
tesseract.setDatapath("D:\\front");
// 设置语言简体中文
tesseract.setLanguage("chi_sim");
// 优化OCR配置
tesseract.setPageSegMode(6); // 假设统一的文本块
tesseract.setOcrEngineMode(3); // 使用默认引擎
// 执行OCR识别图片
String result = tesseract.doOCR(file);
System.out.println("\n=== 原始识别结果 ===");
System.out.println(result);
// 清理和格式化结果
String cleanedResult = cleanOcrResult(result);
System.out.println("\n=== 清理后的结果 ===");
System.out.println(cleanedResult);
// 分行显示结果
System.out.println("\n=== 分行显示结果 ===");
String[] lines = cleanedResult.split("-");
for (int i = 0; i < lines.length; i++) {
if (!lines[i].trim().isEmpty()) {
System.out.println((i + 1) + ". " + lines[i].trim());
}
}
} catch (TesseractException e) {
System.err.println("Tesseract OCR错误: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
System.err.println("其他错误: " + e.getMessage());
e.printStackTrace();
}
}
/**
* 清理OCR识别结果
* @param result 原始识别结果
* @return 清理后的结果
*/
private static String cleanOcrResult(String result) {
if (result == null || result.trim().isEmpty()) {
return "";
}
// 替换常见的OCR错误
String cleaned = result
// 替换回车和换行
.replaceAll("\\r|\\n", "-")
// 移除多余的空格
.replaceAll("\\s+", " ")
// 修复常见的OCR错误
.replaceAll("英声租", "产品名称")
.replaceAll("库咤埕号", "产品型号")
.replaceAll("产晓序列号", "产品序列号")
.replaceAll("购买纳证缉号", "购买凭证编号")
.replaceAll("质保条孰", "质保条款")
.replaceAll("本亢命", "本产品")
.replaceAll("质僚服务", "质保服务")
.replaceAll("质保朝内", "质保期内")
.replaceAll("团素", "因素")
.replaceAll("质量闰题", "质量问题")
.replaceAll("免贵维修", "免费维修")
.replaceAll("更挺服", "更换服")
.replaceAll("不包挂", "不包括")
.replaceAll("溢用", "滥用")
.replaceAll("探环", "损坏")
.replaceAll("取葛保管", "妥善保管")
.replaceAll("雷凭吊证明", "需凭此证明")
.replaceAll("客户信恩", "客户信息")
.replaceAll("姆钗", "姓名")
.replaceAll("联系电语", "联系电话")
.replaceAll("电子邹件", "电子邮件")
.replaceAll("地抛", "地址")
.replaceAll("摒权代表", "授权代表")
.replaceAll("介为示例", "仅为示例")
.replaceAll("根揪实际情况", "根据实际情况")
.replaceAll("调教", "调整")
// 移除多余的空格
.trim();
return cleaned;
}
/**
* 图片预处理(可选)
* @param inputFile 输入图片
* @param outputFile 输出图片
*/
private static void preprocessImage(File inputFile, File outputFile) {
try {
BufferedImage image = ImageIO.read(inputFile);
// 转换为灰度图
BufferedImage grayImage = new BufferedImage(
image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics2D g2d = grayImage.createGraphics();
g2d.drawImage(image, 0, 0, null);
g2d.dispose();
// 保存预处理后的图片
ImageIO.write(grayImage, "png", outputFile);
System.out.println("图片预处理完成: " + outputFile.getAbsolutePath());
} catch (IOException e) {
System.err.println("图片预处理失败: " + e.getMessage());
}
}
}