提交 4fb10d3c authored 作者: 贺阳's avatar 贺阳

识别并删除

上级 d36eb406
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF OCR文字识别删除工具</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf-lib/1.17.1/pdf-lib.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/tesseract.js/4.1.1/tesseract.min.js"></script>
<style>
body {
font-family: Arial, sans-serif;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
}
.container {
background: white;
border-radius: 20px;
padding: 30px;
margin: 20px 0;
box-shadow: 0 10px 30px rgba(0,0,0,0.2);
}
.header {
text-align: center;
margin-bottom: 30px;
padding: 20px;
background: linear-gradient(135deg, #9c27b0 0%, #673ab7 100%);
color: white;
border-radius: 15px;
}
.upload-area {
border: 3px dashed #9c27b0;
padding: 40px;
text-align: center;
border-radius: 15px;
background: #f8f9fa;
margin: 20px 0;
transition: all 0.3s ease;
cursor: pointer;
}
.upload-area:hover {
border-color: #007bff;
background: #e3f2fd;
}
.upload-area.dragover {
border-color: #007bff;
background: #e3f2fd;
transform: scale(1.02);
}
button {
background: linear-gradient(135deg, #9c27b0 0%, #673ab7 100%);
color: white;
padding: 15px 30px;
border: none;
border-radius: 10px;
cursor: pointer;
margin: 10px 5px;
font-size: 16px;
font-weight: bold;
transition: all 0.3s ease;
box-shadow: 0 4px 15px rgba(156,39,176,0.3);
}
button:hover {
transform: translateY(-2px);
box-shadow: 0 6px 20px rgba(156,39,176,0.4);
}
button:disabled {
background: #6c757d;
cursor: not-allowed;
transform: none;
box-shadow: none;
}
.success-btn {
background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
}
.warning-btn {
background: linear-gradient(135deg, #ffc107 0%, #fd7e14 100%);
}
.info-btn {
background: linear-gradient(135deg, #17a2b8 0%, #138496 100%);
}
input[type="file"] {
display: none;
}
.result {
margin: 20px 0;
padding: 20px;
border-radius: 10px;
font-weight: bold;
font-size: 16px;
}
.success {
background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
color: #155724;
border: 2px solid #c3e6cb;
}
.error {
background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
color: #721c24;
border: 2px solid #f5c6cb;
}
.info {
background: linear-gradient(135deg, #d1ecf1 0%, #bee5eb 100%);
color: #0c5460;
border: 2px solid #bee5eb;
}
.progress {
background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%);
color: #856404;
border: 2px solid #ffeaa7;
}
.strategy-options {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin: 20px 0;
}
.strategy-card {
background: #f8f9fa;
padding: 20px;
border-radius: 10px;
border: 2px solid #e9ecef;
cursor: pointer;
transition: all 0.3s ease;
text-align: center;
}
.strategy-card:hover {
border-color: #9c27b0;
background: #f3e5f5;
}
.strategy-card.selected {
border-color: #9c27b0;
background: #f3e5f5;
}
.stats {
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
padding: 20px;
border-radius: 10px;
margin: 15px 0;
border-left: 5px solid #2196f3;
}
.ocr-info {
background: linear-gradient(135deg, #f3e5f5 0%, #e1bee7 100%);
padding: 20px;
border-radius: 10px;
margin: 15px 0;
border: 2px solid #9c27b0;
}
.coordinate-info {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
font-family: monospace;
font-size: 12px;
border-left: 4px solid #9c27b0;
}
.text-detection {
background: #f8f9fa;
padding: 20px;
border-radius: 10px;
margin: 15px 0;
border-left: 5px solid #9c27b0;
}
.text-item {
background: white;
padding: 15px;
margin: 10px 0;
border-radius: 8px;
border: 1px solid #ddd;
display: flex;
justify-content: space-between;
align-items: center;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.text-item.found {
background: #d4edda;
border-color: #28a745;
}
.text-item.not-found {
background: #f8d7da;
border-color: #dc3545;
}
.coordinate-info {
font-size: 12px;
color: #666;
font-family: monospace;
}
.detected-text {
padding: 5px 10px;
border-radius: 5px;
font-weight: bold;
color: white;
}
.detected-text.agn {
background: #ff9800;
}
.detected-text.uclink {
background: #f44336;
}
</style>
</head>
<body>
<div class="header">
<h1>🔍 PDF OCR文字识别删除工具</h1>
<p>使用OCR技术识别扫描件PDF中的文字,然后精确删除</p>
<p>支持扫描件PDF的文字识别和智能删除</p>
</div>
<div class="container">
<h3>📁 上传PDF文件</h3>
<div class="upload-area" id="uploadArea">
<h4>📄 拖拽PDF文件到这里</h4>
<p>或点击下方按钮选择文件</p>
<input type="file" id="pdfInput" accept=".pdf">
<button id="selectFileBtn">选择PDF文件</button>
</div>
<div id="fileInfo" style="display: none;">
<div class="stats">
<h4>📋 文件信息</h4>
<p id="fileName"></p>
<p id="fileSize"></p>
<p id="pageCount"></p>
</div>
</div>
</div>
<div class="container">
<h3>🎯 OCR识别策略</h3>
<div class="ocr-info">
<h4>🔍 OCR文字识别说明</h4>
<p>扫描件PDF的文字位置各不相同,必须使用OCR技术智能识别文字位置,然后根据识别结果精确删除目标文字。</p>
<p><strong>为什么需要OCR:</strong>扫描件PDF中的文字是图像,无法直接获取坐标,必须通过OCR识别才能准确定位。</p>
</div>
<div class="strategy-options">
<div class="strategy-card selected" onclick="selectStrategy('ocr')">
<h4>🔍 OCR识别策略(推荐)</h4>
<p>智能识别扫描件文字位置</p>
<small>最精确,适应不同PDF布局</small>
</div>
<div class="strategy-card" onclick="selectStrategy('hybrid')">
<h4>🔄 混合策略</h4>
<p>OCR识别 + 预设坐标</p>
<small>双重保障,确保识别成功</small>
</div>
<div class="strategy-card" onclick="selectStrategy('fallback')">
<h4>🛡️ 回退策略</h4>
<p>OCR失败时使用预设坐标</p>
<small>最后的安全网</small>
</div>
</div>
<div style="text-align: center; margin: 20px 0;">
<button id="processBtn" onclick="processPDF()" disabled class="success-btn">🔍 开始OCR识别与删除</button>
<button id="fallbackBtn" onclick="selectStrategy('fallback'); processPDF();" disabled class="info-btn" style="display: none;">🛡️ OCR卡住?使用回退策略</button>
<button onclick="resetForm()" class="warning-btn">🔄 重置</button>
</div>
<div class="container">
<h3>🎯 删除精度设置</h3>
<div class="info">
<h4>📏 删除范围调整:</h4>
<p>当前设置:<strong id="precisionText">超精确模式(±0像素)</strong></p>
<input type="range" id="precisionSlider" min="0" max="5" value="0" style="width: 100%; margin: 10px 0;">
<div style="display: flex; justify-content: space-between; font-size: 12px; color: #666;">
<span>最精确</span>
<span>最安全</span>
</div>
<p><small>调整滑块可以控制删除范围的大小。超精确模式只删除文字本身,不扩展任何范围。</small></p>
<div style="margin-top: 10px;">
<button onclick="setPrecision(0)" style="background: #28a745; color: white; padding: 5px 10px; border: none; border-radius: 3px; margin: 2px;">超精确模式</button>
<button onclick="setPrecision(1)" style="background: #17a2b8; color: white; padding: 5px 10px; border: none; border-radius: 3px; margin: 2px;">精确模式</button>
<button onclick="setPrecision(2)" style="background: #ffc107; color: black; padding: 5px 10px; border: none; border-radius: 3px; margin: 2px;">平衡模式</button>
</div>
</div>
</div>
<div class="container">
<h3>📄 多页处理设置</h3>
<div class="info">
<h4>🔧 页码处理:</h4>
<p>✅ 自动排除页码文字(如"Page 1 of 1"、"2/30"等)</p>
<p>✅ 智能识别页码模式,避免误删</p>
<p>✅ 多页进度显示,实时更新处理状态</p>
<p><small>对于多页PDF,工具会自动识别并排除页码相关的文字,确保只删除目标内容。</small></p>
</div>
</div>
<div id="result"></div>
</div>
<div class="container">
<h3>📊 OCR识别结果</h3>
<div id="ocrResults"></div>
</div>
<div class="container">
<h3>📋 使用说明</h3>
<div class="info">
<h4>🎯 OCR处理流程:</h4>
<ol>
<li><strong>上传文件:</strong>选择您的扫描件PDF文件</li>
<li><strong>选择策略:</strong>选择OCR识别策略(推荐OCR识别)</li>
<li><strong>开始处理:</strong>点击"开始OCR识别与删除"按钮</li>
<li><strong>等待识别:</strong>OCR需要时间识别文字(可能需要几分钟)</li>
<li><strong>查看结果:</strong>查看识别到的文字位置和坐标</li>
<li><strong>下载文件:</strong>获取处理后的PDF文件</li>
</ol>
<h4>💡 为什么必须使用OCR:</h4>
<ul>
<li><strong>扫描件特性:</strong>扫描件PDF中的文字是图像,不是可选择的文本</li>
<li><strong>位置不固定:</strong>每个PDF的文字位置都不同,无法使用固定坐标</li>
<li><strong>精确识别:</strong>只有OCR才能准确识别文字的具体位置</li>
<li><strong>智能适应:</strong>OCR可以适应不同的PDF布局和格式</li>
</ul>
<h4>✅ OCR识别优势:</h4>
<ul>
<li>智能识别文字位置,适应不同PDF布局</li>
<li>精确删除目标文字,不会误删其他内容</li>
<li>支持扫描件PDF处理</li>
<li>显示所有识别到的文字,便于验证</li>
<li>处理所有页面,确保完整删除</li>
</ul>
<h4>⚠️ 注意事项:</h4>
<ul>
<li>OCR识别需要时间,请耐心等待</li>
<li>识别精度取决于PDF质量和文字清晰度</li>
<li>如果OCR失败,会自动使用预设坐标</li>
<li>建议使用高质量的扫描件PDF</li>
<li><strong>OCR是处理扫描件的唯一有效方法</strong></li>
<li>固定坐标无法适应不同PDF的文字位置</li>
</ul>
</div>
</div>
<script>
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
let pdfDoc = null;
let pdfjsDocument = null;
let selectedStrategy = 'ocr';
let fileInput = document.getElementById('pdfInput');
let processBtn = document.getElementById('processBtn');
let resultDiv = document.getElementById('result');
let ocrResultsDiv = document.getElementById('ocrResults');
const TARGET_TEXTS = ['AGN', 'UCLINK LOGISITICS LTD', 'UCLINK LOGISITICS', 'UCLINK', 'LOGISITICS', 'LOGISTICS', 'LTD'];
// 排除不应该删除的文字
const EXCLUDE_TEXTS = ['AIR EQK', 'ARN', 'EQK', 'AIR', 'Page 1 of 1', 'Page 2 of 2', 'Page 3 of 3', 'Page 4 of 4', 'Page 5 of 5'];
// 删除精度设置
let deletePrecision = 0; // 默认超精确模式
// 最简单的文件选择处理
document.addEventListener('DOMContentLoaded', function() {
console.log('页面加载完成');
// 直接绑定事件
const fileInput = document.getElementById('pdfInput');
const uploadArea = document.getElementById('uploadArea');
const selectBtn = document.getElementById('selectFileBtn');
const precisionSlider = document.getElementById('precisionSlider');
const precisionText = document.getElementById('precisionText');
console.log('元素检查:', {
fileInput: !!fileInput,
uploadArea: !!uploadArea,
selectBtn: !!selectBtn,
precisionSlider: !!precisionSlider
});
// 精度滑块事件
if (precisionSlider) {
precisionSlider.addEventListener('input', function() {
deletePrecision = parseInt(this.value);
const modes = ['超精确模式(±0像素)', '精确模式(±1像素)', '平衡模式(±2像素)', '安全模式(±3像素)', '宽松模式(±4像素)', '最安全模式(±5像素)'];
precisionText.textContent = modes[deletePrecision];
});
}
// 上传区域点击
uploadArea.onclick = function() {
console.log('点击上传区域');
fileInput.click();
};
// 按钮点击
selectBtn.onclick = function(e) {
e.preventDefault();
console.log('点击选择按钮');
fileInput.click();
};
// 文件选择
fileInput.onchange = function(e) {
console.log('文件选择事件');
handleFileSelect();
};
// 拖拽处理
uploadArea.ondragover = function(e) {
e.preventDefault();
uploadArea.classList.add('dragover');
};
uploadArea.ondragleave = function() {
uploadArea.classList.remove('dragover');
};
uploadArea.ondrop = function(e) {
e.preventDefault();
uploadArea.classList.remove('dragover');
const files = e.dataTransfer.files;
if (files.length > 0) {
fileInput.files = files;
handleFileSelect();
}
};
});
async function handleFileSelect() {
console.log('文件选择事件触发');
const file = fileInput.files[0];
if (!file) {
console.log('没有选择文件');
return;
}
console.log('选择的文件:', file.name, file.size);
// 显示文件信息
document.getElementById('fileName').textContent = `文件名: ${file.name}`;
document.getElementById('fileSize').textContent = `文件大小: ${(file.size / 1024).toFixed(2)} KB`;
document.getElementById('pageCount').textContent = `页面数: 加载中...`;
document.getElementById('fileInfo').style.display = 'block';
resultDiv.innerHTML = '<div class="progress">📄 正在加载PDF文件...</div>';
ocrResultsDiv.innerHTML = '';
processBtn.disabled = true;
try {
const pdfBytes = await file.arrayBuffer();
pdfDoc = await PDFLib.PDFDocument.load(pdfBytes);
pdfjsDocument = await pdfjsLib.getDocument({ data: pdfBytes }).promise;
const pageCount = pdfDoc.getPageCount();
document.getElementById('pageCount').textContent = `页面数: ${pageCount}`;
resultDiv.innerHTML = '<div class="success">✅ PDF文件加载成功,可以开始处理</div>';
processBtn.disabled = false;
console.log('PDF加载成功,页面数:', pageCount);
} catch (error) {
document.getElementById('pageCount').textContent = `页面数: 加载失败`;
resultDiv.innerHTML = `<div class="error">❌ PDF加载失败: ${error.message}</div>`;
processBtn.disabled = false;
console.error('PDF加载失败:', error);
}
}
function selectStrategy(strategy) {
selectedStrategy = strategy;
// 更新UI
document.querySelectorAll('.strategy-card').forEach(card => {
card.classList.remove('selected');
});
event.target.closest('.strategy-card').classList.add('selected');
resultDiv.innerHTML = `<div class="info">✅ 已选择策略: ${getStrategyName(strategy)}</div>`;
}
function getStrategyName(strategy) {
const names = {
'ocr': 'OCR识别策略',
'hybrid': '混合策略',
'fallback': '回退策略'
};
return names[strategy] || strategy;
}
async function processPDF() {
if (!pdfDoc || !pdfjsDocument) {
resultDiv.innerHTML = '<div class="error">请先上传PDF文件</div>';
return;
}
try {
resultDiv.innerHTML = '<div class="progress">🔍 正在启动OCR识别,请稍候...</div>';
processBtn.disabled = true;
if (selectedStrategy === 'ocr') {
await processOCR();
} else if (selectedStrategy === 'hybrid') {
await processHybrid();
} else if (selectedStrategy === 'fallback') {
await processFallback();
}
} catch (error) {
resultDiv.innerHTML = `<div class="error">❌ 处理失败: ${error.message}</div>`;
console.error('处理错误:', error);
} finally {
processBtn.disabled = false;
}
}
async function processOCR() {
const pages = pdfDoc.getPages();
let processedPages = 0;
let totalRectangles = 0;
let detectedTexts = [];
let allRecognizedTexts = []; // 存储所有识别到的文字
try {
// 显示回退按钮
document.getElementById('fallbackBtn').style.display = 'inline-block';
document.getElementById('fallbackBtn').disabled = false;
// 识别所有页面
for (let pageNum = 0; pageNum < pdfjsDocument.numPages; pageNum++) {
resultDiv.innerHTML = `<div class="progress">🔍 正在OCR识别第 ${pageNum + 1}/${pdfjsDocument.numPages} 页... (已找到 ${detectedTexts.length} 个目标文字)</div>`;
try {
const pdfjsPage = await pdfjsDocument.getPage(pageNum + 1);
const viewport = pdfjsPage.getViewport({ scale: 2.0 }); // 提高分辨率
// 创建canvas
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// 渲染PDF页面到canvas
const renderContext = {
canvasContext: context,
viewport: viewport
};
await pdfjsPage.render(renderContext).promise;
// 使用Tesseract.js进行OCR识别,优化配置
if (pageNum === 0) {
resultDiv.innerHTML = `<div class="progress">🔍 正在启动OCR引擎,请稍候...</div>`;
}
// 优化OCR配置,提高识别精度
const { data: { text, words } } = await Tesseract.recognize(canvas, 'eng', {
logger: m => {
console.log('OCR进度:', m);
if (m.status === 'loading tesseract core') {
resultDiv.innerHTML = `<div class="progress">🔍 正在加载OCR引擎... ${Math.round(m.progress * 100)}%</div>`;
} else if (m.status === 'initializing tesseract') {
resultDiv.innerHTML = `<div class="progress">🔍 正在初始化OCR... ${Math.round(m.progress * 100)}%</div>`;
} else if (m.status === 'loading language traineddata') {
resultDiv.innerHTML = `<div class="progress">🔍 正在加载语言包... ${Math.round(m.progress * 100)}%</div>`;
} else if (m.status === 'initializing api') {
resultDiv.innerHTML = `<div class="progress">🔍 正在初始化API... ${Math.round(m.progress * 100)}%</div>`;
} else if (m.status === 'recognizing text') {
resultDiv.innerHTML = `<div class="progress">🔍 正在识别第 ${pageNum + 1} 页文字... ${Math.round(m.progress * 100)}%</div>`;
}
},
// 优化OCR参数,提高对短词和单独字母的识别
tessedit_pageseg_mode: '6', // 单一文本块,提高短词识别
tessedit_ocr_engine_mode: '1', // LSTM OCR引擎
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- ', // 限制字符集
preserve_interword_spaces: '1', // 保留单词间距
tessedit_do_invert: '0', // 不反转图像
textord_min_linesize: '1.0', // 降低最小行大小,识别更小的文字
classify_bln_numeric_mode: '0', // 数字模式
textord_force_make_prop_words: 'F', // 不强制制作比例单词
textord_min_xheight: '8', // 最小字符高度
textord_tabfind_show_vlines: '0' // 不显示垂直线
});
// 存储所有识别到的文字
const pageRecognizedTexts = words.map(word => ({
text: word.text,
confidence: word.confidence,
bbox: word.bbox,
page: pageNum
}));
allRecognizedTexts = allRecognizedTexts.concat(pageRecognizedTexts);
// 获取页面尺寸
const page = pages[pageNum];
const { width: pageWidth, height: pageHeight } = page.getSize();
// 查找目标文字
const pageTexts = findTargetTexts(words, pageNum, viewport.width, viewport.height, pageWidth, pageHeight);
detectedTexts = detectedTexts.concat(pageTexts);
resultDiv.innerHTML = `<div class="progress">🔍 第 ${pageNum + 1} 页OCR完成,找到 ${pageTexts.length} 个目标文字</div>`;
} catch (error) {
console.warn(`第 ${pageNum + 1} 页OCR失败:`, error);
resultDiv.innerHTML = `<div class="progress">⚠️ 第 ${pageNum + 1} 页OCR失败,继续处理其他页面...</div>`;
}
}
// 隐藏回退按钮
document.getElementById('fallbackBtn').style.display = 'none';
// 显示所有识别到的文字和OCR结果
displayAllRecognizedTexts(allRecognizedTexts);
displayOCRResults(detectedTexts);
// 显示排除的文字
const excludedWords = allRecognizedTexts.filter(word => {
const text = word.text.trim().toUpperCase();
return EXCLUDE_TEXTS.some(excludeText => {
const excludeUpper = excludeText.toUpperCase();
return text.includes(excludeUpper) || excludeUpper.includes(text);
});
});
if (excludedWords.length > 0) {
console.log('已排除的文字:', excludedWords.map(w => w.text));
resultDiv.innerHTML += `<div class="info">🛡️ 已排除 ${excludedWords.length} 个不应删除的文字: ${excludedWords.map(w => w.text).join(', ')}</div>`;
}
// 检查是否缺少LTD,如果缺少则添加提示
const hasLTD = detectedTexts.some(text => text.text === 'LTD');
if (!hasLTD) {
console.log('⚠️ 未识别到LTD,可能需要调整OCR参数');
resultDiv.innerHTML += `<div class="warning">⚠️ 未识别到"LTD"文字,如果PDF中确实存在,请尝试调整OCR参数或使用回退策略</div>`;
}
// 统计页码信息
const pageNumbers = allRecognizedTexts.filter(word => {
const text = word.text.trim();
return text.match(/^PAGE\s+\d+\s+OF\s+\d+$/i) || text.match(/^\d+\s*\/\s*\d+$/);
});
if (pageNumbers.length > 0) {
console.log('检测到页码:', pageNumbers.map(p => p.text));
resultDiv.innerHTML += `<div class="info">📄 检测到 ${pageNumbers.length} 个页码,已自动排除</div>`;
}
// 根据OCR结果删除文字(处理所有页面)
if (detectedTexts.length > 0) {
const textsByPage = {};
detectedTexts.forEach(text => {
if (!textsByPage[text.page]) {
textsByPage[text.page] = [];
}
textsByPage[text.page].push(text);
});
Object.keys(textsByPage).forEach(pageNum => {
const page = pages[pageNum];
const pageTexts = textsByPage[pageNum];
const { width: pageWidth, height: pageHeight } = page.getSize();
pageTexts.forEach(text => {
// 超精确删除模式
let rect;
if (deletePrecision === 0) {
// 超精确模式:只删除文字本身,不扩展
rect = {
x: text.x,
y: text.y,
width: text.width,
height: text.height
};
} else {
// 其他模式:按精度扩展
rect = {
x: Math.max(0, text.x - deletePrecision),
y: Math.max(0, text.y - deletePrecision),
width: text.width + (deletePrecision * 2),
height: text.height + (deletePrecision * 2)
};
}
page.drawRectangle({
x: rect.x,
y: rect.y,
width: rect.width,
height: rect.height,
color: PDFLib.rgb(1, 1, 1) // 白色覆盖
});
totalRectangles++;
});
processedPages++;
});
await saveAndDownload(processedPages, totalRectangles, 'ocr_processed');
} else {
resultDiv.innerHTML = `
<div class="warning">
⚠️ OCR未识别到目标文字<br>
<button onclick="selectStrategy('fallback'); processPDF();" class="success-btn" style="margin-top: 10px;">
🛡️ 使用回退策略
</button>
</div>
`;
}
} catch (error) {
resultDiv.innerHTML = `
<div class="error">
❌ OCR识别失败: ${error.message}<br>
<button onclick="selectStrategy('fallback'); processPDF();" class="success-btn" style="margin-top: 10px;">
🛡️ 使用回退策略
</button>
</div>
`;
}
}
function findTargetTexts(words, pageNum, viewportWidth, viewportHeight, pageWidth, pageHeight) {
const foundTexts = [];
// 改进的文字匹配逻辑
words.forEach(word => {
const text = word.text.trim().toUpperCase();
// 首先检查是否在排除列表中
let isExcluded = EXCLUDE_TEXTS.some(excludeText => {
const excludeUpper = excludeText.toUpperCase();
return text.includes(excludeUpper) || excludeUpper.includes(text);
});
// 检查页码模式(Page X of Y)
if (!isExcluded && text.match(/^PAGE\s+\d+\s+OF\s+\d+$/i)) {
isExcluded = true;
console.log('排除页码:', word.text);
}
// 检查单独的页码数字
if (!isExcluded && text.match(/^\d+\s*\/\s*\d+$/)) {
isExcluded = true;
console.log('排除页码数字:', word.text);
}
if (isExcluded) {
console.log('排除文字:', word.text);
return; // 跳过这个文字
}
TARGET_TEXTS.forEach(targetText => {
const targetUpper = targetText.toUpperCase();
// 更严格的匹配方式,避免误匹配
let isMatch = false;
if (targetText === 'AGN') {
// AGN使用精确匹配
isMatch = text === 'AGN';
} else if (targetText === 'LTD') {
// LTD使用精确匹配
isMatch = text === 'LTD';
} else {
// 其他文字使用包含匹配,但更严格
isMatch = text.includes(targetUpper) &&
!text.includes('AIR') &&
!text.includes('EQK') &&
!text.includes('ARN');
}
// 如果精确匹配失败,尝试模糊匹配
if (!isMatch && targetText !== 'AGN' && targetText !== 'LTD') {
isMatch = fuzzyMatch(text, targetUpper);
}
if (isMatch) {
// 使用与AGN相同的精确坐标转换方法
const scaleX = pageWidth / viewportWidth;
const scaleY = pageHeight / viewportHeight;
const convertedX = word.bbox.x0 * scaleX;
const convertedY = (viewportHeight - word.bbox.y1) * scaleY;
const convertedWidth = (word.bbox.x1 - word.bbox.x0) * scaleX;
const convertedHeight = (word.bbox.y1 - word.bbox.y0) * scaleY;
// 调试信息
console.log('找到目标文字:', {
originalText: word.text,
matchedText: targetText,
fullText: text,
originalBbox: word.bbox,
viewportSize: { width: viewportWidth, height: viewportHeight },
pageSize: { width: pageWidth, height: pageHeight },
scaleFactors: { scaleX, scaleY },
convertedCoords: { x: convertedX, y: convertedY, width: convertedWidth, height: convertedHeight }
});
foundTexts.push({
text: targetText,
fullText: word.text,
page: pageNum,
x: convertedX,
y: convertedY,
width: convertedWidth,
height: convertedHeight,
confidence: word.confidence / 100,
type: targetText.includes('AGN') ? 'agn' : 'uclink'
});
}
});
});
return foundTexts;
}
// 模糊匹配函数
function fuzzyMatch(str1, str2) {
const s1 = str1.replace(/[^A-Z]/g, '');
const s2 = str2.replace(/[^A-Z]/g, '');
if (s1.length === 0 || s2.length === 0) return false;
// 简单的相似度检查
const longer = s1.length > s2.length ? s1 : s2;
const shorter = s1.length > s2.length ? s2 : s1;
if (longer.length === 0) return true;
const distance = levenshteinDistance(longer, shorter);
return distance <= Math.max(1, longer.length * 0.3);
}
// 计算编辑距离
function levenshteinDistance(str1, str2) {
const matrix = [];
for (let i = 0; i <= str2.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= str1.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= str2.length; i++) {
for (let j = 1; j <= str1.length; j++) {
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j] + 1
);
}
}
}
return matrix[str2.length][str1.length];
}
function displayAllRecognizedTexts(allRecognizedTexts) {
if (allRecognizedTexts.length === 0) {
return;
}
let html = '<h4>📝 所有识别到的文字:</h4>';
html += `<div class="text-detection">`;
html += `<h5>📄 第 1 页 (OCR识别到 ${allRecognizedTexts.length} 个文字)</h5>`;
// 按置信度排序,显示前50个文字
const sortedTexts = allRecognizedTexts
.sort((a, b) => b.confidence - a.confidence)
.slice(0, 50);
sortedTexts.forEach((word, index) => {
const confidence = word.confidence;
const isTarget = TARGET_TEXTS.some(target => word.text.includes(target));
html += `
<div class="text-item ${isTarget ? 'found' : ''}">
<div>
<strong>${word.text}</strong>
<div class="coordinate-info">
坐标: X=${word.bbox.x0.toFixed(1)}, Y=${word.bbox.y0.toFixed(1)} |
大小: ${(word.bbox.x1 - word.bbox.x0).toFixed(1)} x ${(word.bbox.y1 - word.bbox.y0).toFixed(1)} |
置信度: ${confidence.toFixed(0)}%
</div>
</div>
<div>
<span class="detected-text ${isTarget ? (word.text.includes('AGN') ? 'agn' : 'uclink') : ''}">${word.text}</span>
</div>
</div>
`;
});
if (allRecognizedTexts.length > 50) {
html += `<div class="info">... 还有 ${allRecognizedTexts.length - 50} 个文字未显示</div>`;
}
html += `</div>`;
// 在OCR结果区域前面插入所有识别文字
ocrResultsDiv.innerHTML = html + ocrResultsDiv.innerHTML;
}
function displayOCRResults(detectedTexts) {
if (detectedTexts.length === 0) {
ocrResultsDiv.innerHTML += '<div class="warning">⚠️ OCR未识别到目标文字</div>';
return;
}
let html = '<h4>🎯 目标文字识别结果:</h4>';
const textsByPage = {};
detectedTexts.forEach(text => {
if (!textsByPage[text.page]) {
textsByPage[text.page] = [];
}
textsByPage[text.page].push(text);
});
Object.keys(textsByPage).sort((a, b) => parseInt(a) - parseInt(b)).forEach(pageNum => {
const pageTexts = textsByPage[pageNum];
html += `<div class="text-detection">`;
html += `<h5>📄 第 ${parseInt(pageNum) + 1} 页 (找到 ${pageTexts.length} 个目标文字)</h5>`;
pageTexts.forEach((text) => {
html += `
<div class="text-item found">
<div>
<strong>${text.text}</strong>
<div class="coordinate-info">
坐标: X=${text.x.toFixed(1)}, Y=${text.y.toFixed(1)} |
大小: ${text.width.toFixed(1)} x ${text.height.toFixed(1)} |
置信度: ${(text.confidence * 100).toFixed(0)}%
</div>
</div>
<div>
<span class="detected-text ${text.type}">${text.text}</span>
</div>
</div>
`;
});
html += `</div>`;
});
const agnCount = detectedTexts.filter(t => t.type === 'agn').length;
const uclinkCount = detectedTexts.filter(t => t.type === 'uclink').length;
html += `
<div class="stats">
<h4>📊 目标文字识别统计</h4>
<p>AGN 文字: ${agnCount} 个</p>
<p>UCLINK LOGISITICS LTD 文字: ${uclinkCount} 个</p>
<p>总识别数: ${detectedTexts.length} 个</p>
<p>涉及页面: ${Object.keys(textsByPage).length} 页</p>
</div>
`;
ocrResultsDiv.innerHTML += html;
}
async function processHybrid() {
// 混合策略:先尝试OCR,失败时使用预设坐标
try {
await processOCR();
} catch (error) {
resultDiv.innerHTML = '<div class="progress">🔄 OCR失败,使用预设坐标...</div>';
await processFallback();
}
}
async function processFallback() {
// 回退策略:使用预设坐标
const pages = pdfDoc.getPages();
let processedPages = 0;
let totalRectangles = 0;
for (let i = 0; i < pages.length; i++) {
const page = pages[i];
const { width, height } = page.getSize();
// 超精确的预设坐标覆盖,包含LTD区域
const rectangles = [
{ x: 50, y: height - 200, width: 60, height: 10 }, // AGN
{ x: 50, y: height - 220, width: 100, height: 10 }, // UCLINK LOGISITICS
{ x: 155, y: height - 220, width: 30, height: 10 } // LTD
];
rectangles.forEach(rect => {
page.drawRectangle({
x: rect.x,
y: rect.y,
width: rect.width,
height: rect.height,
color: PDFLib.rgb(1, 1, 1)
});
totalRectangles++;
});
processedPages++;
resultDiv.innerHTML = `<div class="progress">🛡️ 回退策略:处理第 ${processedPages} 页,共 ${pages.length} 页...</div>`;
}
await saveAndDownload(processedPages, totalRectangles, 'fallback_processed');
}
async function saveAndDownload(processedPages, totalRectangles, suffix) {
const pdfBytesModified = await pdfDoc.save();
const blob = new Blob([pdfBytesModified], { type: 'application/pdf' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `TK_43610263735_20251015_${suffix}.pdf`;
a.click();
setTimeout(() => URL.revokeObjectURL(url), 1000);
const precisionModes = ['超精确模式(±0像素)', '精确模式(±1像素)', '平衡模式(±2像素)', '安全模式(±3像素)', '宽松模式(±4像素)', '最安全模式(±5像素)'];
// 统计页码信息
const pageNumbers = allRecognizedTexts.filter(word => {
const text = word.text.trim();
return text.match(/^PAGE\s+\d+\s+OF\s+\d+$/i) || text.match(/^\d+\s*\/\s*\d+$/);
});
resultDiv.innerHTML = `
<div class="success">
✅ PDF OCR处理完成!<br>
📄 已处理 ${processedPages} 页<br>
🔲 添加了 ${totalRectangles} 个覆盖矩形<br>
💾 文件已下载:TK_43610263735_20251015_${suffix}.pdf<br>
🎯 使用策略:${getStrategyName(selectedStrategy)}<br>
🔧 坐标转换:缩放转换方法(已验证)<br>
📏 删除精度:${precisionModes[deletePrecision]}<br>
📄 检测到 ${pageNumbers.length} 个页码,已自动排除<br>
<strong>✅ 目标文字已被精确删除!</strong>
</div>
`;
}
// 快速设置精度函数
function setPrecision(value) {
deletePrecision = value;
const precisionSlider = document.getElementById('precisionSlider');
const precisionText = document.getElementById('precisionText');
if (precisionSlider) {
precisionSlider.value = value;
}
if (precisionText) {
const modes = ['超精确模式(±0像素)', '精确模式(±1像素)', '平衡模式(±2像素)', '安全模式(±3像素)', '宽松模式(±4像素)', '最安全模式(±5像素)'];
precisionText.textContent = modes[value];
}
}
function resetForm() {
fileInput.value = '';
document.getElementById('fileInfo').style.display = 'none';
processBtn.disabled = true;
resultDiv.innerHTML = '';
ocrResultsDiv.innerHTML = '';
pdfDoc = null;
pdfjsDocument = null;
selectedStrategy = 'ocr';
deletePrecision = 0; // 重置为超精确模式
document.querySelectorAll('.strategy-card').forEach(card => {
card.classList.remove('selected');
});
document.querySelector('.strategy-card').classList.add('selected');
// 重置精度设置
setPrecision(0);
}
</script>
</body>
</html>
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论