"""
OCR benchmarking and evaluation system.
Provides comprehensive tools for evaluating OCR pipeline performance against
ground truth data with detailed per-document and aggregate metrics.
"""
import csv
import json
import re
import time
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from biblicus import Corpus
def normalize_words(text: str) -> List[str]:
"""
Normalize text into words for comparison.
- Lowercases
- Removes punctuation
- Filters words > 2 characters
"""
# Lowercase and split
words = text.lower().split()
# Remove punctuation and filter short words
cleaned = []
for word in words:
# Remove all non-alphanumeric
clean = re.sub(r'[^a-z0-9]', '', word)
if len(clean) > 2:
cleaned.append(clean)
return cleaned
[docs]
def calculate_word_metrics(ground_truth: str, extracted: str) -> Dict[str, Any]:
"""
Calculate word-level precision, recall, and F1 score.
Compares word sets after normalization (lowercase, remove punctuation).
Args:
ground_truth: Expected text
extracted: Actual OCR output
Returns:
Dictionary with precision, recall, f1_score, and counts
"""
gt_words = normalize_words(ground_truth)
ex_words = normalize_words(extracted)
gt_set = set(gt_words)
ex_set = set(ex_words)
true_positives = len(gt_set & ex_set)
false_positives = len(ex_set - gt_set)
false_negatives = len(gt_set - ex_set)
precision = (
true_positives / (true_positives + false_positives)
if (true_positives + false_positives) > 0
else 0.0
)
recall = (
true_positives / (true_positives + false_negatives)
if (true_positives + false_negatives) > 0
else 0.0
)
f1 = (
2 * precision * recall / (precision + recall)
if (precision + recall) > 0
else 0.0
)
return {
"precision": precision,
"recall": recall,
"f1_score": f1,
"true_positives": true_positives,
"false_positives": false_positives,
"false_negatives": false_negatives,
"word_count_gt": len(gt_words),
"word_count_ocr": len(ex_words),
}
[docs]
def calculate_character_accuracy(ground_truth: str, extracted: str) -> float:
"""
Calculate character-level accuracy using edit distance.
Uses Levenshtein distance to compute how similar the strings are.
Returns 1.0 - (distance / max_length).
Args:
ground_truth: Expected text
extracted: Actual OCR output
Returns:
Accuracy between 0.0 and 1.0
"""
try:
import editdistance
distance = editdistance.eval(ground_truth, extracted)
max_len = max(len(ground_truth), len(extracted))
if max_len == 0:
return 1.0
accuracy = 1.0 - (distance / max_len)
return max(0.0, accuracy) # Clamp to [0, 1]
except ImportError:
# editdistance not installed, use simple character overlap
gt_chars = set(ground_truth.lower())
ex_chars = set(extracted.lower())
if not gt_chars:
return 1.0 if not ex_chars else 0.0
overlap = len(gt_chars & ex_chars) / len(gt_chars)
return overlap
[docs]
def calculate_word_order_metrics(ground_truth: str, extracted: str) -> Dict[str, Any]:
"""
Calculate order-aware metrics that measure reading sequence quality.
These metrics are critical for evaluating layout-aware OCR where the goal
is to preserve correct reading order (e.g., left column before right column).
Metrics:
- Word Error Rate (WER): Edit distance on word sequences (insertions, deletions, substitutions)
- Sequence accuracy: What % of word sequences match exactly
- Longest Common Subsequence (LCS): Longest sequence of words in correct order
- Normalized edit distance: Word-level Levenshtein distance normalized by length
Args:
ground_truth: Expected text in correct reading order
extracted: Actual OCR output
Returns:
Dictionary with order-aware metrics
"""
gt_words = normalize_words(ground_truth)
ex_words = normalize_words(extracted)
if not gt_words and not ex_words:
return {
'word_error_rate': 0.0,
'sequence_accuracy': 1.0,
'lcs_ratio': 1.0,
'normalized_edit_distance': 0.0,
}
if not gt_words or not ex_words:
return {
'word_error_rate': 1.0,
'sequence_accuracy': 0.0,
'lcs_ratio': 0.0,
'normalized_edit_distance': 1.0,
}
# Calculate Word Error Rate (WER) using edit distance on word sequences
try:
import editdistance
edit_dist = editdistance.eval(gt_words, ex_words)
wer = edit_dist / len(gt_words)
normalized_edit_dist = edit_dist / max(len(gt_words), len(ex_words))
except ImportError:
# Fallback: simple implementation
edit_dist = _simple_edit_distance(gt_words, ex_words)
wer = edit_dist / len(gt_words)
normalized_edit_dist = edit_dist / max(len(gt_words), len(ex_words))
# Calculate Longest Common Subsequence (LCS)
lcs_length = _longest_common_subsequence(gt_words, ex_words)
lcs_ratio = lcs_length / len(gt_words) if gt_words else 0.0
# Sequence accuracy: ratio of correct words in correct positions
matches = sum(1 for i, word in enumerate(gt_words) if i < len(ex_words) and ex_words[i] == word)
sequence_accuracy = matches / len(gt_words)
return {
'word_error_rate': wer,
'sequence_accuracy': sequence_accuracy,
'lcs_ratio': lcs_ratio,
'normalized_edit_distance': normalized_edit_dist,
}
[docs]
def calculate_ngram_overlap(ground_truth: str, extracted: str, n: int = 2) -> float:
"""
Calculate n-gram overlap to measure local word ordering.
N-grams capture short sequences of words. High n-gram overlap means
the extracted text preserves local word ordering, even if global order differs.
Args:
ground_truth: Expected text
extracted: Actual OCR output
n: N-gram size (default 2 for bigrams)
Returns:
N-gram overlap ratio (0.0 to 1.0)
"""
gt_words = normalize_words(ground_truth)
ex_words = normalize_words(extracted)
if len(gt_words) < n or len(ex_words) < n:
return 0.0
# Create n-grams
gt_ngrams = [tuple(gt_words[i:i+n]) for i in range(len(gt_words) - n + 1)]
ex_ngrams = [tuple(ex_words[i:i+n]) for i in range(len(ex_words) - n + 1)]
# Calculate overlap
gt_set = set(gt_ngrams)
ex_set = set(ex_ngrams)
overlap = len(gt_set & ex_set)
# Return as ratio of ground truth n-grams
return overlap / len(gt_ngrams)
def _simple_edit_distance(seq1: List[str], seq2: List[str]) -> int:
"""Compute edit distance for word sequences."""
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i-1] == seq2[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
return dp[m][n]
def _longest_common_subsequence(seq1: List[str], seq2: List[str]) -> int:
"""Calculate length of longest common subsequence."""
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i-1] == seq2[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
return dp[m][n]
[docs]
@dataclass
class OCREvaluationResult:
"""Results for evaluating a single document."""
document_id: str
image_path: str
ground_truth_text: str
extracted_text: str
# Set-based metrics (position-agnostic)
precision: float
recall: float
f1_score: float
character_accuracy: float
true_positives: int
false_positives: int
false_negatives: int
word_count_gt: int
word_count_ocr: int
# Order-aware metrics (sequence quality)
word_error_rate: float
sequence_accuracy: float
lcs_ratio: float
normalized_edit_distance: float
# N-gram overlap (local ordering)
bigram_overlap: float
trigram_overlap: float
[docs]
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return asdict(self)
[docs]
def print_summary(self):
"""Print a summary of this result."""
print(f"Document: {self.document_id[:16]}...")
print(f" Path: {self.image_path}")
print(f" Words: GT={self.word_count_gt}, OCR={self.word_count_ocr}")
print()
print(" Set-based Metrics (position-agnostic):")
print(f" Precision: {self.precision:.3f}")
print(f" Recall: {self.recall:.3f}")
print(f" F1 Score: {self.f1_score:.3f}")
print()
print(" Order-aware Metrics (sequence quality):")
print(f" Word Error Rate: {self.word_error_rate:.3f}")
print(f" Sequence Accuracy: {self.sequence_accuracy:.3f}")
print(f" LCS Ratio: {self.lcs_ratio:.3f}")
print()
print(" N-gram Overlap (local ordering):")
print(f" Bigram Overlap: {self.bigram_overlap:.3f}")
print(f" Trigram Overlap: {self.trigram_overlap:.3f}")
[docs]
@dataclass
class BenchmarkReport:
"""Aggregate benchmark results across multiple documents."""
evaluation_timestamp: str
corpus_path: str
pipeline_configuration: Dict[str, Any]
total_documents: int
# Set-based metrics (position-agnostic)
avg_precision: float
avg_recall: float
avg_f1: float
median_precision: float
median_recall: float
median_f1: float
min_f1: float
max_f1: float
# Order-aware metrics
avg_word_error_rate: float
avg_sequence_accuracy: float
avg_lcs_ratio: float
median_word_error_rate: float
median_sequence_accuracy: float
median_lcs_ratio: float
# N-gram metrics
avg_bigram_overlap: float
avg_trigram_overlap: float
processing_time_seconds: float
per_document_results: List[Dict[str, Any]]
[docs]
def to_json(self, path: Path):
"""Export report as JSON."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'w') as f:
json.dump(asdict(self), f, indent=2)
print(f"✓ JSON report saved to: {path}")
[docs]
def to_csv(self, path: Path):
"""Export per-document results as CSV."""
path.parent.mkdir(parents=True, exist_ok=True)
if not self.per_document_results:
print("No results to export")
return
# Get all keys from first result
fieldnames = [
'document_id', 'image_path',
'word_count_gt', 'word_count_ocr',
'precision', 'recall', 'f1_score', 'character_accuracy',
'word_error_rate', 'sequence_accuracy', 'lcs_ratio',
'bigram_overlap', 'trigram_overlap',
'true_positives', 'false_positives', 'false_negatives'
]
with open(path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(self.per_document_results)
print(f"✓ CSV report saved to: {path}")
[docs]
def print_summary(self):
"""Print console summary."""
print("=" * 70)
print("BENCHMARK REPORT")
print("=" * 70)
print(f"Timestamp: {self.evaluation_timestamp}")
print(f"Corpus: {self.corpus_path}")
print(f"Documents: {self.total_documents}")
print(f"Processing Time: {self.processing_time_seconds:.2f}s")
print()
print("Set-based Metrics (position-agnostic):")
print(f" Average Precision: {self.avg_precision:.3f}")
print(f" Average Recall: {self.avg_recall:.3f}")
print(f" Average F1 Score: {self.avg_f1:.3f}")
print()
print(f" Median Precision: {self.median_precision:.3f}")
print(f" Median Recall: {self.median_recall:.3f}")
print(f" Median F1 Score: {self.median_f1:.3f}")
print()
print("Order-aware Metrics (sequence quality):")
print(f" Avg Word Error Rate: {self.avg_word_error_rate:.3f} (lower is better)")
print(f" Avg Sequence Accuracy: {self.avg_sequence_accuracy:.3f}")
print(f" Avg LCS Ratio: {self.avg_lcs_ratio:.3f}")
print()
print(f" Median Word Error Rate: {self.median_word_error_rate:.3f}")
print(f" Median Sequence Acc: {self.median_sequence_accuracy:.3f}")
print(f" Median LCS Ratio: {self.median_lcs_ratio:.3f}")
print()
print("N-gram Overlap (local ordering):")
print(f" Avg Bigram Overlap: {self.avg_bigram_overlap:.3f}")
print(f" Avg Trigram Overlap: {self.avg_trigram_overlap:.3f}")
print("=" * 70)
[docs]
class OCRBenchmark:
"""
Runs OCR evaluation across multiple documents.
Evaluates extraction snapshots against ground truth data and generates
comprehensive reports with per-document and aggregate metrics.
"""
def __init__(self, corpus: Corpus):
"""
Initialize benchmark with a corpus.
Args:
corpus: Corpus containing documents and ground truth
"""
self.corpus = corpus