Source code for biblicus.extractors
"""
Text extraction plugins for Biblicus.
"""
from __future__ import annotations
from typing import Dict
from .aldea_stt import AldeaSpeechToTextExtractor
from .audio_format_converter import AudioFormatConverterExtractor
from .aws_transcribe_stt import AwsTranscribeSpeechToTextExtractor
from .azure_speech_stt import AzureSpeechToTextExtractor
from .base import TextExtractor
from .deepgram_stt import DeepgramSpeechToTextExtractor
from .deepgram_transform import DeepgramTranscriptTransformExtractor
from .docling_granite_text import DoclingGraniteExtractor
from .docling_smol_text import DoclingSmolExtractor
from .faster_whisper_stt import FasterWhisperSpeechToTextExtractor
from .google_speech_stt import GoogleSpeechToTextExtractor
from .heron_layout import HeronLayoutExtractor
from .markitdown_text import MarkItDownExtractor
from .metadata_text import MetadataTextExtractor
from .mock_layout_detector import MockLayoutDetectorExtractor
from .openai_audio_stt import OpenAiAudioSpeechToTextExtractor
from .openai_stt import OpenAiSpeechToTextExtractor
from .paddleocr_layout import PaddleOCRLayoutExtractor
from .paddleocr_vl_text import PaddleOcrVlExtractor
from .pass_through_text import PassThroughTextExtractor
from .pdf_text import PortableDocumentFormatTextExtractor
from .pipeline import PipelineExtractor
from .rapidocr_text import RapidOcrExtractor
from .select_longest_text import SelectLongestTextExtractor
from .select_override import SelectOverrideExtractor
from .select_smart_override import SelectSmartOverrideExtractor
from .select_text import SelectTextExtractor
from .tesseract_text import TesseractExtractor
from .unstructured_text import UnstructuredExtractor
[docs]
def get_extractor(extractor_id: str) -> TextExtractor:
"""
Resolve a built-in text extractor by identifier.
:param extractor_id: Extractor identifier.
:type extractor_id: str
:return: Extractor plugin instance.
:rtype: TextExtractor
:raises KeyError: If the extractor identifier is not known.
"""
extractors: Dict[str, TextExtractor] = {
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
MockLayoutDetectorExtractor.extractor_id: MockLayoutDetectorExtractor(),
MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
DoclingSmolExtractor.extractor_id: DoclingSmolExtractor(),
DoclingGraniteExtractor.extractor_id: DoclingGraniteExtractor(),
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
PipelineExtractor.extractor_id: PipelineExtractor(),
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
OpenAiAudioSpeechToTextExtractor.extractor_id: OpenAiAudioSpeechToTextExtractor(),
FasterWhisperSpeechToTextExtractor.extractor_id: FasterWhisperSpeechToTextExtractor(),
AudioFormatConverterExtractor.extractor_id: AudioFormatConverterExtractor(),
AwsTranscribeSpeechToTextExtractor.extractor_id: AwsTranscribeSpeechToTextExtractor(),
AzureSpeechToTextExtractor.extractor_id: AzureSpeechToTextExtractor(),
GoogleSpeechToTextExtractor.extractor_id: GoogleSpeechToTextExtractor(),
AldeaSpeechToTextExtractor.extractor_id: AldeaSpeechToTextExtractor(),
DeepgramSpeechToTextExtractor.extractor_id: DeepgramSpeechToTextExtractor(),
DeepgramTranscriptTransformExtractor.extractor_id: DeepgramTranscriptTransformExtractor(),
RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
HeronLayoutExtractor.extractor_id: HeronLayoutExtractor(),
PaddleOCRLayoutExtractor.extractor_id: PaddleOCRLayoutExtractor(),
PaddleOcrVlExtractor.extractor_id: PaddleOcrVlExtractor(),
TesseractExtractor.extractor_id: TesseractExtractor(),
SelectTextExtractor.extractor_id: SelectTextExtractor(),
SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
SelectSmartOverrideExtractor.extractor_id: SelectSmartOverrideExtractor(),
SelectOverrideExtractor.extractor_id: SelectOverrideExtractor(),
UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
}
if extractor_id not in extractors:
raise KeyError(f"Unknown extractor: {extractor_id!r}")
return extractors[extractor_id]