Tutorials
Concepts
Core Building Blocks
Extraction and Ingestion
Retrieval and Evaluation
Analysis and Modeling
Tools
Operations and Demos
Reference
Corpus
Corpus.analysis_dir
Corpus.analysis_run_dir()
Corpus.analysis_runs_dir
Corpus.catalog_generated_at()
Corpus.catalog_path
Corpus.create_crawl_id()
Corpus.delete_extraction_snapshot()
Corpus.extracted_dir
Corpus.extraction_snapshot_dir()
Corpus.extraction_snapshots_dir
Corpus.find()
Corpus.get_item()
Corpus.graph_dir
Corpus.graph_snapshot_dir()
Corpus.graph_snapshots_dir
Corpus.has_items()
Corpus.import_tree()
Corpus.ingest_crawled_payload()
Corpus.ingest_item()
Corpus.ingest_item_stream()
Corpus.ingest_note()
Corpus.ingest_source()
Corpus.init()
Corpus.latest_extraction_snapshot_reference()
Corpus.latest_snapshot_id
Corpus.list_extraction_snapshots()
Corpus.list_items()
Corpus.load_catalog()
Corpus.load_extraction_snapshot_manifest()
Corpus.load_snapshot()
Corpus.name
Corpus.open()
Corpus.pull_source()
Corpus.purge()
Corpus.read_extracted_text()
Corpus.reindex()
Corpus.retrieval_dir
Corpus.snapshots_dir
Corpus.uri
Corpus.write_snapshot()
KnowledgeBase
KnowledgeBase.context_pack()
KnowledgeBase.corpus
KnowledgeBase.defaults
KnowledgeBase.from_folder()
KnowledgeBase.query()
KnowledgeBase.retriever_id
KnowledgeBase.snapshot
KnowledgeBaseDefaults
KnowledgeBaseDefaults.configuration_name
KnowledgeBaseDefaults.model_config
KnowledgeBaseDefaults.query_budget
KnowledgeBaseDefaults.retriever_id
KnowledgeBaseDefaults.tags
CatalogItem
CatalogItem.bytes
CatalogItem.created_at
CatalogItem.id
CatalogItem.media_type
CatalogItem.metadata
CatalogItem.model_config
CatalogItem.relpath
CatalogItem.sha256
CatalogItem.source_uri
CatalogItem.tags
CatalogItem.title
CollectionMembership
CollectionMembership.collection_name
CollectionMembership.corpus_name
CollectionMembership.model_config
ConfigurationManifest
ConfigurationManifest.configuration
ConfigurationManifest.configuration_id
ConfigurationManifest.created_at
ConfigurationManifest.description
ConfigurationManifest.model_config
ConfigurationManifest.name
ConfigurationManifest.retriever_id
CorpusCatalog
CorpusCatalog.corpus_uri
CorpusCatalog.generated_at
CorpusCatalog.items
CorpusCatalog.latest_run_id
CorpusCatalog.latest_snapshot_id
CorpusCatalog.model_config
CorpusCatalog.order
CorpusCatalog.raw_dir
CorpusCatalog.schema_version
CorpusConfig
CorpusConfig.collection
CorpusConfig.corpus_uri
CorpusConfig.created_at
CorpusConfig.hooks
CorpusConfig.model_config
CorpusConfig.notes
CorpusConfig.raw_dir
CorpusConfig.schema_version
CorpusConfig.source
Evidence
Evidence.configuration_id
Evidence.content_ref
Evidence.hash
Evidence.item_id
Evidence.media_type
Evidence.metadata
Evidence.model_config
Evidence.rank
Evidence.score
Evidence.snapshot_id
Evidence.source_uri
Evidence.span_end
Evidence.span_start
Evidence.stage
Evidence.stage_scores
Evidence.text
ExtractedText
ExtractedText.confidence
ExtractedText.metadata
ExtractedText.model_config
ExtractedText.producer_extractor_id
ExtractedText.source_stage_index
ExtractedText.text
ExtractionSnapshotListEntry
ExtractionSnapshotListEntry.catalog_generated_at
ExtractionSnapshotListEntry.configuration_id
ExtractionSnapshotListEntry.configuration_name
ExtractionSnapshotListEntry.created_at
ExtractionSnapshotListEntry.extractor_id
ExtractionSnapshotListEntry.model_config
ExtractionSnapshotListEntry.snapshot_id
ExtractionSnapshotListEntry.stats
ExtractionSnapshotReference
ExtractionSnapshotReference.as_string()
ExtractionSnapshotReference.extractor_id
ExtractionSnapshotReference.model_config
ExtractionSnapshotReference.snapshot_id
ExtractionStageOutput
ExtractionStageOutput.confidence
ExtractionStageOutput.error_message
ExtractionStageOutput.error_type
ExtractionStageOutput.extractor_id
ExtractionStageOutput.metadata
ExtractionStageOutput.model_config
ExtractionStageOutput.producer_extractor_id
ExtractionStageOutput.source_stage_index
ExtractionStageOutput.stage_index
ExtractionStageOutput.status
ExtractionStageOutput.text
ExtractionStageOutput.text_characters
IngestResult
IngestResult.item_id
IngestResult.model_config
IngestResult.relpath
IngestResult.sha256
PipelineAnalysisConfig
PipelineAnalysisConfig.configuration
PipelineAnalysisConfig.kind
PipelineAnalysisConfig.model_config
PipelineCorpusSelector
PipelineCorpusSelector.collection
PipelineCorpusSelector.model_config
PipelineCorpusSelector.path
PipelineCorpusSelector.selector
PipelineExtractionConfig
PipelineExtractionConfig.model_config
PipelineExtractionConfig.recipe
PipelineMirrorConfig
PipelineMirrorConfig.collection
PipelineMirrorConfig.model_config
PipelineRecipeConfig
PipelineRecipeConfig.analysis
PipelineRecipeConfig.corpus
PipelineRecipeConfig.extraction
PipelineRecipeConfig.mirror
PipelineRecipeConfig.model_config
PipelineRecipeConfig.retrieval
PipelineRetrievalConfig
PipelineRetrievalConfig.configuration
PipelineRetrievalConfig.model_config
PipelineRetrievalConfig.retriever
QueryBudget
QueryBudget.max_items_per_source
QueryBudget.max_total_items
QueryBudget.maximum_total_characters
QueryBudget.model_config
QueryBudget.offset
RemoteCollectionPullResult
RemoteCollectionPullResult.archived
RemoteCollectionPullResult.created
RemoteCollectionPullResult.discovered
RemoteCollectionPullResult.errored
RemoteCollectionPullResult.mirrored
RemoteCollectionPullResult.model_config
RemoteCorpusCollectionConfig
RemoteCorpusCollectionConfig.auto_create
RemoteCorpusCollectionConfig.collection_name
RemoteCorpusCollectionConfig.corpus_root
RemoteCorpusCollectionConfig.created_at
RemoteCorpusCollectionConfig.deletion_policy
RemoteCorpusCollectionConfig.discovery
RemoteCorpusCollectionConfig.model_config
RemoteCorpusCollectionConfig.schema_version
RemoteCorpusCollectionConfig.source
RemoteCorpusCollectionDiscovery
RemoteCorpusCollectionDiscovery.depth
RemoteCorpusCollectionDiscovery.include_root_files
RemoteCorpusCollectionDiscovery.mode
RemoteCorpusCollectionDiscovery.model_config
RemoteCorpusSourceConfig
RemoteCorpusSourceConfig.bucket
RemoteCorpusSourceConfig.container
RemoteCorpusSourceConfig.kind
RemoteCorpusSourceConfig.model_config
RemoteCorpusSourceConfig.name
RemoteCorpusSourceConfig.prefix
RemoteCorpusSourceConfig.profile
RemoteSourceItem
RemoteSourceItem.content_type
RemoteSourceItem.etag
RemoteSourceItem.key
RemoteSourceItem.last_modified
RemoteSourceItem.model_config
RemoteSourceItem.size
RemoteSourceItem.source_uri
RemoteSourcePullResult
RemoteSourcePullResult.downloaded
RemoteSourcePullResult.errored
RemoteSourcePullResult.listed
RemoteSourcePullResult.model_config
RemoteSourcePullResult.pruned
RemoteSourcePullResult.skipped
RemoteSourcePullResult.updated
RetrievalResult
RetrievalResult.budget
RetrievalResult.configuration_id
RetrievalResult.evidence
RetrievalResult.generated_at
RetrievalResult.model_config
RetrievalResult.query_text
RetrievalResult.retriever_id
RetrievalResult.snapshot_id
RetrievalResult.stats
RetrievalSnapshot
RetrievalSnapshot.catalog_generated_at
RetrievalSnapshot.configuration
RetrievalSnapshot.corpus_uri
RetrievalSnapshot.created_at
RetrievalSnapshot.model_config
RetrievalSnapshot.snapshot_artifacts
RetrievalSnapshot.snapshot_id
RetrievalSnapshot.stats
parse_extraction_snapshot_reference()
apply_budget()
create_configuration_manifest()
create_snapshot_manifest()
hash_text()
CharacterBudget
CharacterBudget.max_characters
CharacterBudget.model_config
ContextPack
ContextPack.blocks
ContextPack.evidence_count
ContextPack.model_config
ContextPack.text
ContextPackBlock
ContextPackBlock.evidence_item_id
ContextPackBlock.metadata
ContextPackBlock.model_config
ContextPackBlock.text
ContextPackPolicy
ContextPackPolicy.include_metadata
ContextPackPolicy.join_with
ContextPackPolicy.metadata_fields
ContextPackPolicy.model_config
ContextPackPolicy.ordering
TokenBudget
TokenBudget.max_tokens
TokenBudget.model_config
TokenCounter
TokenCounter.model_config
TokenCounter.tokenizer_id
build_context_pack()
count_tokens()
fit_context_pack_to_character_budget()
fit_context_pack_to_token_budget()
BenchmarkConfig
BenchmarkConfig.aggregate_weights
BenchmarkConfig.benchmark_name
BenchmarkConfig.categories
BenchmarkConfig.load()
BenchmarkConfig.output_dir
BenchmarkConfig.pipelines
BenchmarkReport
BenchmarkReport.avg_bigram_overlap
BenchmarkReport.avg_f1
BenchmarkReport.avg_lcs_ratio
BenchmarkReport.avg_precision
BenchmarkReport.avg_recall
BenchmarkReport.avg_sequence_accuracy
BenchmarkReport.avg_trigram_overlap
BenchmarkReport.avg_word_error_rate
BenchmarkReport.corpus_path
BenchmarkReport.evaluation_timestamp
BenchmarkReport.max_f1
BenchmarkReport.median_f1
BenchmarkReport.median_lcs_ratio
BenchmarkReport.median_precision
BenchmarkReport.median_recall
BenchmarkReport.median_sequence_accuracy
BenchmarkReport.median_word_error_rate
BenchmarkReport.min_f1
BenchmarkReport.per_document_results
BenchmarkReport.pipeline_configuration
BenchmarkReport.print_summary()
BenchmarkReport.processing_time_seconds
BenchmarkReport.to_csv()
BenchmarkReport.to_json()
BenchmarkReport.total_documents
BenchmarkResult
BenchmarkResult.aggregate
BenchmarkResult.benchmark_name
BenchmarkResult.benchmark_version
BenchmarkResult.categories
BenchmarkResult.print_summary()
BenchmarkResult.recommendations
BenchmarkResult.timestamp
BenchmarkResult.to_json()
BenchmarkResult.to_markdown()
BenchmarkResult.total_documents
BenchmarkResult.total_processing_time_seconds
BenchmarkRunner
BenchmarkRunner.run_all()
BenchmarkRunner.run_category()
CategoryConfig
CategoryConfig.corpus_path
CategoryConfig.dataset
CategoryConfig.ground_truth_subdir
CategoryConfig.name
CategoryConfig.pipelines
CategoryConfig.primary_metric
CategoryConfig.subset_size
CategoryConfig.tags
CategoryResult
CategoryResult.best_pipeline
CategoryResult.best_score
CategoryResult.category_name
CategoryResult.dataset
CategoryResult.documents_evaluated
CategoryResult.pipelines
CategoryResult.primary_metric
CategoryResult.primary_score
CategoryResult.processing_time_seconds
OCRBenchmark
OCRBenchmark.evaluate_extraction()
OCREvaluationResult
OCREvaluationResult.bigram_overlap
OCREvaluationResult.character_accuracy
OCREvaluationResult.document_id
OCREvaluationResult.extracted_text
OCREvaluationResult.f1_score
OCREvaluationResult.false_negatives
OCREvaluationResult.false_positives
OCREvaluationResult.ground_truth_text
OCREvaluationResult.image_path
OCREvaluationResult.lcs_ratio
OCREvaluationResult.normalized_edit_distance
OCREvaluationResult.precision
OCREvaluationResult.print_summary()
OCREvaluationResult.recall
OCREvaluationResult.sequence_accuracy
OCREvaluationResult.to_dict()
OCREvaluationResult.trigram_overlap
OCREvaluationResult.true_positives
OCREvaluationResult.word_count_gt
OCREvaluationResult.word_count_ocr
OCREvaluationResult.word_error_rate
calculate_character_accuracy()
calculate_ngram_overlap()
calculate_word_metrics()
calculate_word_order_metrics()
evaluate_snapshot()
load_dataset()
ExtractionConfigurationManifest
ExtractionConfigurationManifest.configuration
ExtractionConfigurationManifest.configuration_id
ExtractionConfigurationManifest.created_at
ExtractionConfigurationManifest.extractor_id
ExtractionConfigurationManifest.model_config
ExtractionConfigurationManifest.name
ExtractionItemResult
ExtractionItemResult.error_message
ExtractionItemResult.error_type
ExtractionItemResult.final_metadata_relpath
ExtractionItemResult.final_producer_extractor_id
ExtractionItemResult.final_source_stage_index
ExtractionItemResult.final_stage_extractor_id
ExtractionItemResult.final_stage_index
ExtractionItemResult.final_text_relpath
ExtractionItemResult.item_id
ExtractionItemResult.model_config
ExtractionItemResult.stage_results
ExtractionItemResult.status
ExtractionSnapshotManifest
ExtractionSnapshotManifest.catalog_generated_at
ExtractionSnapshotManifest.configuration
ExtractionSnapshotManifest.corpus_uri
ExtractionSnapshotManifest.created_at
ExtractionSnapshotManifest.items
ExtractionSnapshotManifest.model_config
ExtractionSnapshotManifest.snapshot_id
ExtractionSnapshotManifest.stats
ExtractionStageResult
ExtractionStageResult.confidence
ExtractionStageResult.error_message
ExtractionStageResult.error_type
ExtractionStageResult.extractor_id
ExtractionStageResult.metadata_relpath
ExtractionStageResult.model_config
ExtractionStageResult.producer_extractor_id
ExtractionStageResult.source_stage_index
ExtractionStageResult.stage_index
ExtractionStageResult.status
ExtractionStageResult.text_characters
ExtractionStageResult.text_relpath
build_extraction_snapshot()
create_extraction_configuration_manifest()
create_extraction_snapshot_manifest()
load_or_build_extraction_snapshot()
write_extracted_metadata_artifact()
write_extracted_text_artifact()
write_extraction_latest_pointer()
write_extraction_snapshot_manifest()
write_pipeline_stage_metadata_artifact()
write_pipeline_stage_text_artifact()
get_extractor()
build_graph_snapshot()
create_graph_configuration_manifest()
create_graph_id()
create_graph_snapshot_manifest()
latest_graph_snapshot_reference()
list_graph_snapshots()
load_graph_snapshot_manifest()
resolve_graph_snapshot_reference()
write_graph_latest_pointer()
write_graph_snapshot_manifest()
GraphConfigurationManifest
GraphConfigurationManifest.configuration
GraphConfigurationManifest.configuration_id
GraphConfigurationManifest.created_at
GraphConfigurationManifest.extractor_id
GraphConfigurationManifest.model_config
GraphConfigurationManifest.name
GraphEdge
GraphEdge.dst
GraphEdge.edge_id
GraphEdge.edge_type
GraphEdge.model_config
GraphEdge.properties
GraphEdge.src
GraphEdge.weight
GraphExtractionItemSummary
GraphExtractionItemSummary.edge_count
GraphExtractionItemSummary.error_message
GraphExtractionItemSummary.item_id
GraphExtractionItemSummary.model_config
GraphExtractionItemSummary.node_count
GraphExtractionItemSummary.status
GraphExtractionResult
GraphExtractionResult.edges
GraphExtractionResult.item_id
GraphExtractionResult.metadata
GraphExtractionResult.model_config
GraphExtractionResult.nodes
GraphNode
GraphNode.label
GraphNode.model_config
GraphNode.node_id
GraphNode.node_type
GraphNode.properties
GraphSchemaModel
GraphSchemaModel.model_config
GraphSchemaModel.schema_version
GraphSnapshotListEntry
GraphSnapshotListEntry.catalog_generated_at
GraphSnapshotListEntry.configuration_id
GraphSnapshotListEntry.configuration_name
GraphSnapshotListEntry.created_at
GraphSnapshotListEntry.extractor_id
GraphSnapshotListEntry.graph_id
GraphSnapshotListEntry.model_config
GraphSnapshotListEntry.snapshot_id
GraphSnapshotListEntry.stats
GraphSnapshotManifest
GraphSnapshotManifest.catalog_generated_at
GraphSnapshotManifest.configuration
GraphSnapshotManifest.corpus_uri
GraphSnapshotManifest.created_at
GraphSnapshotManifest.extraction_snapshot
GraphSnapshotManifest.graph_id
GraphSnapshotManifest.model_config
GraphSnapshotManifest.snapshot_id
GraphSnapshotManifest.stats
GraphSnapshotReference
GraphSnapshotReference.as_string()
GraphSnapshotReference.extractor_id
GraphSnapshotReference.model_config
GraphSnapshotReference.snapshot_id
parse_graph_snapshot_reference()
Neo4jSettings
Neo4jSettings.auto_start
Neo4jSettings.bolt_port
Neo4jSettings.container_name
Neo4jSettings.database
Neo4jSettings.docker_image
Neo4jSettings.http_port
Neo4jSettings.password
Neo4jSettings.uri
Neo4jSettings.username
create_neo4j_driver()
ensure_neo4j_running()
resolve_neo4j_settings()
write_graph_records()
available_graph_extractors()
get_graph_extractor()