"""
Graph extraction models for Biblicus.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, ConfigDict, Field
from ..constants import GRAPH_SCHEMA_VERSION
[docs]
class GraphSchemaModel(BaseModel):
"""
Base model for graph extraction schemas with strict validation.
:ivar schema_version: Graph schema version.
:vartype schema_version: int
"""
model_config = ConfigDict(extra="forbid")
schema_version: int = Field(default=GRAPH_SCHEMA_VERSION, ge=1)
[docs]
class GraphConfigurationManifest(BaseModel):
"""
Reproducible configuration for a graph extraction snapshot.
:ivar configuration_id: Deterministic configuration identifier.
:vartype configuration_id: str
:ivar extractor_id: Graph extractor identifier.
:vartype extractor_id: str
:ivar name: Human-readable configuration name.
:vartype name: str
:ivar created_at: International Organization for Standardization 8601 timestamp.
:vartype created_at: str
:ivar configuration: Extractor-specific configuration values.
:vartype configuration: dict[str, Any]
"""
model_config = ConfigDict(extra="forbid")
configuration_id: str
extractor_id: str
name: str
created_at: str
configuration: Dict[str, Any] = Field(default_factory=dict)
[docs]
class GraphNode(GraphSchemaModel):
"""
Node record extracted from a corpus item.
:ivar node_id: Deterministic node identifier.
:vartype node_id: str
:ivar node_type: Node type identifier.
:vartype node_type: str
:ivar label: Human-readable label.
:vartype label: str
:ivar properties: Node-specific properties.
:vartype properties: dict[str, Any]
"""
node_id: str = Field(min_length=1)
node_type: str = Field(min_length=1)
label: str = Field(min_length=1)
properties: Dict[str, Any] = Field(default_factory=dict)
[docs]
class GraphEdge(GraphSchemaModel):
"""
Edge record extracted from a corpus item.
:ivar edge_id: Deterministic edge identifier.
:vartype edge_id: str
:ivar src: Source node identifier.
:vartype src: str
:ivar dst: Destination node identifier.
:vartype dst: str
:ivar edge_type: Edge type identifier.
:vartype edge_type: str
:ivar weight: Edge weight.
:vartype weight: float
:ivar properties: Edge-specific properties.
:vartype properties: dict[str, Any]
"""
edge_id: str = Field(min_length=1)
src: str = Field(min_length=1)
dst: str = Field(min_length=1)
edge_type: str = Field(min_length=1)
weight: float = Field(default=1.0)
properties: Dict[str, Any] = Field(default_factory=dict)
[docs]
class GraphSnapshotManifest(BaseModel):
"""
Immutable record describing a graph extraction snapshot.
:ivar snapshot_id: Unique snapshot identifier.
:vartype snapshot_id: str
:ivar graph_id: Deterministic graph identifier.
:vartype graph_id: str
:ivar configuration: Configuration manifest for this snapshot.
:vartype configuration: GraphConfigurationManifest
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
:vartype corpus_uri: str
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
:vartype catalog_generated_at: str
:ivar extraction_snapshot: Extraction snapshot reference.
:vartype extraction_snapshot: str
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
:vartype created_at: str
:ivar stats: Snapshot statistics.
:vartype stats: dict[str, Any]
"""
model_config = ConfigDict(extra="forbid")
snapshot_id: str
graph_id: str
configuration: GraphConfigurationManifest
corpus_uri: str
catalog_generated_at: str
extraction_snapshot: str
created_at: str
stats: Dict[str, Any] = Field(default_factory=dict)
[docs]
class GraphSnapshotReference(BaseModel):
"""
Reference to a graph extraction snapshot.
:ivar extractor_id: Graph extractor identifier.
:vartype extractor_id: str
:ivar snapshot_id: Graph snapshot identifier.
:vartype snapshot_id: str
"""
model_config = ConfigDict(extra="forbid")
extractor_id: str = Field(min_length=1)
snapshot_id: str = Field(min_length=1)
[docs]
def as_string(self) -> str:
"""
Serialize the reference as a single string.
:return: Reference in the form extractor_id:snapshot_id.
:rtype: str
"""
return f"{self.extractor_id}:{self.snapshot_id}"
[docs]
class GraphSnapshotListEntry(BaseModel):
"""
Summary entry for a graph extraction snapshot stored in a corpus.
:ivar extractor_id: Graph extractor identifier.
:vartype extractor_id: str
:ivar snapshot_id: Graph snapshot identifier.
:vartype snapshot_id: str
:ivar graph_id: Deterministic graph identifier.
:vartype graph_id: str
:ivar configuration_id: Deterministic configuration identifier.
:vartype configuration_id: str
:ivar configuration_name: Human-readable configuration name.
:vartype configuration_name: str
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
:vartype catalog_generated_at: str
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
:vartype created_at: str
:ivar stats: Snapshot statistics.
:vartype stats: dict[str, Any]
"""
model_config = ConfigDict(extra="forbid")
extractor_id: str = Field(min_length=1)
snapshot_id: str = Field(min_length=1)
graph_id: str = Field(min_length=1)
configuration_id: str = Field(min_length=1)
configuration_name: str = Field(min_length=1)
catalog_generated_at: str = Field(min_length=1)
created_at: str = Field(min_length=1)
stats: Dict[str, object] = Field(default_factory=dict)
[docs]
def parse_graph_snapshot_reference(value: str) -> GraphSnapshotReference:
"""
Parse a graph snapshot reference in the form extractor_id:snapshot_id.
:param value: Raw reference string.
:type value: str
:return: Parsed graph snapshot reference.
:rtype: GraphSnapshotReference
:raises ValueError: If the reference is not well formed.
"""
if ":" not in value:
raise ValueError("Graph snapshot reference must be extractor_id:snapshot_id")
extractor_id, snapshot_id = value.split(":", 1)
extractor_id = extractor_id.strip()
snapshot_id = snapshot_id.strip()
if not extractor_id or not snapshot_id:
raise ValueError(
"Graph snapshot reference must be extractor_id:snapshot_id with non-empty parts"
)
return GraphSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)