Source code for biblicus.retrieval

"""
Shared retrieval helpers for Biblicus retrievers.
"""

from __future__ import annotations

import hashlib
import json
from typing import Any, Dict, Iterable, List, Optional

from .corpus import Corpus
from .models import (
    ConfigurationManifest,
    Evidence,
    QueryBudget,
    RetrievalSnapshot,
)
from .time import utc_now_iso


[docs] def create_configuration_manifest( *, retriever_id: str, name: str, configuration: Dict[str, Any], description: Optional[str] = None, ) -> ConfigurationManifest: """ Create a deterministic configuration manifest from a retriever configuration. :param retriever_id: Retriever identifier for the configuration. :type retriever_id: str :param name: Human-readable configuration name. :type name: str :param configuration: Retriever-specific configuration values. :type configuration: dict[str, Any] :param description: Optional configuration description. :type description: str or None :return: Deterministic configuration manifest. :rtype: ConfigurationManifest """ config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":")) configuration_seed = f"{retriever_id}:{config_json}" configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest() return ConfigurationManifest( configuration_id=configuration_id, retriever_id=retriever_id, name=name, created_at=utc_now_iso(), configuration=configuration, description=description, )
[docs] def create_snapshot_manifest( corpus: Corpus, *, configuration: ConfigurationManifest, stats: Dict[str, Any], snapshot_artifacts: Optional[List[str]] = None, ) -> RetrievalSnapshot: """ Create a retrieval snapshot manifest tied to the current catalog snapshot. :param corpus: Corpus used to generate the snapshot. :type corpus: Corpus :param configuration: Configuration manifest for the snapshot. :type configuration: ConfigurationManifest :param stats: Retriever-specific snapshot statistics. :type stats: dict[str, Any] :param snapshot_artifacts: Optional relative paths to materialized artifacts. :type snapshot_artifacts: list[str] or None :return: Snapshot manifest. :rtype: RetrievalSnapshot """ catalog = corpus.load_catalog() created_at = utc_now_iso() snapshot_id = hashlib.sha256( f"{configuration.configuration_id}:{created_at}".encode("utf-8") ).hexdigest() return RetrievalSnapshot( snapshot_id=snapshot_id, configuration=configuration, corpus_uri=catalog.corpus_uri, catalog_generated_at=catalog.generated_at, created_at=created_at, snapshot_artifacts=list(snapshot_artifacts or []), stats=stats, )
[docs] def hash_text(text: str) -> str: """ Hash a text payload for provenance. :param text: Text to hash. :type text: str :return: Secure Hash Algorithm 256 hex digest. :rtype: str """ return hashlib.sha256(text.encode("utf-8")).hexdigest()
[docs] def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evidence]: """ Apply a query budget to a ranked evidence list. :param evidence: Ranked evidence iterable (highest score first). :type evidence: Iterable[Evidence] :param budget: Budget constraints to enforce. :type budget: QueryBudget :return: Evidence list respecting the budget. :rtype: list[Evidence] """ selected_evidence: List[Evidence] = [] source_counts: Dict[str, int] = {} total_characters = 0 skipped = 0 for candidate_evidence in evidence: if skipped < budget.offset: skipped += 1 continue if len(selected_evidence) >= budget.max_total_items: break source_key = candidate_evidence.source_uri or candidate_evidence.item_id if budget.max_items_per_source is not None: if source_counts.get(source_key, 0) >= budget.max_items_per_source: continue text_character_count = len(candidate_evidence.text or "") if budget.maximum_total_characters is not None: if total_characters + text_character_count > budget.maximum_total_characters: continue selected_evidence.append(candidate_evidence) source_counts[source_key] = source_counts.get(source_key, 0) + 1 total_characters += text_character_count return [ evidence_item.model_copy(update={"rank": index}) for index, evidence_item in enumerate(selected_evidence, start=1) ]