Source code for biblicus.knowledge_base

"""
High-level knowledge base workflow for turnkey usage.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List, Optional, Sequence

from pydantic import BaseModel, ConfigDict, Field

from .context import (
    ContextPack,
    ContextPackPolicy,
    TokenBudget,
    build_context_pack,
    fit_context_pack_to_token_budget,
)
from .corpus import Corpus
from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
from .retrievers import get_retriever


[docs] class KnowledgeBaseDefaults(BaseModel): """ Default configuration for a knowledge base workflow. :ivar retriever_id: Retriever identifier to use for retrieval. :vartype retriever_id: str :ivar configuration_name: Human-readable retrieval configuration name. :vartype configuration_name: str :ivar query_budget: Default query budget to apply to retrieval. :vartype query_budget: QueryBudget :ivar tags: Tags to apply when importing the folder. :vartype tags: list[str] """ model_config = ConfigDict(extra="forbid") retriever_id: str = Field(default="scan", min_length=1) configuration_name: str = Field(default="Knowledge base", min_length=1) query_budget: QueryBudget = Field( default_factory=lambda: QueryBudget( max_total_items=5, maximum_total_characters=2000, max_items_per_source=None, ) ) tags: List[str] = Field(default_factory=list)
[docs] @dataclass class KnowledgeBase: """ High-level knowledge base wrapper for turnkey workflows. :ivar corpus: Corpus instance that stores the ingested items. :vartype corpus: Corpus :ivar retriever_id: Retriever identifier used for retrieval. :vartype retriever_id: str :ivar snapshot: Retrieval snapshot manifest associated with the knowledge base. :vartype snapshot: RetrievalSnapshot :ivar defaults: Default configuration used for this knowledge base. :vartype defaults: KnowledgeBaseDefaults """ corpus: Corpus retriever_id: str snapshot: RetrievalSnapshot defaults: KnowledgeBaseDefaults _temp_dir: Optional[TemporaryDirectory]
[docs] @classmethod def from_folder( cls, folder: str | Path, *, retriever_id: Optional[str] = None, configuration_name: Optional[str] = None, query_budget: Optional[QueryBudget] = None, tags: Optional[Sequence[str]] = None, corpus_root: Optional[str | Path] = None, ) -> "KnowledgeBase": """ Build a knowledge base from a folder of files. :param folder: Folder containing source files. :type folder: str or Path :param retriever_id: Optional retriever identifier override. :type retriever_id: str or None :param configuration_name: Optional configuration name override. :type configuration_name: str or None :param query_budget: Optional query budget override. :type query_budget: QueryBudget or None :param tags: Optional tags to apply during import. :type tags: Sequence[str] or None :param corpus_root: Optional corpus root override. Must contain the source folder. :type corpus_root: str or Path or None :return: Knowledge base instance. :rtype: KnowledgeBase :raises FileNotFoundError: If the folder does not exist. :raises NotADirectoryError: If the folder is not a directory. """ source_root = Path(folder).resolve() if not source_root.exists(): raise FileNotFoundError(f"Knowledge base folder does not exist: {source_root}") if not source_root.is_dir(): raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}") defaults = KnowledgeBaseDefaults() resolved_retriever_id = retriever_id or defaults.retriever_id resolved_configuration_name = configuration_name or defaults.configuration_name resolved_query_budget = query_budget or defaults.query_budget resolved_tags = list(tags) if tags is not None else defaults.tags temp_dir: Optional[TemporaryDirectory] = None if corpus_root is None: corpus_root_path = source_root else: corpus_root_path = Path(corpus_root).resolve() if not source_root.is_relative_to(corpus_root_path): raise ValueError( "Knowledge base source folder must live inside the corpus root." ) if (corpus_root_path / "metadata" / "config.json").is_file(): corpus = Corpus.open(corpus_root_path) else: corpus = Corpus.init(corpus_root_path) corpus.import_tree(source_root, tags=resolved_tags) retriever = get_retriever(resolved_retriever_id) snapshot = retriever.build_snapshot( corpus, configuration_name=resolved_configuration_name, configuration={} ) return cls( corpus=corpus, retriever_id=resolved_retriever_id, snapshot=snapshot, defaults=KnowledgeBaseDefaults( retriever_id=resolved_retriever_id, configuration_name=resolved_configuration_name, query_budget=resolved_query_budget, tags=resolved_tags, ), _temp_dir=temp_dir, )
[docs] def query(self, query_text: str, *, budget: Optional[QueryBudget] = None) -> RetrievalResult: """ Query the knowledge base for evidence. :param query_text: Query text to execute. :type query_text: str :param budget: Optional budget override. :type budget: QueryBudget or None :return: Retrieval result containing evidence. :rtype: RetrievalResult """ retriever = get_retriever(self.retriever_id) resolved_budget = budget or self.defaults.query_budget return retriever.query( self.corpus, snapshot=self.snapshot, query_text=query_text, budget=resolved_budget, )
[docs] def context_pack( self, result: RetrievalResult, *, join_with: str = "\n\n", max_tokens: Optional[int] = None, ) -> ContextPack: """ Build a context pack from a retrieval result. :param result: Retrieval result to convert into context. :type result: RetrievalResult :param join_with: Join string for evidence blocks. :type join_with: str :param max_tokens: Optional token budget for the context pack. :type max_tokens: int or None :return: Context pack text and metadata. :rtype: ContextPack """ policy = ContextPackPolicy(join_with=join_with) context_pack = build_context_pack(result, policy=policy) if max_tokens is None: return context_pack return fit_context_pack_to_token_budget( context_pack, policy=policy, token_budget=TokenBudget(max_tokens=max_tokens), )