Source code for finamt.agents.config

"""
finamt.agents.config
~~~~~~~~~~~~~~~~~~~~~~~
All configuration for finamt in one place.

  Config           — OCR, general settings (env prefix: FINAMT_)
  AgentsConfig     — LLM model settings for the 4-agent extraction pipeline
  ModelConfig      — immutable snapshot returned by Config.get_model_config()
  AgentModelConfig — immutable snapshot returned by AgentsConfig.get_agent_config()

Override via environment variables or a .env file:
  FINAMT_AGENT_MODEL=mistral:7b

Recommended models (text-only, no vision required):
  mistral:7b  ← works well, recommended default
  qwen2.5:7b-instruct-q4_K_M

"""

from __future__ import annotations

import warnings
from dataclasses import dataclass

from pydantic import Field, field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict

# ---------------------------------------------------------------------------
# Immutable config snapshots
# ---------------------------------------------------------------------------


[docs] @dataclass(frozen=True) class ModelConfig: """Snapshot of general LLM settings (used by OCR pipeline).""" model: str temperature: float top_p: float num_ctx: int max_retries: int timeout: int
[docs] @dataclass(frozen=True) class AgentModelConfig: """Snapshot of extraction agent LLM settings.""" model: str temperature: float top_p: float num_ctx: int timeout: int max_retries: int
# --------------------------------------------------------------------------- # General config (OCR, PDF) # ---------------------------------------------------------------------------
[docs] class Config(BaseSettings): model_config = SettingsConfigDict( env_prefix="FINAMT_", env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore", ) model: str = Field(default="mistral:7b") temperature: float = Field(default=0.1, ge=0.0, le=2.0) top_p: float = Field(default=0.9, ge=0.0, le=1.0) num_ctx: int = Field(default=8192, ge=512) # OCR tesseract_cmd: str = Field(default="tesseract") ocr_language: str = Field(default="german") ocr_preprocess: bool = Field(default=True) ocr_timeout: int = Field(default=60, ge=5) # PaddleOCR timeout in seconds # PDF rendering pdf_dpi: int = Field(default=150, ge=72, le=1200) # HTTP / retry max_retries: int = Field(default=3, ge=0, le=10) request_timeout: int = Field(default=30, ge=1) @field_validator("ocr_language") @classmethod def _validate_language(cls, v: str) -> str: codes = [c.strip() for c in v.split("+") if c.strip()] if not codes: raise ValueError("ocr_language must not be empty.") return "+".join(codes) @model_validator(mode="after") def _warn_temperature(self) -> Config: if self.temperature > 0.5: warnings.warn( f"temperature={self.temperature} is high for structured extraction.", UserWarning, stacklevel=2, ) return self
[docs] def get_model_config(self) -> ModelConfig: return ModelConfig( model=self.model, temperature=self.temperature, top_p=self.top_p, num_ctx=self.num_ctx, max_retries=self.max_retries, timeout=self.request_timeout, )
# Backward-compatible uppercase aliases used by ocr_processor / cli @property def DEFAULT_MODEL(self) -> str: return self.model @property def TESSERACT_CMD(self) -> str: return self.tesseract_cmd @property def OCR_LANGUAGE(self) -> str: return self.ocr_language @property def OCR_PREPROCESS(self) -> bool: return self.ocr_preprocess @property def PDF_DPI(self) -> int: return self.pdf_dpi @property def MAX_RETRIES(self) -> int: return self.max_retries @property def REQUEST_TIMEOUT(self) -> int: return self.request_timeout
# --------------------------------------------------------------------------- # Agent config (shared by all 4 extraction agents) # ---------------------------------------------------------------------------
[docs] class AgentsConfig(BaseSettings): """ LLM settings for the 4-agent sequential extraction pipeline. All agents use the same model — override with FINAMT_AGENT_MODEL. Temperature is 0.0 for deterministic JSON output. """ model_config = SettingsConfigDict( env_prefix="FINAMT_", env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore", ) agent_model: str = Field(default="mistral:7b") agent_timeout: int = Field(default=60) agent_num_ctx: int = Field(default=4096) agent_max_retries: int = Field(default=2) temperature: float = Field(default=0.0) top_p: float = Field(default=1.0)
[docs] def get_agent_config(self) -> AgentModelConfig: return AgentModelConfig( model=self.agent_model, temperature=self.temperature, top_p=self.top_p, num_ctx=self.agent_num_ctx, timeout=self.agent_timeout, max_retries=self.agent_max_retries, )
# --------------------------------------------------------------------------- # Module-level singleton # --------------------------------------------------------------------------- cfg = Config() __all__ = ["Config", "ModelConfig", "AgentsConfig", "AgentModelConfig", "cfg"]