"""
finamt.agents.prompts
~~~~~~~~~~~~~~~~~~~~~~~~
Short, focused prompts for the 4-agent sequential extraction pipeline.
Pattern: instruction → schema → text → output reminder (sandwich).
"""
from __future__ import annotations
RECEIPT_CATEGORIES = [
"services",
"products",
"material",
"equipment",
"software",
"licensing",
"telecommunication",
"travel",
"car",
"education",
"utilities",
"insurance",
"financial",
"office",
"marketing",
"donations",
"public_fees",
"tax_settlement",
"capital_movement",
"other",
]
_CATS = "|".join(RECEIPT_CATEGORIES)
# ---------------------------------------------------------------------------
# Agent 1 — Metadata: receipt number, date, category
# ---------------------------------------------------------------------------
AGENT1_TEMPLATE = """\
Extract receipt number, date, and category from the text below.
Return only this JSON, no other text:
{{"receipt_number": null, "receipt_date": "YYYY-MM-DD", "category": "{cats}"}}
TEXT:
{text}
Return only JSON:"""
# ---------------------------------------------------------------------------
# Agent 2 — Counterparty
# ---------------------------------------------------------------------------
AGENT2_TEMPLATE = """\
Extract the {party} from the receipt text below.
{exclusion}
Rules: name = business|person, \
vat_id = USt-IdNr|UID, \
tax_number = Steuernummer, \
Return only this JSON, no other text:
{{"name": null, "vat_id": null, "tax_number": null, "street_and_number": null, \
"address_supplement": null, "postcode": null, "city": null, "state": null, "country": null}}
TEXT:
{text}
Return only JSON:"""
# ---------------------------------------------------------------------------
# Agent 3 — Amounts
# ---------------------------------------------------------------------------
AGENT3_TEMPLATE = """\
Extract the financial amounts from the receipt text below.
Return only this JSON, no other text:
{{"total_amount": null, "vat_percentage": null, "vat_amount": null, "currency": null}}
Rules: all numeric values are numbers, not strings. \
German number format "1.234,56" means 1234.56. \
vat_percentage is the rate e.g. 19.0 for 19%%. \
vat_amount is the absolute tax amount, not the rate. \
currency is the ISO 4217 code e.g. EUR, USD, GBP.
TEXT:
{text}
Return only JSON:"""
# ---------------------------------------------------------------------------
# Agent 4 — Line items
# ---------------------------------------------------------------------------
AGENT4_TEMPLATE = """\
Extract all line items from the receipt text below.
Return only this JSON, no other text:
{{"items": [{{"description": null, "vat_rate": null, "vat_amount": null, "total_price": null}}]}}
Rules: all numeric values are numbers, not strings. \
German number format "1.234,56" means 1234.56. \
vat_rate is the percentage e.g. 19.0. \
If no line items exist return {{"items": []}}.
TEXT:
{text}
Return only JSON:"""
# ---------------------------------------------------------------------------
# Builder functions
# ---------------------------------------------------------------------------
def _truncate(text: str, max_chars: int = 3000) -> str:
"""Keep prompts short for local models."""
if len(text) <= max_chars:
return text
return text[:max_chars] + "\n[... truncated ...]"
[docs]
def build_agent1_prompt(text: str) -> str:
return AGENT1_TEMPLATE.format(cats=_CATS, text=_truncate(text))
[docs]
def build_agent2_prompt(text: str, receipt_type: str, taxpayer_info: dict | None = None) -> str:
if receipt_type == "purchase":
party = "vendor/supplier"
else:
party = "client/customer"
exclusion = ""
if taxpayer_info:
parts: list[str] = []
if taxpayer_info.get("name"):
parts.append(f"Name: {taxpayer_info['name']}")
if taxpayer_info.get("vat_id"):
parts.append(f"VAT ID: {taxpayer_info['vat_id']}")
if taxpayer_info.get("tax_number"):
parts.append(f"Tax Number: {taxpayer_info['tax_number']}")
if taxpayer_info.get("address"):
parts.append(f"Address: {taxpayer_info['address']}")
if parts:
exclusion = (
f"IMPORTANT: The following data belong to the USER not {party} "
f"— do NOT extract it: "
+ "; ".join(parts)
+ ". Instead, find other suitable data for these fields."
)
return AGENT2_TEMPLATE.format(party=party, exclusion=exclusion, text=_truncate(text))
[docs]
def build_agent3_prompt(text: str) -> str:
return AGENT3_TEMPLATE.format(text=_truncate(text))
[docs]
def build_agent4_prompt(text: str) -> str:
return AGENT4_TEMPLATE.format(text=_truncate(text))
__all__ = [
"RECEIPT_CATEGORIES",
"build_agent1_prompt",
"build_agent2_prompt",
"build_agent3_prompt",
"build_agent4_prompt",
]