# DemozPay Prometheus alert rules.
#
# Phase D / GL-08-foundation: covers every metric the platform exposes
# today (API + gateway + ledger + recon-runner). The rules sit as code
# until the alerting backbone is deployed (Grafana + Alertmanager + paging
# provider — separate GL-08 deliverable). They are review-able now;
# loading them is a config swap.
#
# Severity guidance:
#   critical — page on-call immediately. Money is at risk OR a core RPC is
#              down.
#   warning  — investigate during business hours. SLO budget is consumed
#              faster than expected; not a fire.
#   info     — observational. No paging; surface in dashboards.
#
# Every alert MUST link a runbook. New alerts without a runbook get
# rejected at review.
groups:
  # ─── Bank webhook signature surface ─────────────────────────────────
  - name: demozpay-bank-webhooks
    interval: 30s
    rules:
      - alert: DemozpayBankWebhookSignatureFailureRate
        expr: |
          sum by (partner) (rate(demozpay_bank_webhook_requests_total{result="signature-rejected"}[5m]))
            /
          sum by (partner) (rate(demozpay_bank_webhook_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Webhook signature failures > 5% on partner {{ $labels.partner }}"
          description: |
            Sustained signature-rejection rate on bank-callback for partner {{ $labels.partner }}.
            Likely causes: clock skew, key rotation, proxy mangling, attack.
          runbook: "docs/runbooks/webhook-failure.md"

      - alert: DemozpayBankWebhookMissingHeaders
        expr: |
          rate(demozpay_bank_webhook_requests_total{result="bad-request"}[10m]) > 0.2
        for: 10m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "Bank-callback bad-request rate elevated on {{ $labels.partner }}"
          runbook: "docs/runbooks/webhook-failure.md"

  # ─── Settlement poller liveness + lag ───────────────────────────────
  - name: demozpay-settlement-poller
    interval: 30s
    rules:
      - alert: DemozpaySettlementPollerErrorRate
        expr: |
          rate(demozpay_settlement_poller_ticks_total{outcome="errored"}[5m]) > 0.5
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "Settlement poller errors > 0.5/sec"
          description: |
            The settlement poller is failing more often than expected. In-flight
            settlements may not be confirmed/rejected promptly. Check ledger
            connectivity, gateway connectivity, DB locks.
          runbook: "docs/runbooks/gateway-down.md"

      - alert: DemozpaySettlementPollerStalled
        expr: |
          (time() - max(demozpay_settlement_poller_ticks_total) by (outcome)) > 300
        for: 5m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Settlement poller has not ticked in > 5 minutes"
          description: "If the poller stops, in-flight settlements are not reconciled until restart."
          runbook: "docs/runbooks/gateway-down.md"

  # ─── Outbox publisher ───────────────────────────────────────────────
  - name: demozpay-outbox
    interval: 30s
    rules:
      - alert: DemozpayOutboxStale
        expr: demozpay_outbox_oldest_unpublished_age_seconds > 300
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Outbox publisher behind by > 5 minutes"
          description: |
            Oldest unpublished outbox row is over 5 minutes old. Downstream
            consumers (notifications, reconciliation triggers) are running stale.
          runbook: "docs/runbooks/outbox-publisher-stale.md" # PLANNED runbook

      - alert: DemozpayOutboxBacklog
        expr: demozpay_outbox_unpublished_total > 10000
        for: 10m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Outbox backlog > 10k rows"
          runbook: "docs/runbooks/outbox-publisher-stale.md" # PLANNED

  # ─── Dependency health ─────────────────────────────────────────────
  - name: demozpay-dependencies
    interval: 15s
    rules:
      - alert: DemozpayDependencyDown
        expr: demozpay_dependency_up == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Dependency {{ $labels.dependency }} is DOWN"
          runbook: "docs/runbooks/gateway-down.md"

  # ─── Account verification (Phase C) ─────────────────────────────────
  - name: demozpay-lookup-account
    interval: 30s
    rules:
      - alert: DemozpayLookupPartnerUnavailable
        expr: |
          rate(demozpay_integration_gateway_lookup_failure_total{reason="PARTNER_UNAVAILABLE"}[5m]) > 1
        for: 5m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Partner {{ $labels.partner }} returning PARTNER_UNAVAILABLE on LookupAccount"
          description: |
            Sustained PARTNER_UNAVAILABLE means we're failing-closed on disbursement.
            Likely partner outage. Disbursement throughput is throttled until the
            partner recovers.
          runbook: "docs/architecture/PHASE_C_LOOKUP_ACCOUNT.md"

      - alert: DemozpayLookupPartnerTimeout
        expr: |
          rate(demozpay_integration_gateway_lookup_failure_total{reason="PARTNER_TIMEOUT"}[5m]) > 1
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "Partner {{ $labels.partner }} hitting LookupAccount timeouts"
          runbook: "docs/architecture/PHASE_C_LOOKUP_ACCOUNT.md"

      - alert: DemozpayLookupP95Latency
        expr: |
          histogram_quantile(0.95, rate(demozpay_integration_gateway_lookup_latency_seconds_bucket[5m])) > 1.0
        for: 10m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "LookupAccount P95 latency > 1s on {{ $labels.partner }}"
          runbook: "docs/architecture/PHASE_C_LOOKUP_ACCOUNT.md"

  # ─── Ledger RPC surface (Phase D) ──────────────────────────────────
  - name: demozpay-ledger
    interval: 30s
    rules:
      - alert: DemozpayLedgerErrorRate
        expr: |
          sum by (rpc) (
            rate(demozpay_ledger_rpc_requests_total{outcome=~"internal|failed_precondition"}[5m])
          )
            /
          sum by (rpc) (rate(demozpay_ledger_rpc_requests_total[5m])) > 0.02
        for: 5m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Ledger RPC error rate > 2% on {{ $labels.rpc }}"
          description: |
            Ledger is the sole source of money truth. Sustained errors mean
            disburses are failing AND/OR settlement confirmations are failing.
            Investigate connectivity, DB locks, and the RPC's own logs.
          runbook: "docs/runbooks/ledger-error-rate.md" # PLANNED

      - alert: DemozpayLedgerP99Latency
        expr: |
          histogram_quantile(0.99, sum by (rpc, le) (
            rate(demozpay_ledger_rpc_latency_seconds_bucket[5m])
          )) > 0.5
        for: 10m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Ledger RPC P99 latency > 500ms on {{ $labels.rpc }}"
          runbook: "docs/runbooks/ledger-error-rate.md" # PLANNED

      - alert: DemozpayLedgerBankDriftNonZero
        expr: abs(demozpay_ledger_reconcile_drift_santim) > 0
        for: 1m
        labels:
          severity: critical
          team: finance
        annotations:
          summary: "Ledger/bank drift on partner {{ $labels.partner }}"
          description: |
            ReconcileWithBank reported non-zero drift. Money correctness is at
            risk OR there's a recon timing issue. Open the runbook IMMEDIATELY.
          runbook: "docs/runbooks/drift-detected.md"

  # ─── Reconciliation runner heartbeat (Phase D) ─────────────────────
  - name: demozpay-reconciliation
    interval: 1m
    rules:
      - alert: DemozpayReconciliationRunnerMissedDay
        # Phase D / D2 — the recon-runner exposes its last-success via a
        # pushgateway counter. Until pushgateway lands, use the run-summary
        # JSON's emit timestamp (operator-side workflow). Rule scaffolded
        # here for when pushgateway ships.
        expr: |
          time() - max(demozpay_reconciliation_last_success_timestamp_seconds) by (tenant_id, partner) > 86400
        for: 1h
        labels:
          severity: critical
          team: finance
        annotations:
          summary: "Daily reconciliation has not run successfully for > 24h"
          description: |
            (tenant={{ $labels.tenant_id }}, partner={{ $labels.partner }}).
            Drift detection is blind for this pair. Investigate the cron / job
            runner.
          runbook: "docs/runbooks/drift-detected.md"

      - alert: DemozpayReconciliationFlaggedLineSpike
        # Once recon-runner pushes counters: rate of FLAGGED outcomes.
        # Placeholder rule — wire metric in D2 follow-up.
        expr: |
          rate(demozpay_reconciliation_flagged_lines_total[1h]) > 5
        for: 1h
        labels:
          severity: warning
          team: finance
        annotations:
          summary: "Reconciliation flagging > 5 lines/hour on {{ $labels.partner }}"
          runbook: "docs/runbooks/drift-detected.md"
