Update Joanna dispatch and infra health checks

3 weeks ago · 539ede8586
parent cb42a22f43
commit 539ede8586
11 changed files with 275 additions and 99 deletions
--- a/config/.HA_VERSION
+++ b/config/.HA_VERSION
@ -1 +1 @@
-2026.5.0
+2026.5.1
--- a/config/.cache/brands/integrations/unifi/icon.png
+++ b/config/.cache/brands/integrations/unifi/icon.png
--- a/config/packages/README.md
+++ b/config/packages/README.md
@ -48,9 +48,9 @@ Live collection of plug-and-play Home Assistant packages. Each YAML file in this
 | [mariadb_monitoring.yaml](mariadb_monitoring.yaml) | MariaDB health sensors and Lovelace dashboard snippet for recorder stats. | `sensor.mariadb_status`, `sensor.database_size` |
 | [docker_infrastructure.yaml](docker_infrastructure.yaml) | Docker host patching telemetry, container/stack Repairs automation, 20-minute Joanna escalation for persistent container outages using stable configured monitor membership, and weekly scheduled prune actions across docker_10/14/17/69. | `sensor.docker_*_apt_status`, `binary_sensor.*_stack_status`, `sensor.docker_stacks_down_count`, `repairs.create`, `script.joanna_dispatch` |
 | [proxmox.yaml](proxmox.yaml) | Proxmox runtime and disk pressure monitoring with Repairs + Joanna dispatch for sustained node degradations, plus nightly Frigate reboot. | `binary_sensor.proxmox*_runtime_healthy`, `sensor.proxmox*_disk_used_percentage`, `repairs.create`, `script.joanna_dispatch`, `button.qemu_docker2_101_reboot` |
-| [synology_dsm.yaml](synology_dsm.yaml) | Synology DSM integration health normalization for Carlo-NAS01 and Carlo-NVR, with Repairs + Joanna dispatch on sustained integration, security, or storage problems. | `binary_sensor.carlo_*_synology_problem`, `sensor.carlo_*_synology_problem_summary`, `repairs.create`, `script.joanna_dispatch` |
+| [synology_dsm.yaml](synology_dsm.yaml) | Synology DSM integration health normalization for Carlo-NAS01 and Carlo-NVR, with outage-aware Joanna-first handling for lone post-outage volume warnings and Repairs escalation for persistent or non-outage problems. | `binary_sensor.carlo_*_synology_problem`, `sensor.carlo_*_synology_problem_summary`, `binary_sensor.powerwall_grid_status`, `repairs.create`, `script.joanna_dispatch` |
-| [infrastructure.yaml](infrastructure.yaml) | Normalized WAN/DNS/backup/domain/cert health, Glances-backed Docker host disk pressure with bounded safe Joanna cleanup, and website uptime/latency SLO signals for Infrastructure dashboards, plus nightly backup verification and monthly Joanna HA log hygiene review with GitHub issue follow-up. | `sensor.docker_*_disk_used_percentage`, `automation.docker_host_disk_pressure_monitor`, `binary_sensor.infra_website_uptime_slo_breach`, `binary_sensor.infra_website_latency_degraded`, `automation.infra_backup_nightly_verification`, `script.joanna_dispatch` |
+| [infrastructure.yaml](infrastructure.yaml) | Normalized WAN/DNS/backup/domain/cert health, Glances-backed Docker host disk pressure with Joanna-only warning cleanup and critical Repairs, and website uptime/latency SLO signals for Infrastructure dashboards, plus nightly backup verification and monthly Joanna HA log hygiene review with GitHub issue follow-up. | `sensor.docker_*_disk_used_percentage`, `automation.docker_host_disk_pressure_monitor`, `binary_sensor.infra_website_uptime_slo_breach`, `binary_sensor.infra_website_latency_degraded`, `automation.infra_backup_nightly_verification`, `script.joanna_dispatch` |
-| [onenote_indexer.yaml](onenote_indexer.yaml) | OneNote indexer health/status monitoring for Joanna, failure-repair automation, and a daily duplicate-delete maintenance request. | `sensor.onenote_indexer_last_job_status`, `binary_sensor.onenote_indexer_last_job_successful` |
+| [onenote_indexer.yaml](onenote_indexer.yaml) | OneNote indexer health/status monitoring for Joanna, explicit index-health confirmation, failure-repair automation, and a daily duplicate-delete maintenance request. | `sensor.onenote_indexer_last_job_status`, `binary_sensor.onenote_indexer_last_job_successful`, `binary_sensor.onenote_indexer_index_healthy` |
 | [mqtt_status.yaml](mqtt_status.yaml) | Command-line MQTT broker reachability probe with Spook Repairs escalation and Joanna troubleshooting dispatch on outage. | `binary_sensor.mqtt_status_raw`, `binary_sensor.mqtt_broker_problem`, `repairs.create`, `rest_command.bearclaw_command` |
 | [mariadb.yaml](mariadb.yaml) | MariaDB recorder health and capacity snapshots with hourly live metrics, weekly admin/recorder polling, and stats-ready numeric sensors. | `sensor.mariadb_status`, `sensor.database_size` |
 | [processmonitor.yaml](processmonitor.yaml) | Root filesystem disk-pressure monitoring with immediate digest/logbook notes at 80%, Joanna review after 10 minutes above 80%, and delayed phone alerts only if the issue stays unresolved after dispatch. | `sensor.disk_use_percent`, `repairs.create`, `script.joanna_dispatch`, `tts.clear_cache` |
--- a/config/packages/bearclaw.yaml
+++ b/config/packages/bearclaw.yaml
@ -16,6 +16,7 @@
 # Notes: v2 intake is the primary HA contract; legacy command/ingest routes remain appliance-side shims.
 # Notes: Command payload supports async_only for automation-first queueing when immediate inline handling is not required.
 # Notes: Command payload supports optional metadata for HA dispatch context snapshots.
 # Notes: HA automation dispatches default to BearClaw's ops domain so wording like NAS "health" cannot route to the health coach.
 # Notes: Blog: https://www.vcloudinfo.com/2026/03/joanna-dispatch-telemetry-home-assistant-infrastructure-dashboard/
 ######################################################################
@ -44,6 +45,10 @@ rest_command:
          "context": {{ context | default(none) | tojson }},
          "callback": {{ callback | default(none) | tojson }}
        },
        "routing": {
          "domainHint": {{ domain_hint | default('ops', true) | tojson }},
          "laneHint": {{ lane_hint | default('joanna.ops', true) | tojson }}
        },
        "replyTargets": [
          {
            "type": "ha",
--- a/config/packages/docker_infrastructure.yaml
+++ b/config/packages/docker_infrastructure.yaml
@ -16,6 +16,7 @@
 # Notes: Outage escalation keys off the configured monitored group so host-wide telemetry drops do not fall out of scope before the delayed Joanna dispatch runs.
 # Notes: Weekly reconcile should replace retired container-name switches with the current container-ID-prefixed discovery set.
 # Notes: Tapple is now served by `games_hub` on `/tapple/`; do not keep a standalone `tapple` container switch in the monitored group.
 # Notes: Teslamate and crystalsoftwashsolutions are live services and should remain in the monitored group when their discovery switches are present.
 # Notes: Infra Info was removed; BearClaw Admin is the planning snapshot surface.
 ######################################################################
@ -86,6 +87,7 @@ switch:
      - switch.college_budget_app_container_2
      - switch.cruise_tracker_container
      - switch.cruise_tracker_container_2
      - switch.crystalsoftwashsolutions_container
      - switch.dashy_container
      - switch.dashy_container_2
      - switch.docker_socket_proxy_container
@ -149,6 +151,10 @@ switch:
      - switch.redis_webhooks_engine_container_2
      - switch.rvtools_ppt_web_container
      - switch.rvtools_ppt_web_container_2
      - switch.teslamate_backup_container
      - switch.teslamate_container
      - switch.teslamate_database_container
      - switch.teslamate_grafana_container
      - switch.tugtainer_agent_container
      - switch.tugtainer_agent_container_2
      - switch.tugtainer_container
--- a/config/packages/infrastructure.yaml
+++ b/config/packages/infrastructure.yaml
@ -11,10 +11,12 @@
 # Notes: Domain warning threshold is <30 days; critical threshold is <14 days.
 # Notes: Nightly Duplicati verification runs at 08:00 after the 05:30 Duplicati job and docker_14 reboot window.
 # Notes: Duplicati transport/API errors are logged only; repairs are reserved for proven failed or stale backups.
 # Notes: Duplicati failure Repairs enable a recovery poll that clears the Repair after a later successful run.
 # Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes.
 # Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics.
 # Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered.
 # Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts.
 # Notes: Warning-level Docker host disk pressure is Joanna-only; Repairs are reserved for critical pressure.
 ######################################################################
 input_text:
@ -28,6 +30,10 @@ input_text:
    name: "docker_69 disk pressure band"
    max: 20
 input_boolean:
  infra_duplicati_backup_repair_active:
    name: "Duplicati backup repair active"
 command_line:
  - sensor:
      name: Infra WAN Packet Loss
@ -481,15 +487,10 @@ automation:
                        value: "critical"
                - conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}"
                  sequence:
-                    - service: repairs.create
+                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                        severity: warning
                        persistent: true
                        title: "{{ host_name }} disk pressure warning ({{ disk_pct | round(1) }}%)"
                        description: >-
                          {{ host_name }} root disk usage is elevated.
                          Plan cleanup before capacity reaches critical levels.
                    - service: script.joanna_dispatch
                      data:
                        trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)"
@ -520,7 +521,7 @@ automation:
                        topic: "DOCKER"
                        message: >-
                          {{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%.
-                          Repair {{ issue_id }} opened and Joanna investigation requested.
+                          Joanna investigation requested without opening a warning Repair.
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
@ -528,19 +529,14 @@ automation:
                        value: "warning"
                - conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}"
                  sequence:
-                    - service: repairs.create
+                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                        severity: warning
                        persistent: true
                        title: "{{ host_name }} disk pressure warning ({{ disk_pct | round(1) }}%)"
                        description: >-
                          {{ host_name }} root disk usage is elevated but no longer critical.
                          Continue cleanup before capacity reaches critical levels again.
                    - service: script.send_to_logbook
                      data:
                        topic: "DOCKER"
-                        message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%."
+                        message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%. Critical Repair cleared; Joanna continues handling warning-level cleanup."
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
@ -580,13 +576,27 @@ automation:
    trigger:
      - platform: time
        at: "08:00:00"
        id: nightly
      - platform: time_pattern
        minutes: "15"
        id: recovery_poll
      - platform: time_pattern
        minutes: "45"
        id: recovery_poll
    condition:
      - condition: template
        value_template: >-
          {{ trigger is not defined or trigger.id != 'recovery_poll'
             or is_state('input_boolean.infra_duplicati_backup_repair_active', 'on') }}
    action:
      - variables:
          trigger_source: "{{ trigger.id if trigger is defined and trigger.id is defined else 'manual' }}"
          verifier_reason: "{{ 'ha_failure_followup' if trigger_source == 'recovery_poll' else 'ha_nightly' }}"
          trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)"
          duplicati_state: "{{ states('switch.duplicati_container') }}"
      - action: rest_command.bearclaw_duplicati_verify
        data:
-          reason: "ha_nightly"
+          reason: "{{ verifier_reason }}"
        response_variable: duplicati_verify
      - service: script.send_to_logbook
        data:
@ -618,6 +628,9 @@ automation:
                continue_on_error: true
                data:
                  issue_id: user_infra_duplicati_backup_failure
              - service: input_boolean.turn_off
                target:
                  entity_id: input_boolean.infra_duplicati_backup_repair_active
          - conditions: "{{ verify_transport_issue }}"
            sequence:
              - service: script.send_to_logbook
@ -628,46 +641,60 @@ automation:
                    status {{ verify_status }} with issue {{ verify_issue }}. No repair card was opened because
                    this is verifier transport state, not a confirmed backup failure.
        default:
-          - service: repairs.create
+          - service: input_boolean.turn_on
-            data:
+            target:
-              issue_id: infra_duplicati_backup_failure
+              entity_id: input_boolean.infra_duplicati_backup_repair_active
-              title: "Duplicati nightly backup verification failed"
+          - choose:
-              description: >-
+              - conditions: "{{ trigger_source != 'recovery_poll' }}"
-                {{ verify_summary }}
+                sequence:
-                Backup={{ verify_backup_name }};
+                  - service: repairs.create
-                status={{ verify_status }};
+                    data:
-                last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
+                      issue_id: infra_duplicati_backup_failure
-                last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
+                      title: "Duplicati nightly backup verification failed"
-              severity: error
+                      description: >-
-              persistent: true
+                        {{ verify_summary }}
-          - service: script.joanna_dispatch
+                        Backup={{ verify_backup_name }};
-            data:
+                        status={{ verify_status }};
-              trigger_context: "{{ trigger_context }}"
+                        last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
-              source: "home_assistant_automation.infra_backup_nightly_verification"
+                        last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
-              summary: "Nightly Duplicati backup verification failed"
+                      severity: error
-              entity_ids:
+                      persistent: true
-                - "switch.duplicati_container"
+                  - service: script.joanna_dispatch
-              diagnostics: >-
+                    data:
-                scheduled_time=08:00:00,
+                      trigger_context: "{{ trigger_context }}"
-                duplicati_container={{ duplicati_state }},
+                      source: "home_assistant_automation.infra_backup_nightly_verification"
-                verifier_http_status={{ verify_http_status }},
+                      summary: "Nightly Duplicati backup verification failed"
-                verifier_status={{ verify_status }},
+                      entity_ids:
-                verifier_issue={{ verify_issue }},
+                        - "switch.duplicati_container"
-                backup_name={{ verify_backup_name }},
+                      diagnostics: >-
-                latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
+                        scheduled_time=08:00:00,
-                last_success={{ verify_last_success.get('endedAt', 'n/a') }}
+                        duplicati_container={{ duplicati_state }},
-              request: >-
+                        verifier_http_status={{ verify_http_status }},
-                Investigate the Duplicati backup job {{ verify_backup_name }}.
+                        verifier_status={{ verify_status }},
-                The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
+                        verifier_issue={{ verify_issue }},
-                Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
+                        backup_name={{ verify_backup_name }},
-                Reply with explicit status fields:
+                        latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
-                resolved=true/false,
+                        last_success={{ verify_last_success.get('endedAt', 'n/a') }}
-                backup_status,
+                      request: >-
-                last_success_time,
+                        Investigate the Duplicati backup job {{ verify_backup_name }}.
-                root_cause,
+                        The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
-                action_taken,
+                        Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
-                verification,
+                        Home Assistant will re-check this verifier every 30 minutes after dispatch and clear the Repair automatically once the backup is healthy.
-                next_action_required=true/false.
+                        Reply with explicit status fields:
                        resolved=true/false,
                        backup_status,
                        last_success_time,
                        root_cause,
                        action_taken,
                        verification,
                        next_action_required=true/false.
            default:
              - service: script.send_to_logbook
                data:
                  topic: "BACKUP"
                  message: >-
                    Duplicati recovery follow-up still reports {{ verify_status }} for {{ verify_backup_name }}:
                    {{ verify_issue }}. Existing Repair remains open; Joanna was not dispatched again.
  - alias: "Infrastructure - Monthly HA Log Hygiene Review"
    id: infra_monthly_log_hygiene_review
--- a/config/packages/onenote_indexer.yaml
+++ b/config/packages/onenote_indexer.yaml
@ -7,10 +7,12 @@
 #  Polls codex_appliance OneNote status and exposes trigger-ready health entities.
 # -------------------------------------------------------------------
 # Notes: Keep onenote indexer monitoring in this package (separate from bearclaw transport).
-# Notes: last_status='never' is treated as success to avoid false alerts after restarts.
+# Notes: last_status='never' is treated as success only when index health is confirmed.
 # Notes: Only explicit last_status='error' is treated as failure; unknown/unavailable are neutral.
 # Notes: HA->Joanna request includes trigger context so Telegram progress messages can identify origin.
 # Notes: Creates/clears a Spook Repair issue and requests Joanna remediation on failures.
 # Notes: Index health requires pages, chunks, no pending embeddings, and a healthy embedding worker.
 # Notes: Recovery clear is polled so stale Repairs do not linger after the indexer recovers.
 # Notes: Daily Joanna recap should be plain-English; only surface detailed index metrics when something materially changes or fails.
 ######################################################################
@ -43,13 +45,23 @@ template:
        state: >-
          {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
          {% set sync = payload.get('sync', {}) if payload is mapping else {} %}
          {% set index = payload.get('index', {}) if payload is mapping else {} %}
          {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
          {% set raw = (sync.get('last_status', '') | string | lower) %}
-          {% if raw in ['ok', 'success', 'never'] %}
+          {% set pages = index.get('pages') | int(0) %}
          {% set chunks = index.get('chunks') | int(0) %}
          {% set pending = index.get('pending_embeddings') | int(999999) %}
          {% set worker_status = worker.get('lastStatus', '') | string | lower %}
          {% set worker_running = worker.get('running', false) | bool %}
          {% set index_healthy = pages > 0 and chunks > 0 and pending == 0 and worker_status == 'ok' and not worker_running %}
          {% if raw in ['ok', 'success'] or (raw == 'never' and index_healthy) %}
            success
          {% elif raw == 'running' %}
            running
          {% elif raw == 'error' %}
            error
          {% elif raw == 'never' %}
            unknown
          {% else %}
            unknown
          {% endif %}
@ -86,6 +98,12 @@ template:
            {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
            {% set index = payload.get('index', {}) if payload is mapping else {} %}
            {{ index.get('chunks') }}
          embedding_worker_status: >-
            {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
            {{ worker.get('lastStatus') }}
          embedding_worker_last_run_at: >-
            {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
            {{ worker.get('lastRunAt') }}
          last_metrics: >-
            {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
            {% set sync = payload.get('sync', {}) if payload is mapping else {} %}
@ -103,6 +121,44 @@ template:
            mdi:alert-circle
          {% endif %}
      - name: OneNote Indexer Index Healthy
        unique_id: onenote_indexer_index_healthy
        state: >-
          {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
          {% set index = payload.get('index', {}) if payload is mapping else {} %}
          {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
          {% set pages = index.get('pages') | int(0) %}
          {% set chunks = index.get('chunks') | int(0) %}
          {% set pending = index.get('pending_embeddings') | int(999999) %}
          {% set worker_status = worker.get('lastStatus', '') | string | lower %}
          {% set worker_running = worker.get('running', false) | bool %}
          {{ pages > 0 and chunks > 0 and pending == 0 and worker_status == 'ok' and not worker_running }}
        icon: >-
          {% if is_state('binary_sensor.onenote_indexer_index_healthy', 'on') %}
            mdi:notebook-check
          {% else %}
            mdi:notebook-remove
          {% endif %}
        attributes:
          pages: >-
            {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
            {% set index = payload.get('index', {}) if payload is mapping else {} %}
            {{ index.get('pages') }}
          chunks: >-
            {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
            {% set index = payload.get('index', {}) if payload is mapping else {} %}
            {{ index.get('chunks') }}
          pending_embeddings: >-
            {% set payload = state_attr('sensor.onenote_indexer_status_payload', 'indexer') or {} %}
            {% set index = payload.get('index', {}) if payload is mapping else {} %}
            {{ index.get('pending_embeddings') }}
          embedding_worker_status: >-
            {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
            {{ worker.get('lastStatus') }}
          embedding_worker_last_run_at: >-
            {% set worker = state_attr('sensor.onenote_indexer_status_payload', 'embeddingWorker') or {} %}
            {{ worker.get('lastRunAt') }}
      - name: OneNote Indexer Job Failed
        unique_id: onenote_indexer_job_failed
        device_class: problem
@ -199,13 +255,28 @@ automation:
  - id: onenote_indexer_failure_clear_repair
    alias: OneNote Indexer - Clear Repair On Recovery
-    description: Clear the Spook Repair issue when OneNote indexer is healthy again.
+    description: Clear the Spook Repair issue when OneNote indexer and index health are confirmed healthy again.
    mode: single
    trigger:
      - platform: state
        entity_id: binary_sensor.onenote_indexer_job_failed
        to: "off"
        for: "00:02:00"
      - platform: state
        entity_id: binary_sensor.onenote_indexer_index_healthy
        to: "on"
        for: "00:02:00"
      - platform: time_pattern
        minutes: "20"
      - platform: time_pattern
        minutes: "50"
    condition:
      - condition: state
        entity_id: binary_sensor.onenote_indexer_job_failed
        state: "off"
      - condition: state
        entity_id: binary_sensor.onenote_indexer_index_healthy
        state: "on"
    action:
      - service: repairs.remove
        continue_on_error: true
@ -214,4 +285,4 @@ automation:
      - service: script.send_to_logbook
        data:
          topic: "ONENOTE"
-          message: "OneNote indexer recovered. Spook repair cleared."
+          message: "OneNote indexer and index health are confirmed healthy. Spook repair cleared."
--- a/config/packages/synology_dsm.yaml
+++ b/config/packages/synology_dsm.yaml
@ -9,6 +9,7 @@
 # Notes: Uses native `synology_dsm` entities for Carlo-NAS01 and Carlo-NVR.
 # Notes: Joanna dispatches are reserved for integration/security/storage problems, not routine reboot/shutdown controls.
 # Notes: DSM update availability stays diagnostic context only; it does not trigger remediation by itself.
 # Notes: Recent Powerwall outages route lone volume warnings to Joanna first; Repairs open after the recovery grace window if still active.
 ######################################################################
 template:
@ -281,16 +282,47 @@ template:
 automation:
  - id: synology_dsm_open_repair_and_dispatch
    alias: "Synology DSM - Open Repair And Dispatch"
-    description: "Open a Repairs issue and dispatch Joanna when a Synology problem stays active."
+    description: "Dispatch Joanna when a Synology problem stays active, and open Repairs after outage-aware grace checks."
    mode: queued
    trigger:
-      - platform: state
+      - id: initial_dispatch
        platform: state
        entity_id:
          - binary_sensor.carlo_nas01_synology_problem
          - binary_sensor.carlo_nvr_synology_problem
        to: "on"
        for: "00:10:00"
      - id: repair_escalation
        platform: state
        entity_id:
          - binary_sensor.carlo_nas01_synology_problem
          - binary_sensor.carlo_nvr_synology_problem
        to: "on"
        for: "01:00:00"
    variables:
      outage_grace_minutes: 60
      trigger_phase: "{{ trigger.id | default('initial_dispatch', true) }}"
      is_repair_escalation: "{{ trigger_phase == 'repair_escalation' }}"
      grid_state: "{{ states('binary_sensor.powerwall_grid_status') }}"
      grid_changed_minutes: >-
        {% if states.binary_sensor.powerwall_grid_status is defined %}
          {{ ((as_timestamp(now(), 0) - as_timestamp(states.binary_sensor.powerwall_grid_status.last_changed, 0)) / 60) | round(1) }}
        {% else %}
          9999
        {% endif %}
      outage_grace_active: >-
        {{ grid_state == 'off' or
           (grid_state == 'on' and (grid_changed_minutes | float(9999)) <= outage_grace_minutes) }}
      outage_context: >-
        {% if states.binary_sensor.powerwall_grid_status is not defined %}
          Powerwall grid status entity is unavailable to this automation.
        {% elif grid_state == 'off' %}
          Powerwall grid is currently down; outage began {{ states.binary_sensor.powerwall_grid_status.last_changed }}.
        {% elif (grid_changed_minutes | float(9999)) <= outage_grace_minutes %}
          Powerwall grid recovered {{ grid_changed_minutes }} minutes ago.
        {% else %}
          No recent Powerwall outage recovery within {{ outage_grace_minutes }} minutes.
        {% endif %}
      host_name: >-
        {% if trigger.entity_id == 'binary_sensor.carlo_nas01_synology_problem' %}
          Carlo-NAS01
@ -402,32 +434,49 @@ automation:
      volume_status: "{{ states(volume_status_entity) }}"
      volume_used: "{{ states(volume_used_entity) }}"
      dsm_update_state: "{{ states(update_entity) }}"
      lone_volume_warning: >-
        {{ problem_summary | lower | trim == 'volume status=warning' and
           volume_status | lower | trim == 'warning' and
           security_state == 'off' }}
      joanna_only_outage_grace: >-
        {{ not (is_repair_escalation | bool(false)) and
           (outage_grace_active | bool(false)) and
           (lone_volume_warning | bool(false)) }}
      should_create_repair: "{{ not (joanna_only_outage_grace | bool(false)) }}"
      trigger_context: "HA automation synology_dsm_open_repair_and_dispatch (Synology DSM - Open Repair And Dispatch)"
    action:
-      - service: repairs.create
+      - choose:
-        data:
+          - conditions:
-          issue_id: "{{ issue_id }}"
+              - condition: template
-          title: "{{ host_name }} Synology health issue"
+                value_template: "{{ should_create_repair | bool(false) }}"
-          severity: "{{ 'error' if problem_severity == 'error' else 'warning' }}"
+            sequence:
-          persistent: true
+              - service: repairs.create
-          description: >-
+                data:
-            Home Assistant detected a sustained Synology DSM issue for {{ host_name }}.
+                  issue_id: "{{ issue_id }}"
                  title: "{{ host_name }} Synology health issue"
                  severity: "{{ 'error' if problem_severity == 'error' else 'warning' }}"
                  persistent: true
                  description: >-
                    Home Assistant detected a sustained Synology DSM issue for {{ host_name }}.
-            summary: {{ problem_summary }}
+                    summary: {{ problem_summary }}
-            security_state: {{ security_state }}
+                    security_state: {{ security_state }}
-            volume_status: {{ volume_status }}
+                    volume_status: {{ volume_status }}
-            volume_used: {{ volume_used }}
+                    volume_used: {{ volume_used }}
-            dsm_update: {{ dsm_update_state }}
+                    dsm_update: {{ dsm_update_state }}
-            ssh_alias: {{ ssh_alias }}
+                    outage_context: {{ outage_context }}
-            dsm_url: {{ dsm_url }}
+                    ssh_alias: {{ ssh_alias }}
                    dsm_url: {{ dsm_url }}
      - service: script.joanna_dispatch
        data:
          trigger_context: "{{ trigger_context }}"
          source: "{{ source }}"
-          summary: "{{ host_name }} Synology DSM problem detected"
+          summary: >-
            {{ host_name }} Synology DSM problem detected{{ ' after recent Powerwall outage' if joanna_only_outage_grace | bool(false) else '' }}
          entity_ids: "{{ entity_ids }}"
          diagnostics: >-
            issue_id={{ issue_id }},
            trigger_phase={{ trigger_phase }},
            severity={{ problem_severity }},
            problem_sensor={{ trigger.entity_id }},
            problem_summary={{ problem_summary }},
@ -435,18 +484,27 @@ automation:
            volume_status={{ volume_status }},
            volume_used={{ volume_used }},
            dsm_update={{ dsm_update_state }},
            outage_grace_active={{ outage_grace_active }},
            outage_context={{ outage_context }},
            joanna_only_outage_grace={{ joanna_only_outage_grace }},
            repair_created={{ should_create_repair }},
            ssh_alias={{ ssh_alias }},
            dsm_url={{ dsm_url }}
          request: >-
            Investigate {{ host_name }} using the Home Assistant Synology DSM entities first, then DSM or SSH if needed.
-            Review security status, drive health, volume health, and integration availability.
+            Review security state, drive condition, volume condition, and integration availability.
            If this is a recent Powerwall outage and the only symptom is a volume warning, treat it as post-outage recovery first and monitor before escalating.
            Do not reboot or shut down the NAS unless explicitly requested.
      - service: script.send_to_logbook
        data:
          topic: "SYNOLOGY"
          message: >-
            {{ host_name }} reported a Synology DSM problem for 10 minutes.
-            Repair {{ issue_id }} opened and Joanna investigation requested.
+            {% if should_create_repair | bool(false) %}
              Repair {{ issue_id }} opened and Joanna investigation requested.
            {% else %}
              Joanna investigation requested without opening a Repair during the post-outage recovery grace window.
            {% endif %}
            Summary: {{ problem_summary }}.
  - id: synology_dsm_clear_repair_on_recovery
--- a/config/script/README.md
+++ b/config/script/README.md
@ -39,7 +39,7 @@ Reusable scripts that other automations call for notifications, lighting, safety
 `script.joanna_dispatch` is the shared handoff contract from Home Assistant automations into Joanna/BearClaw when Home Assistant detects something worth investigating or fixing.
 Why we use it:
- Keeps one message schema for remediation context (`trigger_context`, `source`, `summary`, `entity_ids`, `diagnostics`, `request`).
+- Keeps one message schema for remediation context (`trigger_context`, `source`, `summary`, `entity_ids`, `diagnostics`, `request`, plus optional routing hints).
 - Avoids repeating direct `rest_command.bearclaw_command` payload formatting in multiple packages.
 - Lets Home Assistant stay focused on detection, timing, and routing while Joanna acts as the AGENT engineer for infrastructure triage and recommended remediation.
 - Makes resolution-trigger automations easier to review, update, and audit.
@ -49,6 +49,7 @@ What the helper normalizes before the BearClaw intake call:
 - `entity_ids` from either a YAML list or a comma-delimited string.
 - `diagnostics` from either free text or structured mappings/sequences.
 - `request` guardrails so Joanna defaults to investigation/recommendation, not blind resets or power-cycles.
 - `domain_hint`/`lane_hint` default to BearClaw ops routing so HA infrastructure text does not drift into another domain parser.
 Current automations that kick off automated resolutions (via `script.joanna_dispatch`):
 | Automation ID | Alias | File |
--- a/config/script/joanna_dispatch.yaml
+++ b/config/script/joanna_dispatch.yaml
@ -9,6 +9,7 @@
 # Notes: Keep this helper generic so package automations can reuse one schema.
 # Notes: Source defaults to home_assistant_automation.unknown when omitted.
 # Notes: Automation dispatches are async_only by default so HA calls return quickly while BearClaw works in queue.
 # Notes: Automation dispatches default to domain_hint=ops and lane_hint=joanna.ops.
 # Notes: HA is a dispatcher/integration here; Telegram transport ownership lives in docker_17/codex_appliance.
 ######################################################################
@ -31,6 +32,10 @@ joanna_dispatch:
      description: Extra troubleshooting context.
    user:
      description: BearClaw user identity.
    domain_hint:
      description: BearClaw domain hint.
    lane_hint:
      description: BearClaw lane hint.
  sequence:
    - variables:
        normalized_context: "{{ trigger_context | default('HA automation', true) }}"
@ -39,6 +44,8 @@ joanna_dispatch:
        normalized_request: >-
          {{ request | default('Investigate and recommend remediation. Do not run automated resets or power-cycles unless explicitly requested.', true) }}
        normalized_user: "{{ user | default('carlo', true) }}"
        normalized_domain_hint: "{{ domain_hint | default('ops', true) }}"
        normalized_lane_hint: "{{ lane_hint | default('joanna.ops', true) }}"
        normalized_entity_ids: >-
          {% if entity_ids is sequence and entity_ids is not string %}
            {{ entity_ids | map('string') | join(', ') }}
@ -66,4 +73,6 @@ joanna_dispatch:
        user: "{{ normalized_user }}"
        source: "{{ normalized_source }}"
        context: "{{ normalized_context }}"
        domain_hint: "{{ normalized_domain_hint }}"
        lane_hint: "{{ normalized_lane_hint }}"
        async_only: true
--- a/config/templates/speech/briefing.yaml
+++ b/config/templates/speech/briefing.yaml
@ -9,6 +9,8 @@
 #  Weather, responsibilities, holidays, air quality, and fact prompts parsed by speech_processing/speech_engine.
 # Notes: Dorm zones are away from Bear Stone; only person state `home`
 #  means someone is physically home at this house.
 # Notes: Previous broadcast text is stale context only; current sensor data
 #  stays authoritative for entry point and action wording.
 ######################################################################
@ -88,19 +90,15 @@
  {%- endmacro -%}
  {%- macro window_check() -%}
-  {% if states.group.entry_points.state != 'off' -%}
+    {% set open_entries = states.binary_sensor
-      {% set comma = joiner(', ') %}
+      | selectattr('state', 'eq', 'on')
-      The
+      | selectattr('attributes.device_class', 'eq', 'opening')
-      {% for state in states.binary_sensor if state.state == 'on' and state.attributes.device_class == 'opening' -%}
+      | map(attribute='attributes.friendly_name')
-      {%- endfor %}
+      | list %}
-      {% for group in states.binary_sensor|groupby('state') -%}
+    {% set entry_count = open_entries | length %}
-      {%- for entity in group.list  if entity.state == 'on' and entity.attributes.device_class == 'opening'  -%}
+    {% if entry_count > 0 -%}
-          {{ ' and' if loop.last and not loop.first else comma() }}
+      [Current entry point state: {{ open_entries | join(', ') }} {{ 'is' if entry_count == 1 else 'are' }} still open and {{ 'needs' if entry_count == 1 else 'need' }} to be closed manually. Do not say any physical window or door was closed unless current sensor data says it is closed.]
-          {{ entity.attributes.friendly_name }}
+    {%- endif -%}
      {%- endfor -%}
      {% endfor %}
      need to be closed.
  {%- endif -%}
  {%- endmacro -%}
  {%- macro lock_check() -%}
@ -336,6 +334,7 @@
    {# call a Random fact about the house or inspiration quote #}
    {{ ([moon, holiday, days_until ]|random)() }}
    ]
    [Previous broadcast rule: The previous broadcast is stale context only and must not override current Sensor Data. Use it only to avoid repetitive wording. Do not repeat prior claims that a window, door, lock, garage door, or light was changed unless current Sensor Data supports that claim. If current Sensor Data says an entry point is still open and needs closure, say it is still open or needs attention, not that it is closed.]
    [Previous broadcast for context: "{{ state_attr('sensor.openai_response', 'response') }}" ]
  {%- endmacro -%}