Home-AssistantConfig/config/packages/infrastructure.yaml

######################################################################
# @CCOSTAN - Follow Me on X
# For more info visit https://www.vcloudinfo.com/click-here
# Original Repo : https://github.com/CCOSTAN/Home-AssistantConfig
# -------------------------------------------------------------------
# Infrastructure - Observability, disk pressure, and Joanna review workflows
#  WAN/DNS/website/domain/cert/Docker host state normalized for dashboards, plus scheduled infrastructure reviews.
# -------------------------------------------------------------------
# Related Issue: 1584
# Notes: Home dashboard consumes `infra_*` entities for exceptions-only alerts.
# Notes: Domain warning threshold is <30 days; critical threshold is <14 days.
# Notes: Nightly Duplicati verification runs at 08:00 after the 05:30 Duplicati job and docker_14 reboot window.
# Notes: Duplicati transport/API errors are logged only; repairs are reserved for proven failed or stale backups.
# Notes: Duplicati failure Repairs enable a recovery poll that clears the Repair after a later successful run.
# Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes.
# Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics.
# Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered.
# Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts.
# Notes: Warning-level Docker host disk pressure is Joanna-only; Repairs are reserved for critical pressure.
# Notes: Nebula Sync DNS consistency compares primary/backup Pi-hole answers and dispatches Joanna on sustained drift or container loss.
# Notes: Promoted IoT DNS consistency compares primary/backup Pi-hole answers for reserved IoT host records.
######################################################################

input_text:
  docker_17_disk_pressure_band:
    name: "docker_17 disk pressure band"
    max: 20
  docker_14_disk_pressure_band:
    name: "docker_14 disk pressure band"
    max: 20
  docker_69_disk_pressure_band:
    name: "docker_69 disk pressure band"
    max: 20
  infra_nebula_sync_health_band:
    name: "Nebula Sync health band"
    max: 20
  infra_pihole_iot_dns_health_band:
    name: "Pi-hole IoT DNS health band"
    max: 20

input_boolean:
  infra_duplicati_backup_repair_active:
    name: "Duplicati backup repair active"

command_line:
  - sensor:
      name: Infra WAN Packet Loss
      unique_id: infra_wan_packet_loss
      command: >-
        ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
        awk -F',' '/packet loss/ {gsub(/[^0-9.]/, "", $3); print $3; found=1}
        END {if (!found) print "unknown"}'
      scan_interval: 300
      unit_of_measurement: "%"
      state_class: measurement
      value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"

  - sensor:
      name: Infra WAN Latency Ms
      unique_id: infra_wan_latency_ms
      command: >-
        ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
        awk -F'/' '/^rtt|^round-trip/ {gsub(/[^0-9.]/, "", $5); print $5; found=1}
        END {if (!found) print "unknown"}'
      scan_interval: 300
      unit_of_measurement: "ms"
      state_class: measurement
      value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"

  - sensor:
      name: Infra External IP Fallback
      unique_id: infra_external_ip_fallback
      command: "curl -fsS https://api.ipify.org || echo unknown"
      scan_interval: 900

  - sensor:
      name: Infra Nebula Sync DNS Consistency
      unique_id: infra_nebula_sync_dns_consistency
      command: >-
                /bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; host=GTG-PF45FK6F; fqdn=GTG-PF45FK6F.fordst.com; ip=192.168.10.117; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; r(){ dig +time=2 +tries=1 +short @"$1" -x "$2" 2>/dev/null | tr -d "\r" | sed "s/\.$//" | sort | tr "\n" "," | sed "s/,$//"; }; p_short=$(q "$primary" "$host"); s_short=$(q "$secondary" "$host"); p_fqdn=$(q "$primary" "$fqdn"); s_fqdn=$(q "$secondary" "$fqdn"); p_rev=$(r "$primary" "$ip"); s_rev=$(r "$secondary" "$ip"); status=mismatch; if [ "$p_short" = "$ip" ] && [ "$s_short" = "$ip" ] && [ "$p_fqdn" = "$ip" ] && [ "$s_fqdn" = "$ip" ] && [ -n "$p_rev" ] && [ "$p_rev" = "$s_rev" ]; then status=ok; fi; printf "{\"status\":\"%s\",\"host\":\"%s\",\"expected_ip\":\"%s\",\"primary_short\":\"%s\",\"secondary_short\":\"%s\",\"primary_fqdn\":\"%s\",\"secondary_fqdn\":\"%s\",\"primary_reverse\":\"%s\",\"secondary_reverse\":\"%s\"}\n" "$status" "$host" "$ip" "$p_short" "$s_short" "$p_fqdn" "$s_fqdn" "$p_rev" "$s_rev"'
      scan_interval: 300
      value_template: "{{ value_json.status | default('unknown') }}"
      json_attributes:
        - host
        - expected_ip
        - primary_short
        - secondary_short
        - primary_fqdn
        - secondary_fqdn
        - primary_reverse
        - secondary_reverse

  - sensor:
      name: Infra Pihole IoT DNS Consistency
      unique_id: infra_pihole_iot_dns_consistency
      command: >-
                /bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; records="rachio.fordst.com=192.168.10.73 econet.fordst.com=192.168.10.92 dreame-vacuum.fordst.com=192.168.10.93 carlo-bed.fordst.com=192.168.10.95 lg-smart-fridge.fordst.com=192.168.10.96 tesla-blackbox-gw.fordst.com=192.168.10.97 bgw210.fordst.com=192.168.10.98"; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; status=ok; checked=0; mismatch_count=0; mismatches=""; for record in $records; do host=${record%%=*}; ip=${record#*=}; p=$(q "$primary" "$host"); s=$(q "$secondary" "$host"); checked=$((checked+1)); if [ "$p" != "$ip" ] || [ "$s" != "$ip" ]; then status=mismatch; mismatch_count=$((mismatch_count+1)); mismatches="${mismatches}${host}:expected=${ip},primary=${p:-none},secondary=${s:-none};"; fi; done; if [ -z "$mismatches" ]; then mismatches=none; fi; printf "{\"status\":\"%s\",\"checked_records\":%s,\"mismatch_count\":%s,\"mismatches\":\"%s\",\"primary_dns\":\"%s\",\"backup_dns\":\"%s\"}\n" "$status" "$checked" "$mismatch_count" "$mismatches" "$primary" "$secondary"'
      scan_interval: 300
      value_template: "{{ value_json.status | default('unknown') }}"
      json_attributes:
        - checked_records
        - mismatch_count
        - mismatches
        - primary_dns
        - backup_dns

template:
  - sensor:
      - name: "Infra External IP"
        unique_id: infra_external_ip
        state: >-
          {% set primary = states('sensor.external_ip') | trim %}
          {% set fallback = states('sensor.infra_external_ip_fallback') | trim %}
          {% if primary not in ['unknown', 'unavailable', 'none', ''] %}
            {{ primary }}
          {% else %}
            {{ fallback }}
          {% endif %}

      - name: "docker_17 Disk Used Percentage"
        unique_id: docker_17_disk_used_percentage
        unit_of_measurement: "%"
        state_class: measurement
        icon: mdi:harddisk
        availability: "{{ states('sensor.192_168_10_17_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
        state: "{{ states('sensor.192_168_10_17_disk_usage') | float(0) | round(1) }}"

      - name: "docker_14 Disk Used Percentage"
        unique_id: docker_14_disk_used_percentage
        unit_of_measurement: "%"
        state_class: measurement
        icon: mdi:harddisk
        availability: "{{ states('sensor.docker14_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
        state: "{{ states('sensor.docker14_disk_usage') | float(0) | round(1) }}"

      - name: "docker_69 Disk Used Percentage"
        unique_id: docker_69_disk_used_percentage
        unit_of_measurement: "%"
        state_class: measurement
        icon: mdi:harddisk
        availability: "{{ states('sensor.docker69_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
        state: "{{ states('sensor.docker69_disk_usage') | float(0) | round(1) }}"

      - name: "Infra Domain Expiry Min Days"
        unique_id: infra_domain_expiry_min_days
        unit_of_measurement: "d"
        state: >-
          {% set ids = [
            'sensor.vcloudinfo_com_days_until_expiration',
            'sensor.ipmer_com_days_until_expiration',
            'sensor.fordst_com_days_until_expiration',
            'sensor.kingcrafthomes_com_days_until_expiration'
          ] %}
          {% set ns = namespace(min=9999, any=false) %}
          {% for id in ids %}
            {% if expand(id) | count > 0 %}
              {% set raw = states(id) %}
              {% if raw not in ['unknown', 'unavailable', 'none', ''] %}
                {% set ns.any = true %}
                {% set val = raw | float(9999) %}
                {% if val < ns.min %}
                  {% set ns.min = val %}
                {% endif %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {% if ns.any %}
            {{ ns.min | round(0) }}
          {% else %}
            {{ none }}
          {% endif %}

      - name: "Infra Cert Expiry Min Days"
        unique_id: infra_cert_expiry_min_days
        unit_of_measurement: "d"
        state: >-
          {% set ns = namespace(min=9999, any=false) %}
          {% for item in states.sensor %}
            {% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
              {% set raw = item.state %}
              {% if raw not in ['unknown', 'unavailable', 'none', ''] %}
                {% set value = raw | float(9999) %}
                {% if value != 9999 %}
                  {% set ns.any = true %}
                  {% if value < ns.min %}
                    {% set ns.min = value %}
                  {% endif %}
                {% endif %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {% if ns.any %}
            {{ ns.min | round(0) }}
          {% else %}
            {{ none }}
          {% endif %}

      - name: "Infra Cert Telemetry Count"
        unique_id: infra_cert_telemetry_count
        icon: mdi:counter
        state: >-
          {% set ns = namespace(count=0) %}
          {% for item in states.sensor %}
            {% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
              {% set ns.count = ns.count + 1 %}
            {% endif %}
          {% endfor %}
          {{ ns.count }}

      - name: "Infra Website Down Count"
        unique_id: infra_website_down_count
        icon: mdi:counter
        state: >-
          {% set ids = [
            'binary_sensor.vcloudinfo_com',
            'binary_sensor.ipmer_com',
            'binary_sensor.fordst_com',
            'binary_sensor.www_kingcrafthomes_com'
          ] %}
          {% set ns = namespace(count=0) %}
          {% for id in ids %}
            {% if expand(id) | count > 0 %}
              {% set st = states(id) %}
              {% if st in ['off', 'unknown', 'unavailable'] %}
                {% set ns.count = ns.count + 1 %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {{ ns.count }}

  - binary_sensor:
      - name: "Infra WAN Quality Degraded"
        unique_id: infra_wan_quality_degraded
        device_class: problem
        state: >-
          {% set loss_raw = states('sensor.infra_wan_packet_loss') %}
          {% set lat_raw = states('sensor.infra_wan_latency_ms') %}
          {% set invalid = loss_raw in ['unknown', 'unavailable', 'none', ''] or
                           lat_raw in ['unknown', 'unavailable', 'none', ''] %}
          {% set loss = loss_raw | float(0) %}
          {% set lat = lat_raw | float(0) %}
          {{ invalid or loss > 5 or lat > 80 }}

      - name: "Infra DNS Pihole Degraded"
        unique_id: infra_dns_pihole_degraded
        device_class: problem
        state: >-
          {% set switch_state = states('switch.pi_hole') %}
          {% set service_state = states('binary_sensor.pihole_status') %}
          {{ switch_state != 'on' or service_state in ['off', 'unavailable', 'unknown'] }}

      - name: "Infra Nebula Sync Degraded"
        unique_id: infra_nebula_sync_degraded
        device_class: problem
        state: >-
          {% set dns_state = states('sensor.infra_nebula_sync_dns_consistency') | lower %}
          {% set portainer_known = [
            expand('binary_sensor.nebula_sync_status') | count > 0,
            expand('binary_sensor.nebula_sync_status_2') | count > 0,
            expand('sensor.nebula_sync_state') | count > 0,
            expand('sensor.nebula_sync_state_2') | count > 0,
            expand('switch.nebula_sync_container') | count > 0,
            expand('switch.nebula_sync_container_2') | count > 0
          ] | select('equalto', true) | list | count > 0 %}
          {% set portainer_ok = [
            is_state('binary_sensor.nebula_sync_status', 'on'),
            is_state('binary_sensor.nebula_sync_status_2', 'on'),
            (states('sensor.nebula_sync_state') | lower) == 'running',
            (states('sensor.nebula_sync_state_2') | lower) == 'running',
            is_state('switch.nebula_sync_container', 'on'),
            is_state('switch.nebula_sync_container_2', 'on')
          ] | select('equalto', true) | list | count > 0 %}
          {{ dns_state != 'ok' or (portainer_known and not portainer_ok) }}
        attributes:
          dns_consistency: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
          host: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }}"
          expected_ip: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }}"
          primary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }}"
          secondary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }}"
          primary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }}"
          secondary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }}"
          primary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }}"
          secondary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }}"
          nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
          nebula_status_2: "{{ states('binary_sensor.nebula_sync_status_2') }}"
          nebula_state: "{{ states('sensor.nebula_sync_state') }}"
          nebula_state_2: "{{ states('sensor.nebula_sync_state_2') }}"
          pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
          pihole_secondary_status_2: "{{ states('binary_sensor.pihole_secondary_status_2') }}"

      - name: "Infra Pihole IoT DNS Degraded"
        unique_id: infra_pihole_iot_dns_degraded
        device_class: problem
        state: >-
                    {{ states('sensor.infra_pihole_iot_dns_consistency') | lower != 'ok' }}
        attributes:
          dns_consistency: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
          checked_records: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }}"
          mismatch_count: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }}"
          mismatches: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }}"
          primary_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }}"
          backup_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}"

      - name: "Infra UPS On Battery"
        unique_id: infra_ups_on_battery
        device_class: problem
        state: >-
          {% set status = states('sensor.garage_ups_status') | upper %}
          {{ 'OB' in status }}

      - name: "Infra Website Degraded"
        unique_id: infra_website_degraded
        device_class: problem
        state: >-
                    {{ states('sensor.infra_website_down_count') | int(0) > 0 }}

      - name: "Infra Website Uptime SLO Breach"
        unique_id: infra_website_uptime_slo_breach
        device_class: problem
        state: >-
          {% set ns = namespace(seen=false, breach=false) %}
          {% for item in states.sensor %}
            {% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*uptime_1d$') %}
              {% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
                {% set ns.seen = true %}
                {% if (item.state | float(100)) < 99 %}
                  {% set ns.breach = true %}
                {% endif %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {{ ns.seen and ns.breach }}

      - name: "Infra Website Latency Degraded"
        unique_id: infra_website_latency_degraded
        device_class: problem
        state: >-
          {% set ns = namespace(seen=false, breach=false) %}
          {% for item in states.sensor %}
            {% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*avg_response_time_1d$') %}
              {% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
                {% set ns.seen = true %}
                {% if (item.state | float(0)) > 1.2 %}
                  {% set ns.breach = true %}
                {% endif %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {{ ns.seen and ns.breach }}

      - name: "Infra Domain Expiry Critical"
        unique_id: infra_domain_expiry_critical
        device_class: problem
        state: >-
          {% set d = states('sensor.infra_domain_expiry_min_days') %}
          {% if d in ['unknown', 'unavailable', 'none', ''] %}
            false
          {% else %}
            {{ d | float(9999) < 14 }}
          {% endif %}

      - name: "Infra Domain Expiry Warning"
        unique_id: infra_domain_expiry_warning
        device_class: problem
        state: >-
          {% set d = states('sensor.infra_domain_expiry_min_days') %}
          {% if d in ['unknown', 'unavailable', 'none', ''] %}
            false
          {% else %}
            {% set days = d | float(9999) %}
            {{ days < 30 and days >= 14 }}
          {% endif %}

      - name: "Infra Cert Expiry Critical"
        unique_id: infra_cert_expiry_critical
        device_class: problem
        state: >-
          {% set d = states('sensor.infra_cert_expiry_min_days') %}
          {% if d in ['unknown', 'unavailable', 'none', ''] %}
            false
          {% else %}
            {{ d | float(9999) < 14 }}
          {% endif %}

      - name: "Infra Cert Expiry Warning"
        unique_id: infra_cert_expiry_warning
        device_class: problem
        state: >-
          {% set d = states('sensor.infra_cert_expiry_min_days') %}
          {% if d in ['unknown', 'unavailable', 'none', ''] %}
            false
          {% else %}
            {% set days = d | float(9999) %}
            {{ days < 30 and days >= 14 }}
          {% endif %}

automation:
  - alias: "Infrastructure - External IP Change Logbook"
    id: infra_external_ip_change_logbook
    description: "Log external IP changes into the Activity feed."
    mode: queued
    trigger:
      - platform: state
        entity_id: sensor.infra_external_ip
    condition:
      - condition: template
        value_template: "{{ trigger.from_state is not none }}"
      - condition: template
        value_template: >-
          {{ trigger.from_state.state not in ['unknown', 'unavailable', 'none', ''] and
             trigger.to_state.state not in ['unknown', 'unavailable', 'none', ''] and
             trigger.from_state.state != trigger.to_state.state }}
    action:
      - service: script.send_to_logbook
        data:
          topic: "NETWORK"
          message: >-
                        External IP changed from {{ trigger.from_state.state }} to {{ trigger.to_state.state }}.

  - alias: "Infrastructure - Website Uptime SLO Repair"
    id: infra_website_uptime_slo_repair
    description: "Create/clear Repairs issue when website 1-day uptime breaches SLO."
    mode: queued
    trigger:
      - platform: state
        entity_id: binary_sensor.infra_website_uptime_slo_breach
    action:
      - choose:
          - conditions: "{{ trigger.to_state.state == 'on' }}"
            sequence:
              - service: repairs.create
                data:
                  issue_id: infra_website_uptime_slo_breach
                  title: "Website uptime SLO breached"
                  description: >
                    At least one monitored website has uptime_1d below 99%.
                    Review Uptime Kuma entities on the Website Health dashboard.
                  severity: warning
                  persistent: true
        default:
          - service: repairs.remove
            continue_on_error: true
            data:
              issue_id: infra_website_uptime_slo_breach

  - alias: "Infrastructure - Website Latency Repair"
    id: infra_website_latency_repair
    description: "Create/clear Repairs issue when website response times degrade."
    mode: queued
    trigger:
      - platform: state
        entity_id: binary_sensor.infra_website_latency_degraded
    action:
      - choose:
          - conditions: "{{ trigger.to_state.state == 'on' }}"
            sequence:
              - service: repairs.create
                data:
                  issue_id: infra_website_latency_degraded
                  title: "Website latency degraded"
                  description: >
                    At least one monitored website reports avg_response_time_1d above 1.2s.
                    Review Uptime Kuma response-time entities on Website Health.
                  severity: warning
                  persistent: true
        default:
          - service: repairs.remove
            continue_on_error: true
            data:
              issue_id: infra_website_latency_degraded

  - alias: "Infrastructure - Nebula Sync Health Dispatch"
    id: infra_nebula_sync_health_dispatch
    description: "Dispatch Joanna when Nebula Sync DNS consistency or container telemetry stays degraded."
    mode: queued
    trigger:
      - platform: state
        entity_id: binary_sensor.infra_nebula_sync_degraded
        to: "on"
        for: "00:10:00"
        id: degraded
      - platform: state
        entity_id: binary_sensor.infra_nebula_sync_degraded
        to: "off"
        for: "00:02:00"
        id: recovered
      - platform: homeassistant
        event: start
        id: reconcile
      - platform: time_pattern
        minutes: "/30"
        id: reconcile
    variables:
      issue_id: infra_nebula_sync_degraded
      dns_state: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
      previous_band: "{{ states('input_text.infra_nebula_sync_health_band') | lower }}"
      degraded: "{{ is_state('binary_sensor.infra_nebula_sync_degraded', 'on') }}"
      nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
      nebula_status_alt: "{{ states('binary_sensor.nebula_sync_status_2') }}"
      nebula_state: "{{ states('sensor.nebula_sync_state') }}"
      nebula_state_alt: "{{ states('sensor.nebula_sync_state_2') }}"
      pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
      pihole_secondary_status_alt: "{{ states('binary_sensor.pihole_secondary_status_2') }}"
    action:
      - choose:
          - conditions: "{{ degraded and previous_band != 'warning' }}"
            sequence:
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: "{{ issue_id }}"
              - service: script.joanna_dispatch
                data:
                  trigger_context: "HA automation infra_nebula_sync_health_dispatch (Infrastructure - Nebula Sync Health Dispatch)"
                  source: "home_assistant_automation.infra_nebula_sync_health_dispatch.warning"
                  summary: "Nebula Sync DNS consistency or container health is degraded"
                  entity_ids:
                    - sensor.infra_nebula_sync_dns_consistency
                    - binary_sensor.infra_nebula_sync_degraded
                    - binary_sensor.nebula_sync_status
                    - binary_sensor.nebula_sync_status_2
                    - sensor.nebula_sync_state
                    - sensor.nebula_sync_state_2
                    - binary_sensor.pihole_secondary_status
                    - binary_sensor.pihole_secondary_status_2
                  diagnostics: >-
                    issue_id={{ issue_id }},
                    dns_consistency={{ dns_state }},
                    host={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }},
                    expected_ip={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }},
                    primary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }},
                    secondary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }},
                    primary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }},
                    secondary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }},
                    primary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }},
                    secondary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }},
                    nebula_status={{ nebula_status }},
                    nebula_status_2={{ nebula_status_alt }},
                    nebula_state={{ nebula_state }},
                    nebula_state_2={{ nebula_state_alt }},
                    pihole_secondary_status={{ pihole_secondary_status }},
                    pihole_secondary_status_2={{ pihole_secondary_status_alt }},
                    primary_dns=192.168.10.10,
                    backup_dns=192.168.10.14
                  request: >-
                    Investigate Nebula Sync on docker_14 and the backup Pi-hole sync path.
                    Verify both Pi-holes answer the GTG-PF45FK6F short name, FQDN, and reverse lookup consistently.
                    Check nebula_sync container status, Docker health, recent sync logs, and primary/replica Pi-hole API reachability.
                    If confidence is high, perform safe remediation such as a one-time Nebula Sync run or restarting only the nebula_sync container.
                    Do not restart Pi-hole or change DHCP/custom DNS records unless diagnostics prove data drift and the action is safe.
                    Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
                  domain_hint: ops
                  lane_hint: joanna.ops
              - service: script.send_to_logbook
                data:
                  topic: "DNS"
                  message: >-
                                        Nebula Sync DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_nebula_sync_health_band
                data:
                  value: warning
          - conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
            sequence:
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: "{{ issue_id }}"
              - service: script.send_to_logbook
                data:
                  topic: "DNS"
                  message: "Nebula Sync DNS consistency recovered; Joanna-only warning state cleared."
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_nebula_sync_health_band
                data:
                  value: normal
          - conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
            sequence:
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_nebula_sync_health_band
                data:
                  value: normal

  - alias: "Infrastructure - Pi-hole IoT DNS Drift Dispatch"
    id: infra_pihole_iot_dns_drift_dispatch
    description: "Dispatch Joanna when promoted IoT Pi-hole DNS records drift across primary and backup resolvers."
    mode: queued
    trigger:
      - platform: state
        entity_id: binary_sensor.infra_pihole_iot_dns_degraded
        to: "on"
        for: "00:10:00"
        id: degraded
      - platform: state
        entity_id: binary_sensor.infra_pihole_iot_dns_degraded
        to: "off"
        for: "00:02:00"
        id: recovered
      - platform: event
        event_type: homeassistant_started
        id: reconcile
      - platform: time_pattern
        minutes: "7"
        id: reconcile
    variables:
      issue_id: infra_pihole_iot_dns_degraded
      dns_state: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
      previous_band: "{{ states('input_text.infra_pihole_iot_dns_health_band') | lower }}"
      degraded: "{{ is_state('binary_sensor.infra_pihole_iot_dns_degraded', 'on') }}"
    action:
      - choose:
          - conditions: "{{ degraded and previous_band != 'warning' }}"
            sequence:
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: "{{ issue_id }}"
              - service: script.joanna_dispatch
                data:
                  trigger_context: "HA automation infra_pihole_iot_dns_drift_dispatch (Infrastructure - Pi-hole IoT DNS Drift Dispatch)"
                  source: "home_assistant_automation.infra_pihole_iot_dns_drift_dispatch.warning"
                  summary: "Promoted IoT Pi-hole DNS records drifted across primary and backup resolvers"
                  entity_ids:
                    - sensor.infra_pihole_iot_dns_consistency
                    - binary_sensor.infra_pihole_iot_dns_degraded
                  diagnostics: >-
                    issue_id={{ issue_id }},
                    dns_consistency={{ dns_state }},
                    checked_records={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }},
                    mismatch_count={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }},
                    mismatches={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }},
                    primary_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }},
                    backup_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}
                  request: >-
                    Investigate primary/backup Pi-hole DNS drift for promoted IoT reservations.
                    Verify both Pi-holes answer rachio, econet, dreame-vacuum, carlo-bed, lg-smart-fridge, tesla-blackbox-gw, and bgw210 FQDNs with the expected reserved IPs.
                    Check primary and backup pihole.toml local DNS host records, Nebula Sync behavior, and generated custom.list files.
                    Do not change DHCP/custom DNS records unless diagnostics prove drift and the action is safe.
                    Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
                  domain_hint: ops
                  lane_hint: joanna.ops
              - service: script.send_to_logbook
                data:
                  topic: "DNS"
                  message: >-
                                        Promoted IoT Pi-hole DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_pihole_iot_dns_health_band
                data:
                  value: warning
          - conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
            sequence:
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: "{{ issue_id }}"
              - service: script.send_to_logbook
                data:
                  topic: "DNS"
                  message: "Promoted IoT Pi-hole DNS consistency recovered; Joanna-only warning state cleared."
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_pihole_iot_dns_health_band
                data:
                  value: normal
          - conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
            sequence:
              - service: input_text.set_value
                target:
                  entity_id: input_text.infra_pihole_iot_dns_health_band
                data:
                  value: normal

  - alias: "Docker Host Disk Pressure Monitor"
    id: docker_host_disk_pressure_monitor
    description: "Track Docker host root disk pressure from normalized Glances sensors and dispatch Joanna on band changes."
    mode: queued
    trigger:
      - platform: time_pattern
        minutes: "/15"
      - platform: state
        entity_id:
          - sensor.docker_17_disk_used_percentage
          - sensor.docker_14_disk_used_percentage
          - sensor.docker_69_disk_used_percentage
    variables:
      host_configs:
        - host_id: docker_17
          host_name: docker_17
          disk_entity: sensor.docker_17_disk_used_percentage
          raw_entity: sensor.192_168_10_17_disk_usage
          free_entity: sensor.192_168_10_17_disk_free
          used_entity: sensor.192_168_10_17_disk_used
          band_entity: input_text.docker_17_disk_pressure_band
          issue_id: docker_host_docker_17_disk_pressure
        - host_id: docker_14
          host_name: docker_14
          disk_entity: sensor.docker_14_disk_used_percentage
          raw_entity: sensor.docker14_disk_usage
          free_entity: sensor.docker14_disk_free
          used_entity: sensor.docker14_disk_used
          band_entity: input_text.docker_14_disk_pressure_band
          issue_id: docker_host_docker_14_disk_pressure
        - host_id: docker_69
          host_name: docker_69
          disk_entity: sensor.docker_69_disk_used_percentage
          raw_entity: sensor.docker69_disk_usage
          free_entity: sensor.docker69_disk_free
          used_entity: sensor.docker69_disk_used
          band_entity: input_text.docker_69_disk_pressure_band
          issue_id: docker_host_docker_69_disk_pressure
    action:
      - repeat:
          for_each: "{{ host_configs }}"
          sequence:
            - variables:
                host_id: "{{ repeat.item.host_id }}"
                host_name: "{{ repeat.item.host_name }}"
                disk_entity: "{{ repeat.item.disk_entity }}"
                raw_entity: "{{ repeat.item.raw_entity }}"
                free_entity: "{{ repeat.item.free_entity }}"
                used_entity: "{{ repeat.item.used_entity }}"
                band_entity: "{{ repeat.item.band_entity }}"
                issue_id: "{{ repeat.item.issue_id }}"
                disk_state: "{{ states(disk_entity) }}"
                disk_pct: "{{ disk_state | float(0) }}"
                previous_band: "{{ states(band_entity) | lower }}"
                current_band: >-
                  {{ 'unavailable' if disk_state in ['unknown', 'unavailable', 'none', '']
                     else 'critical' if disk_pct >= 90
                     else 'warning' if disk_pct >= 80
                     else 'normal' }}
            - choose:
                - conditions: "{{ current_band == 'critical' and previous_band != 'critical' }}"
                  sequence:
                    - service: repairs.create
                      data:
                        issue_id: "{{ issue_id }}"
                        severity: error
                        persistent: true
                        title: "{{ host_name }} disk pressure critical ({{ disk_pct | round(1) }}%)"
                        description: >-
                          {{ host_name }} root disk usage is critically high.
                          Free space or expand the host filesystem before Docker workloads fail.
                    - service: script.joanna_dispatch
                      data:
                        trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Critical)"
                        source: "home_assistant_automation.docker_host_disk_pressure_monitor.critical"
                        summary: "{{ host_name }} root disk pressure is critical at {{ disk_pct | round(1) }}%"
                        entity_ids:
                          - "{{ disk_entity }}"
                          - "{{ raw_entity }}"
                          - "{{ free_entity }}"
                          - "{{ used_entity }}"
                        diagnostics: >-
                          issue_id={{ issue_id }},
                          host_id={{ host_id }},
                          disk_entity={{ disk_entity }},
                          raw_entity={{ raw_entity }},
                          disk_pct={{ disk_pct | round(1) }},
                          disk_free={{ states(free_entity) }},
                          disk_used={{ states(used_entity) }},
                          threshold=90
                        request: >-
                          Investigate critical disk pressure on {{ host_name }} and perform safe remediation when confidence is high.
                          Check Docker build cache, image/container volumes, logs, backups, and large files first.
                          Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
                          Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
                          Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
                    - service: script.send_to_logbook
                      data:
                        topic: "DOCKER"
                        message: >-
                          {{ host_name }} disk usage is critical at {{ disk_pct | round(1) }}%.
                          Repair {{ issue_id }} opened and Joanna investigation requested.
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
                      data:
                        value: "critical"
                - conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}"
                  sequence:
                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                    - service: script.joanna_dispatch
                      data:
                        trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)"
                        source: "home_assistant_automation.docker_host_disk_pressure_monitor.warning"
                        summary: "{{ host_name }} root disk pressure warning at {{ disk_pct | round(1) }}%"
                        entity_ids:
                          - "{{ disk_entity }}"
                          - "{{ raw_entity }}"
                          - "{{ free_entity }}"
                          - "{{ used_entity }}"
                        diagnostics: >-
                          issue_id={{ issue_id }},
                          host_id={{ host_id }},
                          disk_entity={{ disk_entity }},
                          raw_entity={{ raw_entity }},
                          disk_pct={{ disk_pct | round(1) }},
                          disk_free={{ states(free_entity) }},
                          disk_used={{ states(used_entity) }},
                          threshold=80
                        request: >-
                          Investigate elevated disk usage on {{ host_name }} and perform safe low-risk cleanup before it becomes critical when confidence is high.
                          Check Docker build cache, image/container volumes, logs, backups, and large files first.
                          Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
                          Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
                          Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
                    - service: script.send_to_logbook
                      data:
                        topic: "DOCKER"
                        message: >-
                          {{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%.
                          Joanna investigation requested without opening a warning Repair.
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
                      data:
                        value: "warning"
                - conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}"
                  sequence:
                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                    - service: script.send_to_logbook
                      data:
                        topic: "DOCKER"
                        message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%. Critical Repair cleared; Joanna continues handling warning-level cleanup."
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
                      data:
                        value: "warning"
                - conditions: "{{ current_band == 'normal' and previous_band in ['warning', 'critical'] }}"
                  sequence:
                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                    - service: script.send_to_logbook
                      data:
                        topic: "DOCKER"
                        message: "{{ host_name }} disk usage recovered to {{ disk_pct | round(1) }}%. Repair {{ issue_id }} cleared."
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
                      data:
                        value: "normal"
                - conditions: "{{ current_band == 'normal' and previous_band not in ['normal', 'warning', 'critical'] }}"
                  sequence:
                    - service: repairs.remove
                      continue_on_error: true
                      data:
                        issue_id: "{{ issue_id }}"
                    - service: input_text.set_value
                      target:
                        entity_id: "{{ band_entity }}"
                      data:
                        value: "normal"

  - alias: "Infrastructure - Backup Nightly Verification"
    id: infra_backup_nightly_verification
    description: "Use codex_appliance to verify the latest Duplicati run and dispatch Joanna only on failure."
    mode: single
    trigger:
      - platform: time
        at: "08:00:00"
        id: nightly
      - platform: time_pattern
        minutes: "15"
        id: recovery_poll
      - platform: time_pattern
        minutes: "45"
        id: recovery_poll
    condition:
      - condition: template
        value_template: >-
          {{ trigger is not defined or trigger.id != 'recovery_poll'
             or is_state('input_boolean.infra_duplicati_backup_repair_active', 'on') }}
    action:
      - variables:
          trigger_source: "{{ trigger.id if trigger is defined and trigger.id is defined else 'manual' }}"
          verifier_reason: "{{ 'ha_failure_followup' if trigger_source == 'recovery_poll' else 'ha_nightly' }}"
          trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)"
          duplicati_state: "{{ states('switch.duplicati_container') }}"
      - action: rest_command.bearclaw_duplicati_verify
        data:
          reason: "{{ verifier_reason }}"
        response_variable: duplicati_verify
      - service: script.send_to_logbook
        data:
          topic: "BACKUP"
          message: >-
            {% set payload = duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} %}
            {% set detail = payload['detail'] if payload is mapping and payload['detail'] is mapping else {} %}
            {{ detail.get('summary', 'Nightly Duplicati verification completed.') }}
      - variables:
          verify_payload: "{{ duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} }}"
          verify_detail: "{{ verify_payload['detail'] if verify_payload is mapping and verify_payload['detail'] is mapping else {} }}"
          verify_http_status: "{{ duplicati_verify['status'] | int(0) if duplicati_verify is mapping else 0 }}"
          verify_healthy: "{{ verify_payload.get('ok', false) and verify_detail.get('healthy', false) }}"
          verify_status: "{{ verify_detail.get('status', 'unknown') }}"
          verify_summary: "{{ verify_detail.get('summary', 'Duplicati verification did not return a summary.') }}"
          verify_issue: "{{ verify_detail.get('issue', verify_payload.get('error', 'duplicati_verify_failed')) }}"
          verify_backup_name: "{{ verify_detail.get('backupName', 'Docker_Configs') }}"
          verify_latest_result: "{{ verify_detail.get('latestResult', {}) if verify_detail is mapping else {} }}"
          verify_last_success: "{{ verify_detail.get('lastSuccessfulRun', {}) if verify_detail is mapping else {} }}"
          verify_transport_issue: "{{ verify_status in ['api_error', 'unknown'] }}"
      - choose:
          - conditions: "{{ verify_healthy }}"
            sequence:
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: infra_duplicati_backup_failure
              - service: repairs.remove
                continue_on_error: true
                data:
                  issue_id: user_infra_duplicati_backup_failure
              - service: input_boolean.turn_off
                target:
                  entity_id: input_boolean.infra_duplicati_backup_repair_active
          - conditions: "{{ verify_transport_issue }}"
            sequence:
              - service: script.send_to_logbook
                data:
                  topic: "BACKUP"
                  message: >-
                    Duplicati verifier could not prove backup health because the verification service returned
                    status {{ verify_status }} with issue {{ verify_issue }}. No repair card was opened because
                    this is verifier transport state, not a confirmed backup failure.
        default:
          - service: input_boolean.turn_on
            target:
              entity_id: input_boolean.infra_duplicati_backup_repair_active
          - choose:
              - conditions: "{{ trigger_source != 'recovery_poll' }}"
                sequence:
                  - service: repairs.create
                    data:
                      issue_id: infra_duplicati_backup_failure
                      title: "Duplicati nightly backup verification failed"
                      description: >-
                        {{ verify_summary }}
                        Backup={{ verify_backup_name }};
                        status={{ verify_status }};
                        last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
                        last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
                      severity: error
                      persistent: true
                  - service: script.joanna_dispatch
                    data:
                      trigger_context: "{{ trigger_context }}"
                      source: "home_assistant_automation.infra_backup_nightly_verification"
                      summary: "Nightly Duplicati backup verification failed"
                      entity_ids:
                        - "switch.duplicati_container"
                      diagnostics: >-
                        scheduled_time=08:00:00,
                        duplicati_container={{ duplicati_state }},
                        verifier_http_status={{ verify_http_status }},
                        verifier_status={{ verify_status }},
                        verifier_issue={{ verify_issue }},
                        backup_name={{ verify_backup_name }},
                        latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
                        last_success={{ verify_last_success.get('endedAt', 'n/a') }}
                      request: >-
                        Investigate the Duplicati backup job {{ verify_backup_name }}.
                        The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
                        Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
                        Home Assistant will re-check this verifier every 30 minutes after dispatch and clear the Repair automatically once the backup is healthy.
                        Reply with explicit status fields:
                        resolved=true/false,
                        backup_status,
                        last_success_time,
                        root_cause,
                        action_taken,
                        verification,
                        next_action_required=true/false.
            default:
              - service: script.send_to_logbook
                data:
                  topic: "BACKUP"
                  message: >-
                    Duplicati recovery follow-up still reports {{ verify_status }} for {{ verify_backup_name }}:
                    {{ verify_issue }}. Existing Repair remains open; Joanna was not dispatched again.

  - alias: "Infrastructure - Monthly HA Log Hygiene Review"
    id: infra_monthly_log_hygiene_review
    description: "Ask Joanna monthly to review Home Assistant logs, create a GitHub issue with noisy entries, and send Telegram recommendations only."
    mode: single
    trigger:
      - platform: time
        at: "03:20:00"
    condition:
      - condition: template
        value_template: "{{ now().day == 1 }}"
    variables:
      trigger_context: "HA automation infra_monthly_log_hygiene_review (Infrastructure - Monthly HA Log Hygiene Review)"
    action:
      - service: script.joanna_dispatch
        data:
          trigger_context: "{{ trigger_context }}"
          source: "home_assistant_automation.infra_monthly_log_hygiene_review"
          summary: "Monthly Home Assistant log hygiene review with GitHub issue and Telegram follow-up"
          diagnostics: >-
            schedule=day_1@03:20:00,
            review_scope=available_home_assistant_logs,
            desired_outputs=telegram_follow_up+github_issue,
            github_repo=CCOSTAN/Home-AssistantConfig,
            approval_required_before_changes=true
          request: >-
            Review the available Home Assistant log files from the last month and identify noisy,
            low-value entries that could be safely suppressed, filtered, slowed, deduplicated, or
            retired. Focus on practical Home Assistant-side changes such as recorder exclusions,
            logger filtering, scan-interval reductions, entity retirement, or automation de-noising.
            Create or refresh a GitHub issue in CCOSTAN/Home-AssistantConfig that captures the noisy
            entries, estimated frequency, why each candidate is low-value, and the exact repo files
            or integrations likely to change. Then send Carlo a concise Telegram summary with the top
            recommendations and the GitHub issue number or link. Do not make any changes from this
            review. Wait for explicit follow-up approval first.
      - service: script.send_to_logbook
        data:
          topic: "HOME ASSISTANT"
          message: "Joanna monthly Home Assistant log hygiene review dispatched; Telegram summary and GitHub issue requested."