###################################################################### # @CCOSTAN - Follow Me on X # For more info visit https://www.vcloudinfo.com/click-here # Original Repo : https://github.com/CCOSTAN/Home-AssistantConfig # ------------------------------------------------------------------- # Infrastructure - Observability, disk pressure, and Joanna review workflows # WAN/DNS/website/domain/cert/Docker host state normalized for dashboards, plus scheduled infrastructure reviews. # ------------------------------------------------------------------- # Related Issue: 1584 # Notes: Home dashboard consumes `infra_*` entities for exceptions-only alerts. # Notes: Domain warning threshold is <30 days; critical threshold is <14 days. # Notes: Nightly Duplicati verification runs at 08:00 after the 05:30 Duplicati job and docker_14 reboot window. # Notes: Duplicati transport/API errors are logged only; repairs are reserved for proven failed or stale backups. # Notes: Duplicati failure Repairs enable a recovery poll that clears the Repair after a later successful run. # Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes. # Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics. # Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered. # Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts. # Notes: Warning-level Docker host disk pressure is Joanna-only; Repairs are reserved for critical pressure. # Notes: Nebula Sync DNS consistency compares primary/backup Pi-hole answers and dispatches Joanna on sustained drift or container loss. ###################################################################### input_text: docker_17_disk_pressure_band: name: "docker_17 disk pressure band" max: 20 docker_14_disk_pressure_band: name: "docker_14 disk pressure band" max: 20 docker_69_disk_pressure_band: name: "docker_69 disk pressure band" max: 20 infra_nebula_sync_health_band: name: "Nebula Sync health band" max: 20 input_boolean: infra_duplicati_backup_repair_active: name: "Duplicati backup repair active" command_line: - sensor: name: Infra WAN Packet Loss unique_id: infra_wan_packet_loss command: >- ping -q -c 10 -W 1 1.1.1.1 2>/dev/null | awk -F',' '/packet loss/ {gsub(/[^0-9.]/, "", $3); print $3; found=1} END {if (!found) print "unknown"}' scan_interval: 300 unit_of_measurement: "%" state_class: measurement value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}" - sensor: name: Infra WAN Latency Ms unique_id: infra_wan_latency_ms command: >- ping -q -c 10 -W 1 1.1.1.1 2>/dev/null | awk -F'/' '/^rtt|^round-trip/ {gsub(/[^0-9.]/, "", $5); print $5; found=1} END {if (!found) print "unknown"}' scan_interval: 300 unit_of_measurement: "ms" state_class: measurement value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}" - sensor: name: Infra External IP Fallback unique_id: infra_external_ip_fallback command: "curl -fsS https://api.ipify.org || echo unknown" scan_interval: 900 - sensor: name: Infra Nebula Sync DNS Consistency unique_id: infra_nebula_sync_dns_consistency command: >- /bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; host=GTG-PF45FK6F; fqdn=GTG-PF45FK6F.fordst.com; ip=192.168.10.117; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; r(){ dig +time=2 +tries=1 +short @"$1" -x "$2" 2>/dev/null | tr -d "\r" | sed "s/\.$//" | sort | tr "\n" "," | sed "s/,$//"; }; p_short=$(q "$primary" "$host"); s_short=$(q "$secondary" "$host"); p_fqdn=$(q "$primary" "$fqdn"); s_fqdn=$(q "$secondary" "$fqdn"); p_rev=$(r "$primary" "$ip"); s_rev=$(r "$secondary" "$ip"); status=mismatch; if [ "$p_short" = "$ip" ] && [ "$s_short" = "$ip" ] && [ "$p_fqdn" = "$ip" ] && [ "$s_fqdn" = "$ip" ] && [ -n "$p_rev" ] && [ "$p_rev" = "$s_rev" ]; then status=ok; fi; printf "{\"status\":\"%s\",\"host\":\"%s\",\"expected_ip\":\"%s\",\"primary_short\":\"%s\",\"secondary_short\":\"%s\",\"primary_fqdn\":\"%s\",\"secondary_fqdn\":\"%s\",\"primary_reverse\":\"%s\",\"secondary_reverse\":\"%s\"}\n" "$status" "$host" "$ip" "$p_short" "$s_short" "$p_fqdn" "$s_fqdn" "$p_rev" "$s_rev"' scan_interval: 300 value_template: "{{ value_json.status | default('unknown') }}" json_attributes: - host - expected_ip - primary_short - secondary_short - primary_fqdn - secondary_fqdn - primary_reverse - secondary_reverse template: - sensor: - name: "Infra External IP" unique_id: infra_external_ip state: >- {% set primary = states('sensor.external_ip') | trim %} {% set fallback = states('sensor.infra_external_ip_fallback') | trim %} {% if primary not in ['unknown', 'unavailable', 'none', ''] %} {{ primary }} {% else %} {{ fallback }} {% endif %} - name: "docker_17 Disk Used Percentage" unique_id: docker_17_disk_used_percentage unit_of_measurement: "%" state_class: measurement icon: mdi:harddisk availability: "{{ states('sensor.192_168_10_17_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}" state: "{{ states('sensor.192_168_10_17_disk_usage') | float(0) | round(1) }}" - name: "docker_14 Disk Used Percentage" unique_id: docker_14_disk_used_percentage unit_of_measurement: "%" state_class: measurement icon: mdi:harddisk availability: "{{ states('sensor.docker14_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}" state: "{{ states('sensor.docker14_disk_usage') | float(0) | round(1) }}" - name: "docker_69 Disk Used Percentage" unique_id: docker_69_disk_used_percentage unit_of_measurement: "%" state_class: measurement icon: mdi:harddisk availability: "{{ states('sensor.docker69_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}" state: "{{ states('sensor.docker69_disk_usage') | float(0) | round(1) }}" - name: "Infra Domain Expiry Min Days" unique_id: infra_domain_expiry_min_days unit_of_measurement: "d" state: >- {% set ids = [ 'sensor.vcloudinfo_com_days_until_expiration', 'sensor.ipmer_com_days_until_expiration', 'sensor.fordst_com_days_until_expiration', 'sensor.kingcrafthomes_com_days_until_expiration' ] %} {% set ns = namespace(min=9999, any=false) %} {% for id in ids %} {% if expand(id) | count > 0 %} {% set raw = states(id) %} {% if raw not in ['unknown', 'unavailable', 'none', ''] %} {% set ns.any = true %} {% set val = raw | float(9999) %} {% if val < ns.min %} {% set ns.min = val %} {% endif %} {% endif %} {% endif %} {% endfor %} {% if ns.any %} {{ ns.min | round(0) }} {% else %} {{ none }} {% endif %} - name: "Infra Cert Expiry Min Days" unique_id: infra_cert_expiry_min_days unit_of_measurement: "d" state: >- {% set ns = namespace(min=9999, any=false) %} {% for item in states.sensor %} {% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %} {% set raw = item.state %} {% if raw not in ['unknown', 'unavailable', 'none', ''] %} {% set value = raw | float(9999) %} {% if value != 9999 %} {% set ns.any = true %} {% if value < ns.min %} {% set ns.min = value %} {% endif %} {% endif %} {% endif %} {% endif %} {% endfor %} {% if ns.any %} {{ ns.min | round(0) }} {% else %} {{ none }} {% endif %} - name: "Infra Cert Telemetry Count" unique_id: infra_cert_telemetry_count icon: mdi:counter state: >- {% set ns = namespace(count=0) %} {% for item in states.sensor %} {% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %} {% set ns.count = ns.count + 1 %} {% endif %} {% endfor %} {{ ns.count }} - name: "Infra Website Down Count" unique_id: infra_website_down_count icon: mdi:counter state: >- {% set ids = [ 'binary_sensor.vcloudinfo_com', 'binary_sensor.ipmer_com', 'binary_sensor.fordst_com', 'binary_sensor.www_kingcrafthomes_com' ] %} {% set ns = namespace(count=0) %} {% for id in ids %} {% if expand(id) | count > 0 %} {% set st = states(id) %} {% if st in ['off', 'unknown', 'unavailable'] %} {% set ns.count = ns.count + 1 %} {% endif %} {% endif %} {% endfor %} {{ ns.count }} - binary_sensor: - name: "Infra WAN Quality Degraded" unique_id: infra_wan_quality_degraded device_class: problem state: >- {% set loss_raw = states('sensor.infra_wan_packet_loss') %} {% set lat_raw = states('sensor.infra_wan_latency_ms') %} {% set invalid = loss_raw in ['unknown', 'unavailable', 'none', ''] or lat_raw in ['unknown', 'unavailable', 'none', ''] %} {% set loss = loss_raw | float(0) %} {% set lat = lat_raw | float(0) %} {{ invalid or loss > 5 or lat > 80 }} - name: "Infra DNS Pihole Degraded" unique_id: infra_dns_pihole_degraded device_class: problem state: >- {% set switch_state = states('switch.pi_hole') %} {% set service_state = states('binary_sensor.pihole_status') %} {{ switch_state != 'on' or service_state in ['off', 'unavailable', 'unknown'] }} - name: "Infra Nebula Sync Degraded" unique_id: infra_nebula_sync_degraded device_class: problem state: >- {% set dns_state = states('sensor.infra_nebula_sync_dns_consistency') | lower %} {% set portainer_known = [ expand('binary_sensor.nebula_sync_status') | count > 0, expand('binary_sensor.nebula_sync_status_2') | count > 0, expand('sensor.nebula_sync_state') | count > 0, expand('sensor.nebula_sync_state_2') | count > 0, expand('switch.nebula_sync_container') | count > 0, expand('switch.nebula_sync_container_2') | count > 0 ] | select('equalto', true) | list | count > 0 %} {% set portainer_ok = [ is_state('binary_sensor.nebula_sync_status', 'on'), is_state('binary_sensor.nebula_sync_status_2', 'on'), (states('sensor.nebula_sync_state') | lower) == 'running', (states('sensor.nebula_sync_state_2') | lower) == 'running', is_state('switch.nebula_sync_container', 'on'), is_state('switch.nebula_sync_container_2', 'on') ] | select('equalto', true) | list | count > 0 %} {{ dns_state != 'ok' or (portainer_known and not portainer_ok) }} attributes: dns_consistency: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}" host: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }}" expected_ip: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }}" primary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }}" secondary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }}" primary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }}" secondary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }}" primary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }}" secondary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }}" nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}" nebula_status_2: "{{ states('binary_sensor.nebula_sync_status_2') }}" nebula_state: "{{ states('sensor.nebula_sync_state') }}" nebula_state_2: "{{ states('sensor.nebula_sync_state_2') }}" pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}" pihole_secondary_status_2: "{{ states('binary_sensor.pihole_secondary_status_2') }}" - name: "Infra UPS On Battery" unique_id: infra_ups_on_battery device_class: problem state: >- {% set status = states('sensor.garage_ups_status') | upper %} {{ 'OB' in status }} - name: "Infra Website Degraded" unique_id: infra_website_degraded device_class: problem state: >- {{ states('sensor.infra_website_down_count') | int(0) > 0 }} - name: "Infra Website Uptime SLO Breach" unique_id: infra_website_uptime_slo_breach device_class: problem state: >- {% set ns = namespace(seen=false, breach=false) %} {% for item in states.sensor %} {% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*uptime_1d$') %} {% if item.state not in ['unknown', 'unavailable', 'none', ''] %} {% set ns.seen = true %} {% if (item.state | float(100)) < 99 %} {% set ns.breach = true %} {% endif %} {% endif %} {% endif %} {% endfor %} {{ ns.seen and ns.breach }} - name: "Infra Website Latency Degraded" unique_id: infra_website_latency_degraded device_class: problem state: >- {% set ns = namespace(seen=false, breach=false) %} {% for item in states.sensor %} {% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*avg_response_time_1d$') %} {% if item.state not in ['unknown', 'unavailable', 'none', ''] %} {% set ns.seen = true %} {% if (item.state | float(0)) > 1.2 %} {% set ns.breach = true %} {% endif %} {% endif %} {% endif %} {% endfor %} {{ ns.seen and ns.breach }} - name: "Infra Domain Expiry Critical" unique_id: infra_domain_expiry_critical device_class: problem state: >- {% set d = states('sensor.infra_domain_expiry_min_days') %} {% if d in ['unknown', 'unavailable', 'none', ''] %} false {% else %} {{ d | float(9999) < 14 }} {% endif %} - name: "Infra Domain Expiry Warning" unique_id: infra_domain_expiry_warning device_class: problem state: >- {% set d = states('sensor.infra_domain_expiry_min_days') %} {% if d in ['unknown', 'unavailable', 'none', ''] %} false {% else %} {% set days = d | float(9999) %} {{ days < 30 and days >= 14 }} {% endif %} - name: "Infra Cert Expiry Critical" unique_id: infra_cert_expiry_critical device_class: problem state: >- {% set d = states('sensor.infra_cert_expiry_min_days') %} {% if d in ['unknown', 'unavailable', 'none', ''] %} false {% else %} {{ d | float(9999) < 14 }} {% endif %} - name: "Infra Cert Expiry Warning" unique_id: infra_cert_expiry_warning device_class: problem state: >- {% set d = states('sensor.infra_cert_expiry_min_days') %} {% if d in ['unknown', 'unavailable', 'none', ''] %} false {% else %} {% set days = d | float(9999) %} {{ days < 30 and days >= 14 }} {% endif %} automation: - alias: "Infrastructure - External IP Change Logbook" id: infra_external_ip_change_logbook description: "Log external IP changes into the Activity feed." mode: queued trigger: - platform: state entity_id: sensor.infra_external_ip condition: - condition: template value_template: "{{ trigger.from_state is not none }}" - condition: template value_template: >- {{ trigger.from_state.state not in ['unknown', 'unavailable', 'none', ''] and trigger.to_state.state not in ['unknown', 'unavailable', 'none', ''] and trigger.from_state.state != trigger.to_state.state }} action: - service: script.send_to_logbook data: topic: "NETWORK" message: >- External IP changed from {{ trigger.from_state.state }} to {{ trigger.to_state.state }}. - alias: "Infrastructure - Website Uptime SLO Repair" id: infra_website_uptime_slo_repair description: "Create/clear Repairs issue when website 1-day uptime breaches SLO." mode: queued trigger: - platform: state entity_id: binary_sensor.infra_website_uptime_slo_breach action: - choose: - conditions: "{{ trigger.to_state.state == 'on' }}" sequence: - service: repairs.create data: issue_id: infra_website_uptime_slo_breach title: "Website uptime SLO breached" description: > At least one monitored website has uptime_1d below 99%. Review Uptime Kuma entities on the Website Health dashboard. severity: warning persistent: true default: - service: repairs.remove continue_on_error: true data: issue_id: infra_website_uptime_slo_breach - alias: "Infrastructure - Website Latency Repair" id: infra_website_latency_repair description: "Create/clear Repairs issue when website response times degrade." mode: queued trigger: - platform: state entity_id: binary_sensor.infra_website_latency_degraded action: - choose: - conditions: "{{ trigger.to_state.state == 'on' }}" sequence: - service: repairs.create data: issue_id: infra_website_latency_degraded title: "Website latency degraded" description: > At least one monitored website reports avg_response_time_1d above 1.2s. Review Uptime Kuma response-time entities on Website Health. severity: warning persistent: true default: - service: repairs.remove continue_on_error: true data: issue_id: infra_website_latency_degraded - alias: "Infrastructure - Nebula Sync Health Dispatch" id: infra_nebula_sync_health_dispatch description: "Dispatch Joanna when Nebula Sync DNS consistency or container telemetry stays degraded." mode: queued trigger: - platform: state entity_id: binary_sensor.infra_nebula_sync_degraded to: "on" for: "00:10:00" id: degraded - platform: state entity_id: binary_sensor.infra_nebula_sync_degraded to: "off" for: "00:02:00" id: recovered - platform: homeassistant event: start id: reconcile - platform: time_pattern minutes: "/30" id: reconcile variables: issue_id: infra_nebula_sync_degraded dns_state: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}" previous_band: "{{ states('input_text.infra_nebula_sync_health_band') | lower }}" degraded: "{{ is_state('binary_sensor.infra_nebula_sync_degraded', 'on') }}" nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}" nebula_status_alt: "{{ states('binary_sensor.nebula_sync_status_2') }}" nebula_state: "{{ states('sensor.nebula_sync_state') }}" nebula_state_alt: "{{ states('sensor.nebula_sync_state_2') }}" pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}" pihole_secondary_status_alt: "{{ states('binary_sensor.pihole_secondary_status_2') }}" action: - choose: - conditions: "{{ degraded and previous_band != 'warning' }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: script.joanna_dispatch data: trigger_context: "HA automation infra_nebula_sync_health_dispatch (Infrastructure - Nebula Sync Health Dispatch)" source: "home_assistant_automation.infra_nebula_sync_health_dispatch.warning" summary: "Nebula Sync DNS consistency or container health is degraded" entity_ids: - sensor.infra_nebula_sync_dns_consistency - binary_sensor.infra_nebula_sync_degraded - binary_sensor.nebula_sync_status - binary_sensor.nebula_sync_status_2 - sensor.nebula_sync_state - sensor.nebula_sync_state_2 - binary_sensor.pihole_secondary_status - binary_sensor.pihole_secondary_status_2 diagnostics: >- issue_id={{ issue_id }}, dns_consistency={{ dns_state }}, host={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }}, expected_ip={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }}, primary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }}, secondary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }}, primary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }}, secondary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }}, primary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }}, secondary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }}, nebula_status={{ nebula_status }}, nebula_status_2={{ nebula_status_alt }}, nebula_state={{ nebula_state }}, nebula_state_2={{ nebula_state_alt }}, pihole_secondary_status={{ pihole_secondary_status }}, pihole_secondary_status_2={{ pihole_secondary_status_alt }}, primary_dns=192.168.10.10, backup_dns=192.168.10.14 request: >- Investigate Nebula Sync on docker_14 and the backup Pi-hole sync path. Verify both Pi-holes answer the GTG-PF45FK6F short name, FQDN, and reverse lookup consistently. Check nebula_sync container status, Docker health, recent sync logs, and primary/replica Pi-hole API reachability. If confidence is high, perform safe remediation such as a one-time Nebula Sync run or restarting only the nebula_sync container. Do not restart Pi-hole or change DHCP/custom DNS records unless diagnostics prove data drift and the action is safe. Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false. domain_hint: ops lane_hint: joanna.ops - service: script.send_to_logbook data: topic: "DNS" message: >- Nebula Sync DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair. - service: input_text.set_value target: entity_id: input_text.infra_nebula_sync_health_band data: value: warning - conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: script.send_to_logbook data: topic: "DNS" message: "Nebula Sync DNS consistency recovered; Joanna-only warning state cleared." - service: input_text.set_value target: entity_id: input_text.infra_nebula_sync_health_band data: value: normal - conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}" sequence: - service: input_text.set_value target: entity_id: input_text.infra_nebula_sync_health_band data: value: normal - alias: "Docker Host Disk Pressure Monitor" id: docker_host_disk_pressure_monitor description: "Track Docker host root disk pressure from normalized Glances sensors and dispatch Joanna on band changes." mode: queued trigger: - platform: time_pattern minutes: "/15" - platform: state entity_id: - sensor.docker_17_disk_used_percentage - sensor.docker_14_disk_used_percentage - sensor.docker_69_disk_used_percentage variables: host_configs: - host_id: docker_17 host_name: docker_17 disk_entity: sensor.docker_17_disk_used_percentage raw_entity: sensor.192_168_10_17_disk_usage free_entity: sensor.192_168_10_17_disk_free used_entity: sensor.192_168_10_17_disk_used band_entity: input_text.docker_17_disk_pressure_band issue_id: docker_host_docker_17_disk_pressure - host_id: docker_14 host_name: docker_14 disk_entity: sensor.docker_14_disk_used_percentage raw_entity: sensor.docker14_disk_usage free_entity: sensor.docker14_disk_free used_entity: sensor.docker14_disk_used band_entity: input_text.docker_14_disk_pressure_band issue_id: docker_host_docker_14_disk_pressure - host_id: docker_69 host_name: docker_69 disk_entity: sensor.docker_69_disk_used_percentage raw_entity: sensor.docker69_disk_usage free_entity: sensor.docker69_disk_free used_entity: sensor.docker69_disk_used band_entity: input_text.docker_69_disk_pressure_band issue_id: docker_host_docker_69_disk_pressure action: - repeat: for_each: "{{ host_configs }}" sequence: - variables: host_id: "{{ repeat.item.host_id }}" host_name: "{{ repeat.item.host_name }}" disk_entity: "{{ repeat.item.disk_entity }}" raw_entity: "{{ repeat.item.raw_entity }}" free_entity: "{{ repeat.item.free_entity }}" used_entity: "{{ repeat.item.used_entity }}" band_entity: "{{ repeat.item.band_entity }}" issue_id: "{{ repeat.item.issue_id }}" disk_state: "{{ states(disk_entity) }}" disk_pct: "{{ disk_state | float(0) }}" previous_band: "{{ states(band_entity) | lower }}" current_band: >- {{ 'unavailable' if disk_state in ['unknown', 'unavailable', 'none', ''] else 'critical' if disk_pct >= 90 else 'warning' if disk_pct >= 80 else 'normal' }} - choose: - conditions: "{{ current_band == 'critical' and previous_band != 'critical' }}" sequence: - service: repairs.create data: issue_id: "{{ issue_id }}" severity: error persistent: true title: "{{ host_name }} disk pressure critical ({{ disk_pct | round(1) }}%)" description: >- {{ host_name }} root disk usage is critically high. Free space or expand the host filesystem before Docker workloads fail. - service: script.joanna_dispatch data: trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Critical)" source: "home_assistant_automation.docker_host_disk_pressure_monitor.critical" summary: "{{ host_name }} root disk pressure is critical at {{ disk_pct | round(1) }}%" entity_ids: - "{{ disk_entity }}" - "{{ raw_entity }}" - "{{ free_entity }}" - "{{ used_entity }}" diagnostics: >- issue_id={{ issue_id }}, host_id={{ host_id }}, disk_entity={{ disk_entity }}, raw_entity={{ raw_entity }}, disk_pct={{ disk_pct | round(1) }}, disk_free={{ states(free_entity) }}, disk_used={{ states(used_entity) }}, threshold=90 request: >- Investigate critical disk pressure on {{ host_name }} and perform safe remediation when confidence is high. Check Docker build cache, image/container volumes, logs, backups, and large files first. Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist. Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval. Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false. - service: script.send_to_logbook data: topic: "DOCKER" message: >- {{ host_name }} disk usage is critical at {{ disk_pct | round(1) }}%. Repair {{ issue_id }} opened and Joanna investigation requested. - service: input_text.set_value target: entity_id: "{{ band_entity }}" data: value: "critical" - conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: script.joanna_dispatch data: trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)" source: "home_assistant_automation.docker_host_disk_pressure_monitor.warning" summary: "{{ host_name }} root disk pressure warning at {{ disk_pct | round(1) }}%" entity_ids: - "{{ disk_entity }}" - "{{ raw_entity }}" - "{{ free_entity }}" - "{{ used_entity }}" diagnostics: >- issue_id={{ issue_id }}, host_id={{ host_id }}, disk_entity={{ disk_entity }}, raw_entity={{ raw_entity }}, disk_pct={{ disk_pct | round(1) }}, disk_free={{ states(free_entity) }}, disk_used={{ states(used_entity) }}, threshold=80 request: >- Investigate elevated disk usage on {{ host_name }} and perform safe low-risk cleanup before it becomes critical when confidence is high. Check Docker build cache, image/container volumes, logs, backups, and large files first. Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist. Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval. Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false. - service: script.send_to_logbook data: topic: "DOCKER" message: >- {{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%. Joanna investigation requested without opening a warning Repair. - service: input_text.set_value target: entity_id: "{{ band_entity }}" data: value: "warning" - conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: script.send_to_logbook data: topic: "DOCKER" message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%. Critical Repair cleared; Joanna continues handling warning-level cleanup." - service: input_text.set_value target: entity_id: "{{ band_entity }}" data: value: "warning" - conditions: "{{ current_band == 'normal' and previous_band in ['warning', 'critical'] }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: script.send_to_logbook data: topic: "DOCKER" message: "{{ host_name }} disk usage recovered to {{ disk_pct | round(1) }}%. Repair {{ issue_id }} cleared." - service: input_text.set_value target: entity_id: "{{ band_entity }}" data: value: "normal" - conditions: "{{ current_band == 'normal' and previous_band not in ['normal', 'warning', 'critical'] }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: "{{ issue_id }}" - service: input_text.set_value target: entity_id: "{{ band_entity }}" data: value: "normal" - alias: "Infrastructure - Backup Nightly Verification" id: infra_backup_nightly_verification description: "Use codex_appliance to verify the latest Duplicati run and dispatch Joanna only on failure." mode: single trigger: - platform: time at: "08:00:00" id: nightly - platform: time_pattern minutes: "15" id: recovery_poll - platform: time_pattern minutes: "45" id: recovery_poll condition: - condition: template value_template: >- {{ trigger is not defined or trigger.id != 'recovery_poll' or is_state('input_boolean.infra_duplicati_backup_repair_active', 'on') }} action: - variables: trigger_source: "{{ trigger.id if trigger is defined and trigger.id is defined else 'manual' }}" verifier_reason: "{{ 'ha_failure_followup' if trigger_source == 'recovery_poll' else 'ha_nightly' }}" trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)" duplicati_state: "{{ states('switch.duplicati_container') }}" - action: rest_command.bearclaw_duplicati_verify data: reason: "{{ verifier_reason }}" response_variable: duplicati_verify - service: script.send_to_logbook data: topic: "BACKUP" message: >- {% set payload = duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} %} {% set detail = payload['detail'] if payload is mapping and payload['detail'] is mapping else {} %} {{ detail.get('summary', 'Nightly Duplicati verification completed.') }} - variables: verify_payload: "{{ duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} }}" verify_detail: "{{ verify_payload['detail'] if verify_payload is mapping and verify_payload['detail'] is mapping else {} }}" verify_http_status: "{{ duplicati_verify['status'] | int(0) if duplicati_verify is mapping else 0 }}" verify_healthy: "{{ verify_payload.get('ok', false) and verify_detail.get('healthy', false) }}" verify_status: "{{ verify_detail.get('status', 'unknown') }}" verify_summary: "{{ verify_detail.get('summary', 'Duplicati verification did not return a summary.') }}" verify_issue: "{{ verify_detail.get('issue', verify_payload.get('error', 'duplicati_verify_failed')) }}" verify_backup_name: "{{ verify_detail.get('backupName', 'Docker_Configs') }}" verify_latest_result: "{{ verify_detail.get('latestResult', {}) if verify_detail is mapping else {} }}" verify_last_success: "{{ verify_detail.get('lastSuccessfulRun', {}) if verify_detail is mapping else {} }}" verify_transport_issue: "{{ verify_status in ['api_error', 'unknown'] }}" - choose: - conditions: "{{ verify_healthy }}" sequence: - service: repairs.remove continue_on_error: true data: issue_id: infra_duplicati_backup_failure - service: repairs.remove continue_on_error: true data: issue_id: user_infra_duplicati_backup_failure - service: input_boolean.turn_off target: entity_id: input_boolean.infra_duplicati_backup_repair_active - conditions: "{{ verify_transport_issue }}" sequence: - service: script.send_to_logbook data: topic: "BACKUP" message: >- Duplicati verifier could not prove backup health because the verification service returned status {{ verify_status }} with issue {{ verify_issue }}. No repair card was opened because this is verifier transport state, not a confirmed backup failure. default: - service: input_boolean.turn_on target: entity_id: input_boolean.infra_duplicati_backup_repair_active - choose: - conditions: "{{ trigger_source != 'recovery_poll' }}" sequence: - service: repairs.create data: issue_id: infra_duplicati_backup_failure title: "Duplicati nightly backup verification failed" description: >- {{ verify_summary }} Backup={{ verify_backup_name }}; status={{ verify_status }}; last_result={{ verify_latest_result.get('endedAt', 'n/a') }}; last_success={{ verify_last_success.get('endedAt', 'n/a') }}. severity: error persistent: true - service: script.joanna_dispatch data: trigger_context: "{{ trigger_context }}" source: "home_assistant_automation.infra_backup_nightly_verification" summary: "Nightly Duplicati backup verification failed" entity_ids: - "switch.duplicati_container" diagnostics: >- scheduled_time=08:00:00, duplicati_container={{ duplicati_state }}, verifier_http_status={{ verify_http_status }}, verifier_status={{ verify_status }}, verifier_issue={{ verify_issue }}, backup_name={{ verify_backup_name }}, latest_result={{ verify_latest_result.get('endedAt', 'n/a') }}, last_success={{ verify_last_success.get('endedAt', 'n/a') }} request: >- Investigate the Duplicati backup job {{ verify_backup_name }}. The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}. Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out. Home Assistant will re-check this verifier every 30 minutes after dispatch and clear the Repair automatically once the backup is healthy. Reply with explicit status fields: resolved=true/false, backup_status, last_success_time, root_cause, action_taken, verification, next_action_required=true/false. default: - service: script.send_to_logbook data: topic: "BACKUP" message: >- Duplicati recovery follow-up still reports {{ verify_status }} for {{ verify_backup_name }}: {{ verify_issue }}. Existing Repair remains open; Joanna was not dispatched again. - alias: "Infrastructure - Monthly HA Log Hygiene Review" id: infra_monthly_log_hygiene_review description: "Ask Joanna monthly to review Home Assistant logs, create a GitHub issue with noisy entries, and send Telegram recommendations only." mode: single trigger: - platform: time at: "03:20:00" condition: - condition: template value_template: "{{ now().day == 1 }}" variables: trigger_context: "HA automation infra_monthly_log_hygiene_review (Infrastructure - Monthly HA Log Hygiene Review)" action: - service: script.joanna_dispatch data: trigger_context: "{{ trigger_context }}" source: "home_assistant_automation.infra_monthly_log_hygiene_review" summary: "Monthly Home Assistant log hygiene review with GitHub issue and Telegram follow-up" diagnostics: >- schedule=day_1@03:20:00, review_scope=available_home_assistant_logs, desired_outputs=telegram_follow_up+github_issue, github_repo=CCOSTAN/Home-AssistantConfig, approval_required_before_changes=true request: >- Review the available Home Assistant log files from the last month and identify noisy, low-value entries that could be safely suppressed, filtered, slowed, deduplicated, or retired. Focus on practical Home Assistant-side changes such as recorder exclusions, logger filtering, scan-interval reductions, entity retirement, or automation de-noising. Create or refresh a GitHub issue in CCOSTAN/Home-AssistantConfig that captures the noisy entries, estimated frequency, why each candidate is low-value, and the exact repo files or integrations likely to change. Then send Carlo a concise Telegram summary with the top recommendations and the GitHub issue number or link. Do not make any changes from this review. Wait for explicit follow-up approval first. - service: script.send_to_logbook data: topic: "HOME ASSISTANT" message: "Joanna monthly Home Assistant log hygiene review dispatched; Telegram summary and GitHub issue requested."