You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1034 lines
52 KiB

######################################################################
# @CCOSTAN - Follow Me on X
# For more info visit https://www.vcloudinfo.com/click-here
# Original Repo : https://github.com/CCOSTAN/Home-AssistantConfig
# -------------------------------------------------------------------
# Infrastructure - Observability, disk pressure, and Joanna review workflows
# WAN/DNS/website/domain/cert/Docker host state normalized for dashboards, plus scheduled infrastructure reviews.
# -------------------------------------------------------------------
# Related Issue: 1584
# Notes: Home dashboard consumes `infra_*` entities for exceptions-only alerts.
# Notes: Domain warning threshold is <30 days; critical threshold is <14 days.
# Notes: Nightly Duplicati verification runs at 08:00 after the 05:30 Duplicati job and docker_14 reboot window.
# Notes: Duplicati transport/API errors are logged only; repairs are reserved for proven failed or stale backups.
# Notes: Duplicati failure Repairs enable a recovery poll that clears the Repair after a later successful run.
# Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes.
# Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics.
# Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered.
# Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts.
# Notes: Warning-level Docker host disk pressure is Joanna-only; Repairs are reserved for critical pressure.
# Notes: Nebula Sync DNS consistency compares primary/backup Pi-hole answers and dispatches Joanna on sustained drift or container loss.
# Notes: Promoted IoT DNS consistency compares primary/backup Pi-hole answers for reserved IoT host records.
######################################################################
input_text:
docker_17_disk_pressure_band:
name: "docker_17 disk pressure band"
max: 20
docker_14_disk_pressure_band:
name: "docker_14 disk pressure band"
max: 20
docker_69_disk_pressure_band:
name: "docker_69 disk pressure band"
max: 20
infra_nebula_sync_health_band:
name: "Nebula Sync health band"
max: 20
infra_pihole_iot_dns_health_band:
name: "Pi-hole IoT DNS health band"
max: 20
input_boolean:
infra_duplicati_backup_repair_active:
name: "Duplicati backup repair active"
command_line:
- sensor:
name: Infra WAN Packet Loss
unique_id: infra_wan_packet_loss
command: >-
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
awk -F',' '/packet loss/ {gsub(/[^0-9.]/, "", $3); print $3; found=1}
END {if (!found) print "unknown"}'
scan_interval: 300
unit_of_measurement: "%"
state_class: measurement
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
- sensor:
name: Infra WAN Latency Ms
unique_id: infra_wan_latency_ms
command: >-
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
awk -F'/' '/^rtt|^round-trip/ {gsub(/[^0-9.]/, "", $5); print $5; found=1}
END {if (!found) print "unknown"}'
scan_interval: 300
unit_of_measurement: "ms"
state_class: measurement
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
- sensor:
name: Infra External IP Fallback
unique_id: infra_external_ip_fallback
command: "curl -fsS https://api.ipify.org || echo unknown"
scan_interval: 900
- sensor:
name: Infra Nebula Sync DNS Consistency
unique_id: infra_nebula_sync_dns_consistency
command: >-
/bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; host=GTG-PF45FK6F; fqdn=GTG-PF45FK6F.fordst.com; ip=192.168.10.117; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; r(){ dig +time=2 +tries=1 +short @"$1" -x "$2" 2>/dev/null | tr -d "\r" | sed "s/\.$//" | sort | tr "\n" "," | sed "s/,$//"; }; p_short=$(q "$primary" "$host"); s_short=$(q "$secondary" "$host"); p_fqdn=$(q "$primary" "$fqdn"); s_fqdn=$(q "$secondary" "$fqdn"); p_rev=$(r "$primary" "$ip"); s_rev=$(r "$secondary" "$ip"); status=mismatch; if [ "$p_short" = "$ip" ] && [ "$s_short" = "$ip" ] && [ "$p_fqdn" = "$ip" ] && [ "$s_fqdn" = "$ip" ] && [ -n "$p_rev" ] && [ "$p_rev" = "$s_rev" ]; then status=ok; fi; printf "{\"status\":\"%s\",\"host\":\"%s\",\"expected_ip\":\"%s\",\"primary_short\":\"%s\",\"secondary_short\":\"%s\",\"primary_fqdn\":\"%s\",\"secondary_fqdn\":\"%s\",\"primary_reverse\":\"%s\",\"secondary_reverse\":\"%s\"}\n" "$status" "$host" "$ip" "$p_short" "$s_short" "$p_fqdn" "$s_fqdn" "$p_rev" "$s_rev"'
scan_interval: 300
value_template: "{{ value_json.status | default('unknown') }}"
json_attributes:
- host
- expected_ip
- primary_short
- secondary_short
- primary_fqdn
- secondary_fqdn
- primary_reverse
- secondary_reverse
- sensor:
name: Infra Pihole IoT DNS Consistency
unique_id: infra_pihole_iot_dns_consistency
command: >-
/bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; records="rachio.fordst.com=192.168.10.73 econet.fordst.com=192.168.10.92 dreame-vacuum.fordst.com=192.168.10.93 carlo-bed.fordst.com=192.168.10.95 lg-smart-fridge.fordst.com=192.168.10.96 tesla-blackbox-gw.fordst.com=192.168.10.97 bgw210.fordst.com=192.168.10.98"; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; status=ok; checked=0; mismatch_count=0; mismatches=""; for record in $records; do host=${record%%=*}; ip=${record#*=}; p=$(q "$primary" "$host"); s=$(q "$secondary" "$host"); checked=$((checked+1)); if [ "$p" != "$ip" ] || [ "$s" != "$ip" ]; then status=mismatch; mismatch_count=$((mismatch_count+1)); mismatches="${mismatches}${host}:expected=${ip},primary=${p:-none},secondary=${s:-none};"; fi; done; if [ -z "$mismatches" ]; then mismatches=none; fi; printf "{\"status\":\"%s\",\"checked_records\":%s,\"mismatch_count\":%s,\"mismatches\":\"%s\",\"primary_dns\":\"%s\",\"backup_dns\":\"%s\"}\n" "$status" "$checked" "$mismatch_count" "$mismatches" "$primary" "$secondary"'
scan_interval: 300
value_template: "{{ value_json.status | default('unknown') }}"
json_attributes:
- checked_records
- mismatch_count
- mismatches
- primary_dns
- backup_dns
template:
- sensor:
- name: "Infra External IP"
unique_id: infra_external_ip
state: >-
{% set primary = states('sensor.external_ip') | trim %}
{% set fallback = states('sensor.infra_external_ip_fallback') | trim %}
{% if primary not in ['unknown', 'unavailable', 'none', ''] %}
{{ primary }}
{% else %}
{{ fallback }}
{% endif %}
- name: "docker_17 Disk Used Percentage"
unique_id: docker_17_disk_used_percentage
unit_of_measurement: "%"
state_class: measurement
icon: mdi:harddisk
availability: "{{ states('sensor.192_168_10_17_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
state: "{{ states('sensor.192_168_10_17_disk_usage') | float(0) | round(1) }}"
- name: "docker_14 Disk Used Percentage"
unique_id: docker_14_disk_used_percentage
unit_of_measurement: "%"
state_class: measurement
icon: mdi:harddisk
availability: "{{ states('sensor.docker14_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
state: "{{ states('sensor.docker14_disk_usage') | float(0) | round(1) }}"
- name: "docker_69 Disk Used Percentage"
unique_id: docker_69_disk_used_percentage
unit_of_measurement: "%"
state_class: measurement
icon: mdi:harddisk
availability: "{{ states('sensor.docker69_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
state: "{{ states('sensor.docker69_disk_usage') | float(0) | round(1) }}"
- name: "Infra Domain Expiry Min Days"
unique_id: infra_domain_expiry_min_days
unit_of_measurement: "d"
state: >-
{% set ids = [
'sensor.vcloudinfo_com_days_until_expiration',
'sensor.ipmer_com_days_until_expiration',
'sensor.fordst_com_days_until_expiration',
'sensor.kingcrafthomes_com_days_until_expiration'
] %}
{% set ns = namespace(min=9999, any=false) %}
{% for id in ids %}
{% if expand(id) | count > 0 %}
{% set raw = states(id) %}
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
{% set ns.any = true %}
{% set val = raw | float(9999) %}
{% if val < ns.min %}
{% set ns.min = val %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{% if ns.any %}
{{ ns.min | round(0) }}
{% else %}
{{ none }}
{% endif %}
- name: "Infra Cert Expiry Min Days"
unique_id: infra_cert_expiry_min_days
unit_of_measurement: "d"
state: >-
{% set ns = namespace(min=9999, any=false) %}
{% for item in states.sensor %}
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
{% set raw = item.state %}
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
{% set value = raw | float(9999) %}
{% if value != 9999 %}
{% set ns.any = true %}
{% if value < ns.min %}
{% set ns.min = value %}
{% endif %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{% if ns.any %}
{{ ns.min | round(0) }}
{% else %}
{{ none }}
{% endif %}
- name: "Infra Cert Telemetry Count"
unique_id: infra_cert_telemetry_count
icon: mdi:counter
state: >-
{% set ns = namespace(count=0) %}
{% for item in states.sensor %}
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
{% set ns.count = ns.count + 1 %}
{% endif %}
{% endfor %}
{{ ns.count }}
- name: "Infra Website Down Count"
unique_id: infra_website_down_count
icon: mdi:counter
state: >-
{% set ids = [
'binary_sensor.vcloudinfo_com',
'binary_sensor.ipmer_com',
'binary_sensor.fordst_com',
'binary_sensor.www_kingcrafthomes_com'
] %}
{% set ns = namespace(count=0) %}
{% for id in ids %}
{% if expand(id) | count > 0 %}
{% set st = states(id) %}
{% if st in ['off', 'unknown', 'unavailable'] %}
{% set ns.count = ns.count + 1 %}
{% endif %}
{% endif %}
{% endfor %}
{{ ns.count }}
- binary_sensor:
- name: "Infra WAN Quality Degraded"
unique_id: infra_wan_quality_degraded
device_class: problem
state: >-
{% set loss_raw = states('sensor.infra_wan_packet_loss') %}
{% set lat_raw = states('sensor.infra_wan_latency_ms') %}
{% set invalid = loss_raw in ['unknown', 'unavailable', 'none', ''] or
lat_raw in ['unknown', 'unavailable', 'none', ''] %}
{% set loss = loss_raw | float(0) %}
{% set lat = lat_raw | float(0) %}
{{ invalid or loss > 5 or lat > 80 }}
- name: "Infra DNS Pihole Degraded"
unique_id: infra_dns_pihole_degraded
device_class: problem
state: >-
{% set switch_state = states('switch.pi_hole') %}
{% set service_state = states('binary_sensor.pihole_status') %}
{{ switch_state != 'on' or service_state in ['off', 'unavailable', 'unknown'] }}
- name: "Infra Nebula Sync Degraded"
unique_id: infra_nebula_sync_degraded
device_class: problem
state: >-
{% set dns_state = states('sensor.infra_nebula_sync_dns_consistency') | lower %}
{% set portainer_known = [
expand('binary_sensor.nebula_sync_status') | count > 0,
expand('binary_sensor.nebula_sync_status_2') | count > 0,
expand('sensor.nebula_sync_state') | count > 0,
expand('sensor.nebula_sync_state_2') | count > 0,
expand('switch.nebula_sync_container') | count > 0,
expand('switch.nebula_sync_container_2') | count > 0
] | select('equalto', true) | list | count > 0 %}
{% set portainer_ok = [
is_state('binary_sensor.nebula_sync_status', 'on'),
is_state('binary_sensor.nebula_sync_status_2', 'on'),
(states('sensor.nebula_sync_state') | lower) == 'running',
(states('sensor.nebula_sync_state_2') | lower) == 'running',
is_state('switch.nebula_sync_container', 'on'),
is_state('switch.nebula_sync_container_2', 'on')
] | select('equalto', true) | list | count > 0 %}
{{ dns_state != 'ok' or (portainer_known and not portainer_ok) }}
attributes:
dns_consistency: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
host: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }}"
expected_ip: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }}"
primary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }}"
secondary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }}"
primary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }}"
secondary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }}"
primary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }}"
secondary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }}"
nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
nebula_status_2: "{{ states('binary_sensor.nebula_sync_status_2') }}"
nebula_state: "{{ states('sensor.nebula_sync_state') }}"
nebula_state_2: "{{ states('sensor.nebula_sync_state_2') }}"
pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
pihole_secondary_status_2: "{{ states('binary_sensor.pihole_secondary_status_2') }}"
- name: "Infra Pihole IoT DNS Degraded"
unique_id: infra_pihole_iot_dns_degraded
device_class: problem
state: >-
{{ states('sensor.infra_pihole_iot_dns_consistency') | lower != 'ok' }}
attributes:
dns_consistency: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
checked_records: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }}"
mismatch_count: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }}"
mismatches: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }}"
primary_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }}"
backup_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}"
- name: "Infra UPS On Battery"
unique_id: infra_ups_on_battery
device_class: problem
state: >-
{% set status = states('sensor.garage_ups_status') | upper %}
{{ 'OB' in status }}
- name: "Infra Website Degraded"
unique_id: infra_website_degraded
device_class: problem
state: >-
{{ states('sensor.infra_website_down_count') | int(0) > 0 }}
- name: "Infra Website Uptime SLO Breach"
unique_id: infra_website_uptime_slo_breach
device_class: problem
state: >-
{% set ns = namespace(seen=false, breach=false) %}
{% for item in states.sensor %}
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*uptime_1d$') %}
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
{% set ns.seen = true %}
{% if (item.state | float(100)) < 99 %}
{% set ns.breach = true %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{{ ns.seen and ns.breach }}
- name: "Infra Website Latency Degraded"
unique_id: infra_website_latency_degraded
device_class: problem
state: >-
{% set ns = namespace(seen=false, breach=false) %}
{% for item in states.sensor %}
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*avg_response_time_1d$') %}
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
{% set ns.seen = true %}
{% if (item.state | float(0)) > 1.2 %}
{% set ns.breach = true %}
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
{{ ns.seen and ns.breach }}
- name: "Infra Domain Expiry Critical"
unique_id: infra_domain_expiry_critical
device_class: problem
state: >-
{% set d = states('sensor.infra_domain_expiry_min_days') %}
{% if d in ['unknown', 'unavailable', 'none', ''] %}
false
{% else %}
{{ d | float(9999) < 14 }}
{% endif %}
- name: "Infra Domain Expiry Warning"
unique_id: infra_domain_expiry_warning
device_class: problem
state: >-
{% set d = states('sensor.infra_domain_expiry_min_days') %}
{% if d in ['unknown', 'unavailable', 'none', ''] %}
false
{% else %}
{% set days = d | float(9999) %}
{{ days < 30 and days >= 14 }}
{% endif %}
- name: "Infra Cert Expiry Critical"
unique_id: infra_cert_expiry_critical
device_class: problem
state: >-
{% set d = states('sensor.infra_cert_expiry_min_days') %}
{% if d in ['unknown', 'unavailable', 'none', ''] %}
false
{% else %}
{{ d | float(9999) < 14 }}
{% endif %}
- name: "Infra Cert Expiry Warning"
unique_id: infra_cert_expiry_warning
device_class: problem
state: >-
{% set d = states('sensor.infra_cert_expiry_min_days') %}
{% if d in ['unknown', 'unavailable', 'none', ''] %}
false
{% else %}
{% set days = d | float(9999) %}
{{ days < 30 and days >= 14 }}
{% endif %}
automation:
- alias: "Infrastructure - External IP Change Logbook"
id: infra_external_ip_change_logbook
description: "Log external IP changes into the Activity feed."
mode: queued
trigger:
- platform: state
entity_id: sensor.infra_external_ip
condition:
- condition: template
value_template: "{{ trigger.from_state is not none }}"
- condition: template
value_template: >-
{{ trigger.from_state.state not in ['unknown', 'unavailable', 'none', ''] and
trigger.to_state.state not in ['unknown', 'unavailable', 'none', ''] and
trigger.from_state.state != trigger.to_state.state }}
action:
- service: script.send_to_logbook
data:
topic: "NETWORK"
message: >-
External IP changed from {{ trigger.from_state.state }} to {{ trigger.to_state.state }}.
- alias: "Infrastructure - Website Uptime SLO Repair"
id: infra_website_uptime_slo_repair
description: "Create/clear Repairs issue when website 1-day uptime breaches SLO."
mode: queued
trigger:
- platform: state
entity_id: binary_sensor.infra_website_uptime_slo_breach
action:
- choose:
- conditions: "{{ trigger.to_state.state == 'on' }}"
sequence:
- service: repairs.create
data:
issue_id: infra_website_uptime_slo_breach
title: "Website uptime SLO breached"
description: >
At least one monitored website has uptime_1d below 99%.
Review Uptime Kuma entities on the Website Health dashboard.
severity: warning
persistent: true
default:
- service: repairs.remove
continue_on_error: true
data:
issue_id: infra_website_uptime_slo_breach
- alias: "Infrastructure - Website Latency Repair"
id: infra_website_latency_repair
description: "Create/clear Repairs issue when website response times degrade."
mode: queued
trigger:
- platform: state
entity_id: binary_sensor.infra_website_latency_degraded
action:
- choose:
- conditions: "{{ trigger.to_state.state == 'on' }}"
sequence:
- service: repairs.create
data:
issue_id: infra_website_latency_degraded
title: "Website latency degraded"
description: >
At least one monitored website reports avg_response_time_1d above 1.2s.
Review Uptime Kuma response-time entities on Website Health.
severity: warning
persistent: true
default:
- service: repairs.remove
continue_on_error: true
data:
issue_id: infra_website_latency_degraded
- alias: "Infrastructure - Nebula Sync Health Dispatch"
id: infra_nebula_sync_health_dispatch
description: "Dispatch Joanna when Nebula Sync DNS consistency or container telemetry stays degraded."
mode: queued
trigger:
- platform: state
entity_id: binary_sensor.infra_nebula_sync_degraded
to: "on"
for: "00:10:00"
id: degraded
- platform: state
entity_id: binary_sensor.infra_nebula_sync_degraded
to: "off"
for: "00:02:00"
id: recovered
- platform: homeassistant
event: start
id: reconcile
- platform: time_pattern
minutes: "/30"
id: reconcile
variables:
issue_id: infra_nebula_sync_degraded
dns_state: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
previous_band: "{{ states('input_text.infra_nebula_sync_health_band') | lower }}"
degraded: "{{ is_state('binary_sensor.infra_nebula_sync_degraded', 'on') }}"
nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
nebula_status_alt: "{{ states('binary_sensor.nebula_sync_status_2') }}"
nebula_state: "{{ states('sensor.nebula_sync_state') }}"
nebula_state_alt: "{{ states('sensor.nebula_sync_state_2') }}"
pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
pihole_secondary_status_alt: "{{ states('binary_sensor.pihole_secondary_status_2') }}"
action:
- choose:
- conditions: "{{ degraded and previous_band != 'warning' }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.joanna_dispatch
data:
trigger_context: "HA automation infra_nebula_sync_health_dispatch (Infrastructure - Nebula Sync Health Dispatch)"
source: "home_assistant_automation.infra_nebula_sync_health_dispatch.warning"
summary: "Nebula Sync DNS consistency or container health is degraded"
entity_ids:
- sensor.infra_nebula_sync_dns_consistency
- binary_sensor.infra_nebula_sync_degraded
- binary_sensor.nebula_sync_status
- binary_sensor.nebula_sync_status_2
- sensor.nebula_sync_state
- sensor.nebula_sync_state_2
- binary_sensor.pihole_secondary_status
- binary_sensor.pihole_secondary_status_2
diagnostics: >-
issue_id={{ issue_id }},
dns_consistency={{ dns_state }},
host={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }},
expected_ip={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }},
primary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }},
secondary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }},
primary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }},
secondary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }},
primary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }},
secondary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }},
nebula_status={{ nebula_status }},
nebula_status_2={{ nebula_status_alt }},
nebula_state={{ nebula_state }},
nebula_state_2={{ nebula_state_alt }},
pihole_secondary_status={{ pihole_secondary_status }},
pihole_secondary_status_2={{ pihole_secondary_status_alt }},
primary_dns=192.168.10.10,
backup_dns=192.168.10.14
request: >-
Investigate Nebula Sync on docker_14 and the backup Pi-hole sync path.
Verify both Pi-holes answer the GTG-PF45FK6F short name, FQDN, and reverse lookup consistently.
Check nebula_sync container status, Docker health, recent sync logs, and primary/replica Pi-hole API reachability.
If confidence is high, perform safe remediation such as a one-time Nebula Sync run or restarting only the nebula_sync container.
Do not restart Pi-hole or change DHCP/custom DNS records unless diagnostics prove data drift and the action is safe.
Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
domain_hint: ops
lane_hint: joanna.ops
- service: script.send_to_logbook
data:
topic: "DNS"
message: >-
Nebula Sync DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
- service: input_text.set_value
target:
entity_id: input_text.infra_nebula_sync_health_band
data:
value: warning
- conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.send_to_logbook
data:
topic: "DNS"
message: "Nebula Sync DNS consistency recovered; Joanna-only warning state cleared."
- service: input_text.set_value
target:
entity_id: input_text.infra_nebula_sync_health_band
data:
value: normal
- conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
sequence:
- service: input_text.set_value
target:
entity_id: input_text.infra_nebula_sync_health_band
data:
value: normal
- alias: "Infrastructure - Pi-hole IoT DNS Drift Dispatch"
id: infra_pihole_iot_dns_drift_dispatch
description: "Dispatch Joanna when promoted IoT Pi-hole DNS records drift across primary and backup resolvers."
mode: queued
trigger:
- platform: state
entity_id: binary_sensor.infra_pihole_iot_dns_degraded
to: "on"
for: "00:10:00"
id: degraded
- platform: state
entity_id: binary_sensor.infra_pihole_iot_dns_degraded
to: "off"
for: "00:02:00"
id: recovered
- platform: event
event_type: homeassistant_started
id: reconcile
- platform: time_pattern
minutes: "7"
id: reconcile
variables:
issue_id: infra_pihole_iot_dns_degraded
dns_state: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
previous_band: "{{ states('input_text.infra_pihole_iot_dns_health_band') | lower }}"
degraded: "{{ is_state('binary_sensor.infra_pihole_iot_dns_degraded', 'on') }}"
action:
- choose:
- conditions: "{{ degraded and previous_band != 'warning' }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.joanna_dispatch
data:
trigger_context: "HA automation infra_pihole_iot_dns_drift_dispatch (Infrastructure - Pi-hole IoT DNS Drift Dispatch)"
source: "home_assistant_automation.infra_pihole_iot_dns_drift_dispatch.warning"
summary: "Promoted IoT Pi-hole DNS records drifted across primary and backup resolvers"
entity_ids:
- sensor.infra_pihole_iot_dns_consistency
- binary_sensor.infra_pihole_iot_dns_degraded
diagnostics: >-
issue_id={{ issue_id }},
dns_consistency={{ dns_state }},
checked_records={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }},
mismatch_count={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }},
mismatches={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }},
primary_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }},
backup_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}
request: >-
Investigate primary/backup Pi-hole DNS drift for promoted IoT reservations.
Verify both Pi-holes answer rachio, econet, dreame-vacuum, carlo-bed, lg-smart-fridge, tesla-blackbox-gw, and bgw210 FQDNs with the expected reserved IPs.
Check primary and backup pihole.toml local DNS host records, Nebula Sync behavior, and generated custom.list files.
Do not change DHCP/custom DNS records unless diagnostics prove drift and the action is safe.
Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
domain_hint: ops
lane_hint: joanna.ops
- service: script.send_to_logbook
data:
topic: "DNS"
message: >-
Promoted IoT Pi-hole DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
- service: input_text.set_value
target:
entity_id: input_text.infra_pihole_iot_dns_health_band
data:
value: warning
- conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.send_to_logbook
data:
topic: "DNS"
message: "Promoted IoT Pi-hole DNS consistency recovered; Joanna-only warning state cleared."
- service: input_text.set_value
target:
entity_id: input_text.infra_pihole_iot_dns_health_band
data:
value: normal
- conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
sequence:
- service: input_text.set_value
target:
entity_id: input_text.infra_pihole_iot_dns_health_band
data:
value: normal
- alias: "Docker Host Disk Pressure Monitor"
id: docker_host_disk_pressure_monitor
description: "Track Docker host root disk pressure from normalized Glances sensors and dispatch Joanna on band changes."
mode: queued
trigger:
- platform: time_pattern
minutes: "/15"
- platform: state
entity_id:
- sensor.docker_17_disk_used_percentage
- sensor.docker_14_disk_used_percentage
- sensor.docker_69_disk_used_percentage
variables:
host_configs:
- host_id: docker_17
host_name: docker_17
disk_entity: sensor.docker_17_disk_used_percentage
raw_entity: sensor.192_168_10_17_disk_usage
free_entity: sensor.192_168_10_17_disk_free
used_entity: sensor.192_168_10_17_disk_used
band_entity: input_text.docker_17_disk_pressure_band
issue_id: docker_host_docker_17_disk_pressure
- host_id: docker_14
host_name: docker_14
disk_entity: sensor.docker_14_disk_used_percentage
raw_entity: sensor.docker14_disk_usage
free_entity: sensor.docker14_disk_free
used_entity: sensor.docker14_disk_used
band_entity: input_text.docker_14_disk_pressure_band
issue_id: docker_host_docker_14_disk_pressure
- host_id: docker_69
host_name: docker_69
disk_entity: sensor.docker_69_disk_used_percentage
raw_entity: sensor.docker69_disk_usage
free_entity: sensor.docker69_disk_free
used_entity: sensor.docker69_disk_used
band_entity: input_text.docker_69_disk_pressure_band
issue_id: docker_host_docker_69_disk_pressure
action:
- repeat:
for_each: "{{ host_configs }}"
sequence:
- variables:
host_id: "{{ repeat.item.host_id }}"
host_name: "{{ repeat.item.host_name }}"
disk_entity: "{{ repeat.item.disk_entity }}"
raw_entity: "{{ repeat.item.raw_entity }}"
free_entity: "{{ repeat.item.free_entity }}"
used_entity: "{{ repeat.item.used_entity }}"
band_entity: "{{ repeat.item.band_entity }}"
issue_id: "{{ repeat.item.issue_id }}"
disk_state: "{{ states(disk_entity) }}"
disk_pct: "{{ disk_state | float(0) }}"
previous_band: "{{ states(band_entity) | lower }}"
current_band: >-
{{ 'unavailable' if disk_state in ['unknown', 'unavailable', 'none', '']
else 'critical' if disk_pct >= 90
else 'warning' if disk_pct >= 80
else 'normal' }}
- choose:
- conditions: "{{ current_band == 'critical' and previous_band != 'critical' }}"
sequence:
- service: repairs.create
data:
issue_id: "{{ issue_id }}"
severity: error
persistent: true
title: "{{ host_name }} disk pressure critical ({{ disk_pct | round(1) }}%)"
description: >-
{{ host_name }} root disk usage is critically high.
Free space or expand the host filesystem before Docker workloads fail.
- service: script.joanna_dispatch
data:
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Critical)"
source: "home_assistant_automation.docker_host_disk_pressure_monitor.critical"
summary: "{{ host_name }} root disk pressure is critical at {{ disk_pct | round(1) }}%"
entity_ids:
- "{{ disk_entity }}"
- "{{ raw_entity }}"
- "{{ free_entity }}"
- "{{ used_entity }}"
diagnostics: >-
issue_id={{ issue_id }},
host_id={{ host_id }},
disk_entity={{ disk_entity }},
raw_entity={{ raw_entity }},
disk_pct={{ disk_pct | round(1) }},
disk_free={{ states(free_entity) }},
disk_used={{ states(used_entity) }},
threshold=90
request: >-
Investigate critical disk pressure on {{ host_name }} and perform safe remediation when confidence is high.
Check Docker build cache, image/container volumes, logs, backups, and large files first.
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
- service: script.send_to_logbook
data:
topic: "DOCKER"
message: >-
{{ host_name }} disk usage is critical at {{ disk_pct | round(1) }}%.
Repair {{ issue_id }} opened and Joanna investigation requested.
- service: input_text.set_value
target:
entity_id: "{{ band_entity }}"
data:
value: "critical"
- conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.joanna_dispatch
data:
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)"
source: "home_assistant_automation.docker_host_disk_pressure_monitor.warning"
summary: "{{ host_name }} root disk pressure warning at {{ disk_pct | round(1) }}%"
entity_ids:
- "{{ disk_entity }}"
- "{{ raw_entity }}"
- "{{ free_entity }}"
- "{{ used_entity }}"
diagnostics: >-
issue_id={{ issue_id }},
host_id={{ host_id }},
disk_entity={{ disk_entity }},
raw_entity={{ raw_entity }},
disk_pct={{ disk_pct | round(1) }},
disk_free={{ states(free_entity) }},
disk_used={{ states(used_entity) }},
threshold=80
request: >-
Investigate elevated disk usage on {{ host_name }} and perform safe low-risk cleanup before it becomes critical when confidence is high.
Check Docker build cache, image/container volumes, logs, backups, and large files first.
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
- service: script.send_to_logbook
data:
topic: "DOCKER"
message: >-
{{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%.
Joanna investigation requested without opening a warning Repair.
- service: input_text.set_value
target:
entity_id: "{{ band_entity }}"
data:
value: "warning"
- conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.send_to_logbook
data:
topic: "DOCKER"
message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%. Critical Repair cleared; Joanna continues handling warning-level cleanup."
- service: input_text.set_value
target:
entity_id: "{{ band_entity }}"
data:
value: "warning"
- conditions: "{{ current_band == 'normal' and previous_band in ['warning', 'critical'] }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: script.send_to_logbook
data:
topic: "DOCKER"
message: "{{ host_name }} disk usage recovered to {{ disk_pct | round(1) }}%. Repair {{ issue_id }} cleared."
- service: input_text.set_value
target:
entity_id: "{{ band_entity }}"
data:
value: "normal"
- conditions: "{{ current_band == 'normal' and previous_band not in ['normal', 'warning', 'critical'] }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: "{{ issue_id }}"
- service: input_text.set_value
target:
entity_id: "{{ band_entity }}"
data:
value: "normal"
- alias: "Infrastructure - Backup Nightly Verification"
id: infra_backup_nightly_verification
description: "Use codex_appliance to verify the latest Duplicati run and dispatch Joanna only on failure."
mode: single
trigger:
- platform: time
at: "08:00:00"
id: nightly
- platform: time_pattern
minutes: "15"
id: recovery_poll
- platform: time_pattern
minutes: "45"
id: recovery_poll
condition:
- condition: template
value_template: >-
{{ trigger is not defined or trigger.id != 'recovery_poll'
or is_state('input_boolean.infra_duplicati_backup_repair_active', 'on') }}
action:
- variables:
trigger_source: "{{ trigger.id if trigger is defined and trigger.id is defined else 'manual' }}"
verifier_reason: "{{ 'ha_failure_followup' if trigger_source == 'recovery_poll' else 'ha_nightly' }}"
trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)"
duplicati_state: "{{ states('switch.duplicati_container') }}"
- action: rest_command.bearclaw_duplicati_verify
data:
reason: "{{ verifier_reason }}"
response_variable: duplicati_verify
- service: script.send_to_logbook
data:
topic: "BACKUP"
message: >-
{% set payload = duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} %}
{% set detail = payload['detail'] if payload is mapping and payload['detail'] is mapping else {} %}
{{ detail.get('summary', 'Nightly Duplicati verification completed.') }}
- variables:
verify_payload: "{{ duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} }}"
verify_detail: "{{ verify_payload['detail'] if verify_payload is mapping and verify_payload['detail'] is mapping else {} }}"
verify_http_status: "{{ duplicati_verify['status'] | int(0) if duplicati_verify is mapping else 0 }}"
verify_healthy: "{{ verify_payload.get('ok', false) and verify_detail.get('healthy', false) }}"
verify_status: "{{ verify_detail.get('status', 'unknown') }}"
verify_summary: "{{ verify_detail.get('summary', 'Duplicati verification did not return a summary.') }}"
verify_issue: "{{ verify_detail.get('issue', verify_payload.get('error', 'duplicati_verify_failed')) }}"
verify_backup_name: "{{ verify_detail.get('backupName', 'Docker_Configs') }}"
verify_latest_result: "{{ verify_detail.get('latestResult', {}) if verify_detail is mapping else {} }}"
verify_last_success: "{{ verify_detail.get('lastSuccessfulRun', {}) if verify_detail is mapping else {} }}"
verify_transport_issue: "{{ verify_status in ['api_error', 'unknown'] }}"
- choose:
- conditions: "{{ verify_healthy }}"
sequence:
- service: repairs.remove
continue_on_error: true
data:
issue_id: infra_duplicati_backup_failure
- service: repairs.remove
continue_on_error: true
data:
issue_id: user_infra_duplicati_backup_failure
- service: input_boolean.turn_off
target:
entity_id: input_boolean.infra_duplicati_backup_repair_active
- conditions: "{{ verify_transport_issue }}"
sequence:
- service: script.send_to_logbook
data:
topic: "BACKUP"
message: >-
Duplicati verifier could not prove backup health because the verification service returned
status {{ verify_status }} with issue {{ verify_issue }}. No repair card was opened because
this is verifier transport state, not a confirmed backup failure.
default:
- service: input_boolean.turn_on
target:
entity_id: input_boolean.infra_duplicati_backup_repair_active
- choose:
- conditions: "{{ trigger_source != 'recovery_poll' }}"
sequence:
- service: repairs.create
data:
issue_id: infra_duplicati_backup_failure
title: "Duplicati nightly backup verification failed"
description: >-
{{ verify_summary }}
Backup={{ verify_backup_name }};
status={{ verify_status }};
last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
severity: error
persistent: true
- service: script.joanna_dispatch
data:
trigger_context: "{{ trigger_context }}"
source: "home_assistant_automation.infra_backup_nightly_verification"
summary: "Nightly Duplicati backup verification failed"
entity_ids:
- "switch.duplicati_container"
diagnostics: >-
scheduled_time=08:00:00,
duplicati_container={{ duplicati_state }},
verifier_http_status={{ verify_http_status }},
verifier_status={{ verify_status }},
verifier_issue={{ verify_issue }},
backup_name={{ verify_backup_name }},
latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
last_success={{ verify_last_success.get('endedAt', 'n/a') }}
request: >-
Investigate the Duplicati backup job {{ verify_backup_name }}.
The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
Home Assistant will re-check this verifier every 30 minutes after dispatch and clear the Repair automatically once the backup is healthy.
Reply with explicit status fields:
resolved=true/false,
backup_status,
last_success_time,
root_cause,
action_taken,
verification,
next_action_required=true/false.
default:
- service: script.send_to_logbook
data:
topic: "BACKUP"
message: >-
Duplicati recovery follow-up still reports {{ verify_status }} for {{ verify_backup_name }}:
{{ verify_issue }}. Existing Repair remains open; Joanna was not dispatched again.
- alias: "Infrastructure - Monthly HA Log Hygiene Review"
id: infra_monthly_log_hygiene_review
description: "Ask Joanna monthly to review Home Assistant logs, create a GitHub issue with noisy entries, and send Telegram recommendations only."
mode: single
trigger:
- platform: time
at: "03:20:00"
condition:
- condition: template
value_template: "{{ now().day == 1 }}"
variables:
trigger_context: "HA automation infra_monthly_log_hygiene_review (Infrastructure - Monthly HA Log Hygiene Review)"
action:
- service: script.joanna_dispatch
data:
trigger_context: "{{ trigger_context }}"
source: "home_assistant_automation.infra_monthly_log_hygiene_review"
summary: "Monthly Home Assistant log hygiene review with GitHub issue and Telegram follow-up"
diagnostics: >-
schedule=day_1@03:20:00,
review_scope=available_home_assistant_logs,
desired_outputs=telegram_follow_up+github_issue,
github_repo=CCOSTAN/Home-AssistantConfig,
approval_required_before_changes=true
request: >-
Review the available Home Assistant log files from the last month and identify noisy,
low-value entries that could be safely suppressed, filtered, slowed, deduplicated, or
retired. Focus on practical Home Assistant-side changes such as recorder exclusions,
logger filtering, scan-interval reductions, entity retirement, or automation de-noising.
Create or refresh a GitHub issue in CCOSTAN/Home-AssistantConfig that captures the noisy
entries, estimated frequency, why each candidate is low-value, and the exact repo files
or integrations likely to change. Then send Carlo a concise Telegram summary with the top
recommendations and the GitHub issue number or link. Do not make any changes from this
review. Wait for explicit follow-up approval first.
- service: script.send_to_logbook
data:
topic: "HOME ASSISTANT"
message: "Joanna monthly Home Assistant log hygiene review dispatched; Telegram summary and GitHub issue requested."

Powered by TurnKey Linux.