You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1034 lines
52 KiB
1034 lines
52 KiB
######################################################################
|
|
# @CCOSTAN - Follow Me on X
|
|
# For more info visit https://www.vcloudinfo.com/click-here
|
|
# Original Repo : https://github.com/CCOSTAN/Home-AssistantConfig
|
|
# -------------------------------------------------------------------
|
|
# Infrastructure - Observability, disk pressure, and Joanna review workflows
|
|
# WAN/DNS/website/domain/cert/Docker host state normalized for dashboards, plus scheduled infrastructure reviews.
|
|
# -------------------------------------------------------------------
|
|
# Related Issue: 1584
|
|
# Notes: Home dashboard consumes `infra_*` entities for exceptions-only alerts.
|
|
# Notes: Domain warning threshold is <30 days; critical threshold is <14 days.
|
|
# Notes: Nightly Duplicati verification runs at 08:00 after the 05:30 Duplicati job and docker_14 reboot window.
|
|
# Notes: Duplicati transport/API errors are logged only; repairs are reserved for proven failed or stale backups.
|
|
# Notes: Duplicati failure Repairs enable a recovery poll that clears the Repair after a later successful run.
|
|
# Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes.
|
|
# Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics.
|
|
# Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered.
|
|
# Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts.
|
|
# Notes: Warning-level Docker host disk pressure is Joanna-only; Repairs are reserved for critical pressure.
|
|
# Notes: Nebula Sync DNS consistency compares primary/backup Pi-hole answers and dispatches Joanna on sustained drift or container loss.
|
|
# Notes: Promoted IoT DNS consistency compares primary/backup Pi-hole answers for reserved IoT host records.
|
|
######################################################################
|
|
|
|
input_text:
|
|
docker_17_disk_pressure_band:
|
|
name: "docker_17 disk pressure band"
|
|
max: 20
|
|
docker_14_disk_pressure_band:
|
|
name: "docker_14 disk pressure band"
|
|
max: 20
|
|
docker_69_disk_pressure_band:
|
|
name: "docker_69 disk pressure band"
|
|
max: 20
|
|
infra_nebula_sync_health_band:
|
|
name: "Nebula Sync health band"
|
|
max: 20
|
|
infra_pihole_iot_dns_health_band:
|
|
name: "Pi-hole IoT DNS health band"
|
|
max: 20
|
|
|
|
input_boolean:
|
|
infra_duplicati_backup_repair_active:
|
|
name: "Duplicati backup repair active"
|
|
|
|
command_line:
|
|
- sensor:
|
|
name: Infra WAN Packet Loss
|
|
unique_id: infra_wan_packet_loss
|
|
command: >-
|
|
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
|
|
awk -F',' '/packet loss/ {gsub(/[^0-9.]/, "", $3); print $3; found=1}
|
|
END {if (!found) print "unknown"}'
|
|
scan_interval: 300
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
|
|
|
|
- sensor:
|
|
name: Infra WAN Latency Ms
|
|
unique_id: infra_wan_latency_ms
|
|
command: >-
|
|
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
|
|
awk -F'/' '/^rtt|^round-trip/ {gsub(/[^0-9.]/, "", $5); print $5; found=1}
|
|
END {if (!found) print "unknown"}'
|
|
scan_interval: 300
|
|
unit_of_measurement: "ms"
|
|
state_class: measurement
|
|
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
|
|
|
|
- sensor:
|
|
name: Infra External IP Fallback
|
|
unique_id: infra_external_ip_fallback
|
|
command: "curl -fsS https://api.ipify.org || echo unknown"
|
|
scan_interval: 900
|
|
|
|
- sensor:
|
|
name: Infra Nebula Sync DNS Consistency
|
|
unique_id: infra_nebula_sync_dns_consistency
|
|
command: >-
|
|
/bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; host=GTG-PF45FK6F; fqdn=GTG-PF45FK6F.fordst.com; ip=192.168.10.117; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; r(){ dig +time=2 +tries=1 +short @"$1" -x "$2" 2>/dev/null | tr -d "\r" | sed "s/\.$//" | sort | tr "\n" "," | sed "s/,$//"; }; p_short=$(q "$primary" "$host"); s_short=$(q "$secondary" "$host"); p_fqdn=$(q "$primary" "$fqdn"); s_fqdn=$(q "$secondary" "$fqdn"); p_rev=$(r "$primary" "$ip"); s_rev=$(r "$secondary" "$ip"); status=mismatch; if [ "$p_short" = "$ip" ] && [ "$s_short" = "$ip" ] && [ "$p_fqdn" = "$ip" ] && [ "$s_fqdn" = "$ip" ] && [ -n "$p_rev" ] && [ "$p_rev" = "$s_rev" ]; then status=ok; fi; printf "{\"status\":\"%s\",\"host\":\"%s\",\"expected_ip\":\"%s\",\"primary_short\":\"%s\",\"secondary_short\":\"%s\",\"primary_fqdn\":\"%s\",\"secondary_fqdn\":\"%s\",\"primary_reverse\":\"%s\",\"secondary_reverse\":\"%s\"}\n" "$status" "$host" "$ip" "$p_short" "$s_short" "$p_fqdn" "$s_fqdn" "$p_rev" "$s_rev"'
|
|
scan_interval: 300
|
|
value_template: "{{ value_json.status | default('unknown') }}"
|
|
json_attributes:
|
|
- host
|
|
- expected_ip
|
|
- primary_short
|
|
- secondary_short
|
|
- primary_fqdn
|
|
- secondary_fqdn
|
|
- primary_reverse
|
|
- secondary_reverse
|
|
|
|
- sensor:
|
|
name: Infra Pihole IoT DNS Consistency
|
|
unique_id: infra_pihole_iot_dns_consistency
|
|
command: >-
|
|
/bin/bash -c 'primary=192.168.10.10; secondary=192.168.10.14; records="rachio.fordst.com=192.168.10.73 econet.fordst.com=192.168.10.92 dreame-vacuum.fordst.com=192.168.10.93 carlo-bed.fordst.com=192.168.10.95 lg-smart-fridge.fordst.com=192.168.10.96 tesla-blackbox-gw.fordst.com=192.168.10.97 bgw210.fordst.com=192.168.10.98"; q(){ dig +time=2 +tries=1 +short @"$1" "$2" A 2>/dev/null | tr -d "\r" | sort | tr "\n" "," | sed "s/,$//"; }; status=ok; checked=0; mismatch_count=0; mismatches=""; for record in $records; do host=${record%%=*}; ip=${record#*=}; p=$(q "$primary" "$host"); s=$(q "$secondary" "$host"); checked=$((checked+1)); if [ "$p" != "$ip" ] || [ "$s" != "$ip" ]; then status=mismatch; mismatch_count=$((mismatch_count+1)); mismatches="${mismatches}${host}:expected=${ip},primary=${p:-none},secondary=${s:-none};"; fi; done; if [ -z "$mismatches" ]; then mismatches=none; fi; printf "{\"status\":\"%s\",\"checked_records\":%s,\"mismatch_count\":%s,\"mismatches\":\"%s\",\"primary_dns\":\"%s\",\"backup_dns\":\"%s\"}\n" "$status" "$checked" "$mismatch_count" "$mismatches" "$primary" "$secondary"'
|
|
scan_interval: 300
|
|
value_template: "{{ value_json.status | default('unknown') }}"
|
|
json_attributes:
|
|
- checked_records
|
|
- mismatch_count
|
|
- mismatches
|
|
- primary_dns
|
|
- backup_dns
|
|
|
|
template:
|
|
- sensor:
|
|
- name: "Infra External IP"
|
|
unique_id: infra_external_ip
|
|
state: >-
|
|
{% set primary = states('sensor.external_ip') | trim %}
|
|
{% set fallback = states('sensor.infra_external_ip_fallback') | trim %}
|
|
{% if primary not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{{ primary }}
|
|
{% else %}
|
|
{{ fallback }}
|
|
{% endif %}
|
|
|
|
- name: "docker_17 Disk Used Percentage"
|
|
unique_id: docker_17_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.192_168_10_17_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.192_168_10_17_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "docker_14 Disk Used Percentage"
|
|
unique_id: docker_14_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.docker14_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.docker14_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "docker_69 Disk Used Percentage"
|
|
unique_id: docker_69_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.docker69_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.docker69_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "Infra Domain Expiry Min Days"
|
|
unique_id: infra_domain_expiry_min_days
|
|
unit_of_measurement: "d"
|
|
state: >-
|
|
{% set ids = [
|
|
'sensor.vcloudinfo_com_days_until_expiration',
|
|
'sensor.ipmer_com_days_until_expiration',
|
|
'sensor.fordst_com_days_until_expiration',
|
|
'sensor.kingcrafthomes_com_days_until_expiration'
|
|
] %}
|
|
{% set ns = namespace(min=9999, any=false) %}
|
|
{% for id in ids %}
|
|
{% if expand(id) | count > 0 %}
|
|
{% set raw = states(id) %}
|
|
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.any = true %}
|
|
{% set val = raw | float(9999) %}
|
|
{% if val < ns.min %}
|
|
{% set ns.min = val %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if ns.any %}
|
|
{{ ns.min | round(0) }}
|
|
{% else %}
|
|
{{ none }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Min Days"
|
|
unique_id: infra_cert_expiry_min_days
|
|
unit_of_measurement: "d"
|
|
state: >-
|
|
{% set ns = namespace(min=9999, any=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
|
|
{% set raw = item.state %}
|
|
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set value = raw | float(9999) %}
|
|
{% if value != 9999 %}
|
|
{% set ns.any = true %}
|
|
{% if value < ns.min %}
|
|
{% set ns.min = value %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if ns.any %}
|
|
{{ ns.min | round(0) }}
|
|
{% else %}
|
|
{{ none }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Telemetry Count"
|
|
unique_id: infra_cert_telemetry_count
|
|
icon: mdi:counter
|
|
state: >-
|
|
{% set ns = namespace(count=0) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
|
|
{% set ns.count = ns.count + 1 %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.count }}
|
|
|
|
- name: "Infra Website Down Count"
|
|
unique_id: infra_website_down_count
|
|
icon: mdi:counter
|
|
state: >-
|
|
{% set ids = [
|
|
'binary_sensor.vcloudinfo_com',
|
|
'binary_sensor.ipmer_com',
|
|
'binary_sensor.fordst_com',
|
|
'binary_sensor.www_kingcrafthomes_com'
|
|
] %}
|
|
{% set ns = namespace(count=0) %}
|
|
{% for id in ids %}
|
|
{% if expand(id) | count > 0 %}
|
|
{% set st = states(id) %}
|
|
{% if st in ['off', 'unknown', 'unavailable'] %}
|
|
{% set ns.count = ns.count + 1 %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.count }}
|
|
|
|
- binary_sensor:
|
|
- name: "Infra WAN Quality Degraded"
|
|
unique_id: infra_wan_quality_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set loss_raw = states('sensor.infra_wan_packet_loss') %}
|
|
{% set lat_raw = states('sensor.infra_wan_latency_ms') %}
|
|
{% set invalid = loss_raw in ['unknown', 'unavailable', 'none', ''] or
|
|
lat_raw in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set loss = loss_raw | float(0) %}
|
|
{% set lat = lat_raw | float(0) %}
|
|
{{ invalid or loss > 5 or lat > 80 }}
|
|
|
|
- name: "Infra DNS Pihole Degraded"
|
|
unique_id: infra_dns_pihole_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set switch_state = states('switch.pi_hole') %}
|
|
{% set service_state = states('binary_sensor.pihole_status') %}
|
|
{{ switch_state != 'on' or service_state in ['off', 'unavailable', 'unknown'] }}
|
|
|
|
- name: "Infra Nebula Sync Degraded"
|
|
unique_id: infra_nebula_sync_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set dns_state = states('sensor.infra_nebula_sync_dns_consistency') | lower %}
|
|
{% set portainer_known = [
|
|
expand('binary_sensor.nebula_sync_status') | count > 0,
|
|
expand('binary_sensor.nebula_sync_status_2') | count > 0,
|
|
expand('sensor.nebula_sync_state') | count > 0,
|
|
expand('sensor.nebula_sync_state_2') | count > 0,
|
|
expand('switch.nebula_sync_container') | count > 0,
|
|
expand('switch.nebula_sync_container_2') | count > 0
|
|
] | select('equalto', true) | list | count > 0 %}
|
|
{% set portainer_ok = [
|
|
is_state('binary_sensor.nebula_sync_status', 'on'),
|
|
is_state('binary_sensor.nebula_sync_status_2', 'on'),
|
|
(states('sensor.nebula_sync_state') | lower) == 'running',
|
|
(states('sensor.nebula_sync_state_2') | lower) == 'running',
|
|
is_state('switch.nebula_sync_container', 'on'),
|
|
is_state('switch.nebula_sync_container_2', 'on')
|
|
] | select('equalto', true) | list | count > 0 %}
|
|
{{ dns_state != 'ok' or (portainer_known and not portainer_ok) }}
|
|
attributes:
|
|
dns_consistency: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
|
|
host: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }}"
|
|
expected_ip: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }}"
|
|
primary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }}"
|
|
secondary_short: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }}"
|
|
primary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }}"
|
|
secondary_fqdn: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }}"
|
|
primary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }}"
|
|
secondary_reverse: "{{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }}"
|
|
nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
|
|
nebula_status_2: "{{ states('binary_sensor.nebula_sync_status_2') }}"
|
|
nebula_state: "{{ states('sensor.nebula_sync_state') }}"
|
|
nebula_state_2: "{{ states('sensor.nebula_sync_state_2') }}"
|
|
pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
|
|
pihole_secondary_status_2: "{{ states('binary_sensor.pihole_secondary_status_2') }}"
|
|
|
|
- name: "Infra Pihole IoT DNS Degraded"
|
|
unique_id: infra_pihole_iot_dns_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{{ states('sensor.infra_pihole_iot_dns_consistency') | lower != 'ok' }}
|
|
attributes:
|
|
dns_consistency: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
|
|
checked_records: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }}"
|
|
mismatch_count: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }}"
|
|
mismatches: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }}"
|
|
primary_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }}"
|
|
backup_dns: "{{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}"
|
|
|
|
- name: "Infra UPS On Battery"
|
|
unique_id: infra_ups_on_battery
|
|
device_class: problem
|
|
state: >-
|
|
{% set status = states('sensor.garage_ups_status') | upper %}
|
|
{{ 'OB' in status }}
|
|
|
|
- name: "Infra Website Degraded"
|
|
unique_id: infra_website_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{{ states('sensor.infra_website_down_count') | int(0) > 0 }}
|
|
|
|
- name: "Infra Website Uptime SLO Breach"
|
|
unique_id: infra_website_uptime_slo_breach
|
|
device_class: problem
|
|
state: >-
|
|
{% set ns = namespace(seen=false, breach=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*uptime_1d$') %}
|
|
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.seen = true %}
|
|
{% if (item.state | float(100)) < 99 %}
|
|
{% set ns.breach = true %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.seen and ns.breach }}
|
|
|
|
- name: "Infra Website Latency Degraded"
|
|
unique_id: infra_website_latency_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set ns = namespace(seen=false, breach=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*avg_response_time_1d$') %}
|
|
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.seen = true %}
|
|
{% if (item.state | float(0)) > 1.2 %}
|
|
{% set ns.breach = true %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.seen and ns.breach }}
|
|
|
|
- name: "Infra Domain Expiry Critical"
|
|
unique_id: infra_domain_expiry_critical
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_domain_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{{ d | float(9999) < 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Domain Expiry Warning"
|
|
unique_id: infra_domain_expiry_warning
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_domain_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{% set days = d | float(9999) %}
|
|
{{ days < 30 and days >= 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Critical"
|
|
unique_id: infra_cert_expiry_critical
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_cert_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{{ d | float(9999) < 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Warning"
|
|
unique_id: infra_cert_expiry_warning
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_cert_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{% set days = d | float(9999) %}
|
|
{{ days < 30 and days >= 14 }}
|
|
{% endif %}
|
|
|
|
automation:
|
|
- alias: "Infrastructure - External IP Change Logbook"
|
|
id: infra_external_ip_change_logbook
|
|
description: "Log external IP changes into the Activity feed."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: sensor.infra_external_ip
|
|
condition:
|
|
- condition: template
|
|
value_template: "{{ trigger.from_state is not none }}"
|
|
- condition: template
|
|
value_template: >-
|
|
{{ trigger.from_state.state not in ['unknown', 'unavailable', 'none', ''] and
|
|
trigger.to_state.state not in ['unknown', 'unavailable', 'none', ''] and
|
|
trigger.from_state.state != trigger.to_state.state }}
|
|
action:
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "NETWORK"
|
|
message: >-
|
|
External IP changed from {{ trigger.from_state.state }} to {{ trigger.to_state.state }}.
|
|
|
|
- alias: "Infrastructure - Website Uptime SLO Repair"
|
|
id: infra_website_uptime_slo_repair
|
|
description: "Create/clear Repairs issue when website 1-day uptime breaches SLO."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_website_uptime_slo_breach
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ trigger.to_state.state == 'on' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_website_uptime_slo_breach
|
|
title: "Website uptime SLO breached"
|
|
description: >
|
|
At least one monitored website has uptime_1d below 99%.
|
|
Review Uptime Kuma entities on the Website Health dashboard.
|
|
severity: warning
|
|
persistent: true
|
|
default:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_website_uptime_slo_breach
|
|
|
|
- alias: "Infrastructure - Website Latency Repair"
|
|
id: infra_website_latency_repair
|
|
description: "Create/clear Repairs issue when website response times degrade."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_website_latency_degraded
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ trigger.to_state.state == 'on' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_website_latency_degraded
|
|
title: "Website latency degraded"
|
|
description: >
|
|
At least one monitored website reports avg_response_time_1d above 1.2s.
|
|
Review Uptime Kuma response-time entities on Website Health.
|
|
severity: warning
|
|
persistent: true
|
|
default:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_website_latency_degraded
|
|
|
|
- alias: "Infrastructure - Nebula Sync Health Dispatch"
|
|
id: infra_nebula_sync_health_dispatch
|
|
description: "Dispatch Joanna when Nebula Sync DNS consistency or container telemetry stays degraded."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_nebula_sync_degraded
|
|
to: "on"
|
|
for: "00:10:00"
|
|
id: degraded
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_nebula_sync_degraded
|
|
to: "off"
|
|
for: "00:02:00"
|
|
id: recovered
|
|
- platform: homeassistant
|
|
event: start
|
|
id: reconcile
|
|
- platform: time_pattern
|
|
minutes: "/30"
|
|
id: reconcile
|
|
variables:
|
|
issue_id: infra_nebula_sync_degraded
|
|
dns_state: "{{ states('sensor.infra_nebula_sync_dns_consistency') }}"
|
|
previous_band: "{{ states('input_text.infra_nebula_sync_health_band') | lower }}"
|
|
degraded: "{{ is_state('binary_sensor.infra_nebula_sync_degraded', 'on') }}"
|
|
nebula_status: "{{ states('binary_sensor.nebula_sync_status') }}"
|
|
nebula_status_alt: "{{ states('binary_sensor.nebula_sync_status_2') }}"
|
|
nebula_state: "{{ states('sensor.nebula_sync_state') }}"
|
|
nebula_state_alt: "{{ states('sensor.nebula_sync_state_2') }}"
|
|
pihole_secondary_status: "{{ states('binary_sensor.pihole_secondary_status') }}"
|
|
pihole_secondary_status_alt: "{{ states('binary_sensor.pihole_secondary_status_2') }}"
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ degraded and previous_band != 'warning' }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation infra_nebula_sync_health_dispatch (Infrastructure - Nebula Sync Health Dispatch)"
|
|
source: "home_assistant_automation.infra_nebula_sync_health_dispatch.warning"
|
|
summary: "Nebula Sync DNS consistency or container health is degraded"
|
|
entity_ids:
|
|
- sensor.infra_nebula_sync_dns_consistency
|
|
- binary_sensor.infra_nebula_sync_degraded
|
|
- binary_sensor.nebula_sync_status
|
|
- binary_sensor.nebula_sync_status_2
|
|
- sensor.nebula_sync_state
|
|
- sensor.nebula_sync_state_2
|
|
- binary_sensor.pihole_secondary_status
|
|
- binary_sensor.pihole_secondary_status_2
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
dns_consistency={{ dns_state }},
|
|
host={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'host') }},
|
|
expected_ip={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'expected_ip') }},
|
|
primary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_short') }},
|
|
secondary_short={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_short') }},
|
|
primary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_fqdn') }},
|
|
secondary_fqdn={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_fqdn') }},
|
|
primary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'primary_reverse') }},
|
|
secondary_reverse={{ state_attr('sensor.infra_nebula_sync_dns_consistency', 'secondary_reverse') }},
|
|
nebula_status={{ nebula_status }},
|
|
nebula_status_2={{ nebula_status_alt }},
|
|
nebula_state={{ nebula_state }},
|
|
nebula_state_2={{ nebula_state_alt }},
|
|
pihole_secondary_status={{ pihole_secondary_status }},
|
|
pihole_secondary_status_2={{ pihole_secondary_status_alt }},
|
|
primary_dns=192.168.10.10,
|
|
backup_dns=192.168.10.14
|
|
request: >-
|
|
Investigate Nebula Sync on docker_14 and the backup Pi-hole sync path.
|
|
Verify both Pi-holes answer the GTG-PF45FK6F short name, FQDN, and reverse lookup consistently.
|
|
Check nebula_sync container status, Docker health, recent sync logs, and primary/replica Pi-hole API reachability.
|
|
If confidence is high, perform safe remediation such as a one-time Nebula Sync run or restarting only the nebula_sync container.
|
|
Do not restart Pi-hole or change DHCP/custom DNS records unless diagnostics prove data drift and the action is safe.
|
|
Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
|
|
domain_hint: ops
|
|
lane_hint: joanna.ops
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DNS"
|
|
message: >-
|
|
Nebula Sync DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_nebula_sync_health_band
|
|
data:
|
|
value: warning
|
|
- conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DNS"
|
|
message: "Nebula Sync DNS consistency recovered; Joanna-only warning state cleared."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_nebula_sync_health_band
|
|
data:
|
|
value: normal
|
|
- conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
|
|
sequence:
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_nebula_sync_health_band
|
|
data:
|
|
value: normal
|
|
|
|
- alias: "Infrastructure - Pi-hole IoT DNS Drift Dispatch"
|
|
id: infra_pihole_iot_dns_drift_dispatch
|
|
description: "Dispatch Joanna when promoted IoT Pi-hole DNS records drift across primary and backup resolvers."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_pihole_iot_dns_degraded
|
|
to: "on"
|
|
for: "00:10:00"
|
|
id: degraded
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_pihole_iot_dns_degraded
|
|
to: "off"
|
|
for: "00:02:00"
|
|
id: recovered
|
|
- platform: event
|
|
event_type: homeassistant_started
|
|
id: reconcile
|
|
- platform: time_pattern
|
|
minutes: "7"
|
|
id: reconcile
|
|
variables:
|
|
issue_id: infra_pihole_iot_dns_degraded
|
|
dns_state: "{{ states('sensor.infra_pihole_iot_dns_consistency') }}"
|
|
previous_band: "{{ states('input_text.infra_pihole_iot_dns_health_band') | lower }}"
|
|
degraded: "{{ is_state('binary_sensor.infra_pihole_iot_dns_degraded', 'on') }}"
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ degraded and previous_band != 'warning' }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation infra_pihole_iot_dns_drift_dispatch (Infrastructure - Pi-hole IoT DNS Drift Dispatch)"
|
|
source: "home_assistant_automation.infra_pihole_iot_dns_drift_dispatch.warning"
|
|
summary: "Promoted IoT Pi-hole DNS records drifted across primary and backup resolvers"
|
|
entity_ids:
|
|
- sensor.infra_pihole_iot_dns_consistency
|
|
- binary_sensor.infra_pihole_iot_dns_degraded
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
dns_consistency={{ dns_state }},
|
|
checked_records={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'checked_records') }},
|
|
mismatch_count={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatch_count') }},
|
|
mismatches={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'mismatches') }},
|
|
primary_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'primary_dns') }},
|
|
backup_dns={{ state_attr('sensor.infra_pihole_iot_dns_consistency', 'backup_dns') }}
|
|
request: >-
|
|
Investigate primary/backup Pi-hole DNS drift for promoted IoT reservations.
|
|
Verify both Pi-holes answer rachio, econet, dreame-vacuum, carlo-bed, lg-smart-fridge, tesla-blackbox-gw, and bgw210 FQDNs with the expected reserved IPs.
|
|
Check primary and backup pihole.toml local DNS host records, Nebula Sync behavior, and generated custom.list files.
|
|
Do not change DHCP/custom DNS records unless diagnostics prove drift and the action is safe.
|
|
Reply with resolved=true/false, root_cause, action_taken, verification, and next_action_required=true/false.
|
|
domain_hint: ops
|
|
lane_hint: joanna.ops
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DNS"
|
|
message: >-
|
|
Promoted IoT Pi-hole DNS consistency is degraded ({{ dns_state }}); Joanna investigation requested without opening a Repair.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_pihole_iot_dns_health_band
|
|
data:
|
|
value: warning
|
|
- conditions: "{{ not degraded and previous_band in ['warning', 'unavailable'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DNS"
|
|
message: "Promoted IoT Pi-hole DNS consistency recovered; Joanna-only warning state cleared."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_pihole_iot_dns_health_band
|
|
data:
|
|
value: normal
|
|
- conditions: "{{ not degraded and previous_band not in ['normal', 'warning', 'unavailable'] }}"
|
|
sequence:
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: input_text.infra_pihole_iot_dns_health_band
|
|
data:
|
|
value: normal
|
|
|
|
- alias: "Docker Host Disk Pressure Monitor"
|
|
id: docker_host_disk_pressure_monitor
|
|
description: "Track Docker host root disk pressure from normalized Glances sensors and dispatch Joanna on band changes."
|
|
mode: queued
|
|
trigger:
|
|
- platform: time_pattern
|
|
minutes: "/15"
|
|
- platform: state
|
|
entity_id:
|
|
- sensor.docker_17_disk_used_percentage
|
|
- sensor.docker_14_disk_used_percentage
|
|
- sensor.docker_69_disk_used_percentage
|
|
variables:
|
|
host_configs:
|
|
- host_id: docker_17
|
|
host_name: docker_17
|
|
disk_entity: sensor.docker_17_disk_used_percentage
|
|
raw_entity: sensor.192_168_10_17_disk_usage
|
|
free_entity: sensor.192_168_10_17_disk_free
|
|
used_entity: sensor.192_168_10_17_disk_used
|
|
band_entity: input_text.docker_17_disk_pressure_band
|
|
issue_id: docker_host_docker_17_disk_pressure
|
|
- host_id: docker_14
|
|
host_name: docker_14
|
|
disk_entity: sensor.docker_14_disk_used_percentage
|
|
raw_entity: sensor.docker14_disk_usage
|
|
free_entity: sensor.docker14_disk_free
|
|
used_entity: sensor.docker14_disk_used
|
|
band_entity: input_text.docker_14_disk_pressure_band
|
|
issue_id: docker_host_docker_14_disk_pressure
|
|
- host_id: docker_69
|
|
host_name: docker_69
|
|
disk_entity: sensor.docker_69_disk_used_percentage
|
|
raw_entity: sensor.docker69_disk_usage
|
|
free_entity: sensor.docker69_disk_free
|
|
used_entity: sensor.docker69_disk_used
|
|
band_entity: input_text.docker_69_disk_pressure_band
|
|
issue_id: docker_host_docker_69_disk_pressure
|
|
action:
|
|
- repeat:
|
|
for_each: "{{ host_configs }}"
|
|
sequence:
|
|
- variables:
|
|
host_id: "{{ repeat.item.host_id }}"
|
|
host_name: "{{ repeat.item.host_name }}"
|
|
disk_entity: "{{ repeat.item.disk_entity }}"
|
|
raw_entity: "{{ repeat.item.raw_entity }}"
|
|
free_entity: "{{ repeat.item.free_entity }}"
|
|
used_entity: "{{ repeat.item.used_entity }}"
|
|
band_entity: "{{ repeat.item.band_entity }}"
|
|
issue_id: "{{ repeat.item.issue_id }}"
|
|
disk_state: "{{ states(disk_entity) }}"
|
|
disk_pct: "{{ disk_state | float(0) }}"
|
|
previous_band: "{{ states(band_entity) | lower }}"
|
|
current_band: >-
|
|
{{ 'unavailable' if disk_state in ['unknown', 'unavailable', 'none', '']
|
|
else 'critical' if disk_pct >= 90
|
|
else 'warning' if disk_pct >= 80
|
|
else 'normal' }}
|
|
- choose:
|
|
- conditions: "{{ current_band == 'critical' and previous_band != 'critical' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
severity: error
|
|
persistent: true
|
|
title: "{{ host_name }} disk pressure critical ({{ disk_pct | round(1) }}%)"
|
|
description: >-
|
|
{{ host_name }} root disk usage is critically high.
|
|
Free space or expand the host filesystem before Docker workloads fail.
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Critical)"
|
|
source: "home_assistant_automation.docker_host_disk_pressure_monitor.critical"
|
|
summary: "{{ host_name }} root disk pressure is critical at {{ disk_pct | round(1) }}%"
|
|
entity_ids:
|
|
- "{{ disk_entity }}"
|
|
- "{{ raw_entity }}"
|
|
- "{{ free_entity }}"
|
|
- "{{ used_entity }}"
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
host_id={{ host_id }},
|
|
disk_entity={{ disk_entity }},
|
|
raw_entity={{ raw_entity }},
|
|
disk_pct={{ disk_pct | round(1) }},
|
|
disk_free={{ states(free_entity) }},
|
|
disk_used={{ states(used_entity) }},
|
|
threshold=90
|
|
request: >-
|
|
Investigate critical disk pressure on {{ host_name }} and perform safe remediation when confidence is high.
|
|
Check Docker build cache, image/container volumes, logs, backups, and large files first.
|
|
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
|
|
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
|
|
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: >-
|
|
{{ host_name }} disk usage is critical at {{ disk_pct | round(1) }}%.
|
|
Repair {{ issue_id }} opened and Joanna investigation requested.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "critical"
|
|
- conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)"
|
|
source: "home_assistant_automation.docker_host_disk_pressure_monitor.warning"
|
|
summary: "{{ host_name }} root disk pressure warning at {{ disk_pct | round(1) }}%"
|
|
entity_ids:
|
|
- "{{ disk_entity }}"
|
|
- "{{ raw_entity }}"
|
|
- "{{ free_entity }}"
|
|
- "{{ used_entity }}"
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
host_id={{ host_id }},
|
|
disk_entity={{ disk_entity }},
|
|
raw_entity={{ raw_entity }},
|
|
disk_pct={{ disk_pct | round(1) }},
|
|
disk_free={{ states(free_entity) }},
|
|
disk_used={{ states(used_entity) }},
|
|
threshold=80
|
|
request: >-
|
|
Investigate elevated disk usage on {{ host_name }} and perform safe low-risk cleanup before it becomes critical when confidence is high.
|
|
Check Docker build cache, image/container volumes, logs, backups, and large files first.
|
|
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
|
|
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
|
|
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: >-
|
|
{{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%.
|
|
Joanna investigation requested without opening a warning Repair.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "warning"
|
|
- conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%. Critical Repair cleared; Joanna continues handling warning-level cleanup."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "warning"
|
|
- conditions: "{{ current_band == 'normal' and previous_band in ['warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: "{{ host_name }} disk usage recovered to {{ disk_pct | round(1) }}%. Repair {{ issue_id }} cleared."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "normal"
|
|
- conditions: "{{ current_band == 'normal' and previous_band not in ['normal', 'warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "normal"
|
|
|
|
- alias: "Infrastructure - Backup Nightly Verification"
|
|
id: infra_backup_nightly_verification
|
|
description: "Use codex_appliance to verify the latest Duplicati run and dispatch Joanna only on failure."
|
|
mode: single
|
|
trigger:
|
|
- platform: time
|
|
at: "08:00:00"
|
|
id: nightly
|
|
- platform: time_pattern
|
|
minutes: "15"
|
|
id: recovery_poll
|
|
- platform: time_pattern
|
|
minutes: "45"
|
|
id: recovery_poll
|
|
condition:
|
|
- condition: template
|
|
value_template: >-
|
|
{{ trigger is not defined or trigger.id != 'recovery_poll'
|
|
or is_state('input_boolean.infra_duplicati_backup_repair_active', 'on') }}
|
|
action:
|
|
- variables:
|
|
trigger_source: "{{ trigger.id if trigger is defined and trigger.id is defined else 'manual' }}"
|
|
verifier_reason: "{{ 'ha_failure_followup' if trigger_source == 'recovery_poll' else 'ha_nightly' }}"
|
|
trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)"
|
|
duplicati_state: "{{ states('switch.duplicati_container') }}"
|
|
- action: rest_command.bearclaw_duplicati_verify
|
|
data:
|
|
reason: "{{ verifier_reason }}"
|
|
response_variable: duplicati_verify
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "BACKUP"
|
|
message: >-
|
|
{% set payload = duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} %}
|
|
{% set detail = payload['detail'] if payload is mapping and payload['detail'] is mapping else {} %}
|
|
{{ detail.get('summary', 'Nightly Duplicati verification completed.') }}
|
|
- variables:
|
|
verify_payload: "{{ duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} }}"
|
|
verify_detail: "{{ verify_payload['detail'] if verify_payload is mapping and verify_payload['detail'] is mapping else {} }}"
|
|
verify_http_status: "{{ duplicati_verify['status'] | int(0) if duplicati_verify is mapping else 0 }}"
|
|
verify_healthy: "{{ verify_payload.get('ok', false) and verify_detail.get('healthy', false) }}"
|
|
verify_status: "{{ verify_detail.get('status', 'unknown') }}"
|
|
verify_summary: "{{ verify_detail.get('summary', 'Duplicati verification did not return a summary.') }}"
|
|
verify_issue: "{{ verify_detail.get('issue', verify_payload.get('error', 'duplicati_verify_failed')) }}"
|
|
verify_backup_name: "{{ verify_detail.get('backupName', 'Docker_Configs') }}"
|
|
verify_latest_result: "{{ verify_detail.get('latestResult', {}) if verify_detail is mapping else {} }}"
|
|
verify_last_success: "{{ verify_detail.get('lastSuccessfulRun', {}) if verify_detail is mapping else {} }}"
|
|
verify_transport_issue: "{{ verify_status in ['api_error', 'unknown'] }}"
|
|
- choose:
|
|
- conditions: "{{ verify_healthy }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_duplicati_backup_failure
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: user_infra_duplicati_backup_failure
|
|
- service: input_boolean.turn_off
|
|
target:
|
|
entity_id: input_boolean.infra_duplicati_backup_repair_active
|
|
- conditions: "{{ verify_transport_issue }}"
|
|
sequence:
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "BACKUP"
|
|
message: >-
|
|
Duplicati verifier could not prove backup health because the verification service returned
|
|
status {{ verify_status }} with issue {{ verify_issue }}. No repair card was opened because
|
|
this is verifier transport state, not a confirmed backup failure.
|
|
default:
|
|
- service: input_boolean.turn_on
|
|
target:
|
|
entity_id: input_boolean.infra_duplicati_backup_repair_active
|
|
- choose:
|
|
- conditions: "{{ trigger_source != 'recovery_poll' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_duplicati_backup_failure
|
|
title: "Duplicati nightly backup verification failed"
|
|
description: >-
|
|
{{ verify_summary }}
|
|
Backup={{ verify_backup_name }};
|
|
status={{ verify_status }};
|
|
last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
|
|
last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
|
|
severity: error
|
|
persistent: true
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "{{ trigger_context }}"
|
|
source: "home_assistant_automation.infra_backup_nightly_verification"
|
|
summary: "Nightly Duplicati backup verification failed"
|
|
entity_ids:
|
|
- "switch.duplicati_container"
|
|
diagnostics: >-
|
|
scheduled_time=08:00:00,
|
|
duplicati_container={{ duplicati_state }},
|
|
verifier_http_status={{ verify_http_status }},
|
|
verifier_status={{ verify_status }},
|
|
verifier_issue={{ verify_issue }},
|
|
backup_name={{ verify_backup_name }},
|
|
latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
|
|
last_success={{ verify_last_success.get('endedAt', 'n/a') }}
|
|
request: >-
|
|
Investigate the Duplicati backup job {{ verify_backup_name }}.
|
|
The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
|
|
Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
|
|
Home Assistant will re-check this verifier every 30 minutes after dispatch and clear the Repair automatically once the backup is healthy.
|
|
Reply with explicit status fields:
|
|
resolved=true/false,
|
|
backup_status,
|
|
last_success_time,
|
|
root_cause,
|
|
action_taken,
|
|
verification,
|
|
next_action_required=true/false.
|
|
default:
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "BACKUP"
|
|
message: >-
|
|
Duplicati recovery follow-up still reports {{ verify_status }} for {{ verify_backup_name }}:
|
|
{{ verify_issue }}. Existing Repair remains open; Joanna was not dispatched again.
|
|
|
|
- alias: "Infrastructure - Monthly HA Log Hygiene Review"
|
|
id: infra_monthly_log_hygiene_review
|
|
description: "Ask Joanna monthly to review Home Assistant logs, create a GitHub issue with noisy entries, and send Telegram recommendations only."
|
|
mode: single
|
|
trigger:
|
|
- platform: time
|
|
at: "03:20:00"
|
|
condition:
|
|
- condition: template
|
|
value_template: "{{ now().day == 1 }}"
|
|
variables:
|
|
trigger_context: "HA automation infra_monthly_log_hygiene_review (Infrastructure - Monthly HA Log Hygiene Review)"
|
|
action:
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "{{ trigger_context }}"
|
|
source: "home_assistant_automation.infra_monthly_log_hygiene_review"
|
|
summary: "Monthly Home Assistant log hygiene review with GitHub issue and Telegram follow-up"
|
|
diagnostics: >-
|
|
schedule=day_1@03:20:00,
|
|
review_scope=available_home_assistant_logs,
|
|
desired_outputs=telegram_follow_up+github_issue,
|
|
github_repo=CCOSTAN/Home-AssistantConfig,
|
|
approval_required_before_changes=true
|
|
request: >-
|
|
Review the available Home Assistant log files from the last month and identify noisy,
|
|
low-value entries that could be safely suppressed, filtered, slowed, deduplicated, or
|
|
retired. Focus on practical Home Assistant-side changes such as recorder exclusions,
|
|
logger filtering, scan-interval reductions, entity retirement, or automation de-noising.
|
|
Create or refresh a GitHub issue in CCOSTAN/Home-AssistantConfig that captures the noisy
|
|
entries, estimated frequency, why each candidate is low-value, and the exact repo files
|
|
or integrations likely to change. Then send Carlo a concise Telegram summary with the top
|
|
recommendations and the GitHub issue number or link. Do not make any changes from this
|
|
review. Wait for explicit follow-up approval first.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "HOME ASSISTANT"
|
|
message: "Joanna monthly Home Assistant log hygiene review dispatched; Telegram summary and GitHub issue requested."
|