You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
695 lines
32 KiB
695 lines
32 KiB
######################################################################
|
|
# @CCOSTAN - Follow Me on X
|
|
# For more info visit https://www.vcloudinfo.com/click-here
|
|
# Original Repo : https://github.com/CCOSTAN/Home-AssistantConfig
|
|
# -------------------------------------------------------------------
|
|
# Infrastructure - Observability, disk pressure, and Joanna review workflows
|
|
# WAN/DNS/website/domain/cert/Docker host state normalized for dashboards, plus scheduled infrastructure reviews.
|
|
# -------------------------------------------------------------------
|
|
# Related Issue: 1584
|
|
# Notes: Home dashboard consumes `infra_*` entities for exceptions-only alerts.
|
|
# Notes: Domain warning threshold is <30 days; critical threshold is <14 days.
|
|
# Notes: Nightly Duplicati verification is performed by codex_appliance after the Duplicati retry window because HA backup entities are not available.
|
|
# Notes: Monthly HA log hygiene review requests Telegram + GitHub issue follow-up only; Joanna must wait for approval before any changes.
|
|
# Notes: Numeric WAN telemetry exposes state_class so recorder can keep long-term statistics.
|
|
# Notes: Docker host root disk usage uses Glances-backed normalized sensors; raw Glances sensors are recorder/logbook-filtered.
|
|
# Notes: Disk-pressure dispatch allows bounded safe cleanup of disposable caches and old generated backup artifacts, but not live data or restarts.
|
|
######################################################################
|
|
|
|
input_text:
|
|
docker_17_disk_pressure_band:
|
|
name: "docker_17 disk pressure band"
|
|
max: 20
|
|
docker_14_disk_pressure_band:
|
|
name: "docker_14 disk pressure band"
|
|
max: 20
|
|
docker_69_disk_pressure_band:
|
|
name: "docker_69 disk pressure band"
|
|
max: 20
|
|
|
|
command_line:
|
|
- sensor:
|
|
name: Infra WAN Packet Loss
|
|
unique_id: infra_wan_packet_loss
|
|
command: >-
|
|
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
|
|
awk -F',' '/packet loss/ {gsub(/[^0-9.]/, "", $3); print $3; found=1}
|
|
END {if (!found) print "unknown"}'
|
|
scan_interval: 300
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
|
|
|
|
- sensor:
|
|
name: Infra WAN Latency Ms
|
|
unique_id: infra_wan_latency_ms
|
|
command: >-
|
|
ping -q -c 10 -W 1 1.1.1.1 2>/dev/null |
|
|
awk -F'/' '/^rtt|^round-trip/ {gsub(/[^0-9.]/, "", $5); print $5; found=1}
|
|
END {if (!found) print "unknown"}'
|
|
scan_interval: 300
|
|
unit_of_measurement: "ms"
|
|
state_class: measurement
|
|
value_template: "{{ (value | regex_replace('[^0-9.]', '')) or 'unknown' }}"
|
|
|
|
- sensor:
|
|
name: Infra External IP Fallback
|
|
unique_id: infra_external_ip_fallback
|
|
command: "curl -fsS https://api.ipify.org || echo unknown"
|
|
scan_interval: 900
|
|
|
|
template:
|
|
- sensor:
|
|
- name: "Infra External IP"
|
|
unique_id: infra_external_ip
|
|
state: >-
|
|
{% set primary = states('sensor.external_ip') | trim %}
|
|
{% set fallback = states('sensor.infra_external_ip_fallback') | trim %}
|
|
{% if primary not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{{ primary }}
|
|
{% else %}
|
|
{{ fallback }}
|
|
{% endif %}
|
|
|
|
- name: "docker_17 Disk Used Percentage"
|
|
unique_id: docker_17_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.192_168_10_17_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.192_168_10_17_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "docker_14 Disk Used Percentage"
|
|
unique_id: docker_14_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.docker14_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.docker14_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "docker_69 Disk Used Percentage"
|
|
unique_id: docker_69_disk_used_percentage
|
|
unit_of_measurement: "%"
|
|
state_class: measurement
|
|
icon: mdi:harddisk
|
|
availability: "{{ states('sensor.docker69_disk_usage') not in ['unknown', 'unavailable', 'none', ''] }}"
|
|
state: "{{ states('sensor.docker69_disk_usage') | float(0) | round(1) }}"
|
|
|
|
- name: "Infra Domain Expiry Min Days"
|
|
unique_id: infra_domain_expiry_min_days
|
|
unit_of_measurement: "d"
|
|
state: >-
|
|
{% set ids = [
|
|
'sensor.vcloudinfo_com_days_until_expiration',
|
|
'sensor.ipmer_com_days_until_expiration',
|
|
'sensor.fordst_com_days_until_expiration',
|
|
'sensor.kingcrafthomes_com_days_until_expiration'
|
|
] %}
|
|
{% set ns = namespace(min=9999, any=false) %}
|
|
{% for id in ids %}
|
|
{% if expand(id) | count > 0 %}
|
|
{% set raw = states(id) %}
|
|
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.any = true %}
|
|
{% set val = raw | float(9999) %}
|
|
{% if val < ns.min %}
|
|
{% set ns.min = val %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if ns.any %}
|
|
{{ ns.min | round(0) }}
|
|
{% else %}
|
|
{{ none }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Min Days"
|
|
unique_id: infra_cert_expiry_min_days
|
|
unit_of_measurement: "d"
|
|
state: >-
|
|
{% set ns = namespace(min=9999, any=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
|
|
{% set raw = item.state %}
|
|
{% if raw not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set value = raw | float(9999) %}
|
|
{% if value != 9999 %}
|
|
{% set ns.any = true %}
|
|
{% if value < ns.min %}
|
|
{% set ns.min = value %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% if ns.any %}
|
|
{{ ns.min | round(0) }}
|
|
{% else %}
|
|
{{ none }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Telemetry Count"
|
|
unique_id: infra_cert_telemetry_count
|
|
icon: mdi:counter
|
|
state: >-
|
|
{% set ns = namespace(count=0) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('(vcloudinfo|ipmer|fordst|kingcrafthomes).*(cert|ssl|tls)') %}
|
|
{% set ns.count = ns.count + 1 %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.count }}
|
|
|
|
- name: "Infra Website Down Count"
|
|
unique_id: infra_website_down_count
|
|
icon: mdi:counter
|
|
state: >-
|
|
{% set ids = [
|
|
'binary_sensor.vcloudinfo_com',
|
|
'binary_sensor.ipmer_com',
|
|
'binary_sensor.fordst_com',
|
|
'binary_sensor.www_kingcrafthomes_com'
|
|
] %}
|
|
{% set ns = namespace(count=0) %}
|
|
{% for id in ids %}
|
|
{% if expand(id) | count > 0 %}
|
|
{% set st = states(id) %}
|
|
{% if st in ['off', 'unknown', 'unavailable'] %}
|
|
{% set ns.count = ns.count + 1 %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.count }}
|
|
|
|
- binary_sensor:
|
|
- name: "Infra WAN Quality Degraded"
|
|
unique_id: infra_wan_quality_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set loss_raw = states('sensor.infra_wan_packet_loss') %}
|
|
{% set lat_raw = states('sensor.infra_wan_latency_ms') %}
|
|
{% set invalid = loss_raw in ['unknown', 'unavailable', 'none', ''] or
|
|
lat_raw in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set loss = loss_raw | float(0) %}
|
|
{% set lat = lat_raw | float(0) %}
|
|
{{ invalid or loss > 5 or lat > 80 }}
|
|
|
|
- name: "Infra DNS Pihole Degraded"
|
|
unique_id: infra_dns_pihole_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set switch_state = states('switch.pi_hole') %}
|
|
{% set service_state = states('binary_sensor.pihole_status') %}
|
|
{{ switch_state != 'on' or service_state in ['off', 'unavailable', 'unknown'] }}
|
|
|
|
- name: "Infra UPS On Battery"
|
|
unique_id: infra_ups_on_battery
|
|
device_class: problem
|
|
state: >-
|
|
{% set status = states('sensor.garage_ups_status') | upper %}
|
|
{{ 'OB' in status }}
|
|
|
|
- name: "Infra Website Degraded"
|
|
unique_id: infra_website_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{{ states('sensor.infra_website_down_count') | int(0) > 0 }}
|
|
|
|
- name: "Infra Website Uptime SLO Breach"
|
|
unique_id: infra_website_uptime_slo_breach
|
|
device_class: problem
|
|
state: >-
|
|
{% set ns = namespace(seen=false, breach=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*uptime_1d$') %}
|
|
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.seen = true %}
|
|
{% if (item.state | float(100)) < 99 %}
|
|
{% set ns.breach = true %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.seen and ns.breach }}
|
|
|
|
- name: "Infra Website Latency Degraded"
|
|
unique_id: infra_website_latency_degraded
|
|
device_class: problem
|
|
state: >-
|
|
{% set ns = namespace(seen=false, breach=false) %}
|
|
{% for item in states.sensor %}
|
|
{% if item.entity_id is search('sensor\\.(vcloudinfo_com|kingcrafthomes_com|www_kingcrafthomes_com).*avg_response_time_1d$') %}
|
|
{% if item.state not in ['unknown', 'unavailable', 'none', ''] %}
|
|
{% set ns.seen = true %}
|
|
{% if (item.state | float(0)) > 1.2 %}
|
|
{% set ns.breach = true %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{{ ns.seen and ns.breach }}
|
|
|
|
- name: "Infra Domain Expiry Critical"
|
|
unique_id: infra_domain_expiry_critical
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_domain_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{{ d | float(9999) < 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Domain Expiry Warning"
|
|
unique_id: infra_domain_expiry_warning
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_domain_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{% set days = d | float(9999) %}
|
|
{{ days < 30 and days >= 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Critical"
|
|
unique_id: infra_cert_expiry_critical
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_cert_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{{ d | float(9999) < 14 }}
|
|
{% endif %}
|
|
|
|
- name: "Infra Cert Expiry Warning"
|
|
unique_id: infra_cert_expiry_warning
|
|
device_class: problem
|
|
state: >-
|
|
{% set d = states('sensor.infra_cert_expiry_min_days') %}
|
|
{% if d in ['unknown', 'unavailable', 'none', ''] %}
|
|
false
|
|
{% else %}
|
|
{% set days = d | float(9999) %}
|
|
{{ days < 30 and days >= 14 }}
|
|
{% endif %}
|
|
|
|
automation:
|
|
- alias: "Infrastructure - External IP Change Logbook"
|
|
id: infra_external_ip_change_logbook
|
|
description: "Log external IP changes into the Activity feed."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: sensor.infra_external_ip
|
|
condition:
|
|
- condition: template
|
|
value_template: "{{ trigger.from_state is not none }}"
|
|
- condition: template
|
|
value_template: >-
|
|
{{ trigger.from_state.state not in ['unknown', 'unavailable', 'none', ''] and
|
|
trigger.to_state.state not in ['unknown', 'unavailable', 'none', ''] and
|
|
trigger.from_state.state != trigger.to_state.state }}
|
|
action:
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "NETWORK"
|
|
message: >-
|
|
External IP changed from {{ trigger.from_state.state }} to {{ trigger.to_state.state }}.
|
|
|
|
- alias: "Infrastructure - Website Uptime SLO Repair"
|
|
id: infra_website_uptime_slo_repair
|
|
description: "Create/clear Repairs issue when website 1-day uptime breaches SLO."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_website_uptime_slo_breach
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ trigger.to_state.state == 'on' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_website_uptime_slo_breach
|
|
title: "Website uptime SLO breached"
|
|
description: >
|
|
At least one monitored website has uptime_1d below 99%.
|
|
Review Uptime Kuma entities on the Website Health dashboard.
|
|
severity: warning
|
|
persistent: true
|
|
default:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_website_uptime_slo_breach
|
|
|
|
- alias: "Infrastructure - Website Latency Repair"
|
|
id: infra_website_latency_repair
|
|
description: "Create/clear Repairs issue when website response times degrade."
|
|
mode: queued
|
|
trigger:
|
|
- platform: state
|
|
entity_id: binary_sensor.infra_website_latency_degraded
|
|
action:
|
|
- choose:
|
|
- conditions: "{{ trigger.to_state.state == 'on' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_website_latency_degraded
|
|
title: "Website latency degraded"
|
|
description: >
|
|
At least one monitored website reports avg_response_time_1d above 1.2s.
|
|
Review Uptime Kuma response-time entities on Website Health.
|
|
severity: warning
|
|
persistent: true
|
|
default:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_website_latency_degraded
|
|
|
|
- alias: "Docker Host Disk Pressure Monitor"
|
|
id: docker_host_disk_pressure_monitor
|
|
description: "Track Docker host root disk pressure from normalized Glances sensors and dispatch Joanna on band changes."
|
|
mode: queued
|
|
trigger:
|
|
- platform: time_pattern
|
|
minutes: "/15"
|
|
- platform: state
|
|
entity_id:
|
|
- sensor.docker_17_disk_used_percentage
|
|
- sensor.docker_14_disk_used_percentage
|
|
- sensor.docker_69_disk_used_percentage
|
|
variables:
|
|
host_configs:
|
|
- host_id: docker_17
|
|
host_name: docker_17
|
|
disk_entity: sensor.docker_17_disk_used_percentage
|
|
raw_entity: sensor.192_168_10_17_disk_usage
|
|
free_entity: sensor.192_168_10_17_disk_free
|
|
used_entity: sensor.192_168_10_17_disk_used
|
|
band_entity: input_text.docker_17_disk_pressure_band
|
|
issue_id: docker_host_docker_17_disk_pressure
|
|
- host_id: docker_14
|
|
host_name: docker_14
|
|
disk_entity: sensor.docker_14_disk_used_percentage
|
|
raw_entity: sensor.docker14_disk_usage
|
|
free_entity: sensor.docker14_disk_free
|
|
used_entity: sensor.docker14_disk_used
|
|
band_entity: input_text.docker_14_disk_pressure_band
|
|
issue_id: docker_host_docker_14_disk_pressure
|
|
- host_id: docker_69
|
|
host_name: docker_69
|
|
disk_entity: sensor.docker_69_disk_used_percentage
|
|
raw_entity: sensor.docker69_disk_usage
|
|
free_entity: sensor.docker69_disk_free
|
|
used_entity: sensor.docker69_disk_used
|
|
band_entity: input_text.docker_69_disk_pressure_band
|
|
issue_id: docker_host_docker_69_disk_pressure
|
|
action:
|
|
- repeat:
|
|
for_each: "{{ host_configs }}"
|
|
sequence:
|
|
- variables:
|
|
host_id: "{{ repeat.item.host_id }}"
|
|
host_name: "{{ repeat.item.host_name }}"
|
|
disk_entity: "{{ repeat.item.disk_entity }}"
|
|
raw_entity: "{{ repeat.item.raw_entity }}"
|
|
free_entity: "{{ repeat.item.free_entity }}"
|
|
used_entity: "{{ repeat.item.used_entity }}"
|
|
band_entity: "{{ repeat.item.band_entity }}"
|
|
issue_id: "{{ repeat.item.issue_id }}"
|
|
disk_state: "{{ states(disk_entity) }}"
|
|
disk_pct: "{{ disk_state | float(0) }}"
|
|
previous_band: "{{ states(band_entity) | lower }}"
|
|
current_band: >-
|
|
{{ 'unavailable' if disk_state in ['unknown', 'unavailable', 'none', '']
|
|
else 'critical' if disk_pct >= 90
|
|
else 'warning' if disk_pct >= 80
|
|
else 'normal' }}
|
|
- choose:
|
|
- conditions: "{{ current_band == 'critical' and previous_band != 'critical' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
severity: error
|
|
persistent: true
|
|
title: "{{ host_name }} disk pressure critical ({{ disk_pct | round(1) }}%)"
|
|
description: >-
|
|
{{ host_name }} root disk usage is critically high.
|
|
Free space or expand the host filesystem before Docker workloads fail.
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Critical)"
|
|
source: "home_assistant_automation.docker_host_disk_pressure_monitor.critical"
|
|
summary: "{{ host_name }} root disk pressure is critical at {{ disk_pct | round(1) }}%"
|
|
entity_ids:
|
|
- "{{ disk_entity }}"
|
|
- "{{ raw_entity }}"
|
|
- "{{ free_entity }}"
|
|
- "{{ used_entity }}"
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
host_id={{ host_id }},
|
|
disk_entity={{ disk_entity }},
|
|
raw_entity={{ raw_entity }},
|
|
disk_pct={{ disk_pct | round(1) }},
|
|
disk_free={{ states(free_entity) }},
|
|
disk_used={{ states(used_entity) }},
|
|
threshold=90
|
|
request: >-
|
|
Investigate critical disk pressure on {{ host_name }} and perform safe remediation when confidence is high.
|
|
Check Docker build cache, image/container volumes, logs, backups, and large files first.
|
|
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
|
|
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
|
|
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: >-
|
|
{{ host_name }} disk usage is critical at {{ disk_pct | round(1) }}%.
|
|
Repair {{ issue_id }} opened and Joanna investigation requested.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "critical"
|
|
- conditions: "{{ current_band == 'warning' and previous_band not in ['warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
severity: warning
|
|
persistent: true
|
|
title: "{{ host_name }} disk pressure warning ({{ disk_pct | round(1) }}%)"
|
|
description: >-
|
|
{{ host_name }} root disk usage is elevated.
|
|
Plan cleanup before capacity reaches critical levels.
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "HA automation docker_host_disk_pressure_monitor (Docker Host Disk Pressure Monitor - Warning)"
|
|
source: "home_assistant_automation.docker_host_disk_pressure_monitor.warning"
|
|
summary: "{{ host_name }} root disk pressure warning at {{ disk_pct | round(1) }}%"
|
|
entity_ids:
|
|
- "{{ disk_entity }}"
|
|
- "{{ raw_entity }}"
|
|
- "{{ free_entity }}"
|
|
- "{{ used_entity }}"
|
|
diagnostics: >-
|
|
issue_id={{ issue_id }},
|
|
host_id={{ host_id }},
|
|
disk_entity={{ disk_entity }},
|
|
raw_entity={{ raw_entity }},
|
|
disk_pct={{ disk_pct | round(1) }},
|
|
disk_free={{ states(free_entity) }},
|
|
disk_used={{ states(used_entity) }},
|
|
threshold=80
|
|
request: >-
|
|
Investigate elevated disk usage on {{ host_name }} and perform safe low-risk cleanup before it becomes critical when confidence is high.
|
|
Check Docker build cache, image/container volumes, logs, backups, and large files first.
|
|
Allowed without confirmation: clear disposable caches, remove unused build cache, and rotate or delete old generated backup artifacts when newer retained copies exist.
|
|
Do not delete live application data, remove the only copy of a backup, prune active or in-use Docker resources, stop critical services, or reboot the host without explicit approval.
|
|
Reply with resolved=true/false, action_taken, verification, and next_action_required=true/false.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: >-
|
|
{{ host_name }} disk usage warning at {{ disk_pct | round(1) }}%.
|
|
Repair {{ issue_id }} opened and Joanna investigation requested.
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "warning"
|
|
- conditions: "{{ current_band == 'warning' and previous_band == 'critical' }}"
|
|
sequence:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
severity: warning
|
|
persistent: true
|
|
title: "{{ host_name }} disk pressure warning ({{ disk_pct | round(1) }}%)"
|
|
description: >-
|
|
{{ host_name }} root disk usage is elevated but no longer critical.
|
|
Continue cleanup before capacity reaches critical levels again.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: "{{ host_name }} disk usage dropped from critical to warning at {{ disk_pct | round(1) }}%."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "warning"
|
|
- conditions: "{{ current_band == 'normal' and previous_band in ['warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "DOCKER"
|
|
message: "{{ host_name }} disk usage recovered to {{ disk_pct | round(1) }}%. Repair {{ issue_id }} cleared."
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "normal"
|
|
- conditions: "{{ current_band == 'normal' and previous_band not in ['normal', 'warning', 'critical'] }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: "{{ issue_id }}"
|
|
- service: input_text.set_value
|
|
target:
|
|
entity_id: "{{ band_entity }}"
|
|
data:
|
|
value: "normal"
|
|
|
|
- alias: "Infrastructure - Backup Nightly Verification"
|
|
id: infra_backup_nightly_verification
|
|
description: "Use codex_appliance to verify the latest Duplicati run and dispatch Joanna only on failure."
|
|
mode: single
|
|
trigger:
|
|
- platform: time
|
|
at: "06:45:00"
|
|
action:
|
|
- variables:
|
|
trigger_context: "HA automation infra_backup_nightly_verification (Infrastructure - Backup Nightly Verification)"
|
|
duplicati_state: "{{ states('switch.duplicati_container') }}"
|
|
- action: rest_command.bearclaw_duplicati_verify
|
|
data:
|
|
reason: "ha_nightly"
|
|
response_variable: duplicati_verify
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "BACKUP"
|
|
message: >-
|
|
{% set payload = duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} %}
|
|
{% set detail = payload['detail'] if payload is mapping and payload['detail'] is mapping else {} %}
|
|
{{ detail.get('summary', 'Nightly Duplicati verification completed.') }}
|
|
- variables:
|
|
verify_payload: "{{ duplicati_verify['content'] if duplicati_verify is mapping and duplicati_verify['content'] is mapping else {} }}"
|
|
verify_detail: "{{ verify_payload['detail'] if verify_payload is mapping and verify_payload['detail'] is mapping else {} }}"
|
|
verify_http_status: "{{ duplicati_verify['status'] | int(0) if duplicati_verify is mapping else 0 }}"
|
|
verify_healthy: "{{ verify_payload.get('ok', false) and verify_detail.get('healthy', false) }}"
|
|
verify_status: "{{ verify_detail.get('status', 'unknown') }}"
|
|
verify_summary: "{{ verify_detail.get('summary', 'Duplicati verification did not return a summary.') }}"
|
|
verify_issue: "{{ verify_detail.get('issue', verify_payload.get('error', 'duplicati_verify_failed')) }}"
|
|
verify_backup_name: "{{ verify_detail.get('backupName', 'Docker_Configs') }}"
|
|
verify_latest_result: "{{ verify_detail.get('latestResult', {}) if verify_detail is mapping else {} }}"
|
|
verify_last_success: "{{ verify_detail.get('lastSuccessfulRun', {}) if verify_detail is mapping else {} }}"
|
|
- choose:
|
|
- conditions: "{{ verify_healthy }}"
|
|
sequence:
|
|
- service: repairs.remove
|
|
continue_on_error: true
|
|
data:
|
|
issue_id: infra_duplicati_backup_failure
|
|
default:
|
|
- service: repairs.create
|
|
data:
|
|
issue_id: infra_duplicati_backup_failure
|
|
title: "Duplicati nightly backup verification failed"
|
|
description: >-
|
|
{{ verify_summary }}
|
|
Backup={{ verify_backup_name }};
|
|
status={{ verify_status }};
|
|
last_result={{ verify_latest_result.get('endedAt', 'n/a') }};
|
|
last_success={{ verify_last_success.get('endedAt', 'n/a') }}.
|
|
severity: error
|
|
persistent: true
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "{{ trigger_context }}"
|
|
source: "home_assistant_automation.infra_backup_nightly_verification"
|
|
summary: "Nightly Duplicati backup verification failed"
|
|
entity_ids:
|
|
- "switch.duplicati_container"
|
|
diagnostics: >-
|
|
scheduled_time=06:45:00,
|
|
duplicati_container={{ duplicati_state }},
|
|
verifier_http_status={{ verify_http_status }},
|
|
verifier_status={{ verify_status }},
|
|
verifier_issue={{ verify_issue }},
|
|
backup_name={{ verify_backup_name }},
|
|
latest_result={{ verify_latest_result.get('endedAt', 'n/a') }},
|
|
last_success={{ verify_last_success.get('endedAt', 'n/a') }}
|
|
request: >-
|
|
Investigate the Duplicati backup job {{ verify_backup_name }}.
|
|
The codex_appliance verifier reported status {{ verify_status }} with issue {{ verify_issue }}.
|
|
Use the Duplicati API or UI directly, resolve the failure if possible, and verify a successful run before closing out.
|
|
Reply with explicit status fields:
|
|
resolved=true/false,
|
|
backup_status,
|
|
last_success_time,
|
|
root_cause,
|
|
action_taken,
|
|
verification,
|
|
next_action_required=true/false.
|
|
|
|
- alias: "Infrastructure - Monthly HA Log Hygiene Review"
|
|
id: infra_monthly_log_hygiene_review
|
|
description: "Ask Joanna monthly to review Home Assistant logs, create a GitHub issue with noisy entries, and send Telegram recommendations only."
|
|
mode: single
|
|
trigger:
|
|
- platform: time
|
|
at: "03:20:00"
|
|
condition:
|
|
- condition: template
|
|
value_template: "{{ now().day == 1 }}"
|
|
variables:
|
|
trigger_context: "HA automation infra_monthly_log_hygiene_review (Infrastructure - Monthly HA Log Hygiene Review)"
|
|
action:
|
|
- service: script.joanna_dispatch
|
|
data:
|
|
trigger_context: "{{ trigger_context }}"
|
|
source: "home_assistant_automation.infra_monthly_log_hygiene_review"
|
|
summary: "Monthly Home Assistant log hygiene review with GitHub issue and Telegram follow-up"
|
|
diagnostics: >-
|
|
schedule=day_1@03:20:00,
|
|
review_scope=available_home_assistant_logs,
|
|
desired_outputs=telegram_follow_up+github_issue,
|
|
github_repo=CCOSTAN/Home-AssistantConfig,
|
|
approval_required_before_changes=true
|
|
request: >-
|
|
Review the available Home Assistant log files from the last month and identify noisy,
|
|
low-value entries that could be safely suppressed, filtered, slowed, deduplicated, or
|
|
retired. Focus on practical Home Assistant-side changes such as recorder exclusions,
|
|
logger filtering, scan-interval reductions, entity retirement, or automation de-noising.
|
|
Create or refresh a GitHub issue in CCOSTAN/Home-AssistantConfig that captures the noisy
|
|
entries, estimated frequency, why each candidate is low-value, and the exact repo files
|
|
or integrations likely to change. Then send Carlo a concise Telegram summary with the top
|
|
recommendations and the GitHub issue number or link. Do not make any changes from this
|
|
review. Wait for explicit follow-up approval first.
|
|
- service: script.send_to_logbook
|
|
data:
|
|
topic: "HOME ASSISTANT"
|
|
message: "Joanna monthly Home Assistant log hygiene review dispatched; Telegram summary and GitHub issue requested."
|