Add Docker telemetry monitoring and degraded state handling

- Introduced a new binary sensor for Docker container telemetry degradation.
- Updated dashboard configurations to display telemetry status and alerts.
- Enhanced button card templates to reflect telemetry state in UI.
- Modified Docker infrastructure package to include new telemetry sensors and logic for handling degraded telemetry scenarios.
- Updated README to describe new features and improvements related to Docker monitoring.
pull/1590/head
Carlo Costanzo 3 weeks ago
parent b527dce495
commit 025b91fec1

@ -8,6 +8,7 @@
# Sections layout for the Docker containers view.
# -------------------------------------------------------------------
# Notes: Auto-discovers Portainer container entities from `switch.*_container`.
# Notes: Keeps cards visible when Portainer telemetry is unavailable (degraded mode).
######################################################################
- type: grid
@ -80,6 +81,20 @@
card_mod:
style: !include /config/dashboards/infrastructure/card_mod/infra_panel.yaml
cards:
- type: custom:button-card
template: bearstone_infra_alert_row
entity: binary_sensor.docker_container_telemetry_degraded
name: Docker telemetry degraded
icon: mdi:lan-disconnect
variables:
alert_kind: binary_on
state_display: >-
[[[
const unavailable = states['sensor.docker_monitored_unavailable_count']?.state ?? '0';
const total = states['sensor.docker_monitored_container_count']?.state ?? '0';
return `${unavailable}/${total} unavailable`;
]]]
- type: custom:auto-entities
show_empty: true
grid_options:
@ -104,7 +119,5 @@
type: custom:button-card
template: bearstone_infra_container_row
icon: mdi:docker
exclude:
- state: unavailable
sort:
method: name

@ -400,6 +400,23 @@
tap_action:
action: none
- type: custom:button-card
template: bearstone_infra_alert_row
entity: binary_sensor.docker_container_telemetry_degraded
name: Docker telemetry degraded
icon: mdi:lan-disconnect
variables:
alert_kind: binary_on
tap_action:
action: navigate
navigation_path: /dashboard-infrastructure/docker
state_display: >-
[[[
const unavailable = states['sensor.docker_monitored_unavailable_count']?.state ?? '0';
const total = states['sensor.docker_monitored_container_count']?.state ?? '0';
return `${unavailable}/${total} unavailable`;
]]]
- type: custom:auto-entities
show_empty: false
card:
@ -424,6 +441,8 @@
icon: mdi:docker
exclude:
- state: 'on'
- state: unavailable
- state: unknown
- type: custom:vertical-stack-in-card
grid_options:

@ -216,6 +216,8 @@ bearstone_infra_container_row:
image: >
[[[
const ent = (entity && entity.entity_id) ? String(entity.entity_id) : '';
const stateNow = String(entity && entity.state !== undefined ? entity.state : '').toLowerCase();
const telemetryDegraded = states['binary_sensor.docker_container_telemetry_degraded']?.state === 'on';
let key = '';
if (ent.startsWith('binary_sensor.') && ent.endsWith('_status')) {
key = ent.replace('binary_sensor.', '').replace(/_status$/, '');
@ -227,6 +229,9 @@ bearstone_infra_container_row:
: (key ? `sensor.${key}_image` : '');
const imageValue = states[imageEntity]?.state;
if (!imageValue || ['unknown', 'unavailable', 'none', ''].includes(String(imageValue).toLowerCase())) {
if (telemetryDegraded && ['unknown', 'unavailable', ''].includes(stateNow)) {
return 'telemetry: delayed';
}
return 'image: n/a';
}
return imageValue;
@ -234,10 +239,11 @@ bearstone_infra_container_row:
status: >
[[[
const s = String(entity.state || '').toLowerCase();
const telemetryDegraded = states['binary_sensor.docker_container_telemetry_degraded']?.state === 'on';
if (s === 'on' || s === 'running') return 'RUNNING';
if (s === 'off' || s === 'stopped') return 'STOPPED';
if (s === 'unavailable') return 'OFFLINE';
if (s === 'unknown' || s === '') return 'UNKNOWN';
if (s === 'unavailable') return telemetryDegraded ? 'STALE' : 'OFFLINE';
if (s === 'unknown' || s === '') return telemetryDegraded ? 'STALE' : 'UNKNOWN';
return String(entity.state).toUpperCase();
]]]
styles:
@ -359,14 +365,25 @@ bearstone_infra_container_row:
- value: unavailable
styles:
card:
- border-color: rgba(229,57,53,0.35)
- background: rgba(255,235,238,0.85)
- border-color: rgba(245,124,0,0.35)
- background: rgba(255,243,224,0.85)
icon:
- color: rgba(198,40,40,1)
- color: rgba(230,81,0,1)
custom_fields:
status:
- background: rgba(198,40,40,0.10)
- color: rgba(198,40,40,1)
- background: rgba(230,81,0,0.12)
- color: rgba(230,81,0,1)
- value: unknown
styles:
card:
- border-color: rgba(245,124,0,0.35)
- background: rgba(255,243,224,0.85)
icon:
- color: rgba(230,81,0,1)
custom_fields:
status:
- background: rgba(230,81,0,0.12)
- color: rgba(230,81,0,1)
bearstone_infra_panel_header:
show_icon: false

@ -45,7 +45,7 @@ Live collection of plug-and-play Home Assistant packages. Each YAML file in this
| [lightning.yaml](lightning.yaml) | Blitzortung lightning counter monitoring with snoozeable push actions. | `sensor.blitzortung_lightning_counter`, `input_boolean.snooze_lightning`, notify engine actions |
| [logbook_activity_feed.yaml](logbook_activity_feed.yaml) | Dummy `sensor.activity_feed` + helper to write clean Activity entries (Issue #1550). | `sensor.activity_feed`, `script.send_to_logbook` |
| [mariadb_monitoring.yaml](mariadb_monitoring.yaml) | MariaDB health sensors and Lovelace dashboard snippet for recorder stats. | `sensor.mariadb_status`, `sensor.database_size` |
| [docker_infrastructure.yaml](docker_infrastructure.yaml) | Docker host patching telemetry (docker_10/14/17/69) + host-side auto-reboots + container-down Repairs alerts. | `sensor.docker_*_apt_status`, `repairs.create`, `repairs.remove` |
| [docker_infrastructure.yaml](docker_infrastructure.yaml) | Docker host patching telemetry (docker_10/14/17/69) + host-side auto-reboots + container-down Repairs alerts, with degraded-telemetry guardrails when Portainer data drops. | `sensor.docker_*_apt_status`, `binary_sensor.docker_container_telemetry_degraded`, `repairs.create`, `repairs.remove` |
| [infrastructure_observability.yaml](infrastructure_observability.yaml) | Normalized WAN/DNS/backup/domain/cert health sensors used by the Infrastructure Home + Website Health dashboards. | `binary_sensor.infra_*`, `sensor.infra_*`, `script.send_to_logbook` |
| [onenote_indexer.yaml](onenote_indexer.yaml) | OneNote indexer health/status monitoring for Joanna, failure-repair automation, and a daily duplicate-delete maintenance request. | `sensor.onenote_indexer_last_job_status`, `binary_sensor.onenote_indexer_last_job_successful` |
| [mariadb.yaml](mariadb.yaml) | MariaDB recorder health and capacity SQL sensors. | `sensor.mariadb_status`, `sensor.database_size` |

@ -200,12 +200,47 @@ template:
{% endif %}
- sensor:
- name: "Docker Monitored Container Count"
unique_id: docker_monitored_container_count
icon: mdi:format-list-numbered
state: >-
{{ state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) | count }}
- name: "Docker Monitored Unavailable Count"
unique_id: docker_monitored_unavailable_count
icon: mdi:lan-disconnect
state: >-
{% set ns = namespace(keys=[], unavailable=0) %}
{% set monitored = state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) %}
{% for switch_entity in monitored %}
{% set key = switch_entity | replace('switch.', '') | regex_replace('_container$', '') %}
{% if key not in ns.keys %}
{% set ns.keys = ns.keys + [key] %}
{% endif %}
{% endfor %}
{% for key in ns.keys %}
{% set status_entity = 'binary_sensor.' ~ key ~ '_status' %}
{% set switch_entity = 'switch.' ~ key ~ '_container' %}
{% if expand(status_entity) | count > 0 %}
{% set effective_state = states(status_entity) | lower %}
{% elif expand(switch_entity) | count > 0 %}
{% set effective_state = states(switch_entity) | lower %}
{% else %}
{% set effective_state = 'unknown' %}
{% endif %}
{% if effective_state == 'unavailable' %}
{% set ns.unavailable = ns.unavailable + 1 %}
{% endif %}
{% endfor %}
{{ ns.unavailable }}
- name: "Docker Containers Down List"
unique_id: docker_containers_down_list
icon: mdi:docker
state: >-
{% set ns = namespace(keys=[], down=[]) %}
{% set monitored = state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) %}
{% set telemetry_degraded = is_state('binary_sensor.docker_container_telemetry_degraded', 'on') %}
{% for switch_entity in monitored %}
{% set key = switch_entity | replace('switch.', '') | regex_replace('_container$', '') %}
{% if key not in ns.keys %}
@ -222,7 +257,9 @@ template:
{% else %}
{% set effective_state = 'unknown' %}
{% endif %}
{% if effective_state in ['off', 'unknown', 'unavailable'] %}
{% if effective_state in ['off', 'stopped'] %}
{% set ns.down = ns.down + [key] %}
{% elif not telemetry_degraded and effective_state in ['unknown', 'unavailable'] %}
{% set ns.down = ns.down + [key] %}
{% endif %}
{% endfor %}
@ -241,6 +278,16 @@ template:
{% endif %}
- binary_sensor:
- name: "Docker Container Telemetry Degraded"
unique_id: docker_container_telemetry_degraded
device_class: problem
icon: mdi:lan-disconnect
state: >-
{% set total = states('sensor.docker_monitored_container_count') | int(0) %}
{% set unavailable = states('sensor.docker_monitored_unavailable_count') | int(0) %}
{% set threshold = [3, ((total * 0.6) | round(0, 'ceil') | int(0))] | max %}
{{ total > 0 and unavailable >= threshold }}
- name: "Docker Container Alerts Snoozed"
unique_id: docker_container_alerts_snoozed
device_class: problem
@ -266,7 +313,7 @@ script:
example: 5
sequence:
- variables:
down_states: ['off', 'unknown', 'unavailable']
down_states: ['off', 'stopped', 'unknown', 'unavailable']
src_entity: "{{ entity_id | default('', true) }}"
op: "{{ operation | default('create', true) | lower }}"
wait_minutes: "{{ delay_minutes | default(0) | int(0) }}"
@ -304,9 +351,12 @@ script:
minutes: "{{ wait_minutes }}"
- variables:
effective_state: "{{ states(effective_entity) | lower }}"
telemetry_degraded: "{{ is_state('binary_sensor.docker_container_telemetry_degraded', 'on') }}"
container_name: "{{ state_attr(effective_entity, 'friendly_name') | default(container_key, true) }}"
- condition: template
value_template: "{{ effective_state in down_states }}"
value_template: >-
{{ effective_state in down_states and
not (telemetry_degraded and effective_state in ['unknown', 'unavailable']) }}
- condition: state
entity_id: binary_sensor.docker_container_alerts_snoozed
state: "off"
@ -453,12 +503,15 @@ automation:
value_template: "{{ trigger.event.data.old_state.state != trigger.event.data.new_state.state }}"
action:
- variables:
down_states: ['off', 'unknown', 'unavailable']
down_states: ['off', 'stopped', 'unknown', 'unavailable']
entity_id: "{{ trigger.event.data.entity_id }}"
old_state: "{{ trigger.event.data.old_state.state | lower }}"
new_state: "{{ trigger.event.data.new_state.state | lower }}"
- choose:
- conditions: "{{ new_state in down_states and old_state not in down_states }}"
- conditions: >-
{{ new_state in down_states and old_state not in down_states and
not (is_state('binary_sensor.docker_container_telemetry_degraded', 'on') and
new_state in ['unknown', 'unavailable']) }}
sequence:
- service: script.docker_container_repairs_sync
data:

Loading…
Cancel
Save

Powered by TurnKey Linux.