Add Docker telemetry monitoring and degraded state handling

- Introduced a new binary sensor for Docker container telemetry degradation. - Updated dashboard configurations to display telemetry status and alerts. - Enhanced button card templates to reflect telemetry state in UI. - Modified Docker infrastructure package to include new telemetry sensors and logic for handling degraded telemetry scenarios. - Updated README to describe new features and improvements related to Docker monitoring.
3 months ago · 025b91fec1
parent b527dce495
commit 025b91fec1
5 changed files with 117 additions and 15 deletions
--- a/config/dashboards/infrastructure/partials/docker_containers_sections.yaml
+++ b/config/dashboards/infrastructure/partials/docker_containers_sections.yaml
@ -8,6 +8,7 @@
 #  Sections layout for the Docker containers view.
 # -------------------------------------------------------------------
 # Notes: Auto-discovers Portainer container entities from `switch.*_container`.
+# Notes: Keeps cards visible when Portainer telemetry is unavailable (degraded mode).
 ######################################################################

 - type: grid
@ -80,6 +81,20 @@
    card_mod:
      style: !include /config/dashboards/infrastructure/card_mod/infra_panel.yaml
    cards:
+    - type: custom:button-card
+      template: bearstone_infra_alert_row
+      entity: binary_sensor.docker_container_telemetry_degraded
+      name: Docker telemetry degraded
+      icon: mdi:lan-disconnect
+      variables:
+        alert_kind: binary_on
+      state_display: >-
+        [[[
+          const unavailable = states['sensor.docker_monitored_unavailable_count']?.state ?? '0';
+          const total = states['sensor.docker_monitored_container_count']?.state ?? '0';
+          return `${unavailable}/${total} unavailable`;
+        ]]]
+
    - type: custom:auto-entities
      show_empty: true
      grid_options:
@ -104,7 +119,5 @@
            type: custom:button-card
            template: bearstone_infra_container_row
            icon: mdi:docker
-        exclude:
-        - state: unavailable
      sort:
        method: name
--- a/config/dashboards/infrastructure/partials/home_sections.yaml
+++ b/config/dashboards/infrastructure/partials/home_sections.yaml
@ -400,6 +400,23 @@
        tap_action:
          action: none

+      - type: custom:button-card
+        template: bearstone_infra_alert_row
+        entity: binary_sensor.docker_container_telemetry_degraded
+        name: Docker telemetry degraded
+        icon: mdi:lan-disconnect
+        variables:
+          alert_kind: binary_on
+        tap_action:
+          action: navigate
+          navigation_path: /dashboard-infrastructure/docker
+        state_display: >-
+          [[[
+            const unavailable = states['sensor.docker_monitored_unavailable_count']?.state ?? '0';
+            const total = states['sensor.docker_monitored_container_count']?.state ?? '0';
+            return `${unavailable}/${total} unavailable`;
+          ]]]
+
      - type: custom:auto-entities
        show_empty: false
        card:
@ -424,6 +441,8 @@
              icon: mdi:docker
          exclude:
          - state: 'on'
+          - state: unavailable
+          - state: unknown

    - type: custom:vertical-stack-in-card
      grid_options:
--- a/config/dashboards/infrastructure/templates/button_card_templates.yaml
+++ b/config/dashboards/infrastructure/templates/button_card_templates.yaml
@ -216,6 +216,8 @@ bearstone_infra_container_row:
    image: >
      [[[
        const ent = (entity && entity.entity_id) ? String(entity.entity_id) : '';
+        const stateNow = String(entity && entity.state !== undefined ? entity.state : '').toLowerCase();
+        const telemetryDegraded = states['binary_sensor.docker_container_telemetry_degraded']?.state === 'on';
        let key = '';
        if (ent.startsWith('binary_sensor.') && ent.endsWith('_status')) {
          key = ent.replace('binary_sensor.', '').replace(/_status$/, '');
@ -227,6 +229,9 @@ bearstone_infra_container_row:
          : (key ? `sensor.${key}_image` : '');
        const imageValue = states[imageEntity]?.state;
        if (!imageValue || ['unknown', 'unavailable', 'none', ''].includes(String(imageValue).toLowerCase())) {
+          if (telemetryDegraded && ['unknown', 'unavailable', ''].includes(stateNow)) {
+            return 'telemetry: delayed';
+          }
          return 'image: n/a';
        }
        return imageValue;
@ -234,10 +239,11 @@ bearstone_infra_container_row:
    status: >
      [[[
        const s = String(entity.state || '').toLowerCase();
+        const telemetryDegraded = states['binary_sensor.docker_container_telemetry_degraded']?.state === 'on';
        if (s === 'on' || s === 'running') return 'RUNNING';
        if (s === 'off' || s === 'stopped') return 'STOPPED';
-        if (s === 'unavailable') return 'OFFLINE';
-        if (s === 'unknown' || s === '') return 'UNKNOWN';
+        if (s === 'unavailable') return telemetryDegraded ? 'STALE' : 'OFFLINE';
+        if (s === 'unknown' || s === '') return telemetryDegraded ? 'STALE' : 'UNKNOWN';
        return String(entity.state).toUpperCase();
      ]]]
  styles:
@ -359,14 +365,25 @@ bearstone_infra_container_row:
  - value: unavailable
    styles:
      card:
-      - border-color: rgba(229,57,53,0.35)
-      - background: rgba(255,235,238,0.85)
+      - border-color: rgba(245,124,0,0.35)
+      - background: rgba(255,243,224,0.85)
      icon:
-      - color: rgba(198,40,40,1)
+      - color: rgba(230,81,0,1)
      custom_fields:
        status:
-        - background: rgba(198,40,40,0.10)
-        - color: rgba(198,40,40,1)
+        - background: rgba(230,81,0,0.12)
+        - color: rgba(230,81,0,1)
+  - value: unknown
+    styles:
+      card:
+      - border-color: rgba(245,124,0,0.35)
+      - background: rgba(255,243,224,0.85)
+      icon:
+      - color: rgba(230,81,0,1)
+      custom_fields:
+        status:
+        - background: rgba(230,81,0,0.12)
+        - color: rgba(230,81,0,1)

 bearstone_infra_panel_header:
  show_icon: false
--- a/config/packages/README.md
+++ b/config/packages/README.md
@ -45,7 +45,7 @@ Live collection of plug-and-play Home Assistant packages. Each YAML file in this
 | [lightning.yaml](lightning.yaml) | Blitzortung lightning counter monitoring with snoozeable push actions. | `sensor.blitzortung_lightning_counter`, `input_boolean.snooze_lightning`, notify engine actions |
 | [logbook_activity_feed.yaml](logbook_activity_feed.yaml) | Dummy `sensor.activity_feed` + helper to write clean Activity entries (Issue #1550). | `sensor.activity_feed`, `script.send_to_logbook` |
 | [mariadb_monitoring.yaml](mariadb_monitoring.yaml) | MariaDB health sensors and Lovelace dashboard snippet for recorder stats. | `sensor.mariadb_status`, `sensor.database_size` |
-| [docker_infrastructure.yaml](docker_infrastructure.yaml) | Docker host patching telemetry (docker_10/14/17/69) + host-side auto-reboots + container-down Repairs alerts. | `sensor.docker_*_apt_status`, `repairs.create`, `repairs.remove` |
+| [docker_infrastructure.yaml](docker_infrastructure.yaml) | Docker host patching telemetry (docker_10/14/17/69) + host-side auto-reboots + container-down Repairs alerts, with degraded-telemetry guardrails when Portainer data drops. | `sensor.docker_*_apt_status`, `binary_sensor.docker_container_telemetry_degraded`, `repairs.create`, `repairs.remove` |
 | [infrastructure_observability.yaml](infrastructure_observability.yaml) | Normalized WAN/DNS/backup/domain/cert health sensors used by the Infrastructure Home + Website Health dashboards. | `binary_sensor.infra_*`, `sensor.infra_*`, `script.send_to_logbook` |
 | [onenote_indexer.yaml](onenote_indexer.yaml) | OneNote indexer health/status monitoring for Joanna, failure-repair automation, and a daily duplicate-delete maintenance request. | `sensor.onenote_indexer_last_job_status`, `binary_sensor.onenote_indexer_last_job_successful` |
 | [mariadb.yaml](mariadb.yaml) | MariaDB recorder health and capacity SQL sensors. | `sensor.mariadb_status`, `sensor.database_size` |
--- a/config/packages/docker_infrastructure.yaml
+++ b/config/packages/docker_infrastructure.yaml
@ -200,12 +200,47 @@ template:
          {% endif %}

  - sensor:
+      - name: "Docker Monitored Container Count"
+        unique_id: docker_monitored_container_count
+        icon: mdi:format-list-numbered
+        state: >-
+          {{ state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) | count }}
+
+      - name: "Docker Monitored Unavailable Count"
+        unique_id: docker_monitored_unavailable_count
+        icon: mdi:lan-disconnect
+        state: >-
+          {% set ns = namespace(keys=[], unavailable=0) %}
+          {% set monitored = state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) %}
+          {% for switch_entity in monitored %}
+            {% set key = switch_entity | replace('switch.', '') | regex_replace('_container$', '') %}
+            {% if key not in ns.keys %}
+              {% set ns.keys = ns.keys + [key] %}
+            {% endif %}
+          {% endfor %}
+          {% for key in ns.keys %}
+            {% set status_entity = 'binary_sensor.' ~ key ~ '_status' %}
+            {% set switch_entity = 'switch.' ~ key ~ '_container' %}
+            {% if expand(status_entity) | count > 0 %}
+              {% set effective_state = states(status_entity) | lower %}
+            {% elif expand(switch_entity) | count > 0 %}
+              {% set effective_state = states(switch_entity) | lower %}
+            {% else %}
+              {% set effective_state = 'unknown' %}
+            {% endif %}
+            {% if effective_state == 'unavailable' %}
+              {% set ns.unavailable = ns.unavailable + 1 %}
+            {% endif %}
+          {% endfor %}
+          {{ ns.unavailable }}
+
      - name: "Docker Containers Down List"
        unique_id: docker_containers_down_list
        icon: mdi:docker
        state: >-
          {% set ns = namespace(keys=[], down=[]) %}
          {% set monitored = state_attr('group.docker_monitored_containers', 'entity_id') | default([], true) %}
+          {% set telemetry_degraded = is_state('binary_sensor.docker_container_telemetry_degraded', 'on') %}
          {% for switch_entity in monitored %}
            {% set key = switch_entity | replace('switch.', '') | regex_replace('_container$', '') %}
            {% if key not in ns.keys %}
@ -222,7 +257,9 @@ template:
            {% else %}
              {% set effective_state = 'unknown' %}
            {% endif %}
-            {% if effective_state in ['off', 'unknown', 'unavailable'] %}
+            {% if effective_state in ['off', 'stopped'] %}
+              {% set ns.down = ns.down + [key] %}
+            {% elif not telemetry_degraded and effective_state in ['unknown', 'unavailable'] %}
              {% set ns.down = ns.down + [key] %}
            {% endif %}
          {% endfor %}
@ -241,6 +278,16 @@ template:
          {% endif %}

  - binary_sensor:
+      - name: "Docker Container Telemetry Degraded"
+        unique_id: docker_container_telemetry_degraded
+        device_class: problem
+        icon: mdi:lan-disconnect
+        state: >-
+          {% set total = states('sensor.docker_monitored_container_count') | int(0) %}
+          {% set unavailable = states('sensor.docker_monitored_unavailable_count') | int(0) %}
+          {% set threshold = [3, ((total * 0.6) | round(0, 'ceil') | int(0))] | max %}
+          {{ total > 0 and unavailable >= threshold }}
+
      - name: "Docker Container Alerts Snoozed"
        unique_id: docker_container_alerts_snoozed
        device_class: problem
@ -266,7 +313,7 @@ script:
        example: 5
    sequence:
      - variables:
-          down_states: ['off', 'unknown', 'unavailable']
+          down_states: ['off', 'stopped', 'unknown', 'unavailable']
          src_entity: "{{ entity_id | default('', true) }}"
          op: "{{ operation | default('create', true) | lower }}"
          wait_minutes: "{{ delay_minutes | default(0) | int(0) }}"
@ -304,9 +351,12 @@ script:
                          minutes: "{{ wait_minutes }}"
              - variables:
                  effective_state: "{{ states(effective_entity) | lower }}"
+                  telemetry_degraded: "{{ is_state('binary_sensor.docker_container_telemetry_degraded', 'on') }}"
                  container_name: "{{ state_attr(effective_entity, 'friendly_name') | default(container_key, true) }}"
              - condition: template
-                value_template: "{{ effective_state in down_states }}"
+                value_template: >-
+                  {{ effective_state in down_states and
+                     not (telemetry_degraded and effective_state in ['unknown', 'unavailable']) }}
              - condition: state
                entity_id: binary_sensor.docker_container_alerts_snoozed
                state: "off"
@ -453,12 +503,15 @@ automation:
        value_template: "{{ trigger.event.data.old_state.state != trigger.event.data.new_state.state }}"
    action:
      - variables:
-          down_states: ['off', 'unknown', 'unavailable']
+          down_states: ['off', 'stopped', 'unknown', 'unavailable']
          entity_id: "{{ trigger.event.data.entity_id }}"
          old_state: "{{ trigger.event.data.old_state.state | lower }}"
          new_state: "{{ trigger.event.data.new_state.state | lower }}"
      - choose:
-          - conditions: "{{ new_state in down_states and old_state not in down_states }}"
+          - conditions: >-
+              {{ new_state in down_states and old_state not in down_states and
+                 not (is_state('binary_sensor.docker_container_telemetry_degraded', 'on') and
+                      new_state in ['unknown', 'unavailable']) }}
            sequence:
              - service: script.docker_container_repairs_sync
                data: