From ddf7dd93b5affe90a61974e6a05383862f34ad0d Mon Sep 17 00:00:00 2001 From: Michael Skrynski Date: Thu, 18 Dec 2025 21:17:17 +0100 Subject: [PATCH] adding metrics to influx via telegraf --- .gitignore | 3 +- README.md | 111 +++++- grafana/rpi-cluster-dashboard.json | 416 ++++++++++++++++++++++ inventory/hosts.ini | 2 +- roles/telegraf/tasks/main.yml | 69 ++++ roles/telegraf/templates/telegraf.conf.j2 | 62 ++++ telegraf.yml | 27 ++ 7 files changed, 684 insertions(+), 6 deletions(-) create mode 100644 grafana/rpi-cluster-dashboard.json create mode 100644 roles/telegraf/tasks/main.yml create mode 100644 roles/telegraf/templates/telegraf.conf.j2 create mode 100644 telegraf.yml diff --git a/.gitignore b/.gitignore index 5dafb99..0d1b10d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ kubeconfig -.claude \ No newline at end of file +.claude +.env \ No newline at end of file diff --git a/README.md b/README.md index 9c4bee9..cc35c12 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ pi-worker-3 ansible_host=192.168.30.104 ansible_user=pi In `inventory/hosts.ini`, you can customize: -- `k3s_version`: K3s version to install (default: v1.28.3+k3s1) +- `k3s_version`: K3s version to install (default: v1.34.2+k3s1) - `extra_server_args`: Additional arguments for k3s server - `extra_agent_args`: Additional arguments for k3s agent - `extra_packages`: List of additional packages to install on all nodes @@ -86,10 +86,113 @@ To add packages, append them to the comma-separated list. To disable extra packa ### Test Connectivity +Basic connectivity test: + ```bash ansible all -m ping ``` +### Gather Node Information + +Display critical information from all nodes (uptime, temperature, memory, disk usage, load average): + +### Deploy Telegraf for Metrics Collection + +Stream system metrics from all nodes to InfluxDB using Telegraf client. + +**Prerequisites:** + +- InfluxDB instance running and accessible +- API token with write permissions to your bucket + +**Setup:** + +1. Configure your InfluxDB credentials in `.env` file (already created): + +```bash +# .env file (keep this secret, never commit!) +INFLUXDB_HOST=192.168.10.10 +INFLUXDB_PORT=8086 +INFLUXDB_ORG=family +INFLUXDB_BUCKET=rpi-cluster +INFLUXDB_TOKEN=your-api-token-here +``` + +2. Deploy Telegraf to all nodes: + +```bash +ansible-playbook telegraf.yml +``` + +Or deploy to specific nodes: + +```bash +# Only worker nodes +ansible-playbook telegraf.yml --limit worker + +# Only master nodes +ansible-playbook telegraf.yml --limit master + +# Specific node +ansible-playbook telegraf.yml --limit cm4-02 +``` + +**Metrics Collected:** + +- **System**: CPU (per-core and total), memory, swap, processes, system load +- **Disk**: Disk I/O, disk usage, inodes +- **Network**: Network interfaces, packets, errors +- **Thermal**: CPU temperature (Raspberry Pi specific) +- **K3s**: Process metrics for k3s components + +**Verify Installation:** + +Check Telegraf status on a node: + +```bash +ssh pi@ +sudo systemctl status telegraf +sudo journalctl -u telegraf -f +``` + +**View Metrics in InfluxDB:** + +Once configured, metrics will appear in your InfluxDB instance under the `rpi-cluster` bucket with tags for each node hostname and node type (master/worker). + +### Grafana Dashboard for Telegraf Metrics + +A pre-built Grafana dashboard is included to visualize all collected metrics. The dashboard displays: + +- CPU usage across all nodes +- Memory usage (percentage) +- CPU temperature (Raspberry Pi specific) +- System load averages +- Disk usage +- Network traffic + +**Import the Dashboard:** + +1. Open Grafana and go to **Dashboards** → **New** → **Import** +2. Upload the dashboard file: `grafana/rpi-cluster-dashboard.json` +3. Select your InfluxDB datasource (must be named `influx`) +4. Click **Import** + +**Datasource Requirements:** + +The dashboard expects your InfluxDB datasource in Grafana to be named exactly `influx`. If your datasource has a different name, either: + +- Rename your datasource in Grafana settings, or +- Edit the dashboard JSON and replace all `"uid": "influx"` references with your datasource name + +**Customize the Dashboard:** + +You can modify the dashboard after import to: + +- Adjust time ranges (default: last 6 hours) +- Add alerts for high CPU/temperature/memory +- Add more panels for network metrics +- Create node-specific views using Grafana variables + ### Deploy K3s Cluster ```bash @@ -169,9 +272,9 @@ You should see all your nodes in Ready state: ```bash NAME STATUS ROLES AGE VERSION -pi-master Ready control-plane,master 5m v1.28.3+k3s1 -pi-worker-1 Ready 3m v1.28.3+k3s1 -pi-worker-2 Ready 3m v1.28.3+k3s1 +pi-master Ready control-plane,master 5m v1.34.2+k3s1 +pi-worker-1 Ready 3m v1.34.2+k3s1 +pi-worker-2 Ready 3m v1.34.2+k3s1 ``` ## Accessing the Cluster diff --git a/grafana/rpi-cluster-dashboard.json b/grafana/rpi-cluster-dashboard.json new file mode 100644 index 0000000..83a9bb0 --- /dev/null +++ b/grafana/rpi-cluster-dashboard.json @@ -0,0 +1,416 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasourceUID": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "refId": "A", + "query": "from(bucket:\"rpi-cluster\")\n |> range(start: -1h)\n |> filter(fn: (r) => r[\"_measurement\"] == \"cpu\")\n |> filter(fn: (r) => r[\"_field\"] == \"usage_user\")\n |> aggregateWindow(every: 10s, fn: mean)", + "format": "table" + } + ], + "title": "CPU Usage - All Nodes", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "refId": "A", + "query": "from(bucket:\"rpi-cluster\")\n |> range(start: -1h)\n |> filter(fn: (r) => r[\"_measurement\"] == \"mem\")\n |> filter(fn: (r) => r[\"_field\"] == \"used_percent\")\n |> aggregateWindow(every: 10s, fn: mean)", + "format": "table" + } + ], + "title": "Memory Usage - All Nodes", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "refId": "A", + "query": "from(bucket:\"rpi-cluster\")\n |> range(start: -1h)\n |> filter(fn: (r) => r[\"_measurement\"] == \"cpu_temp_thermal\")\n |> filter(fn: (r) => r[\"_field\"] == \"value\")\n |> aggregateWindow(every: 10s, fn: mean)", + "format": "table" + } + ], + "title": "CPU Temperature - All Nodes", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "refId": "A", + "query": "from(bucket:\"rpi-cluster\")\n |> range(start: -1h)\n |> filter(fn: (r) => r[\"_measurement\"] == \"system\")\n |> filter(fn: (r) => r[\"_field\"] == \"load1\")\n |> aggregateWindow(every: 10s, fn: mean)", + "format": "table" + } + ], + "title": "System Load - All Nodes", + "type": "timeseries" + } + ], + "schemaVersion": 38, + "style": "dark", + "tags": [ + "raspberry-pi", + "k3s", + "telegraf", + "system-monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Raspberry Pi K3s Cluster Metrics", + "uid": "rpi-cluster-metrics", + "version": 1, + "weekStart": "" +} diff --git a/inventory/hosts.ini b/inventory/hosts.ini index 93cbe60..5f65909 100644 --- a/inventory/hosts.ini +++ b/inventory/hosts.ini @@ -17,7 +17,7 @@ worker [k3s_cluster:vars] # K3s version to install -k3s_version=v1.28.3+k3s1 +k3s_version=v1.34.2+k3s1 # Network settings ansible_user=pi diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml new file mode 100644 index 0000000..920ea7b --- /dev/null +++ b/roles/telegraf/tasks/main.yml @@ -0,0 +1,69 @@ +--- +- name: Install Telegraf from package archive + shell: | + cd /tmp + wget -q https://dl.influxdata.com/telegraf/releases/telegraf_1.29.2-1_arm64.deb + dpkg -i telegraf_1.29.2-1_arm64.deb || apt-get install -f -y + rm -f telegraf_1.29.2-1_arm64.deb + args: + creates: /usr/bin/telegraf + become: true + register: telegraf_install + +- name: Create Telegraf configuration directory + file: + path: /etc/telegraf/telegraf.d + state: directory + owner: root + group: root + mode: '0755' + become: true + +- name: Deploy Telegraf configuration + template: + src: telegraf.conf.j2 + dest: /etc/telegraf/telegraf.conf + owner: root + group: root + mode: '0644' + backup: true + become: true + register: telegraf_config + +- name: Ensure Telegraf service is started and enabled + systemd: + name: telegraf + state: started + enabled: true + daemon_reload: true + become: true + +- name: Restart Telegraf if configuration changed + systemd: + name: telegraf + state: restarted + become: true + when: telegraf_config.changed + +- name: Verify Telegraf is running + systemd: + name: telegraf + state: started + become: true + register: telegraf_status + +- name: Display Telegraf installation status + debug: + msg: | + Telegraf installation complete on {{ inventory_hostname }} + InfluxDB Configuration: + - Host: {{ influxdb_host }}:{{ influxdb_port }} + - Organization: {{ influxdb_org }} + - Bucket: {{ influxdb_bucket }} + - Node Tag: {{ inventory_hostname }} + - Node Type: {{ 'master' if inventory_hostname in groups['master'] else 'worker' }} + + Service Status: {{ telegraf_status.status.ActiveState }} + + To view logs: + sudo journalctl -u telegraf -f diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 new file mode 100644 index 0000000..0ae4e37 --- /dev/null +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -0,0 +1,62 @@ +[global_tags] + hostname = "{{ inventory_hostname }}" + node_type = "{{ 'master' if inventory_hostname in groups['master'] else 'worker' }}" + +[agent] + interval = "10s" + round_interval = true + metric_batch_size = 1000 + metric_buffer_limit = 10000 + collection_jitter = "0s" + flush_interval = "10s" + flush_jitter = "0s" + precision = "" + hostname = "{{ inventory_hostname }}" + omit_hostname = false + +[[outputs.influxdb_v2]] + urls = ["http://{{ influxdb_host }}:{{ influxdb_port }}"] + token = "{{ influxdb_token }}" + organization = "{{ influxdb_org }}" + bucket = "{{ influxdb_bucket }}" + timeout = "5s" + user_agent = "telegraf-rpi-cluster" + +# System metrics +[[inputs.cpu]] + percpu = true + totalcpu = true + collect_cpu_time = false + report_active = false + +[[inputs.disk]] + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + +[[inputs.diskio]] + +[[inputs.mem]] + +[[inputs.processes]] + +[[inputs.swap]] + +[[inputs.system]] + +[[inputs.netstat]] + +[[inputs.net]] + interfaces = ["eth0", "wlan0", "enp*", "wlp*", "docker*", "veth*"] + +# Thermal metrics (Raspberry Pi specific) +[[inputs.exec]] + commands = [ + "python3 -c \"import os; temp_file='/sys/class/thermal/thermal_zone0/temp'; print('cpu_temp,unit=celsius value=' + str(int(open(temp_file).read())/1000) if os.path.exists(temp_file) else 'cpu_temp,unit=celsius value=0') if os.path.exists(temp_file) else ''\"" + ] + timeout = "5s" + data_format = "influx" + name_suffix = "_thermal" + +# Process metrics for k3s components +[[inputs.procstat]] + pattern = "k3s" + prefix = "k3s" diff --git a/telegraf.yml b/telegraf.yml new file mode 100644 index 0000000..4fc1b85 --- /dev/null +++ b/telegraf.yml @@ -0,0 +1,27 @@ +--- +- name: Deploy Telegraf to all nodes + hosts: all + become: yes + pre_tasks: + - name: Parse .env file and set variables + block: + - name: Read .env file + slurp: + src: "{{ playbook_dir }}/.env" + register: env_file + delegate_to: localhost + become: false + run_once: true + + - name: Set InfluxDB variables from .env + set_fact: + influxdb_host: "{{ (env_file.content | b64decode | regex_search('INFLUXDB_HOST=(.+)$', '\\1', multiline=True) | first) }}" + influxdb_port: "{{ (env_file.content | b64decode | regex_search('INFLUXDB_PORT=(.+)$', '\\1', multiline=True) | first) }}" + influxdb_org: "{{ (env_file.content | b64decode | regex_search('INFLUXDB_ORG=(.+)$', '\\1', multiline=True) | first) }}" + influxdb_bucket: "{{ (env_file.content | b64decode | regex_search('INFLUXDB_BUCKET=(.+)$', '\\1', multiline=True) | first) }}" + influxdb_token: "{{ (env_file.content | b64decode | regex_search('INFLUXDB_TOKEN=(.+)$', '\\1', multiline=True) | first) }}" + run_once: true + become: false + + roles: + - telegraf