diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..5c7d37c --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,95 @@ +name: Release iperf3-monitor + +on: + push: + tags: + - 'v*.*.*' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + lint-and-test: + name: Lint and Test + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.10.0 + + - name: Helm Lint + run: helm lint ./charts/iperf3-monitor + + build-and-publish-image: + name: Build and Publish Docker Image + runs-on: ubuntu-latest + needs: lint-and-test + permissions: + contents: read + packages: write + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: ./exporter + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + package-and-publish-chart: + name: Package and Publish Helm Chart + runs-on: ubuntu-latest + needs: build-and-publish-image + permissions: + contents: write # Needed by stefanprodan/helm-gh-pages to push to gh-pages branch + steps: + - name: Check out code + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for helm-gh-pages to calculate chart index + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.10.0 + + - name: Install yq + run: | + sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq &&\ + sudo chmod +x /usr/bin/yq + + - name: Set Chart Version from Tag + run: | + VERSION=$(echo "${{ github.ref_name }}" | sed 's/^v//') + yq e -i '.version = strenv(VERSION)' ./charts/iperf3-monitor/Chart.yaml + yq e -i '.appVersion = strenv(VERSION)' ./charts/iperf3-monitor/Chart.yaml + cat ./charts/iperf3-monitor/Chart.yaml # Optional: print updated Chart.yaml + + - name: Publish Helm chart + uses: stefanprodan/helm-gh-pages@v1.6.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + charts_dir: ./charts + charts_url: https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }} +``` \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e58ae8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +``` +# Byte-code files +*.pyc +*.pyo +*.pyd +__pycache__/ + +# Distribution / build outputs +dist/ +build/ +*.egg-info/ +.tox/ + +# Virtual environments +.venv/ +venv/ +env/ + +# Editor/IDE specific files +.idea/ +.vscode/ +*.swp +*.swo + +# Logs and temporary files +*.log +*.tmp + +# OS generated files +.DS_Store +Thumbs.db + +# Docker +!Dockerfile +.dockerignore + +# Helm +!charts/iperf3-monitor/.helmignore +charts/*.tgz # Ignore packaged chart files +``` \ No newline at end of file diff --git a/Kubernetes Network Performance Monitoring.md b/Kubernetes Network Performance Monitoring.md new file mode 100755 index 0000000..2868fa5 --- /dev/null +++ b/Kubernetes Network Performance Monitoring.md @@ -0,0 +1,1505 @@ +# **Architecting a Kubernetes-Native Network Performance Monitoring Service with iperf3, Prometheus, and Helm** + +## **Section 1: Architectural Blueprint for Continuous Network Validation** + +### **1.1 Introduction to Proactive Network Monitoring in Kubernetes** {#introduction-to-proactive-network-monitoring-in-kubernetes} + +In modern cloud-native infrastructures, Kubernetes has emerged as the de +facto standard for container orchestration, simplifying the deployment, +scaling, and management of complex applications.^1^ However, the very +dynamism and abstraction that make Kubernetes powerful also introduce +significant challenges in diagnosing network performance issues. The +ephemeral nature of pods, the complexity of overlay networks provided by +Container Network Interfaces (CNIs), and the multi-layered traffic +routing through Services and Ingress controllers can obscure the root +causes of latency, packet loss, and throughput degradation. + +Traditional, reactive troubleshooting---investigating network problems +only after an application has failed---is insufficient in these +environments. Performance bottlenecks can be subtle, intermittent, and +difficult to reproduce, often manifesting as degraded user experience +long before they trigger hard failures.^1^ To maintain the reliability +and performance of critical workloads, engineering teams must shift from +a reactive to a proactive stance. This requires a system that performs +continuous, automated validation of the underlying network fabric, +treating network health not as an assumption but as a measurable, +time-series metric. + +This document outlines the architecture and implementation of a +comprehensive, Kubernetes-native network performance monitoring service. +The solution leverages a suite of industry-standard, open-source tools +to provide continuous, actionable insights into cluster network health. +The core components are: + +- **iperf3:** A widely adopted tool for active network performance + > measurement, used to generate traffic and measure maximum achievable + > bandwidth, jitter, and packet loss between two points.^2^ + +- **Prometheus:** A powerful, open-source monitoring and alerting system + > that has become the standard for collecting and storing time-series + > metrics in the Kubernetes ecosystem.^3^ + +- **Grafana:** A leading visualization tool for creating rich, + > interactive dashboards from various data sources, including + > Prometheus, enabling intuitive analysis of complex datasets.^4^ + +By combining these components into a cohesive, automated service, we can +transform abstract network performance into a concrete, queryable, and +visualizable stream of data, enabling teams to detect and address +infrastructure-level issues before they impact end-users.^6^ + +### **1.2 The Core Architectural Pattern: Decoupled Test Endpoints and a Central Orchestrator** {#the-core-architectural-pattern-decoupled-test-endpoints-and-a-central-orchestrator} + +The foundation of this monitoring service is a robust, decoupled +architectural pattern designed for scalability and resilience within a +dynamic Kubernetes environment. The design separates the passive test +endpoints from the active test orchestrator, a critical distinction that +ensures the system is both efficient and aligned with Kubernetes +operational principles. + +The data flow and component interaction can be visualized as follows: + +1. A **DaemonSet** deploys an iperf3 server pod onto every node in the + > cluster, creating a mesh of passive test targets. + +2. A central **Deployment**, the iperf3-exporter, uses the Kubernetes + > API to discover the IP addresses of all iperf3 server pods. + +3. The iperf3-exporter periodically orchestrates tests, running an + > iperf3 client to connect to each server pod and measure network + > performance. + +4. The exporter parses the JSON output from iperf3, transforms the + > results into Prometheus metrics, and exposes them on a /metrics + > HTTP endpoint. + +5. A **Prometheus** server, configured via a **ServiceMonitor**, + > scrapes the /metrics endpoint of the exporter, ingesting the + > performance data into its time-series database. + +6. A **Grafana** instance, using Prometheus as a data source, + > visualizes the metrics in a purpose-built dashboard, providing + > heatmaps and time-series graphs of node-to-node bandwidth, jitter, + > and packet loss. + +This architecture is composed of three primary logical components: + +- **Component 1: The iperf3-server DaemonSet.** To accurately measure + > network performance between any two nodes (N-to-N), an iperf3 server + > process must be running and accessible on every node. The DaemonSet + > is the canonical Kubernetes controller for this exact use case. It + > guarantees that a copy of a specific pod runs on all, or a selected + > subset of, nodes within the cluster.^7^ When a new node joins the + > cluster, the + > DaemonSet controller automatically deploys an iperf3-server pod to + > it; conversely, when a node is removed, the pod is garbage + > collected. This ensures the mesh of test endpoints is always in sync + > with the state of the cluster, requiring zero manual + > intervention.^9^ This pattern of using a + > DaemonSet to deploy iperf3 across a cluster is a well-established + > practice for network validation.^11^ + +- **Component 2: The iperf3-exporter Deployment.** A separate, + > centralized component is required to act as the test orchestrator. + > This component is responsible for initiating the iperf3 client + > connections, executing the tests, parsing the results, and exposing + > them as Prometheus metrics. Since this is a stateless service whose + > primary function is to perform a periodic task, a Deployment is the + > ideal controller.^8^ A + > Deployment ensures a specified number of replicas are running, + > provides mechanisms for rolling updates, and allows for independent + > resource management and lifecycle control, decoupled from the + > iperf3-server pods it tests against.^10^ + +- **Component 3: The Prometheus & Grafana Stack.** The monitoring + > backend is provided by the kube-prometheus-stack, a comprehensive + > Helm chart that deploys Prometheus, Grafana, Alertmanager, and the + > necessary exporters for cluster monitoring.^4^ Our custom monitoring + > service is designed to integrate seamlessly with this stack, + > leveraging its Prometheus Operator for automatic scrape + > configuration and its Grafana instance for visualization. + +### **1.3 Architectural Justification and Design Rationale** {#architectural-justification-and-design-rationale} + +The primary strength of this architecture lies in its deliberate +separation of concerns, a design choice that yields significant benefits +in resilience, scalability, and operational efficiency. The DaemonSet is +responsible for the *presence* of test endpoints, while the Deployment +handles the *orchestration* of the tests. This decoupling is not +arbitrary; it is a direct consequence of applying Kubernetes-native +principles to the problem. + +The logical progression is as follows: The requirement to continuously +measure N-to-N node bandwidth necessitates that iperf3 server processes +are available on all N nodes to act as targets. The most reliable, +self-healing, and automated method to achieve this \"one-pod-per-node\" +pattern in Kubernetes is to use a DaemonSet.^7^ This makes the server +deployment automatically scale with the cluster itself. Next, a process +is needed to trigger the tests against these servers. This +\"orchestrator\" is a logically distinct, active service. It needs to be +reliable and potentially scalable, but it does not need to run on every +single node. The standard Kubernetes object for managing such stateless +services is a + +Deployment.^8^ + +This separation allows for independent and appropriate resource +allocation. The iperf3-server pods are extremely lightweight, consuming +minimal resources while idle. The iperf3-exporter, however, may be more +CPU-intensive during the brief periods it is actively running tests. By +placing them in different workload objects (DaemonSet and Deployment), +we can configure their resource requests and limits independently. This +prevents the monitoring workload from interfering with or being starved +by application workloads, a crucial consideration for any +production-grade system. This design is fundamentally more robust and +scalable than simpler, monolithic approaches, such as a single script +that attempts to manage both server and client lifecycles.^12^ + +## **Section 2: Implementing the iperf3-prometheus-exporter** + +The heart of this monitoring solution is the iperf3-prometheus-exporter, +a custom application responsible for orchestrating the network tests and +translating their results into a format that Prometheus can ingest. This +section provides a detailed breakdown of its implementation, from +technology selection to the final container image. + +### **2.1 Technology Selection: Python for Agility and Ecosystem** {#technology-selection-python-for-agility-and-ecosystem} + +Python was selected as the implementation language for the exporter due +to its powerful ecosystem and rapid development capabilities. The +availability of mature, well-maintained libraries for interacting with +both Prometheus and Kubernetes significantly accelerates the development +of a robust, cloud-native application. + +The key libraries leveraged are: + +- **prometheus-client:** The official Python client library for + > instrumenting applications with Prometheus metrics. It provides a + > simple API for defining metrics (Gauges, Counters, etc.) and + > exposing them via an HTTP server, handling much of the boilerplate + > required for creating a valid exporter.^13^ + +- **iperf3-python:** A clean, high-level Python wrapper around the + > iperf3 C library. It allows for programmatic control of iperf3 + > clients and servers, and it can directly parse the JSON output of a + > test into a convenient Python object, eliminating the need for + > manual process management and output parsing.^15^ + +- **kubernetes:** The official Python client library for the Kubernetes + > API. This library is essential for the exporter to become + > \"Kubernetes-aware,\" enabling it to dynamically discover the + > iperf3-server pods it needs to test against by querying the API + > server directly. + +### **2.2 Core Exporter Logic (Annotated Python Code)** {#core-exporter-logic-annotated-python-code} + +The exporter\'s logic can be broken down into five distinct steps, which +together form a continuous loop of discovery, testing, and reporting. + +#### **Step 1: Initialization and Metric Definition** + +The application begins by importing the necessary libraries and defining +the Prometheus metrics that will be exposed. We use a Gauge metric, as +bandwidth is a value that can go up or down. Labels are crucial for +providing context; they allow us to slice and dice the data in +Prometheus and Grafana. + +> Python + +import os +import time +import logging +from kubernetes import client, config +from prometheus_client import start_http_server, Gauge +import iperf3 + +\# \-\-- Configuration \-\-- +\# Configure logging +logging.basicConfig(level=logging.INFO, format=\'%(asctime)s - +%(levelname)s - %(message)s\') + +\# \-\-- Prometheus Metrics Definition \-\-- +IPERF_BANDWIDTH_MBPS = Gauge( +\'iperf_network_bandwidth_mbps\', +\'Network bandwidth measured by iperf3 in Megabits per second\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_JITTER_MS = Gauge( +\'iperf_network_jitter_ms\', +\'Network jitter measured by iperf3 in milliseconds\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_PACKETS_TOTAL = Gauge( +\'iperf_network_packets_total\', +\'Total packets transmitted or received during the iperf3 test\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_LOST_PACKETS = Gauge( +\'iperf_network_lost_packets_total\', +\'Total lost packets during the iperf3 UDP test\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_TEST_SUCCESS = Gauge( +\'iperf_test_success\', +\'Indicates if the iperf3 test was successful (1) or failed (0)\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) + +#### **Step 2: Kubernetes-Aware Target Discovery** + +A static list of test targets is an anti-pattern in a dynamic +environment like Kubernetes.^16^ The exporter must dynamically discover +its targets. This is achieved by using the Kubernetes Python client to +query the API server for all pods that match the label selector of our + +iperf3-server DaemonSet (e.g., app=iperf3-server). The function returns +a list of dictionaries, each containing the pod\'s IP address and the +name of the node it is running on. + +This dynamic discovery is what transforms the exporter from a simple +script into a resilient, automated service. It adapts to cluster scaling +events without any manual intervention. The logical path is clear: +Kubernetes clusters are dynamic, so a hardcoded list of IPs would become +stale instantly. The API server is the single source of truth for the +cluster\'s state. Therefore, the exporter must query this API, which in +turn necessitates including the Kubernetes client library and +configuring the appropriate Role-Based Access Control (RBAC) permissions +for its ServiceAccount. + +> Python + +def discover_iperf_servers(): +\"\"\" +Discover iperf3 server pods in the cluster using the Kubernetes API. +\"\"\" +try: +\# Load in-cluster configuration +config.load_incluster_config() +v1 = client.CoreV1Api() + +namespace = os.getenv(\'IPERF_SERVER_NAMESPACE\', \'default\') +label_selector = os.getenv(\'IPERF_SERVER_LABEL_SELECTOR\', +\'app=iperf3-server\') + +logging.info(f\"Discovering iperf3 servers with label +\'{label_selector}\' in namespace \'{namespace}\'\") + +ret = v1.list_pod_for_all_namespaces(label_selector=label_selector, +watch=False) + +servers = +for i in ret.items: +\# Ensure pod has an IP and is running +if i.status.pod_ip and i.status.phase == \'Running\': +servers.append({ +\'ip\': i.status.pod_ip, +\'node_name\': i.spec.node_name +}) +logging.info(f\"Discovered {len(servers)} iperf3 server pods.\") +return servers +except Exception as e: +logging.error(f\"Error discovering iperf servers: {e}\") +return + +#### **Step 3: The Test Orchestration Loop** + +The main function of the application contains an infinite while True +loop that orchestrates the entire process. It periodically discovers the +servers, creates a list of test pairs (node-to-node), and then executes +an iperf3 test for each pair. + +> Python + +def run_iperf_test(server_ip, server_port, protocol, source_node, +dest_node): +\"\"\" +Runs a single iperf3 test and updates Prometheus metrics. +\"\"\" +logging.info(f\"Running iperf3 test from {source_node} to {dest_node} +({server_ip}:{server_port}) using {protocol.upper()}\") + +client = iperf3.Client() +client.server_hostname = server_ip +client.port = server_port +client.protocol = protocol +client.duration = int(os.getenv(\'IPERF_TEST_DURATION\', 5)) +client.json_output = True \# Critical for parsing + +result = client.run() + +\# Parse results and update metrics +parse_and_publish_metrics(result, source_node, dest_node, protocol) + +def main_loop(): +\"\"\" +Main orchestration loop. +\"\"\" +test_interval = int(os.getenv(\'IPERF_TEST_INTERVAL\', 300)) +server_port = int(os.getenv(\'IPERF_SERVER_PORT\', 5201)) +protocol = os.getenv(\'IPERF_TEST_PROTOCOL\', \'tcp\').lower() +source_node_name = os.getenv(\'SOURCE_NODE_NAME\') \# Injected via +Downward API + +if not source_node_name: +logging.error(\"SOURCE_NODE_NAME environment variable not set. +Exiting.\") +return + +while True: +servers = discover_iperf_servers() + +for server in servers: +\# Avoid testing a node against itself +if server\[\'node_name\'\] == source_node_name: +continue + +run_iperf_test(server\[\'ip\'\], server_port, protocol, +source_node_name, server\[\'node_name\'\]) + +logging.info(f\"Completed test cycle. Sleeping for {test_interval} +seconds.\") +time.sleep(test_interval) + +#### **Step 4: Parsing and Publishing Metrics** + +After each test run, a dedicated function parses the JSON result object +provided by the iperf3-python library.^15^ It extracts the key +performance indicators and uses them to set the value of the +corresponding Prometheus + +Gauge, applying the correct labels for source and destination nodes. +Robust error handling ensures that failed tests are also recorded as a +metric, which is vital for alerting. + +> Python + +def parse_and_publish_metrics(result, source_node, dest_node, +protocol): +\"\"\" +Parses the iperf3 result and updates Prometheus gauges. +\"\"\" +labels = {\'source_node\': source_node, \'destination_node\': dest_node, +\'protocol\': protocol} + +if result.error: +logging.error(f\"Test from {source_node} to {dest_node} failed: +{result.error}\") +IPERF_TEST_SUCCESS.labels(\*\*labels).set(0) +\# Clear previous successful metrics for this path +IPERF_BANDWIDTH_MBPS.labels(\*\*labels).set(0) +IPERF_JITTER_MS.labels(\*\*labels).set(0) +return + +IPERF_TEST_SUCCESS.labels(\*\*labels).set(1) + +\# The summary data is in result.sent_Mbps or result.received_Mbps +depending on direction +\# For simplicity, we check for available attributes. +if hasattr(result, \'sent_Mbps\'): +bandwidth_mbps = result.sent_Mbps +elif hasattr(result, \'received_Mbps\'): +bandwidth_mbps = result.received_Mbps +else: +\# Fallback for different iperf3 versions/outputs +bandwidth_mbps = result.Mbps if hasattr(result, \'Mbps\') else 0 + +IPERF_BANDWIDTH_MBPS.labels(\*\*labels).set(bandwidth_mbps) + +if protocol == \'udp\': +IPERF_JITTER_MS.labels(\*\*labels).set(result.jitter_ms if +hasattr(result, \'jitter_ms\') else 0) +IPERF_PACKETS_TOTAL.labels(\*\*labels).set(result.packets if +hasattr(result, \'packets\') else 0) +IPERF_LOST_PACKETS.labels(\*\*labels).set(result.lost_packets if +hasattr(result, \'lost_packets\') else 0) + +#### **Step 5: Exposing the /metrics Endpoint** + +Finally, the main execution block starts a simple HTTP server using the +prometheus-client library. This server exposes the collected metrics on +the standard /metrics path, ready to be scraped by Prometheus.^13^ + +> Python + +if \_\_name\_\_ == \'\_\_main\_\_\': +\# Start the Prometheus metrics server +listen_port = int(os.getenv(\'LISTEN_PORT\', 9876)) +start_http_server(listen_port) +logging.info(f\"Prometheus exporter listening on port {listen_port}\") + +\# Start the main orchestration loop +main_loop() + +### **2.3 Containerizing the Exporter (Dockerfile)** {#containerizing-the-exporter-dockerfile} + +To deploy the exporter in Kubernetes, it must be packaged into a +container image. A multi-stage Dockerfile is used to create a minimal +and more secure final image by separating the build environment from the +runtime environment. This is a standard best practice for producing +production-ready containers.^14^ + +> Dockerfile + +\# Stage 1: Build stage with dependencies +FROM python:3.9-slim as builder + +WORKDIR /app + +\# Install iperf3 and build dependencies +RUN apt-get update && \\ +apt-get install -y \--no-install-recommends gcc iperf3 libiperf-dev && +\\ +rm -rf /var/lib/apt/lists/\* + +\# Install Python dependencies +COPY requirements.txt. +RUN pip install \--no-cache-dir -r requirements.txt + +\# Stage 2: Final runtime stage +FROM python:3.9-slim + +WORKDIR /app + +\# Copy iperf3 binary and library from the builder stage +COPY \--from=builder /usr/bin/iperf3 /usr/bin/iperf3 +COPY \--from=builder /usr/lib/x86_64-linux-gnu/libiperf.so.0 +/usr/lib/x86_64-linux-gnu/libiperf.so.0 + +\# Copy installed Python packages from the builder stage +COPY \--from=builder /usr/local/lib/python3.9/site-packages +/usr/local/lib/python3.9/site-packages + +\# Copy the exporter application code +COPY exporter.py. + +\# Expose the metrics port +EXPOSE 9876 + +\# Set the entrypoint +CMD \[\"python\", \"exporter.py\"\] + +The corresponding requirements.txt would contain: + +prometheus-client +iperf3 +kubernetes + +## **Section 3: Kubernetes Manifests and Deployment Strategy** + +With the architectural blueprint defined and the exporter application +containerized, the next step is to translate this design into +declarative Kubernetes manifests. These YAML files define the necessary +Kubernetes objects to deploy, configure, and manage the monitoring +service. Using static manifests here provides a clear foundation before +they are parameterized into a Helm chart in the next section. + +### **3.1 The iperf3-server DaemonSet** {#the-iperf3-server-daemonset} + +The iperf3-server component is deployed as a DaemonSet to ensure an +instance of the server pod runs on every eligible node in the +cluster.^7^ This creates the ubiquitous grid of test endpoints required +for comprehensive N-to-N testing. + +Key fields in this manifest include: + +- **spec.selector**: Connects the DaemonSet to the pods it manages via + > labels. + +- **spec.template.metadata.labels**: The label app: iperf3-server is + > applied to the pods, which is crucial for discovery by both the + > iperf3-exporter and Kubernetes Services. + +- **spec.template.spec.containers**: Defines the iperf3 container, using + > a public image and running the iperf3 -s command to start it in + > server mode. + +- **spec.template.spec.tolerations**: This is often necessary to allow + > the DaemonSet to schedule pods on control-plane (master) nodes, + > which may have taints preventing normal workloads from running + > there. This ensures the entire cluster, including masters, is part + > of the test mesh. + +- **spec.template.spec.hostNetwork: true**: This is a critical setting. + > By running the server pods on the host\'s network namespace, we + > bypass the Kubernetes network overlay (CNI) for the server side. + > This allows the test to measure the raw performance of the + > underlying node network interface, which is often the primary goal + > of infrastructure-level testing. + +> YAML + +apiVersion: apps/v1 +kind: DaemonSet +metadata: +name: iperf3-server +labels: +app: iperf3-server +spec: +selector: +matchLabels: +app: iperf3-server +template: +metadata: +labels: +app: iperf3-server +spec: +\# Run on the host network to measure raw node-to-node performance +hostNetwork: true +\# Tolerations to allow scheduling on control-plane nodes +tolerations: +- key: \"node-role.kubernetes.io/control-plane\" +operator: \"Exists\" +effect: \"NoSchedule\" +- key: \"node-role.kubernetes.io/master\" +operator: \"Exists\" +effect: \"NoSchedule\" +containers: +- name: iperf3-server +image: networkstatic/iperf3:latest +args: \[\"-s\"\] \# Start in server mode +ports: +- containerPort: 5201 +name: iperf3 +protocol: TCP +- containerPort: 5201 +name: iperf3-udp +protocol: UDP +resources: +requests: +cpu: \"50m\" +memory: \"64Mi\" +limits: +cpu: \"100m\" +memory: \"128Mi\" + +### **3.2 The iperf3-exporter Deployment** {#the-iperf3-exporter-deployment} + +The iperf3-exporter is deployed as a Deployment, as it is a stateless +application that orchestrates the tests.^14^ Only one replica is +typically needed, as it can sequentially test all nodes. + +Key fields in this manifest are: + +- **spec.replicas: 1**: A single instance is sufficient for most + > clusters. + +- **spec.template.spec.serviceAccountName**: This assigns the custom + > ServiceAccount (defined next) to the pod, granting it the necessary + > permissions to talk to the Kubernetes API. + +- **spec.template.spec.containers.env**: The SOURCE_NODE_NAME + > environment variable is populated using the Downward API. This is + > how the exporter pod knows which node *it* is running on, allowing + > it to skip testing against itself. + +- **spec.template.spec.containers.image**: This points to the custom + > exporter image built in the previous section. + +> YAML + +apiVersion: apps/v1 +kind: Deployment +metadata: +name: iperf3-exporter +labels: +app: iperf3-exporter +spec: +replicas: 1 +selector: +matchLabels: +app: iperf3-exporter +template: +metadata: +labels: +app: iperf3-exporter +spec: +serviceAccountName: iperf3-exporter-sa +containers: +- name: iperf3-exporter +image: your-repo/iperf3-prometheus-exporter:latest \# Replace with your +image +ports: +- containerPort: 9876 +name: metrics +env: +\# Use the Downward API to inject the node name this pod is running on +- name: SOURCE_NODE_NAME +valueFrom: +fieldRef: +fieldPath: spec.nodeName +\# Other configurations for the exporter script +- name: IPERF_TEST_INTERVAL +value: \"300\" +- name: IPERF_SERVER_LABEL_SELECTOR +value: \"app=iperf3-server\" +resources: +requests: +cpu: \"100m\" +memory: \"128Mi\" +limits: +cpu: \"500m\" +memory: \"256Mi\" + +### **3.3 RBAC: Granting Necessary Permissions** {#rbac-granting-necessary-permissions} + +For the exporter to perform its dynamic discovery of iperf3-server pods, +it must be granted specific, limited permissions to read information +from the Kubernetes API. This is accomplished through a ServiceAccount, +a ClusterRole, and a ClusterRoleBinding. + +- **ServiceAccount**: Provides an identity for the exporter pod within + > the cluster. + +- **ClusterRole**: Defines a set of permissions. Here, we grant get, + > list, and watch access to pods. These are the minimum required + > permissions for the discovery function to work. The role is a + > ClusterRole because the exporter needs to find pods across all + > namespaces where servers might be running. + +- **ClusterRoleBinding**: Links the ServiceAccount to the ClusterRole, + > effectively granting the permissions to any pod that uses the + > ServiceAccount. + +> YAML + +apiVersion: v1 +kind: ServiceAccount +metadata: +name: iperf3-exporter-sa +\-\-- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: +name: iperf3-exporter-role +rules: +- apiGroups: \[\"\"\] +resources: \[\"pods\"\] +verbs: \[\"get\", \"list\", \"watch\"\] +\-\-- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: +name: iperf3-exporter-rb +subjects: +- kind: ServiceAccount +name: iperf3-exporter-sa +namespace: default \# The namespace where the exporter is deployed +roleRef: +kind: ClusterRole +name: iperf3-exporter-role +apiGroup: rbac.authorization.k8s.io + +### **3.4 Network Exposure: Service and ServiceMonitor** {#network-exposure-service-and-servicemonitor} + +To make the exporter\'s metrics available to Prometheus, we need two +final objects. The Service exposes the exporter pod\'s metrics port +within the cluster, and the ServiceMonitor tells the Prometheus Operator +how to find and scrape that service. + +This ServiceMonitor-based approach is the linchpin for a GitOps-friendly +integration. Instead of manually editing the central Prometheus +configuration file---a brittle and non-declarative process---we deploy a +ServiceMonitor custom resource alongside our application.^14^ The +Prometheus Operator, a key component of the + +kube-prometheus-stack, continuously watches for these objects. When it +discovers our iperf3-exporter-sm, it automatically generates the +necessary scrape configuration and reloads Prometheus without any manual +intervention.^4^ This empowers the application team to define + +*how their application should be monitored* as part of the +application\'s own deployment package, a cornerstone of scalable, \"you +build it, you run it\" observability. + +> YAML + +apiVersion: v1 +kind: Service +metadata: +name: iperf3-exporter-svc +labels: +app: iperf3-exporter +spec: +selector: +app: iperf3-exporter +ports: +- name: metrics +port: 9876 +targetPort: metrics +protocol: TCP +\-\-- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: +name: iperf3-exporter-sm +labels: +\# Label for Prometheus Operator to discover this ServiceMonitor +release: prometheus-operator +spec: +selector: +matchLabels: +\# This must match the labels on the Service object above +app: iperf3-exporter +endpoints: +- port: metrics +interval: 60s +scrapeTimeout: 30s + +## **Section 4: Packaging with Helm for Reusability and Distribution** + +While static YAML manifests are excellent for defining Kubernetes +resources, they lack the flexibility needed for easy configuration, +distribution, and lifecycle management. Helm, the package manager for +Kubernetes, solves this by bundling applications into +version-controlled, reusable packages called charts.^17^ This section +details how to package the entire + +iperf3 monitoring service into a professional, flexible, and +distributable Helm chart. + +### **4.1 Helm Chart Structure** {#helm-chart-structure} + +A well-organized Helm chart follows a standard directory structure. This +convention makes charts easier to understand and maintain.^19^ + +iperf3-monitor/ +├── Chart.yaml \# Metadata about the chart (name, version, etc.) +├── values.yaml \# Default configuration values for the chart +├── charts/ \# Directory for sub-chart dependencies (empty for this +project) +├── templates/ \# Directory containing the templated Kubernetes +manifests +│ ├── \_helpers.tpl \# A place for reusable template helpers +│ ├── server-daemonset.yaml +│ ├── exporter-deployment.yaml +│ ├── rbac.yaml +│ ├── service.yaml +│ └── servicemonitor.yaml +└── README.md \# Documentation for the chart + +### **4.2 Templating the Kubernetes Manifests** {#templating-the-kubernetes-manifests} + +The core of Helm\'s power lies in its templating engine, which uses Go +templates. We convert the static manifests from Section 3 into dynamic +templates by replacing hardcoded values with references to variables +defined in the values.yaml file. + +A crucial best practice is to use a \_helpers.tpl file to define common +functions and partial templates, especially for generating resource +names and labels. This reduces boilerplate, ensures consistency, and +makes the chart easier to manage.^19^ + +**Example: templates/\_helpers.tpl** + +> Code snippet + +{{/\* +Expand the name of the chart. +\*/}} +{{- define \"iperf3-monitor.name\" -}} +{{- default.Chart.Name.Values.nameOverride \| trunc 63 \| trimSuffix +\"-\" }} +{{- end -}} + +{{/\* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited +to this (by the DNS naming spec). +\*/}} +{{- define \"iperf3-monitor.fullname\" -}} +{{- if.Values.fullnameOverride }} +{{-.Values.fullnameOverride \| trunc 63 \| trimSuffix \"-\" }} +{{- else }} +{{- \$name := default.Chart.Name.Values.nameOverride }} +{{- if contains \$name.Release.Name }} +{{-.Release.Name \| trunc 63 \| trimSuffix \"-\" }} +{{- else }} +{{- printf \"%s-%s\".Release.Name \$name \| trunc 63 \| trimSuffix \"-\" +}} +{{- end }} +{{- end }} +{{- end -}} + +{{/\* +Common labels +\*/}} +{{- define \"iperf3-monitor.labels\" -}} +helm.sh/chart: {{ include \"iperf3-monitor.name\". }} +{{ include \"iperf3-monitor.selectorLabels\". }} +{{- if.Chart.AppVersion }} +app.kubernetes.io/version: {{.Chart.AppVersion \| quote }} +{{- end }} +app.kubernetes.io/managed-by: {{.Release.Service }} +{{- end -}} + +{{/\* +Selector labels +\*/}} +{{- define \"iperf3-monitor.selectorLabels\" -}} +app.kubernetes.io/name: {{ include \"iperf3-monitor.name\". }} +app.kubernetes.io/instance: {{.Release.Name }} +{{- end -}} + +**Example: Templated exporter-deployment.yaml** + +> YAML + +apiVersion: apps/v1 +kind: Deployment +metadata: +name: {{ include \"iperf3-monitor.fullname\". }}-exporter +labels: +{{- include \"iperf3-monitor.labels\". \| nindent 4 }} +app.kubernetes.io/component: exporter +spec: +replicas: {{.Values.exporter.replicaCount }} +selector: +matchLabels: +{{- include \"iperf3-monitor.selectorLabels\". \| nindent 6 }} +app.kubernetes.io/component: exporter +template: +metadata: +labels: +{{- include \"iperf3-monitor.selectorLabels\". \| nindent 8 }} +app.kubernetes.io/component: exporter +spec: +{{- if.Values.rbac.create }} +serviceAccountName: {{ include \"iperf3-monitor.fullname\". }}-sa +{{- else }} +serviceAccountName: {{.Values.serviceAccount.name }} +{{- end }} +containers: +- name: iperf3-exporter +image: \"{{.Values.exporter.image.repository +}}:{{.Values.exporter.image.tag \| default.Chart.AppVersion }}\" +imagePullPolicy: {{.Values.exporter.image.pullPolicy }} +ports: +- containerPort: 9876 +name: metrics +env: +- name: SOURCE_NODE_NAME +valueFrom: +fieldRef: +fieldPath: spec.nodeName +- name: IPERF_TEST_INTERVAL +value: \"{{.Values.exporter.testInterval }}\" +resources: +{{- toYaml.Values.exporter.resources \| nindent 10 }} + +### **4.3 Designing a Comprehensive values.yaml** {#designing-a-comprehensive-values.yaml} + +The values.yaml file is the public API of a Helm chart. A well-designed +values file is intuitive, clearly documented, and provides users with +the flexibility to adapt the chart to their specific needs. Best +practices include using clear, camelCase naming conventions and +providing comments for every parameter.^21^ + +A particularly powerful feature of Helm is conditional logic. By +wrapping entire resource definitions in if blocks based on boolean flags +in values.yaml (e.g., {{- if.Values.rbac.create }}), the chart becomes +highly adaptable. A user in a high-security environment can disable the +automatic creation of ClusterRoles by setting rbac.create: false, +allowing them to manage permissions manually without causing the Helm +installation to fail.^20^ Similarly, a user not running the Prometheus +Operator can set + +serviceMonitor.enabled: false. This adaptability transforms the chart +from a rigid, all-or-nothing package into a flexible building block, +dramatically increasing its utility across different organizations and +security postures. + +The following table documents the comprehensive set of configurable +parameters for the iperf3-monitor chart. This serves as the primary +documentation for any user wishing to install and customize the service. + +| Parameter | Description | Type | Default | +|------------------------------|----------------------------------------------------------------------|---------|-------------------------------------------| +| nameOverride | Override the name of the chart. | string | \"\" | +| fullnameOverride | Override the fully qualified app name. | string | \"\" | +| exporter.image.repository | The container image repository for the exporter. | string | ghcr.io/my-org/iperf3-prometheus-exporter | +| exporter.image.tag | The container image tag for the exporter. | string | (Chart.AppVersion) | +| exporter.image.pullPolicy | The image pull policy for the exporter. | string | IfNotPresent | +| exporter.replicaCount | Number of exporter pod replicas. | integer | 1 | +| exporter.testInterval | Interval in seconds between test cycles. | integer | 300 | +| exporter.testTimeout | Timeout in seconds for a single iperf3 test. | integer | 10 | +| exporter.testProtocol | Protocol to use for testing (tcp or udp). | string | tcp | +| exporter.resources | CPU/memory resource requests and limits for the exporter. | object | {} | +| server.image.repository | The container image repository for the iperf3 server. | string | networkstatic/iperf3 | +| server.image.tag | The container image tag for the iperf3 server. | string | latest | +| server.resources | CPU/memory resource requests and limits for the server pods. | object | {} | +| server.nodeSelector | Node selector for scheduling server pods. | object | {} | +| server.tolerations | Tolerations for scheduling server pods on tainted nodes. | array | \`\` | +| rbac.create | If true, create ServiceAccount, ClusterRole, and ClusterRoleBinding. | boolean | true | +| serviceAccount.name | The name of the ServiceAccount to use. Used if rbac.create is false. | string | \"\" | +| serviceMonitor.enabled | If true, create a ServiceMonitor for Prometheus Operator. | boolean | true | +| serviceMonitor.interval | Scrape interval for the ServiceMonitor. | string | 60s | +| serviceMonitor.scrapeTimeout | Scrape timeout for the ServiceMonitor. | string | 30s | + +## **Section 5: Visualizing Network Performance with a Custom Grafana Dashboard** + +The final piece of the user experience is a purpose-built Grafana +dashboard that transforms the raw, time-series metrics from Prometheus +into intuitive, actionable visualizations. A well-designed dashboard +does more than just display data; it tells a story, guiding an operator +from a high-level overview of cluster health to a deep-dive analysis of +a specific problematic network path.^5^ + +### **5.1 Dashboard Design Principles** {#dashboard-design-principles} + +The primary goals for this network performance dashboard are: + +1. **At-a-Glance Overview:** Provide an immediate, cluster-wide view of + > network health, allowing operators to quickly spot systemic issues + > or anomalies. + +2. **Intuitive Drill-Down:** Enable users to seamlessly transition from + > a high-level view to a detailed analysis of performance between + > specific nodes. + +3. **Correlation:** Display multiple related metrics (bandwidth, + > jitter, packet loss) on the same timeline to help identify causal + > relationships. + +4. **Clarity and Simplicity:** Avoid clutter and overly complex panels + > that can obscure meaningful data.^4^ + +### **5.2 Key Visualizations and Panels** {#key-visualizations-and-panels} + +The dashboard is constructed from several key panel types, each serving +a specific analytical purpose. + +- **Panel 1: Node-to-Node Bandwidth Heatmap.** This is the centerpiece + > of the dashboard\'s overview. It uses Grafana\'s \"Heatmap\" + > visualization to create a matrix of network performance. + + - **Y-Axis:** Source Node (source_node label). + + - **X-Axis:** Destination Node (destination_node label). + + - **Cell Color:** The value of the iperf_network_bandwidth_mbps + > metric. + + - PromQL Query: avg(iperf_network_bandwidth_mbps) by (source_node, + > destination_node) + > This panel provides an instant visual summary of the entire + > cluster\'s network fabric. A healthy cluster will show a uniformly + > \"hot\" (high bandwidth) grid, while any \"cold\" spots + > immediately draw attention to underperforming network paths. + +- **Panel 2: Time-Series Performance Graphs.** These panels use the + > \"Time series\" visualization to plot performance over time, + > allowing for trend analysis and historical investigation. + + - **Bandwidth (Mbps):** Plots + > iperf_network_bandwidth_mbps{source_node=\"\$source_node\", + > destination_node=\"\$destination_node\"}. + + - **Jitter (ms):** Plots + > iperf_network_jitter_ms{source_node=\"\$source_node\", + > destination_node=\"\$destination_node\", protocol=\"udp\"}. + + - Packet Loss (%): Plots (iperf_network_lost_packets_total{\...} / + > iperf_network_packets_total{\...}) \* 100. + > These graphs are filtered by the dashboard variables, enabling the + > drill-down analysis. + +- **Panel 3: Stat Panels.** These panels use the \"Stat\" visualization + > to display single, key performance indicators (KPIs) for the + > selected time range and nodes. + + - **Average Bandwidth:** avg(iperf_network_bandwidth_mbps{\...}) + + - **Minimum Bandwidth:** min(iperf_network_bandwidth_mbps{\...}) + + - **Maximum Jitter:** max(iperf_network_jitter_ms{\...}) + +### **5.3 Enabling Interactivity with Grafana Variables** {#enabling-interactivity-with-grafana-variables} + +The dashboard\'s interactivity is powered by Grafana\'s template +variables. These variables are dynamically populated from Prometheus and +are used to filter the data displayed in the panels.^4^ + +- **\$source_node**: A dropdown variable populated by the PromQL query + > label_values(iperf_network_bandwidth_mbps, source_node). + +- **\$destination_node**: A dropdown variable populated by + > label_values(iperf_network_bandwidth_mbps{source_node=\"\$source_node\"}, + > destination_node). This query is cascaded, meaning it only shows + > destinations relevant to the selected source. + +- **\$protocol**: A custom variable with the options tcp and udp. + +This combination of a high-level heatmap with interactive, +variable-driven drill-down graphs creates a powerful analytical +workflow. An operator can begin with a bird\'s-eye view of the cluster. +Upon spotting an anomaly on the heatmap (e.g., a low-bandwidth link +between Node-5 and Node-8), they can use the \$source_node and +\$destination_node dropdowns to select that specific path. All the +time-series panels will instantly update to show the detailed +performance history for that link, allowing the operator to correlate +bandwidth drops with jitter spikes or other events. This workflow +transforms raw data into actionable insight, dramatically reducing the +Mean Time to Identification (MTTI) for network issues. + +### **5.4 The Complete Grafana Dashboard JSON Model** {#the-complete-grafana-dashboard-json-model} + +To facilitate easy deployment, the entire dashboard is defined in a +single JSON model. This model can be imported directly into any Grafana +instance. + +> JSON + +{ +\"\_\_inputs\":, +\"\_\_requires\": \[ +{ +\"type\": \"grafana\", +\"id\": \"grafana\", +\"name\": \"Grafana\", +\"version\": \"8.0.0\" +}, +{ +\"type\": \"datasource\", +\"id\": \"prometheus\", +\"name\": \"Prometheus\", +\"version\": \"1.0.0\" +} +\], +\"annotations\": { +\"list\": \[ +{ +\"builtIn\": 1, +\"datasource\": { +\"type\": \"grafana\", +\"uid\": \"\-- Grafana \--\" +}, +\"enable\": true, +\"hide\": true, +\"iconColor\": \"rgba(0, 211, 255, 1)\", +\"name\": \"Annotations & Alerts\", +\"type\": \"dashboard\" +} +\] +}, +\"editable\": true, +\"fiscalYearStartMonth\": 0, +\"gnetId\": null, +\"graphTooltip\": 0, +\"id\": null, +\"links\":, +\"panels\":)\", +\"format\": \"heatmap\", +\"legendFormat\": \"{{source_node}} -\> {{destination_node}}\", +\"refId\": \"A\" +} +\], +\"cards\": { \"cardPadding\": null, \"cardRound\": null }, +\"color\": { +\"mode\": \"spectrum\", +\"scheme\": \"red-yellow-green\", +\"exponent\": 0.5, +\"reverse\": false +}, +\"dataFormat\": \"tsbuckets\", +\"yAxis\": { \"show\": true, \"format\": \"short\" }, +\"xAxis\": { \"show\": true } +}, +{ +\"title\": \"Bandwidth Over Time (Source: \$source_node, Dest: +\$destination_node)\", +\"type\": \"timeseries\", +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"gridPos\": { \"h\": 8, \"w\": 12, \"x\": 0, \"y\": 9 }, +\"targets\":, +\"fieldConfig\": { +\"defaults\": { +\"unit\": \"mbps\" +} +} +}, +{ +\"title\": \"Jitter Over Time (Source: \$source_node, Dest: +\$destination_node)\", +\"type\": \"timeseries\", +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"gridPos\": { \"h\": 8, \"w\": 12, \"x\": 12, \"y\": 9 }, +\"targets\": \[ +{ +\"expr\": \"iperf_network_jitter_ms{source_node=\\\$source_node\\, +destination_node=\\\$destination_node\\, protocol=\\udp\\}\", +\"legendFormat\": \"Jitter\", +\"refId\": \"A\" +} +\], +\"fieldConfig\": { +\"defaults\": { +\"unit\": \"ms\" +} +} +} +\], +\"refresh\": \"30s\", +\"schemaVersion\": 36, +\"style\": \"dark\", +\"tags\": \[\"iperf3\", \"network\", \"kubernetes\"\], +\"templating\": { +\"list\": \[ +{ +\"current\": {}, +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"definition\": \"label_values(iperf_network_bandwidth_mbps, +source_node)\", +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"source_node\", +\"options\":, +\"query\": \"label_values(iperf_network_bandwidth_mbps, +source_node)\", +\"refresh\": 1, +\"regex\": \"\", +\"skipUrlSync\": false, +\"sort\": 1, +\"type\": \"query\" +}, +{ +\"current\": {}, +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"definition\": +\"label_values(iperf_network_bandwidth_mbps{source_node=\\\$source_node\\}, +destination_node)\", +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"destination_node\", +\"options\":, +\"query\": +\"label_values(iperf_network_bandwidth_mbps{source_node=\\\$source_node\\}, +destination_node)\", +\"refresh\": 1, +\"regex\": \"\", +\"skipUrlSync\": false, +\"sort\": 1, +\"type\": \"query\" +}, +{ +\"current\": { \"selected\": true, \"text\": \"tcp\", \"value\": \"tcp\" +}, +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"protocol\", +\"options\": \[ +{ \"selected\": true, \"text\": \"tcp\", \"value\": \"tcp\" }, +{ \"selected\": false, \"text\": \"udp\", \"value\": \"udp\" } +\], +\"query\": \"tcp,udp\", +\"skipUrlSync\": false, +\"type\": \"custom\" +} +\] +}, +\"time\": { +\"from\": \"now-1h\", +\"to\": \"now\" +}, +\"timepicker\": {}, +\"timezone\": \"browser\", +\"title\": \"Kubernetes iperf3 Network Performance\", +\"uid\": \"k8s-iperf3-dashboard\", +\"version\": 1, +\"weekStart\": \"\" +} + +## **Section 6: GitHub Repository Structure and CI/CD Workflow** + +To deliver this monitoring service as a professional, open-source-ready +project, it is essential to package it within a well-structured GitHub +repository and implement a robust Continuous Integration and Continuous +Deployment (CI/CD) pipeline. This automates the build, test, and release +process, ensuring that every version of the software is consistent, +trustworthy, and easy for consumers to adopt. + +### **6.1 Recommended Repository Structure** {#recommended-repository-structure} + +A clean, logical directory structure is fundamental for project +maintainability and ease of navigation for contributors and users. + +. +├──.github/ +│ └── workflows/ +│ └── release.yml \# GitHub Actions workflow for CI/CD +├── charts/ +│ └── iperf3-monitor/ \# The Helm chart for the service +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +│ └──\... +└── exporter/ +├── Dockerfile \# Dockerfile for the exporter +├── requirements.txt \# Python dependencies +└── exporter.py \# Exporter source code +├──.gitignore +├── LICENSE +└── README.md + +This structure cleanly separates the exporter application code +(/exporter) from its deployment packaging (/charts/iperf3-monitor), and +its release automation (/.github/workflows). + +### **6.2 CI/CD Pipeline with GitHub Actions** {#cicd-pipeline-with-github-actions} + +A fully automated CI/CD pipeline is the hallmark of a mature software +project. It eliminates manual, error-prone release steps and provides +strong guarantees about the integrity of the published artifacts. By +triggering the pipeline on the creation of a Git tag (e.g., v1.2.3), we +use the tag as a single source of truth for versioning both the Docker +image and the Helm chart. This ensures that chart version 1.2.3 is built +to use image version 1.2.3, and that both have been validated before +release. This automated, atomic release process provides trust and +velocity, elevating the project from a collection of files into a +reliable, distributable piece of software. + +The following GitHub Actions workflow automates the entire release +process: + +> YAML + +\#.github/workflows/release.yml +name: Release iperf3-monitor + +on: +push: +tags: +- \'v\*.\*.\*\' + +env: +REGISTRY: ghcr.io +IMAGE_NAME: \${{ github.repository }} + +jobs: +lint-and-test: +name: Lint and Test +runs-on: ubuntu-latest +steps: +- name: Check out code +uses: actions/checkout@v3 + +- name: Set up Helm +uses: azure/setup-helm@v3 +with: +version: v3.10.0 + +- name: Helm Lint +run: helm lint./charts/iperf3-monitor + +build-and-publish-image: +name: Build and Publish Docker Image +runs-on: ubuntu-latest +needs: lint-and-test +permissions: +contents: read +packages: write +steps: +- name: Check out code +uses: actions/checkout@v3 + +- name: Log in to GitHub Container Registry +uses: docker/login-action@v2 +with: +registry: \${{ env.REGISTRY }} +username: \${{ github.actor }} +password: \${{ secrets.GITHUB_TOKEN }} + +- name: Extract metadata (tags, labels) for Docker +id: meta +uses: docker/metadata-action@v4 +with: +images: \${{ env.REGISTRY }}/\${{ env.IMAGE_NAME }} + +- name: Build and push Docker image +uses: docker/build-push-action@v4 +with: +context:./exporter +push: true +tags: \${{ steps.meta.outputs.tags }} +labels: \${{ steps.meta.outputs.labels }} + +package-and-publish-chart: +name: Package and Publish Helm Chart +runs-on: ubuntu-latest +needs: build-and-publish-image +permissions: +contents: write +steps: +- name: Check out code +uses: actions/checkout@v3 +with: +fetch-depth: 0 + +- name: Set up Helm +uses: azure/setup-helm@v3 +with: +version: v3.10.0 + +- name: Set Chart Version +run: \| +VERSION=\$(echo \"\${{ github.ref_name }}\" \| sed \'s/\^v//\') +helm-docs \--sort-values-order file +yq e -i \'.version = +strenv(VERSION)\'./charts/iperf3-monitor/Chart.yaml +yq e -i \'.appVersion = +strenv(VERSION)\'./charts/iperf3-monitor/Chart.yaml + +- name: Publish Helm chart +uses: stefanprodan/helm-gh-pages@v1.6.0 +with: +token: \${{ secrets.GITHUB_TOKEN }} +charts_dir:./charts +charts_url: https://\${{ github.repository_owner }}.github.io/\${{ +github.event.repository.name }} + +### **6.3 Documentation and Usability** {#documentation-and-usability} + +The final, and arguably most critical, component for project success is +high-quality documentation. The README.md file at the root of the +repository is the primary entry point for any user. It should clearly +explain what the project does, its architecture, and how to deploy and +use it. + +A common failure point in software projects is documentation that falls +out of sync with the code. For Helm charts, the values.yaml file +frequently changes, adding new parameters and options. To combat this, +it is a best practice to automate the documentation of these parameters. +The helm-docs tool can be integrated directly into the CI/CD pipeline to +automatically generate the \"Parameters\" section of the README.md by +parsing the comments directly from the values.yaml file.^20^ This +ensures that the documentation is always an accurate reflection of the +chart\'s configurable options, providing a seamless and trustworthy +experience for users. + +## **Conclusion** + +The proliferation of distributed microservices on Kubernetes has made +network performance a critical, yet often opaque, component of overall +application health. This report has detailed a comprehensive, +production-grade solution for establishing continuous network validation +within a Kubernetes cluster. By architecting a system around the robust, +decoupled pattern of an iperf3-server DaemonSet and a Kubernetes-aware +iperf3-exporter Deployment, this service provides a resilient and +automated foundation for network observability. + +The implementation leverages industry-standard tools---Python for the +exporter, Prometheus for metrics storage, and Grafana for +visualization---to create a powerful and flexible monitoring pipeline. +The entire service is packaged into a professional Helm chart, following +best practices for templating, configuration, and adaptability. This +allows for simple, version-controlled deployment across a wide range of +environments. The final Grafana dashboard transforms the collected data +into an intuitive, interactive narrative, enabling engineers to move +swiftly from high-level anomaly detection to root-cause analysis. + +Ultimately, by treating network performance not as a given but as a +continuously measured metric, organizations can proactively identify and +resolve infrastructure bottlenecks, enhance application reliability, and +ensure a consistent, high-quality experience for their users in the +dynamic world of Kubernetes. + +#### Works cited + +1. How to Identify Performance Issues in Kubernetes - LabEx, accessed + > June 17, 2025, + > [[https://labex.io/questions/how-to-identify-performance-issues-in-kubernetes-11358]{.underline}](https://labex.io/questions/how-to-identify-performance-issues-in-kubernetes-11358) + +2. Performing large-scale network testing on Red Hat OpenShift: A 100 + > Gbps approach, accessed June 17, 2025, + > [[https://www.redhat.com/en/blog/performing-large-scale-network-testing-red-hat-openshift]{.underline}](https://www.redhat.com/en/blog/performing-large-scale-network-testing-red-hat-openshift) + +3. How to Implement Full-Stack Monitoring with Prometheus/Grafana on + > FreeBSD Operating System \| Siberoloji, accessed June 17, 2025, + > [[https://www.siberoloji.com/how-to-implement-full-stack-monitoring-with-prometheusgrafana-on-freebsd-operating-system/]{.underline}](https://www.siberoloji.com/how-to-implement-full-stack-monitoring-with-prometheusgrafana-on-freebsd-operating-system/) + +4. Kubernetes Metrics and Monitoring with Prometheus and Grafana - DEV + > Community, accessed June 17, 2025, + > [[https://dev.to/abhay_yt_52a8e72b213be229/kubernetes-metrics-and-monitoring-with-prometheus-and-grafana-4e9n]{.underline}](https://dev.to/abhay_yt_52a8e72b213be229/kubernetes-metrics-and-monitoring-with-prometheus-and-grafana-4e9n) + +5. The Top 30 Grafana Dashboard Examples - Logit.io, accessed June 17, + > 2025, + > [[https://logit.io/blog/post/top-grafana-dashboards-and-visualisations/]{.underline}](https://logit.io/blog/post/top-grafana-dashboards-and-visualisations/) + +6. Autopilot Metrics \| Grafana Labs, accessed June 17, 2025, + > [[https://grafana.com/grafana/dashboards/23123-autopilot-metrics/]{.underline}](https://grafana.com/grafana/dashboards/23123-autopilot-metrics/) + +7. Kubernetes DaemonSet: Practical Guide to Monitoring in Kubernetes - + > Cast AI, accessed June 17, 2025, + > [[https://cast.ai/blog/kubernetes-daemonset-practical-guide-to-monitoring-in-kubernetes/]{.underline}](https://cast.ai/blog/kubernetes-daemonset-practical-guide-to-monitoring-in-kubernetes/) + +8. Kubernetes DaemonSets vs Deployments: Key Differences and Use + > Cases - RubixKube™, accessed June 17, 2025, + > [[https://www.rubixkube.io/blog/kubernetes-daemonsets-vs-deployments-key-differences-and-use-cases-4a5i]{.underline}](https://www.rubixkube.io/blog/kubernetes-daemonsets-vs-deployments-key-differences-and-use-cases-4a5i) + +9. Kubernetes DaemonSet: Examples, Use Cases & Best Practices - + > groundcover, accessed June 17, 2025, + > [[https://www.groundcover.com/blog/kubernetes-daemonset]{.underline}](https://www.groundcover.com/blog/kubernetes-daemonset) + +10. Complete Comparison of Kubernetes Daemonset Vs Deployment \| + > Zeet.co, accessed June 17, 2025, + > [[https://zeet.co/blog/kubernetes-daemonset-vs-deployment]{.underline}](https://zeet.co/blog/kubernetes-daemonset-vs-deployment) + +11. Testing Connectivity Between Kubernetes Pods with Iperf3 \| + > Support - SUSE, accessed June 17, 2025, + > [[https://www.suse.com/support/kb/doc/?id=000020954]{.underline}](https://www.suse.com/support/kb/doc/?id=000020954) + +12. Pharb/kubernetes-iperf3: Simple wrapper around iperf3 to \... - + > GitHub, accessed June 17, 2025, + > [[https://github.com/Pharb/kubernetes-iperf3]{.underline}](https://github.com/Pharb/kubernetes-iperf3) + +13. Building a Custom Prometheus Exporter in Python - SysOpsPro, + > accessed June 17, 2025, + > [[https://sysopspro.com/how-to-build-your-own-prometheus-exporter-in-python/]{.underline}](https://sysopspro.com/how-to-build-your-own-prometheus-exporter-in-python/) + +14. How to Create a Prometheus Exporter? - Enix.io, accessed June 17, + > 2025, + > [[https://enix.io/en/blog/create-prometheus-exporter/]{.underline}](https://enix.io/en/blog/create-prometheus-exporter/) + +15. Examples --- iperf3 0.1.10 documentation - iperf3 python wrapper, + > accessed June 17, 2025, + > [[https://iperf3-python.readthedocs.io/en/latest/examples.html]{.underline}](https://iperf3-python.readthedocs.io/en/latest/examples.html) + +16. markormesher/iperf-prometheus-collector - GitHub, accessed June 17, + > 2025, + > [[https://github.com/markormesher/iperf-prometheus-collector]{.underline}](https://github.com/markormesher/iperf-prometheus-collector) + +17. Using Helm with Kubernetes: A Guide to Helm Charts and Their + > Implementation, accessed June 17, 2025, + > [[https://dev.to/alexmercedcoder/using-helm-with-kubernetes-a-guide-to-helm-charts-and-their-implementation-8dg]{.underline}](https://dev.to/alexmercedcoder/using-helm-with-kubernetes-a-guide-to-helm-charts-and-their-implementation-8dg) + +18. Kubernetes Helm Charts: The Basics and a Quick Tutorial - Spot.io, + > accessed June 17, 2025, + > [[https://spot.io/resources/kubernetes-architecture/kubernetes-helm-charts-the-basics-and-a-quick-tutorial/]{.underline}](https://spot.io/resources/kubernetes-architecture/kubernetes-helm-charts-the-basics-and-a-quick-tutorial/) + +19. Understand a Helm chart structure - Bitnami Documentation, accessed + > June 17, 2025, + > [[https://docs.bitnami.com/kubernetes/faq/administration/understand-helm-chart/]{.underline}](https://docs.bitnami.com/kubernetes/faq/administration/understand-helm-chart/) + +20. Helm Chart Essentials & Writing Effective Charts - DEV Community, + > accessed June 17, 2025, + > [[https://dev.to/hkhelil/helm-chart-essentials-writing-effective-charts-11ca]{.underline}](https://dev.to/hkhelil/helm-chart-essentials-writing-effective-charts-11ca) + +21. Values - Helm, accessed June 17, 2025, + > [[https://helm.sh/docs/chart_best_practices/values/]{.underline}](https://helm.sh/docs/chart_best_practices/values/) + +22. grafana.com, accessed June 17, 2025, + > [[https://grafana.com/grafana/dashboards/22901-traffic-monitoring/#:\~:text=host%20traffic%20breakdowns.-,Grafana%20Dashboard,traffic%20statistics%20by%20source%2Fdestination.]{.underline}](https://grafana.com/grafana/dashboards/22901-traffic-monitoring/#:~:text=host%20traffic%20breakdowns.-,Grafana%20Dashboard,traffic%20statistics%20by%20source%2Fdestination.) diff --git a/README.md b/README.md new file mode 100644 index 0000000..81a9f30 --- /dev/null +++ b/README.md @@ -0,0 +1,437 @@ +# Kubernetes-Native Network Performance Monitoring Service + +This project provides a comprehensive solution for continuous network validation within a Kubernetes cluster. Leveraging industry-standard tools like `iperf3`, `Prometheus`, and `Grafana`, it offers proactive monitoring of network performance between nodes, helping to identify and troubleshoot latency, bandwidth, and packet loss issues before they impact applications. + +## Features + +* **Continuous N-to-N Testing:** Automatically measures network performance between all nodes in the cluster. +* **Kubernetes-Native:** Deploys as standard Kubernetes workloads (DaemonSet, Deployment). +* **Dynamic Discovery:** Exporter automatically discovers iperf3 server pods using the Kubernetes API. +* **Prometheus Integration:** Translates iperf3 results into standard Prometheus metrics for time-series storage. +* **Grafana Visualization:** Provides a rich, interactive dashboard with heatmaps and time-series graphs. +* **Helm Packaging:** Packaged as a Helm chart for easy deployment and configuration management. +* **Automated CI/CD:** Includes a GitHub Actions workflow for building and publishing the exporter image and Helm chart. + +## Architecture + +The service is based on a decoupled architecture: + +1. **iperf3-server DaemonSet:** Deploys an `iperf3` server pod on every node to act as a test endpoint. Running on the host network to measure raw node performance. +2. **iperf3-exporter Deployment:** A centralized service that uses the Kubernetes API to discover server pods, orchestrates `iperf3` client tests against them, parses the JSON output, and exposes performance metrics via an HTTP endpoint. +3. **Prometheus & Grafana Stack:** A standard monitoring backend (like `kube-prometheus-stack`) that scrapes the exporter's metrics and visualizes them in a custom dashboard. + +This separation of concerns ensures scalability, resilience, and aligns with Kubernetes operational principles. + +## Getting Started + +### Prerequisites + +* A running Kubernetes cluster. +* `kubectl` configured to connect to your cluster. +* Helm v3+ installed. +* A Prometheus instance configured to scrape services (ideally using the Prometheus Operator and ServiceMonitors). +* A Grafana instance accessible and configured with Prometheus as a data source. + +### Installation with Helm + +1. Add the Helm chart repository (replace with your actual repo URL once published): + + ```/dev/null/helm-install.sh#L1-1 + helm repo add iperf3-monitor https://your-github-org.github.io/iperf3-monitor/ + ``` + +2. Update your Helm repositories: + + ```/dev/null/helm-install.sh#L3-3 + helm repo update + ``` + +3. Install the chart: + + ```/dev/null/helm-install.sh#L5-8 + helm install iperf3-monitor iperf3-monitor/iperf3-monitor \ + --namespace monitoring # Or your preferred namespace \ + --create-namespace \ + --values values.yaml # Optional: Use a custom values file + ``` + + > **Note:** Ensure your Prometheus instance is configured to scrape services in the namespace where you install the chart and that it recognizes `ServiceMonitor` resources with the label `release: prometheus-operator` (if using the standard `kube-prometheus-stack` setup). + +### Configuration + +The Helm chart is highly configurable via the `values.yaml` file. You can override default settings by creating your own `values.yaml` and passing it during installation (`--values my-values.yaml`). + +Refer to the comments in the default `values.yaml` for a detailed explanation of each parameter: + +```iperf3-monitor/charts/iperf3-monitor/values.yaml +# Default values for iperf3-monitor. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# -- Override the name of the chart. +nameOverride: "" + +# -- Override the fully qualified app name. +fullnameOverride: "" + +exporter: + # -- Configuration for the exporter container image. + image: + # -- The container image repository for the exporter. + repository: ghcr.io/my-org/iperf3-prometheus-exporter # Replace with your repo URL + # -- The container image tag for the exporter. If not set, the chart's appVersion is used. + tag: "" + # -- The image pull policy for the exporter container. + pullPolicy: IfNotPresent + + # -- Number of exporter pod replicas. Typically 1 is sufficient. + replicaCount: 1 + + # -- Interval in seconds between complete test cycles (i.e., testing all server nodes). + testInterval: 300 + + # -- Timeout in seconds for a single iperf3 test run. + testTimeout: 10 + + # -- Protocol to use for testing (tcp or udp). + testProtocol: tcp + + # -- CPU and memory resource requests and limits for the exporter pod. + # @default -- A small default is provided if commented out. + resources: {} + # requests: + # cpu: "100m" + # memory: "128Mi" + # limits: + # cpu: "500m" + # memory: "256Mi" + +server: + # -- Configuration for the iperf3 server container image (DaemonSet). + image: + # -- The container image repository for the iperf3 server. + repository: networkstatic/iperf3 + # -- The container image tag for the iperf3 server. + tag: latest + + # -- CPU and memory resource requests and limits for the iperf3 server pods (DaemonSet). + # These should be very low as the server is mostly idle. + # @default -- A small default is provided if commented out. + resources: {} + # requests: + # cpu: "50m" + # memory: "64Mi" + # limits: + # cpu: "100m" + # memory: "128Mi" + + # -- Node selector for scheduling iperf3 server pods. + # Use this to restrict the DaemonSet to a subset of nodes. + # @default -- {} (schedule on all nodes) + nodeSelector: {} + + # -- Tolerations for scheduling iperf3 server pods on tainted nodes (e.g., control-plane nodes). + # This is often necessary to include master nodes in the test mesh. + # @default -- Tolerates control-plane and master taints. + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + +rbac: + # -- If true, create ServiceAccount, ClusterRole, and ClusterRoleBinding for the exporter. + # Set to false if you manage RBAC externally. + create: true + +serviceAccount: + # -- The name of the ServiceAccount to use for the exporter pod. + # Only used if rbac.create is false. If not set, it defaults to the chart's fullname. + name: "" + +serviceMonitor: + # -- If true, create a ServiceMonitor resource for integration with Prometheus Operator. + # Requires a running Prometheus Operator in the cluster. + enabled: true + + # -- Scrape interval for the ServiceMonitor. How often Prometheus scrapes the exporter metrics. + interval: 60s + + # -- Scrape timeout for the ServiceMonitor. How long Prometheus waits for metrics response. + scrapeTimeout: 30s + +# -- Configuration for the exporter Service. +service: + # -- Service type. ClusterIP is typically sufficient. + type: ClusterIP + # -- Port on which the exporter service is exposed. + port: 9876 + # -- Target port on the exporter pod. + targetPort: 9876 + +# -- Optional configuration for a network policy to allow traffic to the iperf3 server DaemonSet. +# This is often necessary if you are using a network policy controller. +networkPolicy: + # -- If true, create a NetworkPolicy resource. + enabled: false + # -- Specify source selectors if needed (e.g., pods in a specific namespace). + from: [] + # -- Specify namespace selectors if needed. + namespaceSelector: {} + # -- Specify pod selectors if needed. + podSelector: {} +``` + +## Grafana Dashboard + +A custom Grafana dashboard is provided to visualize the collected `iperf3` metrics. + +1. Log in to your Grafana instance. +2. Navigate to `Dashboards` -> `Import`. +3. Paste the full JSON model provided below into the text area and click `Load`. +4. Select your Prometheus data source and click `Import`. + +```/dev/null/grafana-dashboard.json +{ +"__inputs": [], +"__requires": [ +{ +"type": "grafana", +"id": "grafana", +"name": "Grafana", +"version": "8.0.0" +}, +{ +"type": "datasource", +"id": "prometheus", +"name": "Prometheus", +"version": "1.0.0" +} +], +"annotations": { +"list": [ +{ +"builtIn": 1, +"datasource": { +"type": "grafana", +"uid": "-- Grafana --" +}, +"enable": true, +"hide": true, +"iconColor": "rgba(0, 211, 255, 1)", +"name": "Annotations & Alerts", +"type": "dashboard" +} +] +}, +"editable": true, +"fiscalYearStartMonth": 0, +"gnetId": null, +"graphTooltip": 0, +"id": null, +"links": [], +"panels": [ +{ +"datasource": { +"type": "prometheus", +"uid": "prometheus" +}, +"gridPos": { +"h": 9, +"w": 24, +"x": 0, +"y": 0 +}, +"id": 2, +"targets": [ +{ +"expr": "avg(iperf_network_bandwidth_mbps) by (source_node, destination_node)", +"format": "heatmap", +"legendFormat": "{{source_node}} -> {{destination_node}}", +"refId": "A" +} +], +"cards": { "cardPadding": null, "cardRound": null }, +"color": { +"mode": "spectrum", +"scheme": "red-yellow-green", +"exponent": 0.5, +"reverse": false +}, +"dataFormat": "tsbuckets", +"yAxis": { "show": true, "format": "short" }, +"xAxis": { "show": true } +}, +{ +"title": "Bandwidth Over Time (Source: $source_node, Dest: $destination_node)", +"type": "timeseries", +"datasource": { +"type": "prometheus", +"uid": "prometheus" +}, +"gridPos": { +"h": 8, +"w": 12, +"x": 0, +"y": 9 +}, +"targets": [ +{ +"expr": "iperf_network_bandwidth_mbps{source_node=~\"^$source_node$\", destination_node=~\"^$destination_node$\", protocol=~\"^$protocol$\"}", +"legendFormat": "Bandwidth", +"refId": "A" +} +], +"fieldConfig": { +"defaults": { +"unit": "mbps" +} +} +}, +{ +"title": "Jitter Over Time (Source: $source_node, Dest: $destination_node)", +"type": "timeseries", +"datasource": { +"type": "prometheus", +"uid": "prometheus" +}, +"gridPos": { +"h": 8, +"w": 12, +"x": 12, +"y": 9 +}, +"targets": [ +{ +"expr": "iperf_network_jitter_ms{source_node=~\"^$source_node$\", destination_node=~\"^$destination_node$\", protocol=\"udp\"}", +"legendFormat": "Jitter", +"refId": "A" +} +], +"fieldConfig": { +"defaults": { +"unit": "ms" +} +} +} +], +"refresh": "30s", +"schemaVersion": 36, +"style": "dark", +"tags": ["iperf3", "network", "kubernetes"], +"templating": { +"list": [ +{ +"current": {}, +"datasource": { +"type": "prometheus", +"uid": "prometheus" +}, +"definition": "label_values(iperf_network_bandwidth_mbps, source_node)", +"hide": 0, +"includeAll": false, +"multi": false, +"name": "source_node", +"options": [], +"query": "label_values(iperf_network_bandwidth_mbps, source_node)", +"refresh": 1, +"regex": "", +"skipUrlSync": false, +"sort": 1, +"type": "query" +}, +{ +"current": {}, +"datasource": { +"type": "prometheus", +"uid": "prometheus" +}, +"definition": "label_values(iperf_network_bandwidth_mbps{source_node=~\"^$source_node$\"}, destination_node)", +"hide": 0, +"includeAll": false, +"multi": false, +"name": "destination_node", +"options": [], +"query": "label_values(iperf_network_bandwidth_mbps{source_node=~\"^$source_node$\"}, destination_node)", +"refresh": 1, +"regex": "", +"skipUrlSync": false, +"sort": 1, +"type": "query" +}, +{ +"current": { "selected": true, "text": "tcp", "value": "tcp" }, +"hide": 0, +"includeAll": false, +"multi": false, +"name": "protocol", +"options": [ +{ "selected": true, "text": "tcp", "value": "tcp" }, +{ "selected": false, "text": "udp", "value": "udp" } +], +"query": "tcp,udp", +"skipUrlSync": false, +"type": "custom" +} +] +}, +"time": { +"from": "now-1h", +"to": "now" +}, +"timepicker": {}, +"timezone": "browser", +"title": "Kubernetes iperf3 Network Performance", +"uid": "k8s-iperf3-dashboard", +"version": 1, +"weekStart": "" +} +``` + +## Repository Structure + +The project follows a standard structure: + +```/dev/null/repo-structure.txt +. +├── .github/ +│ └── workflows/ +│ └── release.yml # GitHub Actions workflow for CI/CD +├── charts/ +│ └── iperf3-monitor/ # The Helm chart for the service +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +│ ├── _helpers.tpl +│ ├── server-daemonset.yaml +│ ├── exporter-deployment.yaml +│ ├── rbac.yaml +│ ├── service.yaml +│ └── servicemonitor.yaml +└── exporter/ + ├── Dockerfile # Dockerfile for the exporter + ├── requirements.txt # Python dependencies + └── exporter.py # Exporter source code +├── .gitignore # Specifies intentionally untracked files +├── LICENSE # Project license +└── README.md # This file +``` + +## Development and CI/CD + +The project includes a GitHub Actions workflow (`.github/workflows/release.yml`) triggered on Git tags (`v*.*.*`) to automate: + +1. Linting the Helm chart. +2. Building and publishing the Docker image for the exporter to GitHub Container Registry (`ghcr.io`). +3. Updating the Helm chart version based on the Git tag. +4. Packaging and publishing the Helm chart to GitHub Pages. + +## License + +This project is licensed under the terms defined in the `LICENSE` file. + +```iperf3-monitor/LICENSE +This project is currently unlicensed. Please see the project's documentation or repository for licensing information when it becomes available. +``` diff --git a/bootstrap.md b/bootstrap.md new file mode 100644 index 0000000..0b4c1c4 --- /dev/null +++ b/bootstrap.md @@ -0,0 +1,1418 @@ +# **Architecting a Kubernetes-Native Network Performance Monitoring Service with iperf3, Prometheus, and Helm** + +## **Section 1: Architectural Blueprint for Continuous Network Validation** + +### **1.1 Introduction to Proactive Network Monitoring in Kubernetes** {#introduction-to-proactive-network-monitoring-in-kubernetes} + +In modern cloud-native infrastructures, Kubernetes has emerged as the de +facto standard for container orchestration, simplifying the deployment, +scaling, and management of complex applications.^1^ However, the very +dynamism and abstraction that make Kubernetes powerful also introduce +significant challenges in diagnosing network performance issues. The +ephemeral nature of pods, the complexity of overlay networks provided by +Container Network Interfaces (CNIs), and the multi-layered traffic +routing through Services and Ingress controllers can obscure the root +causes of latency, packet loss, and throughput degradation. + +Traditional, reactive troubleshooting---investigating network problems +only after an application has failed---is insufficient in these +environments. Performance bottlenecks can be subtle, intermittent, and +difficult to reproduce, often manifesting as degraded user experience +long before they trigger hard failures.^1^ To maintain the reliability +and performance of critical workloads, engineering teams must shift from +a reactive to a proactive stance. This requires a system that performs +continuous, automated validation of the underlying network fabric, +treating network health not as an assumption but as a measurable, +time-series metric. + +This document outlines the architecture and implementation of a +comprehensive, Kubernetes-native network performance monitoring service. +The solution leverages a suite of industry-standard, open-source tools +to provide continuous, actionable insights into cluster network health. +The core components are: + +- **iperf3:** A widely adopted tool for active network performance + > measurement, used to generate traffic and measure maximum achievable + > bandwidth, jitter, and packet loss between two points.^2^ + +- **Prometheus:** A powerful, open-source monitoring and alerting system + > that has become the standard for collecting and storing time-series + > metrics in the Kubernetes ecosystem.^3^ + +- **Grafana:** A leading visualization tool for creating rich, + > interactive dashboards from various data sources, including + > Prometheus, enabling intuitive analysis of complex datasets.^4^ + +By combining these components into a cohesive, automated service, we can +transform abstract network performance into a concrete, queryable, and +visualizable stream of data, enabling teams to detect and address +infrastructure-level issues before they impact end-users.^6^ + +### **1.2 The Core Architectural Pattern: Decoupled Test Endpoints and a Central Orchestrator** {#the-core-architectural-pattern-decoupled-test-endpoints-and-a-central-orchestrator} + +The foundation of this monitoring service is a robust, decoupled +architectural pattern designed for scalability and resilience within a +dynamic Kubernetes environment. The design separates the passive test +endpoints from the active test orchestrator, a critical distinction that +ensures the system is both efficient and aligned with Kubernetes +operational principles. + +The data flow and component interaction can be visualized as follows: + +1. A **DaemonSet** deploys an iperf3 server pod onto every node in the + > cluster, creating a mesh of passive test targets. + +2. A central **Deployment**, the iperf3-exporter, uses the Kubernetes + > API to discover the IP addresses of all iperf3 server pods. + +3. The iperf3-exporter periodically orchestrates tests, running an + > iperf3 client to connect to each server pod and measure network + > performance. + +4. The exporter parses the JSON output from iperf3, transforms the + > results into Prometheus metrics, and exposes them on a /metrics + > HTTP endpoint. + +5. A **Prometheus** server, configured via a **ServiceMonitor**, + > scrapes the /metrics endpoint of the exporter, ingesting the + > performance data into its time-series database. + +6. A **Grafana** instance, using Prometheus as a data source, + > visualizes the metrics in a purpose-built dashboard, providing + > heatmaps and time-series graphs of node-to-node bandwidth, jitter, + > and packet loss. + +This architecture is composed of three primary logical components: + +- **Component 1: The iperf3-server DaemonSet.** To accurately measure + > network performance between any two nodes (N-to-N), an iperf3 server + > process must be running and accessible on every node. The DaemonSet + > is the canonical Kubernetes controller for this exact use case. It + > guarantees that a copy of a specific pod runs on all, or a selected + > subset of, nodes within the cluster.^7^ When a new node joins the + > cluster, the + > DaemonSet controller automatically deploys an iperf3-server pod to + > it; conversely, when a node is removed, the pod is garbage + > collected. This ensures the mesh of test endpoints is always in sync + > with the state of the cluster, requiring zero manual + > intervention.^9^ This pattern of using a + > DaemonSet to deploy iperf3 across a cluster is a well-established + > practice for network validation.^11^ + +- **Component 2: The iperf3-exporter Deployment.** A separate, + > centralized component is required to act as the test orchestrator. + > This component is responsible for initiating the iperf3 client + > connections, executing the tests, parsing the results, and exposing + > them as Prometheus metrics. Since this is a stateless service whose + > primary function is to perform a periodic task, a Deployment is the + > ideal controller.^8^ A + > Deployment ensures a specified number of replicas are running, + > provides mechanisms for rolling updates, and allows for independent + > resource management and lifecycle control, decoupled from the + > iperf3-server pods it tests against.^10^ + +- **Component 3: The Prometheus & Grafana Stack.** The monitoring + > backend is provided by the kube-prometheus-stack, a comprehensive + > Helm chart that deploys Prometheus, Grafana, Alertmanager, and the + > necessary exporters for cluster monitoring.^4^ Our custom monitoring + > service is designed to integrate seamlessly with this stack, + > leveraging its Prometheus Operator for automatic scrape + > configuration and its Grafana instance for visualization. + +### **1.3 Architectural Justification and Design Rationale** {#architectural-justification-and-design-rationale} + +The primary strength of this architecture lies in its deliberate +separation of concerns, a design choice that yields significant benefits +in resilience, scalability, and operational efficiency. The DaemonSet is +responsible for the *presence* of test endpoints, while the Deployment +handles the *orchestration* of the tests. This decoupling is not +arbitrary; it is a direct consequence of applying Kubernetes-native +principles to the problem. + +The logical progression is as follows: The requirement to continuously +measure N-to-N node bandwidth necessitates that iperf3 server processes +are available on all N nodes to act as targets. The most reliable, +self-healing, and automated method to achieve this \"one-pod-per-node\" +pattern in Kubernetes is to use a DaemonSet.^7^ This makes the server +deployment automatically scale with the cluster itself. Next, a process +is needed to trigger the tests against these servers. This +\"orchestrator\" is a logically distinct, active service. It needs to be +reliable and potentially scalable, but it does not need to run on every +single node. The standard Kubernetes object for managing such stateless +services is a + +Deployment.^8^ + +This separation allows for independent and appropriate resource +allocation. The iperf3-server pods are extremely lightweight, consuming +minimal resources while idle. The iperf3-exporter, however, may be more +CPU-intensive during the brief periods it is actively running tests. By +placing them in different workload objects (DaemonSet and Deployment), +we can configure their resource requests and limits independently. This +prevents the monitoring workload from interfering with or being starved +by application workloads, a crucial consideration for any +production-grade system. This design is fundamentally more robust and +scalable than simpler, monolithic approaches, such as a single script +that attempts to manage both server and client lifecycles.^12^ + +## **Section 2: Implementing the iperf3-prometheus-exporter** + +The heart of this monitoring solution is the iperf3-prometheus-exporter, +a custom application responsible for orchestrating the network tests and +translating their results into a format that Prometheus can ingest. This +section provides a detailed breakdown of its implementation, from +technology selection to the final container image. + +### **2.1 Technology Selection: Python for Agility and Ecosystem** {#technology-selection-python-for-agility-and-ecosystem} + +Python was selected as the implementation language for the exporter due +to its powerful ecosystem and rapid development capabilities. The +availability of mature, well-maintained libraries for interacting with +both Prometheus and Kubernetes significantly accelerates the development +of a robust, cloud-native application. + +The key libraries leveraged are: + +- **prometheus-client:** The official Python client library for + > instrumenting applications with Prometheus metrics. It provides a + > simple API for defining metrics (Gauges, Counters, etc.) and + > exposing them via an HTTP server, handling much of the boilerplate + > required for creating a valid exporter.^13^ + +- **iperf3-python:** A clean, high-level Python wrapper around the + > iperf3 C library. It allows for programmatic control of iperf3 + > clients and servers, and it can directly parse the JSON output of a + > test into a convenient Python object, eliminating the need for + > manual process management and output parsing.^15^ + +- **kubernetes:** The official Python client library for the Kubernetes + > API. This library is essential for the exporter to become + > \"Kubernetes-aware,\" enabling it to dynamically discover the + > iperf3-server pods it needs to test against by querying the API + > server directly. + +### **2.2 Core Exporter Logic (Annotated Python Code)** {#core-exporter-logic-annotated-python-code} + +The exporter\'s logic can be broken down into five distinct steps, which +together form a continuous loop of discovery, testing, and reporting. + +#### **Step 1: Initialization and Metric Definition** + +The application begins by importing the necessary libraries and defining +the Prometheus metrics that will be exposed. We use a Gauge metric, as +bandwidth is a value that can go up or down. Labels are crucial for +providing context; they allow us to slice and dice the data in +Prometheus and Grafana. + +> Python + +import os +import time +import logging +from kubernetes import client, config +from prometheus_client import start_http_server, Gauge +import iperf3 + +\# \-\-- Configuration \-\-- +\# Configure logging +logging.basicConfig(level=logging.INFO, format=\'%(asctime)s - +%(levelname)s - %(message)s\') + +\# \-\-- Prometheus Metrics Definition \-\-- +IPERF_BANDWIDTH_MBPS = Gauge( +\'iperf_network_bandwidth_mbps\', +\'Network bandwidth measured by iperf3 in Megabits per second\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_JITTER_MS = Gauge( +\'iperf_network_jitter_ms\', +\'Network jitter measured by iperf3 in milliseconds\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_PACKETS_TOTAL = Gauge( +\'iperf_network_packets_total\', +\'Total packets transmitted or received during the iperf3 test\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_LOST_PACKETS = Gauge( +\'iperf_network_lost_packets_total\', +\'Total lost packets during the iperf3 UDP test\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) +IPERF_TEST_SUCCESS = Gauge( +\'iperf_test_success\', +\'Indicates if the iperf3 test was successful (1) or failed (0)\', +\[\'source_node\', \'destination_node\', \'protocol\'\] +) + +#### **Step 2: Kubernetes-Aware Target Discovery** + +A static list of test targets is an anti-pattern in a dynamic +environment like Kubernetes.^16^ The exporter must dynamically discover +its targets. This is achieved by using the Kubernetes Python client to +query the API server for all pods that match the label selector of our + +iperf3-server DaemonSet (e.g., app=iperf3-server). The function returns +a list of dictionaries, each containing the pod\'s IP address and the +name of the node it is running on. + +This dynamic discovery is what transforms the exporter from a simple +script into a resilient, automated service. It adapts to cluster scaling +events without any manual intervention. The logical path is clear: +Kubernetes clusters are dynamic, so a hardcoded list of IPs would become +stale instantly. The API server is the single source of truth for the +cluster\'s state. Therefore, the exporter must query this API, which in +turn necessitates including the Kubernetes client library and +configuring the appropriate Role-Based Access Control (RBAC) permissions +for its ServiceAccount. + +> Python + +def discover_iperf_servers(): +\"\"\" +Discover iperf3 server pods in the cluster using the Kubernetes API. +\"\"\" +try: +\# Load in-cluster configuration +config.load_incluster_config() +v1 = client.CoreV1Api() + +namespace = os.getenv(\'IPERF_SERVER_NAMESPACE\', \'default\') +label_selector = os.getenv(\'IPERF_SERVER_LABEL_SELECTOR\', +\'app=iperf3-server\') + +logging.info(f\"Discovering iperf3 servers with label +\'{label_selector}\' in namespace \'{namespace}\'\") + +ret = v1.list_pod_for_all_namespaces(label_selector=label_selector, +watch=False) + +servers = +for i in ret.items: +\# Ensure pod has an IP and is running +if i.status.pod_ip and i.status.phase == \'Running\': +servers.append({ +\'ip\': i.status.pod_ip, +\'node_name\': i.spec.node_name +}) +logging.info(f\"Discovered {len(servers)} iperf3 server pods.\") +return servers +except Exception as e: +logging.error(f\"Error discovering iperf servers: {e}\") +return + +#### **Step 3: The Test Orchestration Loop** + +The main function of the application contains an infinite while True +loop that orchestrates the entire process. It periodically discovers the +servers, creates a list of test pairs (node-to-node), and then executes +an iperf3 test for each pair. + +> Python + +def run_iperf_test(server_ip, server_port, protocol, source_node, +dest_node): +\"\"\" +Runs a single iperf3 test and updates Prometheus metrics. +\"\"\" +logging.info(f\"Running iperf3 test from {source_node} to {dest_node} +({server_ip}:{server_port}) using {protocol.upper()}\") + +client = iperf3.Client() +client.server_hostname = server_ip +client.port = server_port +client.protocol = protocol +client.duration = int(os.getenv(\'IPERF_TEST_DURATION\', 5)) +client.json_output = True \# Critical for parsing + +result = client.run() + +\# Parse results and update metrics +parse_and_publish_metrics(result, source_node, dest_node, protocol) + +def main_loop(): +\"\"\" +Main orchestration loop. +\"\"\" +test_interval = int(os.getenv(\'IPERF_TEST_INTERVAL\', 300)) +server_port = int(os.getenv(\'IPERF_SERVER_PORT\', 5201)) +protocol = os.getenv(\'IPERF_TEST_PROTOCOL\', \'tcp\').lower() +source_node_name = os.getenv(\'SOURCE_NODE_NAME\') \# Injected via +Downward API + +if not source_node_name: +logging.error(\"SOURCE_NODE_NAME environment variable not set. +Exiting.\") +return + +while True: +servers = discover_iperf_servers() + +for server in servers: +\# Avoid testing a node against itself +if server\[\'node_name\'\] == source_node_name: +continue + +run_iperf_test(server\[\'ip\'\], server_port, protocol, +source_node_name, server\[\'node_name\'\]) + +logging.info(f\"Completed test cycle. Sleeping for {test_interval} +seconds.\") +time.sleep(test_interval) + +#### **Step 4: Parsing and Publishing Metrics** + +After each test run, a dedicated function parses the JSON result object +provided by the iperf3-python library.^15^ It extracts the key +performance indicators and uses them to set the value of the +corresponding Prometheus + +Gauge, applying the correct labels for source and destination nodes. +Robust error handling ensures that failed tests are also recorded as a +metric, which is vital for alerting. + +> Python + +def parse_and_publish_metrics(result, source_node, dest_node, +protocol): +\"\"\" +Parses the iperf3 result and updates Prometheus gauges. +\"\"\" +labels = {\'source_node\': source_node, \'destination_node\': dest_node, +\'protocol\': protocol} + +if result.error: +logging.error(f\"Test from {source_node} to {dest_node} failed: +{result.error}\") +IPERF_TEST_SUCCESS.labels(\*\*labels).set(0) +\# Clear previous successful metrics for this path +IPERF_BANDWIDTH_MBPS.labels(\*\*labels).set(0) +IPERF_JITTER_MS.labels(\*\*labels).set(0) +return + +IPERF_TEST_SUCCESS.labels(\*\*labels).set(1) + +\# The summary data is in result.sent_Mbps or result.received_Mbps +depending on direction +\# For simplicity, we check for available attributes. +if hasattr(result, \'sent_Mbps\'): +bandwidth_mbps = result.sent_Mbps +elif hasattr(result, \'received_Mbps\'): +bandwidth_mbps = result.received_Mbps +else: +\# Fallback for different iperf3 versions/outputs +bandwidth_mbps = result.Mbps if hasattr(result, \'Mbps\') else 0 + +IPERF_BANDWIDTH_MBPS.labels(\*\*labels).set(bandwidth_mbps) + +if protocol == \'udp\': +IPERF_JITTER_MS.labels(\*\*labels).set(result.jitter_ms if +hasattr(result, \'jitter_ms\') else 0) +IPERF_PACKETS_TOTAL.labels(\*\*labels).set(result.packets if +hasattr(result, \'packets\') else 0) +IPERF_LOST_PACKETS.labels(\*\*labels).set(result.lost_packets if +hasattr(result, \'lost_packets\') else 0) + +#### **Step 5: Exposing the /metrics Endpoint** + +Finally, the main execution block starts a simple HTTP server using the +prometheus-client library. This server exposes the collected metrics on +the standard /metrics path, ready to be scraped by Prometheus.^13^ + +> Python + +if \_\_name\_\_ == \'\_\_main\_\_\': +\# Start the Prometheus metrics server +listen_port = int(os.getenv(\'LISTEN_PORT\', 9876)) +start_http_server(listen_port) +logging.info(f\"Prometheus exporter listening on port {listen_port}\") + +\# Start the main orchestration loop +main_loop() + +### **2.3 Containerizing the Exporter (Dockerfile)** {#containerizing-the-exporter-dockerfile} + +To deploy the exporter in Kubernetes, it must be packaged into a +container image. A multi-stage Dockerfile is used to create a minimal +and more secure final image by separating the build environment from the +runtime environment. This is a standard best practice for producing +production-ready containers.^14^ + +> Dockerfile + +\# Stage 1: Build stage with dependencies +FROM python:3.9-slim as builder + +WORKDIR /app + +\# Install iperf3 and build dependencies +RUN apt-get update && \\ +apt-get install -y \--no-install-recommends gcc iperf3 libiperf-dev && +\\ +rm -rf /var/lib/apt/lists/\* + +\# Install Python dependencies +COPY requirements.txt. +RUN pip install \--no-cache-dir -r requirements.txt + +\# Stage 2: Final runtime stage +FROM python:3.9-slim + +WORKDIR /app + +\# Copy iperf3 binary and library from the builder stage +COPY \--from=builder /usr/bin/iperf3 /usr/bin/iperf3 +COPY \--from=builder /usr/lib/x86_64-linux-gnu/libiperf.so.0 +/usr/lib/x86_64-linux-gnu/libiperf.so.0 + +\# Copy installed Python packages from the builder stage +COPY \--from=builder /usr/local/lib/python3.9/site-packages +/usr/local/lib/python3.9/site-packages + +\# Copy the exporter application code +COPY exporter.py. + +\# Expose the metrics port +EXPOSE 9876 + +\# Set the entrypoint +CMD \[\"python\", \"exporter.py\"\] + +The corresponding requirements.txt would contain: + +prometheus-client +iperf3 +kubernetes + +## **Section 3: Kubernetes Manifests and Deployment Strategy** + +With the architectural blueprint defined and the exporter application +containerized, the next step is to translate this design into +declarative Kubernetes manifests. These YAML files define the necessary +Kubernetes objects to deploy, configure, and manage the monitoring +service. Using static manifests here provides a clear foundation before +they are parameterized into a Helm chart in the next section. + +### **3.1 The iperf3-server DaemonSet** {#the-iperf3-server-daemonset} + +The iperf3-server component is deployed as a DaemonSet to ensure an +instance of the server pod runs on every eligible node in the +cluster.^7^ This creates the ubiquitous grid of test endpoints required +for comprehensive N-to-N testing. + +Key fields in this manifest include: + +- **spec.selector**: Connects the DaemonSet to the pods it manages via + > labels. + +- **spec.template.metadata.labels**: The label app: iperf3-server is + > applied to the pods, which is crucial for discovery by both the + > iperf3-exporter and Kubernetes Services. + +- **spec.template.spec.containers**: Defines the iperf3 container, using + > a public image and running the iperf3 -s command to start it in + > server mode. + +- **spec.template.spec.tolerations**: This is often necessary to allow + > the DaemonSet to schedule pods on control-plane (master) nodes, + > which may have taints preventing normal workloads from running + > there. This ensures the entire cluster, including masters, is part + > of the test mesh. + +- **spec.template.spec.hostNetwork: true**: This is a critical setting. + > By running the server pods on the host\'s network namespace, we + > bypass the Kubernetes network overlay (CNI) for the server side. + > This allows the test to measure the raw performance of the + > underlying node network interface, which is often the primary goal + > of infrastructure-level testing. + +> YAML + +apiVersion: apps/v1 +kind: DaemonSet +metadata: +name: iperf3-server +labels: +app: iperf3-server +spec: +selector: +matchLabels: +app: iperf3-server +template: +metadata: +labels: +app: iperf3-server +spec: +\# Run on the host network to measure raw node-to-node performance +hostNetwork: true +\# Tolerations to allow scheduling on control-plane nodes +tolerations: +- key: \"node-role.kubernetes.io/control-plane\" +operator: \"Exists\" +effect: \"NoSchedule\" +- key: \"node-role.kubernetes.io/master\" +operator: \"Exists\" +effect: \"NoSchedule\" +containers: +- name: iperf3-server +image: networkstatic/iperf3:latest +args: \[\"-s\"\] \# Start in server mode +ports: +- containerPort: 5201 +name: iperf3 +protocol: TCP +- containerPort: 5201 +name: iperf3-udp +protocol: UDP +resources: +requests: +cpu: \"50m\" +memory: \"64Mi\" +limits: +cpu: \"100m\" +memory: \"128Mi\" + +### **3.2 The iperf3-exporter Deployment** {#the-iperf3-exporter-deployment} + +The iperf3-exporter is deployed as a Deployment, as it is a stateless +application that orchestrates the tests.^14^ Only one replica is +typically needed, as it can sequentially test all nodes. + +Key fields in this manifest are: + +- **spec.replicas: 1**: A single instance is sufficient for most + > clusters. + +- **spec.template.spec.serviceAccountName**: This assigns the custom + > ServiceAccount (defined next) to the pod, granting it the necessary + > permissions to talk to the Kubernetes API. + +- **spec.template.spec.containers.env**: The SOURCE_NODE_NAME + > environment variable is populated using the Downward API. This is + > how the exporter pod knows which node *it* is running on, allowing + > it to skip testing against itself. + +- **spec.template.spec.containers.image**: This points to the custom + > exporter image built in the previous section. + +> YAML + +apiVersion: apps/v1 +kind: Deployment +metadata: +name: iperf3-exporter +labels: +app: iperf3-exporter +spec: +replicas: 1 +selector: +matchLabels: +app: iperf3-exporter +template: +metadata: +labels: +app: iperf3-exporter +spec: +serviceAccountName: iperf3-exporter-sa +containers: +- name: iperf3-exporter +image: your-repo/iperf3-prometheus-exporter:latest \# Replace with your +image +ports: +- containerPort: 9876 +name: metrics +env: +\# Use the Downward API to inject the node name this pod is running on +- name: SOURCE_NODE_NAME +valueFrom: +fieldRef: +fieldPath: spec.nodeName +\# Other configurations for the exporter script +- name: IPERF_TEST_INTERVAL +value: \"300\" +- name: IPERF_SERVER_LABEL_SELECTOR +value: \"app=iperf3-server\" +resources: +requests: +cpu: \"100m\" +memory: \"128Mi\" +limits: +cpu: \"500m\" +memory: \"256Mi\" + +### **3.3 RBAC: Granting Necessary Permissions** {#rbac-granting-necessary-permissions} + +For the exporter to perform its dynamic discovery of iperf3-server pods, +it must be granted specific, limited permissions to read information +from the Kubernetes API. This is accomplished through a ServiceAccount, +a ClusterRole, and a ClusterRoleBinding. + +- **ServiceAccount**: Provides an identity for the exporter pod within + > the cluster. + +- **ClusterRole**: Defines a set of permissions. Here, we grant get, + > list, and watch access to pods. These are the minimum required + > permissions for the discovery function to work. The role is a + > ClusterRole because the exporter needs to find pods across all + > namespaces where servers might be running. + +- **ClusterRoleBinding**: Links the ServiceAccount to the ClusterRole, + > effectively granting the permissions to any pod that uses the + > ServiceAccount. + +> YAML + +apiVersion: v1 +kind: ServiceAccount +metadata: +name: iperf3-exporter-sa +\-\-- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: +name: iperf3-exporter-role +rules: +- apiGroups: \[\"\"\] +resources: \[\"pods\"\] +verbs: \[\"get\", \"list\", \"watch\"\] +\-\-- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: +name: iperf3-exporter-rb +subjects: +- kind: ServiceAccount +name: iperf3-exporter-sa +namespace: default \# The namespace where the exporter is deployed +roleRef: +kind: ClusterRole +name: iperf3-exporter-role +apiGroup: rbac.authorization.k8s.io + +### **3.4 Network Exposure: Service and ServiceMonitor** {#network-exposure-service-and-servicemonitor} + +To make the exporter\'s metrics available to Prometheus, we need two +final objects. The Service exposes the exporter pod\'s metrics port +within the cluster, and the ServiceMonitor tells the Prometheus Operator +how to find and scrape that service. + +This ServiceMonitor-based approach is the linchpin for a GitOps-friendly +integration. Instead of manually editing the central Prometheus +configuration file---a brittle and non-declarative process---we deploy a +ServiceMonitor custom resource alongside our application.^14^ The +Prometheus Operator, a key component of the + +kube-prometheus-stack, continuously watches for these objects. When it +discovers our iperf3-exporter-sm, it automatically generates the +necessary scrape configuration and reloads Prometheus without any manual +intervention.^4^ This empowers the application team to define + +*how their application should be monitored* as part of the +application\'s own deployment package, a cornerstone of scalable, \"you +build it, you run it\" observability. + +> YAML + +apiVersion: v1 +kind: Service +metadata: +name: iperf3-exporter-svc +labels: +app: iperf3-exporter +spec: +selector: +app: iperf3-exporter +ports: +- name: metrics +port: 9876 +targetPort: metrics +protocol: TCP +\-\-- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: +name: iperf3-exporter-sm +labels: +\# Label for Prometheus Operator to discover this ServiceMonitor +release: prometheus-operator +spec: +selector: +matchLabels: +\# This must match the labels on the Service object above +app: iperf3-exporter +endpoints: +- port: metrics +interval: 60s +scrapeTimeout: 30s + +## **Section 4: Packaging with Helm for Reusability and Distribution** + +While static YAML manifests are excellent for defining Kubernetes +resources, they lack the flexibility needed for easy configuration, +distribution, and lifecycle management. Helm, the package manager for +Kubernetes, solves this by bundling applications into +version-controlled, reusable packages called charts.^17^ This section +details how to package the entire + +iperf3 monitoring service into a professional, flexible, and +distributable Helm chart. + +### **4.1 Helm Chart Structure** {#helm-chart-structure} + +A well-organized Helm chart follows a standard directory structure. This +convention makes charts easier to understand and maintain.^19^ + +iperf3-monitor/ +├── Chart.yaml \# Metadata about the chart (name, version, etc.) +├── values.yaml \# Default configuration values for the chart +├── charts/ \# Directory for sub-chart dependencies (empty for this +project) +├── templates/ \# Directory containing the templated Kubernetes +manifests +│ ├── \_helpers.tpl \# A place for reusable template helpers +│ ├── server-daemonset.yaml +│ ├── exporter-deployment.yaml +│ ├── rbac.yaml +│ ├── service.yaml +│ └── servicemonitor.yaml +└── README.md \# Documentation for the chart + +### **4.2 Templating the Kubernetes Manifests** {#templating-the-kubernetes-manifests} + +The core of Helm\'s power lies in its templating engine, which uses Go +templates. We convert the static manifests from Section 3 into dynamic +templates by replacing hardcoded values with references to variables +defined in the values.yaml file. + +A crucial best practice is to use a \_helpers.tpl file to define common +functions and partial templates, especially for generating resource +names and labels. This reduces boilerplate, ensures consistency, and +makes the chart easier to manage.^19^ + +**Example: templates/\_helpers.tpl** + +> Code snippet + +{{/\* +Expand the name of the chart. +\*/}} +{{- define \"iperf3-monitor.name\" -}} +{{- default.Chart.Name.Values.nameOverride \| trunc 63 \| trimSuffix +\"-\" }} +{{- end -}} + +{{/\* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited +to this (by the DNS naming spec). +\*/}} +{{- define \"iperf3-monitor.fullname\" -}} +{{- if.Values.fullnameOverride }} +{{-.Values.fullnameOverride \| trunc 63 \| trimSuffix \"-\" }} +{{- else }} +{{- \$name := default.Chart.Name.Values.nameOverride }} +{{- if contains \$name.Release.Name }} +{{-.Release.Name \| trunc 63 \| trimSuffix \"-\" }} +{{- else }} +{{- printf \"%s-%s\".Release.Name \$name \| trunc 63 \| trimSuffix \"-\" +}} +{{- end }} +{{- end }} +{{- end -}} + +{{/\* +Common labels +\*/}} +{{- define \"iperf3-monitor.labels\" -}} +helm.sh/chart: {{ include \"iperf3-monitor.name\". }} +{{ include \"iperf3-monitor.selectorLabels\". }} +{{- if.Chart.AppVersion }} +app.kubernetes.io/version: {{.Chart.AppVersion \| quote }} +{{- end }} +app.kubernetes.io/managed-by: {{.Release.Service }} +{{- end -}} + +{{/\* +Selector labels +\*/}} +{{- define \"iperf3-monitor.selectorLabels\" -}} +app.kubernetes.io/name: {{ include \"iperf3-monitor.name\". }} +app.kubernetes.io/instance: {{.Release.Name }} +{{- end -}} + +**Example: Templated exporter-deployment.yaml** + +> YAML + +apiVersion: apps/v1 +kind: Deployment +metadata: +name: {{ include \"iperf3-monitor.fullname\". }}-exporter +labels: +{{- include \"iperf3-monitor.labels\". \| nindent 4 }} +app.kubernetes.io/component: exporter +spec: +replicas: {{.Values.exporter.replicaCount }} +selector: +matchLabels: +{{- include \"iperf3-monitor.selectorLabels\". \| nindent 6 }} +app.kubernetes.io/component: exporter +template: +metadata: +labels: +{{- include \"iperf3-monitor.selectorLabels\". \| nindent 8 }} +app.kubernetes.io/component: exporter +spec: +{{- if.Values.rbac.create }} +serviceAccountName: {{ include \"iperf3-monitor.fullname\". }}-sa +{{- else }} +serviceAccountName: {{.Values.serviceAccount.name }} +{{- end }} +containers: +- name: iperf3-exporter +image: \"{{.Values.exporter.image.repository +}}:{{.Values.exporter.image.tag \| default.Chart.AppVersion }}\" +imagePullPolicy: {{.Values.exporter.image.pullPolicy }} +ports: +- containerPort: 9876 +name: metrics +env: +- name: SOURCE_NODE_NAME +valueFrom: +fieldRef: +fieldPath: spec.nodeName +- name: IPERF_TEST_INTERVAL +value: \"{{.Values.exporter.testInterval }}\" +resources: +{{- toYaml.Values.exporter.resources \| nindent 10 }} + +### **4.3 Designing a Comprehensive values.yaml** {#designing-a-comprehensive-values.yaml} + +The values.yaml file is the public API of a Helm chart. A well-designed +values file is intuitive, clearly documented, and provides users with +the flexibility to adapt the chart to their specific needs. Best +practices include using clear, camelCase naming conventions and +providing comments for every parameter.^21^ + +A particularly powerful feature of Helm is conditional logic. By +wrapping entire resource definitions in if blocks based on boolean flags +in values.yaml (e.g., {{- if.Values.rbac.create }}), the chart becomes +highly adaptable. A user in a high-security environment can disable the +automatic creation of ClusterRoles by setting rbac.create: false, +allowing them to manage permissions manually without causing the Helm +installation to fail.^20^ Similarly, a user not running the Prometheus +Operator can set + +serviceMonitor.enabled: false. This adaptability transforms the chart +from a rigid, all-or-nothing package into a flexible building block, +dramatically increasing its utility across different organizations and +security postures. + +The following table documents the comprehensive set of configurable +parameters for the iperf3-monitor chart. This serves as the primary +documentation for any user wishing to install and customize the service. + +| Parameter | Description | Type | Default | +|------------------------------|----------------------------------------------------------------------|---------|-------------------------------------------| +| nameOverride | Override the name of the chart. | string | \"\" | +| fullnameOverride | Override the fully qualified app name. | string | \"\" | +| exporter.image.repository | The container image repository for the exporter. | string | ghcr.io/my-org/iperf3-prometheus-exporter | +| exporter.image.tag | The container image tag for the exporter. | string | (Chart.AppVersion) | +| exporter.image.pullPolicy | The image pull policy for the exporter. | string | IfNotPresent | +| exporter.replicaCount | Number of exporter pod replicas. | integer | 1 | +| exporter.testInterval | Interval in seconds between test cycles. | integer | 300 | +| exporter.testTimeout | Timeout in seconds for a single iperf3 test. | integer | 10 | +| exporter.testProtocol | Protocol to use for testing (tcp or udp). | string | tcp | +| exporter.resources | CPU/memory resource requests and limits for the exporter. | object | {} | +| server.image.repository | The container image repository for the iperf3 server. | string | networkstatic/iperf3 | +| server.image.tag | The container image tag for the iperf3 server. | string | latest | +| server.resources | CPU/memory resource requests and limits for the server pods. | object | {} | +| server.nodeSelector | Node selector for scheduling server pods. | object | {} | +| server.tolerations | Tolerations for scheduling server pods on tainted nodes. | array | \`\` | +| rbac.create | If true, create ServiceAccount, ClusterRole, and ClusterRoleBinding. | boolean | true | +| serviceAccount.name | The name of the ServiceAccount to use. Used if rbac.create is false. | string | \"\" | +| serviceMonitor.enabled | If true, create a ServiceMonitor for Prometheus Operator. | boolean | true | +| serviceMonitor.interval | Scrape interval for the ServiceMonitor. | string | 60s | +| serviceMonitor.scrapeTimeout | Scrape timeout for the ServiceMonitor. | string | 30s | + +## **Section 5: Visualizing Network Performance with a Custom Grafana Dashboard** + +The final piece of the user experience is a purpose-built Grafana +dashboard that transforms the raw, time-series metrics from Prometheus +into intuitive, actionable visualizations. A well-designed dashboard +does more than just display data; it tells a story, guiding an operator +from a high-level overview of cluster health to a deep-dive analysis of +a specific problematic network path.^5^ + +### **5.1 Dashboard Design Principles** {#dashboard-design-principles} + +The primary goals for this network performance dashboard are: + +1. **At-a-Glance Overview:** Provide an immediate, cluster-wide view of + > network health, allowing operators to quickly spot systemic issues + > or anomalies. + +2. **Intuitive Drill-Down:** Enable users to seamlessly transition from + > a high-level view to a detailed analysis of performance between + > specific nodes. + +3. **Correlation:** Display multiple related metrics (bandwidth, + > jitter, packet loss) on the same timeline to help identify causal + > relationships. + +4. **Clarity and Simplicity:** Avoid clutter and overly complex panels + > that can obscure meaningful data.^4^ + +### **5.2 Key Visualizations and Panels** {#key-visualizations-and-panels} + +The dashboard is constructed from several key panel types, each serving +a specific analytical purpose. + +- **Panel 1: Node-to-Node Bandwidth Heatmap.** This is the centerpiece + > of the dashboard\'s overview. It uses Grafana\'s \"Heatmap\" + > visualization to create a matrix of network performance. + + - **Y-Axis:** Source Node (source_node label). + + - **X-Axis:** Destination Node (destination_node label). + + - **Cell Color:** The value of the iperf_network_bandwidth_mbps + > metric. + + - PromQL Query: avg(iperf_network_bandwidth_mbps) by (source_node, + > destination_node) + > This panel provides an instant visual summary of the entire + > cluster\'s network fabric. A healthy cluster will show a uniformly + > \"hot\" (high bandwidth) grid, while any \"cold\" spots + > immediately draw attention to underperforming network paths. + +- **Panel 2: Time-Series Performance Graphs.** These panels use the + > \"Time series\" visualization to plot performance over time, + > allowing for trend analysis and historical investigation. + + - **Bandwidth (Mbps):** Plots + > iperf_network_bandwidth_mbps{source_node=\"\$source_node\", + > destination_node=\"\$destination_node\"}. + + - **Jitter (ms):** Plots + > iperf_network_jitter_ms{source_node=\"\$source_node\", + > destination_node=\"\$destination_node\", protocol=\"udp\"}. + + - Packet Loss (%): Plots (iperf_network_lost_packets_total{\...} / + > iperf_network_packets_total{\...}) \* 100. + > These graphs are filtered by the dashboard variables, enabling the + > drill-down analysis. + +- **Panel 3: Stat Panels.** These panels use the \"Stat\" visualization + > to display single, key performance indicators (KPIs) for the + > selected time range and nodes. + + - **Average Bandwidth:** avg(iperf_network_bandwidth_mbps{\...}) + + - **Minimum Bandwidth:** min(iperf_network_bandwidth_mbps{\...}) + + - **Maximum Jitter:** max(iperf_network_jitter_ms{\...}) + +### **5.3 Enabling Interactivity with Grafana Variables** {#enabling-interactivity-with-grafana-variables} + +The dashboard\'s interactivity is powered by Grafana\'s template +variables. These variables are dynamically populated from Prometheus and +are used to filter the data displayed in the panels.^4^ + +- **\$source_node**: A dropdown variable populated by the PromQL query + > label_values(iperf_network_bandwidth_mbps, source_node). + +- **\$destination_node**: A dropdown variable populated by + > label_values(iperf_network_bandwidth_mbps{source_node=\"\$source_node\"}, + > destination_node). This query is cascaded, meaning it only shows + > destinations relevant to the selected source. + +- **\$protocol**: A custom variable with the options tcp and udp. + +This combination of a high-level heatmap with interactive, +variable-driven drill-down graphs creates a powerful analytical +workflow. An operator can begin with a bird\'s-eye view of the cluster. +Upon spotting an anomaly on the heatmap (e.g., a low-bandwidth link +between Node-5 and Node-8), they can use the \$source_node and +\$destination_node dropdowns to select that specific path. All the +time-series panels will instantly update to show the detailed +performance history for that link, allowing the operator to correlate +bandwidth drops with jitter spikes or other events. This workflow +transforms raw data into actionable insight, dramatically reducing the +Mean Time to Identification (MTTI) for network issues. + +### **5.4 The Complete Grafana Dashboard JSON Model** {#the-complete-grafana-dashboard-json-model} + +To facilitate easy deployment, the entire dashboard is defined in a +single JSON model. This model can be imported directly into any Grafana +instance. + +> JSON + +{ +\"\_\_inputs\":, +\"\_\_requires\": \[ +{ +\"type\": \"grafana\", +\"id\": \"grafana\", +\"name\": \"Grafana\", +\"version\": \"8.0.0\" +}, +{ +\"type\": \"datasource\", +\"id\": \"prometheus\", +\"name\": \"Prometheus\", +\"version\": \"1.0.0\" +} +\], +\"annotations\": { +\"list\": \[ +{ +\"builtIn\": 1, +\"datasource\": { +\"type\": \"grafana\", +\"uid\": \"\-- Grafana \--\" +}, +\"enable\": true, +\"hide\": true, +\"iconColor\": \"rgba(0, 211, 255, 1)\", +\"name\": \"Annotations & Alerts\", +\"type\": \"dashboard\" +} +\] +}, +\"editable\": true, +\"fiscalYearStartMonth\": 0, +\"gnetId\": null, +\"graphTooltip\": 0, +\"id\": null, +\"links\":, +\"panels\":)\", +\"format\": \"heatmap\", +\"legendFormat\": \"{{source_node}} -\> {{destination_node}}\", +\"refId\": \"A\" +} +\], +\"cards\": { \"cardPadding\": null, \"cardRound\": null }, +\"color\": { +\"mode\": \"spectrum\", +\"scheme\": \"red-yellow-green\", +\"exponent\": 0.5, +\"reverse\": false +}, +\"dataFormat\": \"tsbuckets\", +\"yAxis\": { \"show\": true, \"format\": \"short\" }, +\"xAxis\": { \"show\": true } +}, +{ +\"title\": \"Bandwidth Over Time (Source: \$source_node, Dest: +\$destination_node)\", +\"type\": \"timeseries\", +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"gridPos\": { \"h\": 8, \"w\": 12, \"x\": 0, \"y\": 9 }, +\"targets\":, +\"fieldConfig\": { +\"defaults\": { +\"unit\": \"mbps\" +} +} +}, +{ +\"title\": \"Jitter Over Time (Source: \$source_node, Dest: +\$destination_node)\", +\"type\": \"timeseries\", +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"gridPos\": { \"h\": 8, \"w\": 12, \"x\": 12, \"y\": 9 }, +\"targets\": \[ +{ +\"expr\": \"iperf_network_jitter_ms{source_node=\\\$source_node\\, +destination_node=\\\$destination_node\\, protocol=\\udp\\}\", +\"legendFormat\": \"Jitter\", +\"refId\": \"A\" +} +\], +\"fieldConfig\": { +\"defaults\": { +\"unit\": \"ms\" +} +} +} +\], +\"refresh\": \"30s\", +\"schemaVersion\": 36, +\"style\": \"dark\", +\"tags\": \[\"iperf3\", \"network\", \"kubernetes\"\], +\"templating\": { +\"list\": \[ +{ +\"current\": {}, +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"definition\": \"label_values(iperf_network_bandwidth_mbps, +source_node)\", +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"source_node\", +\"options\":, +\"query\": \"label_values(iperf_network_bandwidth_mbps, +source_node)\", +\"refresh\": 1, +\"regex\": \"\", +\"skipUrlSync\": false, +\"sort\": 1, +\"type\": \"query\" +}, +{ +\"current\": {}, +\"datasource\": { +\"type\": \"prometheus\", +\"uid\": \"prometheus\" +}, +\"definition\": +\"label_values(iperf_network_bandwidth_mbps{source_node=\\\$source_node\\}, +destination_node)\", +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"destination_node\", +\"options\":, +\"query\": +\"label_values(iperf_network_bandwidth_mbps{source_node=\\\$source_node\\}, +destination_node)\", +\"refresh\": 1, +\"regex\": \"\", +\"skipUrlSync\": false, +\"sort\": 1, +\"type\": \"query\" +}, +{ +\"current\": { \"selected\": true, \"text\": \"tcp\", \"value\": \"tcp\" +}, +\"hide\": 0, +\"includeAll\": false, +\"multi\": false, +\"name\": \"protocol\", +\"options\": \[ +{ \"selected\": true, \"text\": \"tcp\", \"value\": \"tcp\" }, +{ \"selected\": false, \"text\": \"udp\", \"value\": \"udp\" } +\], +\"query\": \"tcp,udp\", +\"skipUrlSync\": false, +\"type\": \"custom\" +} +\] +}, +\"time\": { +\"from\": \"now-1h\", +\"to\": \"now\" +}, +\"timepicker\": {}, +\"timezone\": \"browser\", +\"title\": \"Kubernetes iperf3 Network Performance\", +\"uid\": \"k8s-iperf3-dashboard\", +\"version\": 1, +\"weekStart\": \"\" +} + +## **Section 6: GitHub Repository Structure and CI/CD Workflow** + +To deliver this monitoring service as a professional, open-source-ready +project, it is essential to package it within a well-structured GitHub +repository and implement a robust Continuous Integration and Continuous +Deployment (CI/CD) pipeline. This automates the build, test, and release +process, ensuring that every version of the software is consistent, +trustworthy, and easy for consumers to adopt. + +### **6.1 Recommended Repository Structure** {#recommended-repository-structure} + +A clean, logical directory structure is fundamental for project +maintainability and ease of navigation for contributors and users. + +. +├──.github/ +│ └── workflows/ +│ └── release.yml \# GitHub Actions workflow for CI/CD +├── charts/ +│ └── iperf3-monitor/ \# The Helm chart for the service +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +│ └──\... +└── exporter/ +├── Dockerfile \# Dockerfile for the exporter +├── requirements.txt \# Python dependencies +└── exporter.py \# Exporter source code +├──.gitignore +├── LICENSE +└── README.md + +This structure cleanly separates the exporter application code +(/exporter) from its deployment packaging (/charts/iperf3-monitor), and +its release automation (/.github/workflows). + +### **6.2 CI/CD Pipeline with GitHub Actions** {#cicd-pipeline-with-github-actions} + +A fully automated CI/CD pipeline is the hallmark of a mature software +project. It eliminates manual, error-prone release steps and provides +strong guarantees about the integrity of the published artifacts. By +triggering the pipeline on the creation of a Git tag (e.g., v1.2.3), we +use the tag as a single source of truth for versioning both the Docker +image and the Helm chart. This ensures that chart version 1.2.3 is built +to use image version 1.2.3, and that both have been validated before +release. This automated, atomic release process provides trust and +velocity, elevating the project from a collection of files into a +reliable, distributable piece of software. + +The following GitHub Actions workflow automates the entire release +process: + +> YAML + +\#.github/workflows/release.yml +name: Release iperf3-monitor + +on: +push: +tags: +- \'v\*.\*.\*\' + +env: +REGISTRY: ghcr.io +IMAGE_NAME: \${{ github.repository }} + +jobs: +lint-and-test: +name: Lint and Test +runs-on: ubuntu-latest +steps: +- name: Check out code +uses: actions/checkout@v3 + +- name: Set up Helm +uses: azure/setup-helm@v3 +with: +version: v3.10.0 + +- name: Helm Lint +run: helm lint./charts/iperf3-monitor + +build-and-publish-image: +name: Build and Publish Docker Image +runs-on: ubuntu-latest +needs: lint-and-test +permissions: +contents: read +packages: write +steps: +- name: Check out code +uses: actions/checkout@v3 + +- name: Log in to GitHub Container Registry +uses: docker/login-action@v2 +with: +registry: \${{ env.REGISTRY }} +username: \${{ github.actor }} +password: \${{ secrets.GITHUB_TOKEN }} + +- name: Extract metadata (tags, labels) for Docker +id: meta +uses: docker/metadata-action@v4 +with: +images: \${{ env.REGISTRY }}/\${{ env.IMAGE_NAME }} + +- name: Build and push Docker image +uses: docker/build-push-action@v4 +with: +context:./exporter +push: true +tags: \${{ steps.meta.outputs.tags }} +labels: \${{ steps.meta.outputs.labels }} + +package-and-publish-chart: +name: Package and Publish Helm Chart +runs-on: ubuntu-latest +needs: build-and-publish-image +permissions: +contents: write +steps: +- name: Check out code +uses: actions/checkout@v3 +with: +fetch-depth: 0 + +- name: Set up Helm +uses: azure/setup-helm@v3 +with: +version: v3.10.0 + +- name: Set Chart Version +run: \| +VERSION=\$(echo \"\${{ github.ref_name }}\" \| sed \'s/\^v//\') +helm-docs \--sort-values-order file +yq e -i \'.version = +strenv(VERSION)\'./charts/iperf3-monitor/Chart.yaml +yq e -i \'.appVersion = +strenv(VERSION)\'./charts/iperf3-monitor/Chart.yaml + +- name: Publish Helm chart +uses: stefanprodan/helm-gh-pages@v1.6.0 +with: +token: \${{ secrets.GITHUB_TOKEN }} +charts_dir:./charts +charts_url: https://\${{ github.repository_owner }}.github.io/\${{ +github.event.repository.name }} + +### **6.3 Documentation and Usability** {#documentation-and-usability} + +The final, and arguably most critical, component for project success is +high-quality documentation. The README.md file at the root of the +repository is the primary entry point for any user. It should clearly +explain what the project does, its architecture, and how to deploy and +use it. + +A common failure point in software projects is documentation that falls +out of sync with the code. For Helm charts, the values.yaml file +frequently changes, adding new parameters and options. To combat this, +it is a best practice to automate the documentation of these parameters. +The helm-docs tool can be integrated directly into the CI/CD pipeline to +automatically generate the \"Parameters\" section of the README.md by +parsing the comments directly from the values.yaml file.^20^ This +ensures that the documentation is always an accurate reflection of the +chart\'s configurable options, providing a seamless and trustworthy +experience for users. + +## **Conclusion** + +The proliferation of distributed microservices on Kubernetes has made +network performance a critical, yet often opaque, component of overall +application health. This report has detailed a comprehensive, +production-grade solution for establishing continuous network validation +within a Kubernetes cluster. By architecting a system around the robust, +decoupled pattern of an iperf3-server DaemonSet and a Kubernetes-aware +iperf3-exporter Deployment, this service provides a resilient and +automated foundation for network observability. + +The implementation leverages industry-standard tools---Python for the +exporter, Prometheus for metrics storage, and Grafana for +visualization---to create a powerful and flexible monitoring pipeline. +The entire service is packaged into a professional Helm chart, following +best practices for templating, configuration, and adaptability. This +allows for simple, version-controlled deployment across a wide range of +environments. The final Grafana dashboard transforms the collected data +into an intuitive, interactive narrative, enabling engineers to move +swiftly from high-level anomaly detection to root-cause analysis. + +Ultimately, by treating network performance not as a given but as a +continuously measured metric, organizations can proactively identify and +resolve infrastructure bottlenecks, enhance application reliability, and +ensure a consistent, high-quality experience for their users in the +dynamic world of Kubernetes. diff --git a/charts/iperf3-monitor/Chart.yaml b/charts/iperf3-monitor/Chart.yaml new file mode 100644 index 0000000..912aa46 --- /dev/null +++ b/charts/iperf3-monitor/Chart.yaml @@ -0,0 +1,30 @@ +apiVersion: v2 +name: iperf3-monitor +version: 0.1.0 +appVersion: "0.1.0" +description: A Helm chart for deploying a Kubernetes-native iperf3 network performance monitoring service with Prometheus and Grafana. +type: application +keywords: + - iperf3 + - network + - performance + - monitoring + - kubernetes + - prometheus + - grafana +home: https://github.com/malarinv/iperf3-monitor # Replace with your repo URL +sources: + - https://github.com/malarinv/iperf3-monitor # Replace with your repo URL +maintainers: + - name: Malar Invention # Replace with your name + email: malarkannan.invention@gmail.com # Replace with your email +icon: https://raw.githubusercontent.com/malarinv/iperf3-monitor/main/icon.png # Optional icon URL +annotations: + artifacthub.io/changes: | + - Add initial Helm chart structure. + artifacthub.io/category: networking +dependencies: + - name: prometheus-community/kube-prometheus-stack # Example dependency if you package the whole stack + version: ">=30.0.0" # Specify a compatible version range + repository: https://prometheus-community.github.io/helm-charts + condition: serviceMonitor.enabled # Only include if ServiceMonitor is enabled (assuming Prometheus Operator) diff --git a/charts/iperf3-monitor/templates/_helpers.tpl b/charts/iperf3-monitor/templates/_helpers.tpl new file mode 100644 index 0000000..9e7d6c4 --- /dev/null +++ b/charts/iperf3-monitor/templates/_helpers.tpl @@ -0,0 +1,55 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "iperf3-monitor.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "iperf3-monitor.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart's labels +*/}} +{{- define "iperf3-monitor.labels" -}} +helm.sh/chart: {{ include "iperf3-monitor.name" . }}-{{ .Chart.Version | replace "+" "_" }} +{{ include "iperf3-monitor.selectorLabels" . }} +{{- if .Chart.AppVersion -}} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end -}} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "iperf3-monitor.selectorLabels" -}} +app.kubernetes.io/name: {{ include "iperf3-monitor.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "iperf3-monitor.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{- default (include "iperf3-monitor.fullname" .) .Values.serviceAccount.name -}} +{{- else -}} + {{- default "default" .Values.serviceAccount.name -}} +{{- end -}} +{{- end -}} \ No newline at end of file diff --git a/charts/iperf3-monitor/templates/exporter-deployment.yaml b/charts/iperf3-monitor/templates/exporter-deployment.yaml new file mode 100644 index 0000000..943aa45 --- /dev/null +++ b/charts/iperf3-monitor/templates/exporter-deployment.yaml @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-exporter + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} + app.kubernetes.io/component: exporter +spec: + replicas: {{ .Values.exporter.replicaCount }} + selector: + matchLabels: + {{- include "iperf3-monitor.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: exporter + template: + metadata: + labels: + {{- include "iperf3-monitor.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: exporter + spec: + serviceAccountName: {{ include "iperf3-monitor.serviceAccountName" . }} + containers: + - name: iperf3-exporter + image: "{{ .Values.exporter.image.repository }}:{{ .Values.exporter.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.exporter.image.pullPolicy }} + ports: + - containerPort: {{ .Values.service.targetPort }} + name: metrics + env: + - name: SOURCE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: IPERF_TEST_INTERVAL + value: "{{ .Values.exporter.testInterval }}" + - name: IPERF_TEST_PROTOCOL + value: "{{ .Values.exporter.testProtocol }}" + - name: IPERF_SERVER_PORT + value: "5201" # Hardcoded as per server DaemonSet + - name: IPERF_SERVER_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: IPERF_SERVER_LABEL_SELECTOR + value: "app.kubernetes.io/name={{ include \"iperf3-monitor.name\" . }},app.kubernetes.io/instance={{ .Release.Name }},app.kubernetes.io/component=server" + {{- with .Values.exporter.resources }} + resources: + {{- toYaml . | nindent 10 }} + {{- end }} +``` \ No newline at end of file diff --git a/charts/iperf3-monitor/templates/rbac.yaml b/charts/iperf3-monitor/templates/rbac.yaml new file mode 100644 index 0000000..0a931e1 --- /dev/null +++ b/charts/iperf3-monitor/templates/rbac.yaml @@ -0,0 +1,34 @@ +{{- if .Values.rbac.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "iperf3-monitor.serviceAccountName" . }} + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-role + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-rb + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} +subjects: + - kind: ServiceAccount + name: {{ include "iperf3-monitor.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ include "iperf3-monitor.fullname" . }}-role + apiGroup: rbac.authorization.k8s.io +{{- end -}} \ No newline at end of file diff --git a/charts/iperf3-monitor/templates/server-daemonset.yaml b/charts/iperf3-monitor/templates/server-daemonset.yaml new file mode 100644 index 0000000..9523270 --- /dev/null +++ b/charts/iperf3-monitor/templates/server-daemonset.yaml @@ -0,0 +1,45 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-server + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} + app.kubernetes.io/component: server +spec: + selector: + matchLabels: + {{- include "iperf3-monitor.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: server + template: + metadata: + labels: + {{- include "iperf3-monitor.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: server + spec: + # Run on the host network to measure raw node-to-node performance + hostNetwork: true + {{- with .Values.server.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.server.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: iperf3-server + image: "{{ .Values.server.image.repository }}:{{ .Values.server.image.tag }}" + imagePullPolicy: {{ .Values.server.image.pullPolicy }} + args: ["-s"] # Start in server mode + ports: + - containerPort: 5201 + name: iperf3-tcp + protocol: TCP + - containerPort: 5201 + name: iperf3-udp + protocol: UDP + {{- with .Values.server.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +``` \ No newline at end of file diff --git a/charts/iperf3-monitor/templates/service.yaml b/charts/iperf3-monitor/templates/service.yaml new file mode 100644 index 0000000..2b60f7b --- /dev/null +++ b/charts/iperf3-monitor/templates/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-exporter-svc + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} + app.kubernetes.io/component: exporter +spec: + type: {{ .Values.service.type }} + selector: + {{- include "iperf3-monitor.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: exporter + ports: + - name: metrics + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP \ No newline at end of file diff --git a/charts/iperf3-monitor/templates/servicemonitor.yaml b/charts/iperf3-monitor/templates/servicemonitor.yaml new file mode 100644 index 0000000..a8041cf --- /dev/null +++ b/charts/iperf3-monitor/templates/servicemonitor.yaml @@ -0,0 +1,20 @@ +{{- if .Values.serviceMonitor.enabled -}} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "iperf3-monitor.fullname" . }}-sm + labels: + {{- include "iperf3-monitor.labels" . | nindent 4 }} + release: prometheus-operator # Standard label for Prometheus Operator discovery + app.kubernetes.io/component: exporter +spec: + selector: + matchLabels: + {{- include "iperf3-monitor.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: exporter + endpoints: + - port: metrics + interval: {{ .Values.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} + path: /metrics +{{- end -}} \ No newline at end of file diff --git a/charts/iperf3-monitor/values.yaml b/charts/iperf3-monitor/values.yaml new file mode 100644 index 0000000..62be7ef --- /dev/null +++ b/charts/iperf3-monitor/values.yaml @@ -0,0 +1,120 @@ +# Default values for iperf3-monitor. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# -- Override the name of the chart. +nameOverride: "" + +# -- Override the fully qualified app name. +fullnameOverride: "" + +exporter: + # -- Configuration for the exporter container image. + image: + # -- The container image repository for the exporter. + repository: ghcr.io/malarinv/iperf3-prometheus-exporter # Replace with your repo URL + # -- The container image tag for the exporter. If not set, the chart's appVersion is used. + tag: "" + # -- The image pull policy for the exporter container. + pullPolicy: IfNotPresent + + # -- Number of exporter pod replicas. Typically 1 is sufficient. + replicaCount: 1 + + # -- Interval in seconds between complete test cycles (i.e., testing all server nodes). + testInterval: 300 + + # -- Timeout in seconds for a single iperf3 test run. + testTimeout: 10 + + # -- Protocol to use for testing (tcp or udp). + testProtocol: tcp + + # -- CPU and memory resource requests and limits for the exporter pod. + # @default -- A small default is provided if commented out. + resources: + {} + # requests: + # cpu: "100m" + # memory: "128Mi" + # limits: + # cpu: "500m" + # memory: "256Mi" + +server: + # -- Configuration for the iperf3 server container image (DaemonSet). + image: + # -- The container image repository for the iperf3 server. + repository: networkstatic/iperf3 + # -- The container image tag for the iperf3 server. + tag: latest + + # -- CPU and memory resource requests and limits for the iperf3 server pods (DaemonSet). + # These should be very low as the server is mostly idle. + # @default -- A small default is provided if commented out. + resources: + {} + # requests: + # cpu: "50m" + # memory: "64Mi" + # limits: + # cpu: "100m" + # memory: "128Mi" + + # -- Node selector for scheduling iperf3 server pods. + # Use this to restrict the DaemonSet to a subset of nodes. + # @default -- {} (schedule on all nodes) + nodeSelector: {} + + # -- Tolerations for scheduling iperf3 server pods on tainted nodes (e.g., control-plane nodes). + # This is often necessary to include master nodes in the test mesh. + # @default -- Tolerates control-plane and master taints. + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + +rbac: + # -- If true, create ServiceAccount, ClusterRole, and ClusterRoleBinding for the exporter. + # Set to false if you manage RBAC externally. + create: true + +serviceAccount: + # -- The name of the ServiceAccount to use for the exporter pod. + # Only used if rbac.create is false. If not set, it defaults to the chart's fullname. + name: "" + +serviceMonitor: + # -- If true, create a ServiceMonitor resource for integration with Prometheus Operator. + # Requires a running Prometheus Operator in the cluster. + enabled: true + + # -- Scrape interval for the ServiceMonitor. How often Prometheus scrapes the exporter metrics. + interval: 60s + + # -- Scrape timeout for the ServiceMonitor. How long Prometheus waits for metrics response. + scrapeTimeout: 30s + +# -- Configuration for the exporter Service. +service: + # -- Service type. ClusterIP is typically sufficient. + type: ClusterIP + # -- Port on which the exporter service is exposed. + port: 9876 + # -- Target port on the exporter pod. + targetPort: 9876 + +# -- Optional configuration for a network policy to allow traffic to the iperf3 server DaemonSet. +# This is often necessary if you are using a network policy controller. +networkPolicy: + # -- If true, create a NetworkPolicy resource. + enabled: false + # -- Specify source selectors if needed (e.g., pods in a specific namespace). + from: [] + # -- Specify namespace selectors if needed. + namespaceSelector: {} + # -- Specify pod selectors if needed. + podSelector: {} diff --git a/exporter/Dockerfile b/exporter/Dockerfile new file mode 100644 index 0000000..76a2138 --- /dev/null +++ b/exporter/Dockerfile @@ -0,0 +1,34 @@ +# Stage 1: Build stage with dependencies +FROM python:3.9-slim as builder + +WORKDIR /app + +# Install iperf3 and build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc iperf3 libiperf-dev && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Stage 2: Final runtime stage +FROM python:3.9-slim + +WORKDIR /app + +# Copy iperf3 binary and library from the builder stage +COPY --from=builder /usr/bin/iperf3 /usr/bin/iperf3 +COPY --from=builder /usr/lib/x86_64-linux-gnu/libiperf.so.0 /usr/lib/x86_64-linux-gnu/libiperf.so.0 + +# Copy installed Python packages from the builder stage +COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages + +# Copy the exporter application code +COPY exporter.py . + +# Expose the metrics port +EXPOSE 9876 + +# Set the entrypoint +CMD ["python", "exporter.py"] \ No newline at end of file diff --git a/exporter/exporter.py b/exporter/exporter.py new file mode 100644 index 0000000..c53c6c7 --- /dev/null +++ b/exporter/exporter.py @@ -0,0 +1,159 @@ +import os +import time +import logging +from kubernetes import client, config +from prometheus_client import start_http_server, Gauge +import iperf3 + +# --- Configuration --- +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# --- Prometheus Metrics Definition --- +IPERF_BANDWIDTH_MBPS = Gauge( + 'iperf_network_bandwidth_mbps', + 'Network bandwidth measured by iperf3 in Megabits per second', + ['source_node', 'destination_node', 'protocol'] +) +IPERF_JITTER_MS = Gauge( + 'iperf_network_jitter_ms', + 'Network jitter measured by iperf3 in milliseconds', + ['source_node', 'destination_node', 'protocol'] +) +IPERF_PACKETS_TOTAL = Gauge( + 'iperf_network_packets_total', + 'Total packets transmitted or received during the iperf3 test', + ['source_node', 'destination_node', 'protocol'] +) +IPERF_LOST_PACKETS = Gauge( + 'iperf_network_lost_packets_total', + 'Total lost packets during the iperf3 UDP test', + ['source_node', 'destination_node', 'protocol'] +) +IPERF_TEST_SUCCESS = Gauge( + 'iperf_test_success', + 'Indicates if the iperf3 test was successful (1) or failed (0)', + ['source_node', 'destination_node', 'protocol'] +) + +def discover_iperf_servers(): + """ + Discover iperf3 server pods in the cluster using the Kubernetes API. + """ + try: + # Load in-cluster configuration + # Assumes the exporter runs in a pod with a service account having permissions + config.load_incluster_config() + v1 = client.CoreV1Api() + + namespace = os.getenv('IPERF_SERVER_NAMESPACE', 'default') + label_selector = os.getenv('IPERF_SERVER_LABEL_SELECTOR', 'app=iperf3-server') + + logging.info(f"Discovering iperf3 servers with label '{label_selector}' in namespace '{namespace}'") + + # List pods across all namespaces with the specified label selector + # Note: list_pod_for_all_namespaces requires cluster-wide permissions + ret = v1.list_pod_for_all_namespaces(label_selector=label_selector, watch=False) + + servers = [] + for i in ret.items: + # Ensure pod has an IP and is running + if i.status.pod_ip and i.status.phase == 'Running': + servers.append({ + 'ip': i.status.pod_ip, + 'node_name': i.spec.node_name + }) + logging.info(f"Discovered {len(servers)} iperf3 server pods.") + return servers + except Exception as e: + logging.error(f"Error discovering iperf servers: {e}") + return [] # Return empty list on error to avoid crashing the loop + +def run_iperf_test(server_ip, server_port, protocol, source_node, dest_node): + """ + Runs a single iperf3 test and updates Prometheus metrics. + """ + logging.info(f"Running iperf3 test from {source_node} to {dest_node} ({server_ip}:{server_port}) using {protocol.upper()}") + + client = iperf3.Client() + client.server_hostname = server_ip + client.port = server_port + client.protocol = protocol + # Duration of the test (seconds) + client.duration = int(os.getenv('IPERF_TEST_DURATION', 5)) + # Output results as JSON for easy parsing + client.json_output = True + + result = client.run() + + # Parse results and update metrics + parse_and_publish_metrics(result, source_node, dest_node, protocol) + +def parse_and_publish_metrics(result, source_node, dest_node, protocol): + """ + Parses the iperf3 result and updates Prometheus gauges. + Handles both successful and failed tests. + """ + labels = {'source_node': source_node, 'destination_node': dest_node, 'protocol': protocol} + + if result and result.error: + logging.error(f"Test from {source_node} to {dest_node} failed: {result.error}") + IPERF_TEST_SUCCESS.labels(**labels).set(0) + # Set metrics to 0 on failure + try: + IPERF_BANDWIDTH_MBPS.labels(**labels).set(0) + IPERF_JITTER_MS.labels(**labels).set(0) + IPERF_PACKETS_TOTAL.labels(**labels).set(0) + IPERF_LOST_PACKETS.labels(**labels).set(0) + except KeyError: + # Labels might not be registered yet if this is the first failure + pass + return + + if not result: + logging.error(f"Test from {source_node} to {dest_node} failed to return a result object.") + IPERF_TEST_SUCCESS.labels(**labels).set(0) + try: + IPERF_BANDWIDTH_MBPS.labels(**labels).set(0) + IPERF_JITTER_MS.labels(**labels).set(0) + IPERF_PACKETS_TOTAL.labels(**labels).set(0) + IPERF_LOST_PACKETS.labels(**labels).set(0) + except KeyError: + pass + return + + + IPERF_TEST_SUCCESS.labels(**labels).set(1) + + # The summary data is typically in result.json['end']['sum_sent'] or result.json['end']['sum_received'] + # The iperf3-python client often exposes this directly as attributes like sent_Mbps or received_Mbps + # For TCP, we usually care about the received bandwidth on the client side (which is the exporter) + # For UDP, the client report contains jitter, lost packets, etc. + bandwidth_mbps = 0 + if hasattr(result, 'received_Mbps') and result.received_Mbps is not None: + bandwidth_mbps = result.received_Mbps + elif hasattr(result, 'sent_Mbps') and result.sent_Mbps is not None: + # Fallback, though received_Mbps is usually more relevant for TCP client + bandwidth_mbps = result.sent_Mbps + # Add a check for the raw JSON output structure as a fallback + elif result.json and 'end' in result.json and 'sum_received' in result.json['end'] and result.json['end']['sum_received']['bits_per_second'] is not None: + bandwidth_mbps = result.json['end']['sum_received']['bits_per_second'] / 1000000 + elif result.json and 'end' in result.json and 'sum_sent' in result.json['end'] and result.json['end']['sum_sent']['bits_per_second'] is not None: + bandwidth_mbps = result.json['end']['sum_sent']['bits_per_second'] / 1000000 + + + IPERF_BANDWIDTH_MBPS.labels(**labels).set(bandwidth_mbps) + + # UDP specific metrics + if protocol == 'udp': + # iperf3-python exposes UDP results directly + IPERF_JITTER_MS.labels(**labels).set(result.jitter_ms if hasattr(result, 'jitter_ms') and result.jitter_ms is not None else 0) + IPERF_PACKETS_TOTAL.labels(**labels).set(result.packets if hasattr(result, 'packets') and result.packets is not None else 0) + IPERF_LOST_PACKETS.labels(**labels).set(result.lost_packets if hasattr(result, 'lost_packets') and result.lost_packets is not None else 0) + else: + # Ensure UDP metrics are zeroed or absent for TCP tests + try: + IPERF_JITTER_MS.labels(**labels).set(0) + IPERF_PACKETS_TOTAL.labels(**labels).set(0) + IPERF_LOST_PACKETS.labels(**labels).set(0) + except KeyError: \ No newline at end of file diff --git a/exporter/requirements.txt b/exporter/requirements.txt new file mode 100644 index 0000000..05f83d6 --- /dev/null +++ b/exporter/requirements.txt @@ -0,0 +1,3 @@ +prometheus-client +iperf3 +kubernetes \ No newline at end of file