From 84e26ddd33bc17a38a50594cca25b05afb21dc1b Mon Sep 17 00:00:00 2001
From: Sander Saares <sander@saares.eu>
Date: Sun, 22 Dec 2019 18:10:34 +0000
Subject: [PATCH] Implemented basic probing

---
 Constants.cs                       |  16 ++-
 ContainerTracker.cs                | 223 +++++++++++++++++++++++++++++
 ContainerTrackerResourceMetrics.cs |  77 ++++++++++
 ContainerTrackerStateMetrics.cs    |  51 +++++++
 DockerExporter.csproj              |   1 +
 DockerTracker.cs                   | 145 +++++++++++++++++++
 DockerTrackerMetrics.cs            |  13 ++
 ExporterLogic.cs                   |  83 +++++++++--
 ExporterLogicMetrics.cs            |   9 ++
 Program.cs                         |  18 +--
 10 files changed, 610 insertions(+), 26 deletions(-)
 create mode 100644 ContainerTracker.cs
 create mode 100644 ContainerTrackerResourceMetrics.cs
 create mode 100644 ContainerTrackerStateMetrics.cs
 create mode 100644 DockerTracker.cs
 create mode 100644 DockerTrackerMetrics.cs
 create mode 100644 ExporterLogicMetrics.cs
diff --git a/Constants.cs b/Constants.cs
index 9a93cb1..8e3935c 100644
--- a/Constants.cs
+++ b/Constants.cs
@@ -8,8 +8,22 @@ namespace DockerExporter
         public const string VersionString = "__VERSIONSTRING__";
 
         /// <summary>
-        /// Docker can sometimes be slow to respond. If that is the case, we just give up and try again later.
+        /// Docker can sometimes be slow to respond. If that is the case, we just give up and try
+        /// again later. This limit is applied per individual API call, so does not reflect the
+        /// total possible duration of a scrape, which is handled by the timeout values below.
         /// </summary>
         public static readonly TimeSpan DockerCommandTimeout = TimeSpan.FromSeconds(30);
+
+        /// <summary>
+        /// We are willing to delay a single scrape up to this long to wait for fresh data.
+        /// Beyond this point, the update can still continue but will be done in the background.
+        /// </summary>
+        public static readonly TimeSpan MaxInlineUpdateDuration = TimeSpan.FromSeconds(20);
+
+        /// <summary>
+        /// Even if the update happens in the background, it will be cancelled if it takes
+        /// more time than this. The next scrape will try again from scratch.
+        /// </summary>
+        public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2);
     }
 }
diff --git a/ContainerTracker.cs b/ContainerTracker.cs
new file mode 100644
index 0000000..142a484
--- /dev/null
+++ b/ContainerTracker.cs
@@ -0,0 +1,223 @@
+﻿using Axinom.Toolkit;
+using Prometheus;
+using Docker.DotNet;
+using Docker.DotNet.Models;
+using System;
+using System.Collections.Generic;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Diagnostics;
+using System.Linq;
+
+namespace DockerExporter
+{
+    /// <summary>
+    /// Tracks the status of one container and exports metrics, updating the data when new scrapes are requested.
+    /// </summary>
+    /// <remarks>
+    /// NOT thread-safe! No concurrent usage is expected.
+    /// DockerTracker performs the necessary synchronization logic.
+    /// </remarks>
+    sealed class ContainerTracker : IDisposable
+    {
+        public string Id { get; }
+
+        public ContainerTracker(string id)
+        {
+            Id = id;
+        }
+
+        public void Dispose()
+        {
+            _resourceMetrics?.Dispose();
+            _stateMetrics?.Dispose();
+        }
+
+        /// <summary>
+        /// Requests the tracker to update its data set.
+        /// </summary>
+        /// <remarks>
+        /// May be called multiple times concurrently.
+        /// 
+        /// Method does not throw exceptions on transient failures, merely logs and ignores them.
+        /// </remarks>
+        public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel)
+        {
+            ContainerInspectResponse container;
+            StatsRecorder resourceStatsRecorder = new StatsRecorder();
+
+            try
+            {
+                // First, inspect to get some basic information.
+                container = await client.Containers.InspectContainerAsync(Id, cancel);
+
+                // Then query for the latest resource usage stats (if container is running).
+                if (container.State.Running)
+                {
+                    await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters
+                    {
+                        Stream = false // Only get latest, then stop.
+                    }, resourceStatsRecorder, cancel);
+                }
+            }
+            catch (Exception ex)
+            {
+                // TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc();
+                _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+                _log.Debug(ex.ToString()); // Only to verbose output.
+
+                // Errors are ignored - if we fail to get data, we just skip an update and log the failure.
+                // The next update will hopefully get past the error.
+                return;
+            }
+
+            // If anything goes wrong below, it is a fatal error not to be ignored, so not in the try block.
+
+            // Now that we have the data assembled, update the metrics.
+            if (_stateMetrics == null)
+            {
+                var displayName = GetDisplayNameOrId(container);
+                _log.Debug($"First update of state metrics for {displayName} ({Id}).");
+                _stateMetrics = new ContainerTrackerStateMetrics(Id, displayName);
+            }
+
+            UpdateStateMetrics(_stateMetrics, container);
+
+            if (resourceStatsRecorder.Response != null)
+            {
+                if (_resourceMetrics == null)
+                {
+                    var displayName = GetDisplayNameOrId(container);
+                    _log.Debug($"Initializing resource metrics for {displayName} ({Id}).");
+                    _resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName);
+                }
+
+                UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response);
+            }
+            else
+            {
+                // TODO: It could be we already had resource metrics and now they should go away.
+                _resourceMetrics?.Dispose();
+                _resourceMetrics = null;
+            }
+        }
+
+        private void UpdateStateMetrics(ContainerTrackerStateMetrics metrics, ContainerInspectResponse container)
+        {
+            metrics.RestartCount.Set(container.RestartCount);
+
+            if (container.State.Running)
+                metrics.RunningState.Set(1);
+            else if (container.State.Restarting)
+                metrics.RunningState.Set(0.5);
+            else
+                metrics.RunningState.Set(0);
+
+            if (container.State.Running && !string.IsNullOrWhiteSpace(container.State.StartedAt))
+                metrics.StartTime.SetToTimeUtc(DateTimeOffset.Parse(container.State.StartedAt));
+        }
+
+        private void UpdateResourceMetrics(ContainerTrackerResourceMetrics metrics, ContainerInspectResponse container, ContainerStatsResponse resources)
+        {
+            // The resource reporting is very different for different operating systems.
+            // This field is only used on Windows. We assume a container can't exist with 0 memory.
+            bool isWindowsContainer = resources.MemoryStats.Commit != 0;
+
+            // CPU usage
+            // The mechanism of calculation is the rate of increase in container CPU time versus available ("system") CPU time.
+            // The idea here is that we build two series - one counting used CPU in whatever units
+            // the other counting potentially available CPU in whatever units. The % always comes right.
+            // Docker CPU usage on Windows counts 100ns ticks.
+            // Docker CPU usage on Linux counts unspecified ticks in relation to some other stats.
+            // See https://github.com/moby/moby/blob/eb131c5383db8cac633919f82abad86c99bffbe5/cli/command/container/stats_helpers.go#L175
+            if (isWindowsContainer)
+            {
+                // To compensate for core count on Windows, we normalize the container usage to a single core.
+                // We also normalize the available CPU time to a single core.
+                // This way the Windows calculation is always per-core averaged.
+                // A .NET DateTimeOffset tick is 100ns, exactly, so matches what Docker uses.
+                metrics.CpuCapacity.Set(CpuBaselineTimer.Elapsed.Ticks);
+                metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage / resources.NumProcs);
+            }
+            else
+            {
+                // This is counting all cores (right?).
+                metrics.CpuCapacity.Set(resources.CPUStats.SystemUsage);
+                metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage);
+            }
+
+            // Memory usage
+            if (isWindowsContainer)
+            {
+                // Windows reports Private Working Set in Docker stats... but seems to use Commit Bytes to enforce limit!
+                // We want to report the same metric that is limited, so there we go.
+                metrics.MemoryUsage.Set(resources.MemoryStats.Commit);
+            }
+            else
+            {
+                metrics.MemoryUsage.Set(resources.MemoryStats.Usage);
+            }
+
+            // Network I/O
+            if (resources.Networks == null)
+            {
+                metrics.TotalNetworkBytesIn.Set(0);
+                metrics.TotalNetworkBytesOut.Set(0);
+            }
+            else
+            {
+                metrics.TotalNetworkBytesIn.Set(resources.Networks.Values.Sum(n => (double)n.RxBytes));
+                metrics.TotalNetworkBytesOut.Set(resources.Networks.Values.Sum(n => (double)n.TxBytes));
+            }
+
+            // Disk I/O
+            if (isWindowsContainer)
+            {
+                metrics.TotalDiskBytesRead.Set(resources.StorageStats.ReadSizeBytes);
+                metrics.TotalDiskBytesWrite.Set(resources.StorageStats.WriteSizeBytes);
+            }
+            else
+            {
+                var readEntries = resources.BlkioStats.IoServiceBytesRecursive
+                    .Where(entry => entry.Op.Equals("read", StringComparison.InvariantCultureIgnoreCase))
+                    .ToArray();
+
+                var writeEntries = resources.BlkioStats.IoServiceBytesRecursive
+                    .Where(entry => entry.Op.Equals("write", StringComparison.InvariantCultureIgnoreCase))
+                    .ToArray();
+
+                var totalRead = readEntries.Any() ? readEntries.Sum(entry => (long)entry.Value) : 0;
+                var totalWrite = writeEntries.Any() ? writeEntries.Sum(entry => (long)entry.Value) : 0;
+
+                metrics.TotalDiskBytesRead.Set(totalRead);
+                metrics.TotalDiskBytesWrite.Set(totalWrite);
+            }
+        }
+
+        private sealed class StatsRecorder : IProgress<ContainerStatsResponse>
+        {
+            public ContainerStatsResponse? Response { get; private set; }
+            public void Report(ContainerStatsResponse value) => Response = value;
+        }
+
+        /// <summary>
+        /// If a display name can be determined, returns it. Otherwise returns the container ID.
+        /// </summary>
+        private static string GetDisplayNameOrId(ContainerInspectResponse container)
+        {
+            if (!string.IsNullOrWhiteSpace(container.Name))
+                return container.Name.Trim('/');
+
+            return container.ID;
+        }
+
+        // We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base).
+        private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew();
+
+        private ContainerTrackerStateMetrics? _stateMetrics;
+        private ContainerTrackerResourceMetrics? _resourceMetrics;
+
+        private readonly LogSource _log = Log.Default;
+    }
+}
diff --git a/ContainerTrackerResourceMetrics.cs b/ContainerTrackerResourceMetrics.cs
new file mode 100644
index 0000000..037f561
--- /dev/null
+++ b/ContainerTrackerResourceMetrics.cs
@@ -0,0 +1,77 @@
+﻿using Prometheus;
+using System;
+using System.Linq;
+
+namespace DockerExporter
+{
+    sealed class ContainerTrackerResourceMetrics : IDisposable
+    {
+        public Gauge.Child CpuUsage { get; private set; }
+        public Gauge.Child CpuCapacity { get; private set; }
+        public Gauge.Child MemoryUsage { get; private set; }
+        public Gauge.Child TotalNetworkBytesIn { get; private set; }
+        public Gauge.Child TotalNetworkBytesOut { get; private set; }
+        public Gauge.Child TotalDiskBytesRead { get; private set; }
+        public Gauge.Child TotalDiskBytesWrite { get; private set; }
+
+        public ContainerTrackerResourceMetrics(string id, string displayName)
+        {
+            _id = id;
+            _displayName = displayName;
+
+            CpuUsage = BaseCpuUsage.WithLabels(id, displayName);
+            CpuCapacity = BaseCpuCapacity.WithLabels(id, displayName);
+            MemoryUsage = BaseMemoryUsage.WithLabels(id, displayName);
+            TotalNetworkBytesIn = BaseTotalNetworkBytesIn.WithLabels(id, displayName);
+            TotalNetworkBytesOut = BaseTotalNetworkBytesOut.WithLabels(id, displayName);
+            TotalDiskBytesRead = BaseTotalDiskBytesRead.WithLabels(id, displayName);
+            TotalDiskBytesWrite = BaseTotalDiskBytesWrite.WithLabels(id, displayName);
+        }
+
+        private readonly string _id;
+        private readonly string _displayName;
+
+        public void Dispose()
+        {
+            BaseCpuUsage.RemoveLabelled(_id, _displayName);
+            BaseCpuCapacity.RemoveLabelled(_id, _displayName);
+            BaseMemoryUsage.RemoveLabelled(_id, _displayName);
+            BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName);
+            BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName);
+            BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName);
+            BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName);
+        }
+
+        // While logically counters, all of these are gauges because we do not know when Docker might reset the values.
+
+        private static readonly Gauge BaseCpuUsage = Metrics
+            .CreateGauge("docker_container_cpu_used_total", "Accumulated CPU usage of a container, in unspecified units, averaged for all logical CPUs usable by the container.", ConfigureGauge());
+
+        private static readonly Gauge BaseCpuCapacity = Metrics
+            .CreateGauge("docker_container_cpu_capacity_total", "All potential CPU usage available to a container, in unspecified units, averaged for all logical CPUs usable by the container. Start point of measurement is undefined - only relative values should be used in analytics.", ConfigureGauge());
+
+        private static readonly Gauge BaseMemoryUsage = Metrics
+            .CreateGauge("docker_container_memory_used_bytes", "Memory usage of a container.", ConfigureGauge());
+
+        private static readonly Gauge BaseTotalNetworkBytesIn = Metrics
+            .CreateGauge("docker_container_network_in_bytes", "Total bytes received by the container's network interfaces.", ConfigureGauge());
+
+        private static readonly Gauge BaseTotalNetworkBytesOut = Metrics
+            .CreateGauge("docker_container_network_out_bytes", "Total bytes sent by the container's network interfaces.", ConfigureGauge());
+
+        private static readonly Gauge BaseTotalDiskBytesRead = Metrics
+            .CreateGauge("docker_container_disk_read_bytes", "Total bytes read from disk by a container.", ConfigureGauge());
+
+        private static readonly Gauge BaseTotalDiskBytesWrite = Metrics
+            .CreateGauge("docker_container_disk_write_bytes", "Total bytes written to disk by a container.", ConfigureGauge());
+
+        private static string[] LabelNames(params string[] extra) =>
+            new[] { "id", "display_name" }.Concat(extra).ToArray();
+
+        private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
+        {
+            LabelNames = LabelNames(),
+            SuppressInitialValue = true
+        };
+    }
+}
diff --git a/ContainerTrackerStateMetrics.cs b/ContainerTrackerStateMetrics.cs
new file mode 100644
index 0000000..a098a39
--- /dev/null
+++ b/ContainerTrackerStateMetrics.cs
@@ -0,0 +1,51 @@
+﻿using Prometheus;
+using System;
+using System.Linq;
+
+namespace DockerExporter
+{
+    sealed class ContainerTrackerStateMetrics : IDisposable
+    {
+        public Gauge.Child RestartCount { get; private set; }
+        public Gauge.Child RunningState { get; private set; }
+        public Gauge.Child StartTime { get; private set; }
+
+        public ContainerTrackerStateMetrics(string id, string displayName)
+        {
+            _id = id;
+            _displayName = displayName;
+
+            RestartCount = BaseRestartCount.WithLabels(id, displayName);
+            RunningState = BaseRunningState.WithLabels(id, displayName);
+            StartTime = BaseStartTime.WithLabels(id, displayName);
+        }
+
+        private readonly string _id;
+        private readonly string _displayName;
+
+        public void Dispose()
+        {
+            BaseRestartCount.RemoveLabelled(_id, _displayName);
+            BaseRunningState.RemoveLabelled(_id, _displayName);
+            BaseStartTime.RemoveLabelled(_id, _displayName);
+        }
+
+        private static readonly Gauge BaseRestartCount = Metrics
+            .CreateGauge("docker_container_restart_count", "Number of times the runtime has restarted this container without explicit user action, since the container was last started.", ConfigureGauge());
+
+        private static readonly Gauge BaseRunningState = Metrics
+            .CreateGauge("docker_container_running_state", "Whether the container is running (value 1), restarting (value 0.5) or stopped (value 0).", ConfigureGauge());
+
+        private static readonly Gauge BaseStartTime = Metrics
+            .CreateGauge("docker_container_start_time", "Timestamp indicating when the container was started. Does not get reset by automatic restarts.", ConfigureGauge());
+
+        private static string[] LabelNames(params string[] extra) =>
+            new[] { "id", "display_name" }.Concat(extra).ToArray();
+
+        private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
+        {
+            LabelNames = LabelNames(),
+            SuppressInitialValue = true
+        };
+    }
+}
diff --git a/DockerExporter.csproj b/DockerExporter.csproj
index b110c6d..385899f 100644
--- a/DockerExporter.csproj
+++ b/DockerExporter.csproj
@@ -25,6 +25,7 @@
     <PackageReference Include="Axinom.Toolkit" Version="14.0.0" />
     <PackageReference Include="Docker.DotNet" Version="3.125.2" />
     <PackageReference Include="Mono.Options" Version="5.3.0.1" />
+    <PackageReference Include="prometheus-net" Version="3.4.0-pre-000079-eff2a83" />
   </ItemGroup>
 
 </Project>
diff --git a/DockerTracker.cs b/DockerTracker.cs
new file mode 100644
index 0000000..a6c7510
--- /dev/null
+++ b/DockerTracker.cs
@@ -0,0 +1,145 @@
+﻿using Axinom.Toolkit;
+using Docker.DotNet;
+using Docker.DotNet.Models;
+using Prometheus;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace DockerExporter
+{
+    /// <summary>
+    /// Tracks the status of one instance of Docker and exports metrics, updating the data when new scrapes are requested.
+    /// </summary>
+    /// <remarks>
+    /// Thread-safe.
+    /// </remarks>
+    sealed class DockerTracker
+    {
+        public Uri DockerUrl { get; }
+
+        private readonly DockerClientConfiguration _clientConfiguration;
+
+        // If an execution can get the lock on first try, it will really perform the update.
+        // Otherwise, it will wait for the lock and then perform a no-op update to just leave
+        // the tracker with the same data the just-finished update generated.
+        // This acts as basic rate control.
+        private readonly SemaphoreSlim _updateLock = new SemaphoreSlim(1);
+
+        public DockerTracker(Uri dockerUrl)
+        {
+            DockerUrl = dockerUrl;
+
+            // TODO: Support mutual authentication via certificates.
+            _clientConfiguration = new DockerClientConfiguration(dockerUrl, null, Constants.DockerCommandTimeout);
+        }
+
+        /// <summary>
+        /// Requests the tracker to update its data set.
+        /// </summary>
+        /// <remarks>
+        /// May be called multiple times concurrently.
+        /// 
+        /// The method returns to signal that the trackerss of all containers
+        /// when the method was called have attempted an update to their data.
+        /// It may be that some updates failed - all we can say is that we tried.
+        /// 
+        /// Method does not throw exceptions on transient failures, merely logs and ignores them.
+        /// </remarks>
+        public async Task TryUpdateAsync()
+        {
+            using var cts = new CancellationTokenSource(Constants.MaxTotalUpdateDuration);
+
+            // If we get this lock, we will actually perform the update.
+            using var writeLock = await SemaphoreLock.TryTakeAsync(_updateLock, TimeSpan.Zero);
+
+            if (writeLock == null)
+            {
+                // Otherwise, we just no-op once the one that came before has updated the data.
+                await WaitForPredecessorUpdateAsync(cts.Token);
+                return;
+            }
+
+            using var client = _clientConfiguration.CreateClient();
+
+            IList<ContainerListResponse> allContainers;
+
+            try
+            {
+                allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters
+                {
+                    All = true
+                }, cts.Token);
+            }
+            catch (Exception ex)
+            {
+                DockerTrackerMetrics.ListContainersErrorCount.Inc();
+                _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+                _log.Debug(ex.ToString()); // Only to verbose output.
+
+                // Errors are ignored - if we fail to get data, we just skip an update and log the failure.
+                // The next update will hopefully get past the error.
+
+                // We won't even try update the trackers if we can't even list the containers.
+                // TODO: Is this wise? What if individual container data is still available?
+                // Then again, if listing containers already does not work, can you expect anything to work?
+                return;
+            }
+
+            DockerTrackerMetrics.ContainerCount.Set(allContainers.Count);
+            SynchronizeTrackerSet(allContainers);
+
+            // Update each tracker. We do them in parallel to minimize the total time span spent on probing.
+            var updateTasks = new List<Task>();
+
+            foreach (var tracker in _containerTrackers.Values)
+                updateTasks.Add(tracker.TryUpdateAsync(client, cts.Token));
+
+            // Only exceptions from the update calls should be terminal exceptions,
+            // so it is fine not to catch anything that may be thrown here.
+            await Task.WhenAll(updateTasks);
+        }
+
+        private async Task WaitForPredecessorUpdateAsync(CancellationToken cancel)
+        {
+            _log.Debug("Will not trigger new probe as it overlaps with existing probe.");
+            using var readLock = await SemaphoreLock.TakeAsync(_updateLock, cancel);
+        }
+
+        /// <summary>
+        /// Ensures that we have a tracker for every listed container
+        /// and removes trackers for any containers not in the list.
+        /// </summary>
+        private void SynchronizeTrackerSet(IList<ContainerListResponse> allContainers)
+        {
+            var containerIds = allContainers.Select(c => c.ID).ToArray();
+            var trackedIds = _containerTrackers.Keys.ToArray();
+
+            // Create a tracker for any new containers.
+            var newIds = containerIds.Except(trackedIds);
+            foreach (var id in newIds)
+            {
+                _log.Debug($"Encountered container for the first time: {id}");
+                _containerTrackers[id] = new ContainerTracker(id);
+            }
+
+            // Remove the trackers of any removed containers.
+            var removedIds = trackedIds.Except(containerIds);
+            foreach (var id in removedIds)
+            {
+                _log.Debug($"Tracked container no longer exists. Removing: {id}");
+                var tracker = _containerTrackers[id];
+                tracker.Dispose();
+                _containerTrackers.Remove(id);
+            }
+        }
+
+        // Synchronized - only single threaded access occurs.
+        private readonly Dictionary<string, ContainerTracker> _containerTrackers = new Dictionary<string, ContainerTracker>();
+
+        private readonly LogSource _log = Log.Default;
+    }
+}
diff --git a/DockerTrackerMetrics.cs b/DockerTrackerMetrics.cs
new file mode 100644
index 0000000..a6a9513
--- /dev/null
+++ b/DockerTrackerMetrics.cs
@@ -0,0 +1,13 @@
+﻿using Prometheus;
+
+namespace DockerExporter
+{
+    sealed class DockerTrackerMetrics
+    {
+        public static readonly Gauge ContainerCount = Metrics
+            .CreateGauge("docker_containers", "Number of containers that exist.");
+
+        public static readonly Counter ListContainersErrorCount = Metrics
+            .CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
+    }
+}
diff --git a/ExporterLogic.cs b/ExporterLogic.cs
index a942e01..ff2d146 100644
--- a/ExporterLogic.cs
+++ b/ExporterLogic.cs
@@ -1,8 +1,7 @@
 ﻿using Axinom.Toolkit;
-using Docker.DotNet;
+using Prometheus;
 using System;
-using System.Collections.Generic;
-using System.Text;
+using System.Diagnostics;
 using System.Threading;
 using System.Threading.Tasks;
 
@@ -15,6 +14,7 @@ namespace DockerExporter
         public ExporterLogic()
         {
             // Default value only valid if not running as container.
+            // This is intended for development purposes only.
             if (Helpers.Environment.IsMicrosoftOperatingSystem())
             {
                 DockerUrl = "npipe://./pipe/docker_engine";
@@ -27,18 +27,81 @@ namespace DockerExporter
 
         public async Task RunAsync(CancellationToken cancel)
         {
-            _log.Info($"Connecting to Docker via {DockerUrl}");
+            _log.Info($"Configured to probe Docker on {DockerUrl}");
 
-            var clientConfig = new DockerClientConfiguration(new Uri(DockerUrl), null, Constants.DockerCommandTimeout);
+            _tracker = new DockerTracker(new Uri(DockerUrl));
 
-            using (var client = clientConfig.CreateClient())
+            Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics);
+
+#if DEBUG
+            var server = new MetricServer("localhost", 3652);
+            _log.Info($"Open http://localhost:3652/metrics to initiate a probe.");
+#else
+            var server = new MetricServer(80);
+#endif
+
+            server.Start();
+
+            while (!cancel.IsCancellationRequested)
             {
-                var allContainers = await client.Containers.ListContainersAsync(new Docker.DotNet.Models.ContainersListParameters
+                try
                 {
-                    All = true
-                }, cancel);
+                    await Task.Delay(-1, cancel);
+                }
+                catch (TaskCanceledException) when (cancel.IsCancellationRequested)
+                {
+                    // Totally normal - we are exiting.
+                    break;
+                }
+            }
 
-                _log.Info(Helpers.Debug.ToDebugString(allContainers));
+            await server.StopAsync();
+        }
+
+        private DockerTracker? _tracker;
+
+        /// <summary>
+        /// Called before every Prometheus collection in order to update metrics.
+        /// </summary>
+        /// <remarks>
+        /// The Docker API can be very slow at times, so there is a risk that the scrape will
+        /// just time out under load. To avoid that, we enforce a maximum update duration and
+        /// will give up on fetching new values if the update takes longer than that. If the
+        /// threshold is crossed, we simply allow the scrape to proceed with stale data, while
+        /// the update keeps running in the background, hopefully eventually succeeding.
+        /// 
+        /// If multiple parallel scrapes are made, the results from the first one will be used
+        /// to satisfy all requests that come in while the data loading triggered by the first
+        /// scrape is still being performed (even if we give up with the scrape before loading finishes).
+        /// This acts as a primitive form of rate control to avoid overloading the fragile Docker API.
+        /// The implementation for this is in DockerTracker.
+        /// </remarks>
+        private void UpdateMetrics()
+        {
+            _log.Debug("Probing Docker.");
+
+            using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration);
+            var updateTask = _tracker!.TryUpdateAsync()
+                .WithAbandonment(inlineCancellation.Token);
+
+            try
+            {
+                updateTask.WaitAndUnwrapExceptions();
+            }
+            catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested)
+            {
+                _log.Debug("Probe took too long - will return stale results and finish probe in background.");
+
+                // This is expected if it goes above the inline threshold, and will be ignored.
+                // Other exceptions are caught, logged, and ignored in DockerState itself.
+                ExporterLogicMetrics.InlineTimeouts.Inc();
+            }
+            catch (Exception ex)
+            {
+                // TODO: Now what? If we throw here prometheus-net will just reject the scrape...
+                // ... but what if this is a fatal error that we want to crash the app with?
+                _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+                Debugger.Break();
             }
         }
 
diff --git a/ExporterLogicMetrics.cs b/ExporterLogicMetrics.cs
new file mode 100644
index 0000000..e4cf518
--- /dev/null
+++ b/ExporterLogicMetrics.cs
@@ -0,0 +1,9 @@
+﻿using Prometheus;
+
+namespace DockerExporter
+{
+    static class ExporterLogicMetrics
+    {
+        public static readonly Counter InlineTimeouts = Metrics.CreateCounter("docker_probe_inline_timeouts_total", "Total number of times we have forced the scrape to happen in the background and returned outdated data because performing an update inline took too long.");
+    }
+}
diff --git a/Program.cs b/Program.cs
index fb4f241..936a305 100644
--- a/Program.cs
+++ b/Program.cs
@@ -54,23 +54,9 @@ namespace DockerExporter
 
                 Environment.ExitCode = -1;
             }
-            catch (AggregateException ex)
-            {
-                foreach (var innerException in ex.InnerExceptions)
-                {
-                    _log.Error(innerException.Message);
-                    _log.Error(innerException.GetType().Name);
-                }
-
-                Environment.ExitCode = -1;
-            }
             catch (Exception ex)
             {
-                if (!string.IsNullOrWhiteSpace(ex.Message))
-                {
-                    _log.Error(ex.Message);
-                    _log.Error(ex.GetType().Name);
-                }
+                _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
 
                 Environment.ExitCode = -1;
             }
@@ -141,7 +127,9 @@ namespace DockerExporter
             // We default to displaying Info or higher but allow this to be reconfiured later, if the user wishes.
             _filteringLogListener = new FilteringLogListener(new ConsoleLogListener())
             {
+#if !DEBUG
                 MinimumSeverity = LogEntrySeverity.Info
+#endif
             };
 
             Log.Default.RegisterListener(_filteringLogListener);