diff --git a/Constants.cs b/Constants.cs
index 9a93cb1..8e3935c 100644
--- a/Constants.cs
+++ b/Constants.cs
@@ -8,8 +8,22 @@ namespace DockerExporter
public const string VersionString = "__VERSIONSTRING__";
///
- /// Docker can sometimes be slow to respond. If that is the case, we just give up and try again later.
+ /// Docker can sometimes be slow to respond. If that is the case, we just give up and try
+ /// again later. This limit is applied per individual API call, so does not reflect the
+ /// total possible duration of a scrape, which is handled by the timeout values below.
///
public static readonly TimeSpan DockerCommandTimeout = TimeSpan.FromSeconds(30);
+
+ ///
+ /// We are willing to delay a single scrape up to this long to wait for fresh data.
+ /// Beyond this point, the update can still continue but will be done in the background.
+ ///
+ public static readonly TimeSpan MaxInlineUpdateDuration = TimeSpan.FromSeconds(20);
+
+ ///
+ /// Even if the update happens in the background, it will be cancelled if it takes
+ /// more time than this. The next scrape will try again from scratch.
+ ///
+ public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2);
}
}
diff --git a/ContainerTracker.cs b/ContainerTracker.cs
new file mode 100644
index 0000000..142a484
--- /dev/null
+++ b/ContainerTracker.cs
@@ -0,0 +1,223 @@
+using Axinom.Toolkit;
+using Prometheus;
+using Docker.DotNet;
+using Docker.DotNet.Models;
+using System;
+using System.Collections.Generic;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Diagnostics;
+using System.Linq;
+
+namespace DockerExporter
+{
+ ///
+ /// Tracks the status of one container and exports metrics, updating the data when new scrapes are requested.
+ ///
+ ///
+ /// NOT thread-safe! No concurrent usage is expected.
+ /// DockerTracker performs the necessary synchronization logic.
+ ///
+ sealed class ContainerTracker : IDisposable
+ {
+ public string Id { get; }
+
+ public ContainerTracker(string id)
+ {
+ Id = id;
+ }
+
+ public void Dispose()
+ {
+ _resourceMetrics?.Dispose();
+ _stateMetrics?.Dispose();
+ }
+
+ ///
+ /// Requests the tracker to update its data set.
+ ///
+ ///
+ /// May be called multiple times concurrently.
+ ///
+ /// Method does not throw exceptions on transient failures, merely logs and ignores them.
+ ///
+ public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel)
+ {
+ ContainerInspectResponse container;
+ StatsRecorder resourceStatsRecorder = new StatsRecorder();
+
+ try
+ {
+ // First, inspect to get some basic information.
+ container = await client.Containers.InspectContainerAsync(Id, cancel);
+
+ // Then query for the latest resource usage stats (if container is running).
+ if (container.State.Running)
+ {
+ await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters
+ {
+ Stream = false // Only get latest, then stop.
+ }, resourceStatsRecorder, cancel);
+ }
+ }
+ catch (Exception ex)
+ {
+ // TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc();
+ _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+ _log.Debug(ex.ToString()); // Only to verbose output.
+
+ // Errors are ignored - if we fail to get data, we just skip an update and log the failure.
+ // The next update will hopefully get past the error.
+ return;
+ }
+
+ // If anything goes wrong below, it is a fatal error not to be ignored, so not in the try block.
+
+ // Now that we have the data assembled, update the metrics.
+ if (_stateMetrics == null)
+ {
+ var displayName = GetDisplayNameOrId(container);
+ _log.Debug($"First update of state metrics for {displayName} ({Id}).");
+ _stateMetrics = new ContainerTrackerStateMetrics(Id, displayName);
+ }
+
+ UpdateStateMetrics(_stateMetrics, container);
+
+ if (resourceStatsRecorder.Response != null)
+ {
+ if (_resourceMetrics == null)
+ {
+ var displayName = GetDisplayNameOrId(container);
+ _log.Debug($"Initializing resource metrics for {displayName} ({Id}).");
+ _resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName);
+ }
+
+ UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response);
+ }
+ else
+ {
+ // TODO: It could be we already had resource metrics and now they should go away.
+ _resourceMetrics?.Dispose();
+ _resourceMetrics = null;
+ }
+ }
+
+ private void UpdateStateMetrics(ContainerTrackerStateMetrics metrics, ContainerInspectResponse container)
+ {
+ metrics.RestartCount.Set(container.RestartCount);
+
+ if (container.State.Running)
+ metrics.RunningState.Set(1);
+ else if (container.State.Restarting)
+ metrics.RunningState.Set(0.5);
+ else
+ metrics.RunningState.Set(0);
+
+ if (container.State.Running && !string.IsNullOrWhiteSpace(container.State.StartedAt))
+ metrics.StartTime.SetToTimeUtc(DateTimeOffset.Parse(container.State.StartedAt));
+ }
+
+ private void UpdateResourceMetrics(ContainerTrackerResourceMetrics metrics, ContainerInspectResponse container, ContainerStatsResponse resources)
+ {
+ // The resource reporting is very different for different operating systems.
+ // This field is only used on Windows. We assume a container can't exist with 0 memory.
+ bool isWindowsContainer = resources.MemoryStats.Commit != 0;
+
+ // CPU usage
+ // The mechanism of calculation is the rate of increase in container CPU time versus available ("system") CPU time.
+ // The idea here is that we build two series - one counting used CPU in whatever units
+ // the other counting potentially available CPU in whatever units. The % always comes right.
+ // Docker CPU usage on Windows counts 100ns ticks.
+ // Docker CPU usage on Linux counts unspecified ticks in relation to some other stats.
+ // See https://github.com/moby/moby/blob/eb131c5383db8cac633919f82abad86c99bffbe5/cli/command/container/stats_helpers.go#L175
+ if (isWindowsContainer)
+ {
+ // To compensate for core count on Windows, we normalize the container usage to a single core.
+ // We also normalize the available CPU time to a single core.
+ // This way the Windows calculation is always per-core averaged.
+ // A .NET DateTimeOffset tick is 100ns, exactly, so matches what Docker uses.
+ metrics.CpuCapacity.Set(CpuBaselineTimer.Elapsed.Ticks);
+ metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage / resources.NumProcs);
+ }
+ else
+ {
+ // This is counting all cores (right?).
+ metrics.CpuCapacity.Set(resources.CPUStats.SystemUsage);
+ metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage);
+ }
+
+ // Memory usage
+ if (isWindowsContainer)
+ {
+ // Windows reports Private Working Set in Docker stats... but seems to use Commit Bytes to enforce limit!
+ // We want to report the same metric that is limited, so there we go.
+ metrics.MemoryUsage.Set(resources.MemoryStats.Commit);
+ }
+ else
+ {
+ metrics.MemoryUsage.Set(resources.MemoryStats.Usage);
+ }
+
+ // Network I/O
+ if (resources.Networks == null)
+ {
+ metrics.TotalNetworkBytesIn.Set(0);
+ metrics.TotalNetworkBytesOut.Set(0);
+ }
+ else
+ {
+ metrics.TotalNetworkBytesIn.Set(resources.Networks.Values.Sum(n => (double)n.RxBytes));
+ metrics.TotalNetworkBytesOut.Set(resources.Networks.Values.Sum(n => (double)n.TxBytes));
+ }
+
+ // Disk I/O
+ if (isWindowsContainer)
+ {
+ metrics.TotalDiskBytesRead.Set(resources.StorageStats.ReadSizeBytes);
+ metrics.TotalDiskBytesWrite.Set(resources.StorageStats.WriteSizeBytes);
+ }
+ else
+ {
+ var readEntries = resources.BlkioStats.IoServiceBytesRecursive
+ .Where(entry => entry.Op.Equals("read", StringComparison.InvariantCultureIgnoreCase))
+ .ToArray();
+
+ var writeEntries = resources.BlkioStats.IoServiceBytesRecursive
+ .Where(entry => entry.Op.Equals("write", StringComparison.InvariantCultureIgnoreCase))
+ .ToArray();
+
+ var totalRead = readEntries.Any() ? readEntries.Sum(entry => (long)entry.Value) : 0;
+ var totalWrite = writeEntries.Any() ? writeEntries.Sum(entry => (long)entry.Value) : 0;
+
+ metrics.TotalDiskBytesRead.Set(totalRead);
+ metrics.TotalDiskBytesWrite.Set(totalWrite);
+ }
+ }
+
+ private sealed class StatsRecorder : IProgress
+ {
+ public ContainerStatsResponse? Response { get; private set; }
+ public void Report(ContainerStatsResponse value) => Response = value;
+ }
+
+ ///
+ /// If a display name can be determined, returns it. Otherwise returns the container ID.
+ ///
+ private static string GetDisplayNameOrId(ContainerInspectResponse container)
+ {
+ if (!string.IsNullOrWhiteSpace(container.Name))
+ return container.Name.Trim('/');
+
+ return container.ID;
+ }
+
+ // We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base).
+ private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew();
+
+ private ContainerTrackerStateMetrics? _stateMetrics;
+ private ContainerTrackerResourceMetrics? _resourceMetrics;
+
+ private readonly LogSource _log = Log.Default;
+ }
+}
diff --git a/ContainerTrackerResourceMetrics.cs b/ContainerTrackerResourceMetrics.cs
new file mode 100644
index 0000000..037f561
--- /dev/null
+++ b/ContainerTrackerResourceMetrics.cs
@@ -0,0 +1,77 @@
+using Prometheus;
+using System;
+using System.Linq;
+
+namespace DockerExporter
+{
+ sealed class ContainerTrackerResourceMetrics : IDisposable
+ {
+ public Gauge.Child CpuUsage { get; private set; }
+ public Gauge.Child CpuCapacity { get; private set; }
+ public Gauge.Child MemoryUsage { get; private set; }
+ public Gauge.Child TotalNetworkBytesIn { get; private set; }
+ public Gauge.Child TotalNetworkBytesOut { get; private set; }
+ public Gauge.Child TotalDiskBytesRead { get; private set; }
+ public Gauge.Child TotalDiskBytesWrite { get; private set; }
+
+ public ContainerTrackerResourceMetrics(string id, string displayName)
+ {
+ _id = id;
+ _displayName = displayName;
+
+ CpuUsage = BaseCpuUsage.WithLabels(id, displayName);
+ CpuCapacity = BaseCpuCapacity.WithLabels(id, displayName);
+ MemoryUsage = BaseMemoryUsage.WithLabels(id, displayName);
+ TotalNetworkBytesIn = BaseTotalNetworkBytesIn.WithLabels(id, displayName);
+ TotalNetworkBytesOut = BaseTotalNetworkBytesOut.WithLabels(id, displayName);
+ TotalDiskBytesRead = BaseTotalDiskBytesRead.WithLabels(id, displayName);
+ TotalDiskBytesWrite = BaseTotalDiskBytesWrite.WithLabels(id, displayName);
+ }
+
+ private readonly string _id;
+ private readonly string _displayName;
+
+ public void Dispose()
+ {
+ BaseCpuUsage.RemoveLabelled(_id, _displayName);
+ BaseCpuCapacity.RemoveLabelled(_id, _displayName);
+ BaseMemoryUsage.RemoveLabelled(_id, _displayName);
+ BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName);
+ BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName);
+ BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName);
+ BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName);
+ }
+
+ // While logically counters, all of these are gauges because we do not know when Docker might reset the values.
+
+ private static readonly Gauge BaseCpuUsage = Metrics
+ .CreateGauge("docker_container_cpu_used_total", "Accumulated CPU usage of a container, in unspecified units, averaged for all logical CPUs usable by the container.", ConfigureGauge());
+
+ private static readonly Gauge BaseCpuCapacity = Metrics
+ .CreateGauge("docker_container_cpu_capacity_total", "All potential CPU usage available to a container, in unspecified units, averaged for all logical CPUs usable by the container. Start point of measurement is undefined - only relative values should be used in analytics.", ConfigureGauge());
+
+ private static readonly Gauge BaseMemoryUsage = Metrics
+ .CreateGauge("docker_container_memory_used_bytes", "Memory usage of a container.", ConfigureGauge());
+
+ private static readonly Gauge BaseTotalNetworkBytesIn = Metrics
+ .CreateGauge("docker_container_network_in_bytes", "Total bytes received by the container's network interfaces.", ConfigureGauge());
+
+ private static readonly Gauge BaseTotalNetworkBytesOut = Metrics
+ .CreateGauge("docker_container_network_out_bytes", "Total bytes sent by the container's network interfaces.", ConfigureGauge());
+
+ private static readonly Gauge BaseTotalDiskBytesRead = Metrics
+ .CreateGauge("docker_container_disk_read_bytes", "Total bytes read from disk by a container.", ConfigureGauge());
+
+ private static readonly Gauge BaseTotalDiskBytesWrite = Metrics
+ .CreateGauge("docker_container_disk_write_bytes", "Total bytes written to disk by a container.", ConfigureGauge());
+
+ private static string[] LabelNames(params string[] extra) =>
+ new[] { "id", "display_name" }.Concat(extra).ToArray();
+
+ private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
+ {
+ LabelNames = LabelNames(),
+ SuppressInitialValue = true
+ };
+ }
+}
diff --git a/ContainerTrackerStateMetrics.cs b/ContainerTrackerStateMetrics.cs
new file mode 100644
index 0000000..a098a39
--- /dev/null
+++ b/ContainerTrackerStateMetrics.cs
@@ -0,0 +1,51 @@
+using Prometheus;
+using System;
+using System.Linq;
+
+namespace DockerExporter
+{
+ sealed class ContainerTrackerStateMetrics : IDisposable
+ {
+ public Gauge.Child RestartCount { get; private set; }
+ public Gauge.Child RunningState { get; private set; }
+ public Gauge.Child StartTime { get; private set; }
+
+ public ContainerTrackerStateMetrics(string id, string displayName)
+ {
+ _id = id;
+ _displayName = displayName;
+
+ RestartCount = BaseRestartCount.WithLabels(id, displayName);
+ RunningState = BaseRunningState.WithLabels(id, displayName);
+ StartTime = BaseStartTime.WithLabels(id, displayName);
+ }
+
+ private readonly string _id;
+ private readonly string _displayName;
+
+ public void Dispose()
+ {
+ BaseRestartCount.RemoveLabelled(_id, _displayName);
+ BaseRunningState.RemoveLabelled(_id, _displayName);
+ BaseStartTime.RemoveLabelled(_id, _displayName);
+ }
+
+ private static readonly Gauge BaseRestartCount = Metrics
+ .CreateGauge("docker_container_restart_count", "Number of times the runtime has restarted this container without explicit user action, since the container was last started.", ConfigureGauge());
+
+ private static readonly Gauge BaseRunningState = Metrics
+ .CreateGauge("docker_container_running_state", "Whether the container is running (value 1), restarting (value 0.5) or stopped (value 0).", ConfigureGauge());
+
+ private static readonly Gauge BaseStartTime = Metrics
+ .CreateGauge("docker_container_start_time", "Timestamp indicating when the container was started. Does not get reset by automatic restarts.", ConfigureGauge());
+
+ private static string[] LabelNames(params string[] extra) =>
+ new[] { "id", "display_name" }.Concat(extra).ToArray();
+
+ private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
+ {
+ LabelNames = LabelNames(),
+ SuppressInitialValue = true
+ };
+ }
+}
diff --git a/DockerExporter.csproj b/DockerExporter.csproj
index b110c6d..385899f 100644
--- a/DockerExporter.csproj
+++ b/DockerExporter.csproj
@@ -25,6 +25,7 @@
+
diff --git a/DockerTracker.cs b/DockerTracker.cs
new file mode 100644
index 0000000..a6c7510
--- /dev/null
+++ b/DockerTracker.cs
@@ -0,0 +1,145 @@
+using Axinom.Toolkit;
+using Docker.DotNet;
+using Docker.DotNet.Models;
+using Prometheus;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace DockerExporter
+{
+ ///
+ /// Tracks the status of one instance of Docker and exports metrics, updating the data when new scrapes are requested.
+ ///
+ ///
+ /// Thread-safe.
+ ///
+ sealed class DockerTracker
+ {
+ public Uri DockerUrl { get; }
+
+ private readonly DockerClientConfiguration _clientConfiguration;
+
+ // If an execution can get the lock on first try, it will really perform the update.
+ // Otherwise, it will wait for the lock and then perform a no-op update to just leave
+ // the tracker with the same data the just-finished update generated.
+ // This acts as basic rate control.
+ private readonly SemaphoreSlim _updateLock = new SemaphoreSlim(1);
+
+ public DockerTracker(Uri dockerUrl)
+ {
+ DockerUrl = dockerUrl;
+
+ // TODO: Support mutual authentication via certificates.
+ _clientConfiguration = new DockerClientConfiguration(dockerUrl, null, Constants.DockerCommandTimeout);
+ }
+
+ ///
+ /// Requests the tracker to update its data set.
+ ///
+ ///
+ /// May be called multiple times concurrently.
+ ///
+ /// The method returns to signal that the trackerss of all containers
+ /// when the method was called have attempted an update to their data.
+ /// It may be that some updates failed - all we can say is that we tried.
+ ///
+ /// Method does not throw exceptions on transient failures, merely logs and ignores them.
+ ///
+ public async Task TryUpdateAsync()
+ {
+ using var cts = new CancellationTokenSource(Constants.MaxTotalUpdateDuration);
+
+ // If we get this lock, we will actually perform the update.
+ using var writeLock = await SemaphoreLock.TryTakeAsync(_updateLock, TimeSpan.Zero);
+
+ if (writeLock == null)
+ {
+ // Otherwise, we just no-op once the one that came before has updated the data.
+ await WaitForPredecessorUpdateAsync(cts.Token);
+ return;
+ }
+
+ using var client = _clientConfiguration.CreateClient();
+
+ IList allContainers;
+
+ try
+ {
+ allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters
+ {
+ All = true
+ }, cts.Token);
+ }
+ catch (Exception ex)
+ {
+ DockerTrackerMetrics.ListContainersErrorCount.Inc();
+ _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+ _log.Debug(ex.ToString()); // Only to verbose output.
+
+ // Errors are ignored - if we fail to get data, we just skip an update and log the failure.
+ // The next update will hopefully get past the error.
+
+ // We won't even try update the trackers if we can't even list the containers.
+ // TODO: Is this wise? What if individual container data is still available?
+ // Then again, if listing containers already does not work, can you expect anything to work?
+ return;
+ }
+
+ DockerTrackerMetrics.ContainerCount.Set(allContainers.Count);
+ SynchronizeTrackerSet(allContainers);
+
+ // Update each tracker. We do them in parallel to minimize the total time span spent on probing.
+ var updateTasks = new List();
+
+ foreach (var tracker in _containerTrackers.Values)
+ updateTasks.Add(tracker.TryUpdateAsync(client, cts.Token));
+
+ // Only exceptions from the update calls should be terminal exceptions,
+ // so it is fine not to catch anything that may be thrown here.
+ await Task.WhenAll(updateTasks);
+ }
+
+ private async Task WaitForPredecessorUpdateAsync(CancellationToken cancel)
+ {
+ _log.Debug("Will not trigger new probe as it overlaps with existing probe.");
+ using var readLock = await SemaphoreLock.TakeAsync(_updateLock, cancel);
+ }
+
+ ///
+ /// Ensures that we have a tracker for every listed container
+ /// and removes trackers for any containers not in the list.
+ ///
+ private void SynchronizeTrackerSet(IList allContainers)
+ {
+ var containerIds = allContainers.Select(c => c.ID).ToArray();
+ var trackedIds = _containerTrackers.Keys.ToArray();
+
+ // Create a tracker for any new containers.
+ var newIds = containerIds.Except(trackedIds);
+ foreach (var id in newIds)
+ {
+ _log.Debug($"Encountered container for the first time: {id}");
+ _containerTrackers[id] = new ContainerTracker(id);
+ }
+
+ // Remove the trackers of any removed containers.
+ var removedIds = trackedIds.Except(containerIds);
+ foreach (var id in removedIds)
+ {
+ _log.Debug($"Tracked container no longer exists. Removing: {id}");
+ var tracker = _containerTrackers[id];
+ tracker.Dispose();
+ _containerTrackers.Remove(id);
+ }
+ }
+
+ // Synchronized - only single threaded access occurs.
+ private readonly Dictionary _containerTrackers = new Dictionary();
+
+ private readonly LogSource _log = Log.Default;
+ }
+}
diff --git a/DockerTrackerMetrics.cs b/DockerTrackerMetrics.cs
new file mode 100644
index 0000000..a6a9513
--- /dev/null
+++ b/DockerTrackerMetrics.cs
@@ -0,0 +1,13 @@
+using Prometheus;
+
+namespace DockerExporter
+{
+ sealed class DockerTrackerMetrics
+ {
+ public static readonly Gauge ContainerCount = Metrics
+ .CreateGauge("docker_containers", "Number of containers that exist.");
+
+ public static readonly Counter ListContainersErrorCount = Metrics
+ .CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
+ }
+}
diff --git a/ExporterLogic.cs b/ExporterLogic.cs
index a942e01..ff2d146 100644
--- a/ExporterLogic.cs
+++ b/ExporterLogic.cs
@@ -1,8 +1,7 @@
using Axinom.Toolkit;
-using Docker.DotNet;
+using Prometheus;
using System;
-using System.Collections.Generic;
-using System.Text;
+using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;
@@ -15,6 +14,7 @@ namespace DockerExporter
public ExporterLogic()
{
// Default value only valid if not running as container.
+ // This is intended for development purposes only.
if (Helpers.Environment.IsMicrosoftOperatingSystem())
{
DockerUrl = "npipe://./pipe/docker_engine";
@@ -27,18 +27,81 @@ namespace DockerExporter
public async Task RunAsync(CancellationToken cancel)
{
- _log.Info($"Connecting to Docker via {DockerUrl}");
+ _log.Info($"Configured to probe Docker on {DockerUrl}");
- var clientConfig = new DockerClientConfiguration(new Uri(DockerUrl), null, Constants.DockerCommandTimeout);
+ _tracker = new DockerTracker(new Uri(DockerUrl));
- using (var client = clientConfig.CreateClient())
+ Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics);
+
+#if DEBUG
+ var server = new MetricServer("localhost", 3652);
+ _log.Info($"Open http://localhost:3652/metrics to initiate a probe.");
+#else
+ var server = new MetricServer(80);
+#endif
+
+ server.Start();
+
+ while (!cancel.IsCancellationRequested)
{
- var allContainers = await client.Containers.ListContainersAsync(new Docker.DotNet.Models.ContainersListParameters
+ try
{
- All = true
- }, cancel);
+ await Task.Delay(-1, cancel);
+ }
+ catch (TaskCanceledException) when (cancel.IsCancellationRequested)
+ {
+ // Totally normal - we are exiting.
+ break;
+ }
+ }
- _log.Info(Helpers.Debug.ToDebugString(allContainers));
+ await server.StopAsync();
+ }
+
+ private DockerTracker? _tracker;
+
+ ///
+ /// Called before every Prometheus collection in order to update metrics.
+ ///
+ ///
+ /// The Docker API can be very slow at times, so there is a risk that the scrape will
+ /// just time out under load. To avoid that, we enforce a maximum update duration and
+ /// will give up on fetching new values if the update takes longer than that. If the
+ /// threshold is crossed, we simply allow the scrape to proceed with stale data, while
+ /// the update keeps running in the background, hopefully eventually succeeding.
+ ///
+ /// If multiple parallel scrapes are made, the results from the first one will be used
+ /// to satisfy all requests that come in while the data loading triggered by the first
+ /// scrape is still being performed (even if we give up with the scrape before loading finishes).
+ /// This acts as a primitive form of rate control to avoid overloading the fragile Docker API.
+ /// The implementation for this is in DockerTracker.
+ ///
+ private void UpdateMetrics()
+ {
+ _log.Debug("Probing Docker.");
+
+ using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration);
+ var updateTask = _tracker!.TryUpdateAsync()
+ .WithAbandonment(inlineCancellation.Token);
+
+ try
+ {
+ updateTask.WaitAndUnwrapExceptions();
+ }
+ catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested)
+ {
+ _log.Debug("Probe took too long - will return stale results and finish probe in background.");
+
+ // This is expected if it goes above the inline threshold, and will be ignored.
+ // Other exceptions are caught, logged, and ignored in DockerState itself.
+ ExporterLogicMetrics.InlineTimeouts.Inc();
+ }
+ catch (Exception ex)
+ {
+ // TODO: Now what? If we throw here prometheus-net will just reject the scrape...
+ // ... but what if this is a fatal error that we want to crash the app with?
+ _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
+ Debugger.Break();
}
}
diff --git a/ExporterLogicMetrics.cs b/ExporterLogicMetrics.cs
new file mode 100644
index 0000000..e4cf518
--- /dev/null
+++ b/ExporterLogicMetrics.cs
@@ -0,0 +1,9 @@
+using Prometheus;
+
+namespace DockerExporter
+{
+ static class ExporterLogicMetrics
+ {
+ public static readonly Counter InlineTimeouts = Metrics.CreateCounter("docker_probe_inline_timeouts_total", "Total number of times we have forced the scrape to happen in the background and returned outdated data because performing an update inline took too long.");
+ }
+}
diff --git a/Program.cs b/Program.cs
index fb4f241..936a305 100644
--- a/Program.cs
+++ b/Program.cs
@@ -54,23 +54,9 @@ namespace DockerExporter
Environment.ExitCode = -1;
}
- catch (AggregateException ex)
- {
- foreach (var innerException in ex.InnerExceptions)
- {
- _log.Error(innerException.Message);
- _log.Error(innerException.GetType().Name);
- }
-
- Environment.ExitCode = -1;
- }
catch (Exception ex)
{
- if (!string.IsNullOrWhiteSpace(ex.Message))
- {
- _log.Error(ex.Message);
- _log.Error(ex.GetType().Name);
- }
+ _log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
Environment.ExitCode = -1;
}
@@ -141,7 +127,9 @@ namespace DockerExporter
// We default to displaying Info or higher but allow this to be reconfiured later, if the user wishes.
_filteringLogListener = new FilteringLogListener(new ConsoleLogListener())
{
+#if !DEBUG
MinimumSeverity = LogEntrySeverity.Info
+#endif
};
Log.Default.RegisterListener(_filteringLogListener);