diff --git a/Constants.cs b/Constants.cs index 8e3935c..64aa8c7 100644 --- a/Constants.cs +++ b/Constants.cs @@ -1,4 +1,5 @@ -using System; +using Prometheus; +using System; namespace DockerExporter { @@ -25,5 +26,10 @@ namespace DockerExporter /// more time than this. The next scrape will try again from scratch. /// public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2); + + /// + /// The default buckets used to measure Docker probe operation durations. + /// + public static readonly double[] DurationBuckets = Histogram.ExponentialBuckets(0.5, 1.5, 14); } } diff --git a/ContainerTracker.cs b/ContainerTracker.cs index 142a484..0b3dab3 100644 --- a/ContainerTracker.cs +++ b/ContainerTracker.cs @@ -1,14 +1,12 @@ using Axinom.Toolkit; -using Prometheus; using Docker.DotNet; using Docker.DotNet.Models; +using Prometheus; using System; -using System.Collections.Generic; -using System.Text; -using System.Threading; -using System.Threading.Tasks; using System.Diagnostics; using System.Linq; +using System.Threading; +using System.Threading.Tasks; namespace DockerExporter { @@ -22,10 +20,14 @@ namespace DockerExporter sealed class ContainerTracker : IDisposable { public string Id { get; } + public string DisplayName { get; } - public ContainerTracker(string id) + public ContainerTracker(string id, string displayName) { Id = id; + DisplayName = displayName; + + _metrics = new ContainerTrackerMetrics(id, displayName); } public void Dispose() @@ -34,27 +36,33 @@ namespace DockerExporter _stateMetrics?.Dispose(); } + public void Unpublish() + { + _resourceMetrics?.Unpublish(); + _stateMetrics?.Unpublish(); + } + /// /// Requests the tracker to update its data set. /// /// - /// May be called multiple times concurrently. - /// /// Method does not throw exceptions on transient failures, merely logs and ignores them. /// public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel) { ContainerInspectResponse container; - StatsRecorder resourceStatsRecorder = new StatsRecorder(); + var resourceStatsRecorder = new StatsRecorder(); try { // First, inspect to get some basic information. - container = await client.Containers.InspectContainerAsync(Id, cancel); + using (_metrics.InspectContainerDuration.NewTimer()) + container = await client.Containers.InspectContainerAsync(Id, cancel); // Then query for the latest resource usage stats (if container is running). if (container.State.Running) { + using var statsTimer = _metrics.GetResourceStatsDuration.NewTimer(); await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters { Stream = false // Only get latest, then stop. @@ -63,12 +71,13 @@ namespace DockerExporter } catch (Exception ex) { - // TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc(); + _metrics.FailedProbeCount.Inc(); _log.Error(Helpers.Debug.GetAllExceptionMessages(ex)); _log.Debug(ex.ToString()); // Only to verbose output. // Errors are ignored - if we fail to get data, we just skip an update and log the failure. - // The next update will hopefully get past the error. + // The next update will hopefully get past the error. For now, we just unpublish. + Unpublish(); return; } @@ -77,9 +86,8 @@ namespace DockerExporter // Now that we have the data assembled, update the metrics. if (_stateMetrics == null) { - var displayName = GetDisplayNameOrId(container); - _log.Debug($"First update of state metrics for {displayName} ({Id})."); - _stateMetrics = new ContainerTrackerStateMetrics(Id, displayName); + _log.Debug($"First update of state metrics for {DisplayName} ({Id})."); + _stateMetrics = new ContainerTrackerStateMetrics(Id, DisplayName); } UpdateStateMetrics(_stateMetrics, container); @@ -88,16 +96,16 @@ namespace DockerExporter { if (_resourceMetrics == null) { - var displayName = GetDisplayNameOrId(container); - _log.Debug($"Initializing resource metrics for {displayName} ({Id})."); - _resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName); + _log.Debug($"Initializing resource metrics for {DisplayName} ({Id})."); + _resourceMetrics = new ContainerTrackerResourceMetrics(Id, DisplayName); } UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response); } else { - // TODO: It could be we already had resource metrics and now they should go away. + // It could be we already had resource metrics and now they should go away. + // They'll be recreated once we get the resource metrics again (e.g. after it starts). _resourceMetrics?.Dispose(); _resourceMetrics = null; } @@ -201,20 +209,10 @@ namespace DockerExporter public void Report(ContainerStatsResponse value) => Response = value; } - /// - /// If a display name can be determined, returns it. Otherwise returns the container ID. - /// - private static string GetDisplayNameOrId(ContainerInspectResponse container) - { - if (!string.IsNullOrWhiteSpace(container.Name)) - return container.Name.Trim('/'); - - return container.ID; - } - // We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base). private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew(); + private ContainerTrackerMetrics _metrics; private ContainerTrackerStateMetrics? _stateMetrics; private ContainerTrackerResourceMetrics? _resourceMetrics; diff --git a/ContainerTrackerMetrics.cs b/ContainerTrackerMetrics.cs new file mode 100644 index 0000000..6ace054 --- /dev/null +++ b/ContainerTrackerMetrics.cs @@ -0,0 +1,42 @@ +using Prometheus; +using System; + +namespace DockerExporter +{ + sealed class ContainerTrackerMetrics : IDisposable + { + public Counter.Child FailedProbeCount { get; } + + // These two are NOT differentiated by container, just to avoid a large number of series for each container. + // Aggregate results seem useful, container scope less so. Can be expanded in the future if need be. + public Histogram InspectContainerDuration => BaseInspectContainerDuration; + public Histogram GetResourceStatsDuration => BaseGetResourceStatsDuration; + + public ContainerTrackerMetrics(string id, string displayName) + { + FailedProbeCount = BaseFailedProbeCount.WithLabels(id, displayName); + } + + public void Dispose() + { + FailedProbeCount.Remove(); + } + + private static readonly Counter BaseFailedProbeCount = Metrics.CreateCounter("docker_probe_container_failed_total", "Number of times the exporter failed to collect information about a specific container.", new CounterConfiguration + { + LabelNames = new[] { "id", "display_name" } + }); + + private static readonly Histogram BaseInspectContainerDuration = Metrics + .CreateHistogram("docker_probe_inspect_duration_seconds", "How long it takes to query Docker for the basic information about a single container. Includes failed requests.", new HistogramConfiguration + { + Buckets = Constants.DurationBuckets + }); + + private static readonly Histogram BaseGetResourceStatsDuration = Metrics + .CreateHistogram("docker_probe_stats_duration_seconds", "How long it takes to query Docker for the resource usage of a single container. Includes failed requests.", new HistogramConfiguration + { + Buckets = Constants.DurationBuckets + }); + } +} diff --git a/ContainerTrackerResourceMetrics.cs b/ContainerTrackerResourceMetrics.cs index 037f561..3586775 100644 --- a/ContainerTrackerResourceMetrics.cs +++ b/ContainerTrackerResourceMetrics.cs @@ -33,13 +33,24 @@ namespace DockerExporter public void Dispose() { - BaseCpuUsage.RemoveLabelled(_id, _displayName); - BaseCpuCapacity.RemoveLabelled(_id, _displayName); - BaseMemoryUsage.RemoveLabelled(_id, _displayName); - BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName); - BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName); - BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName); - BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName); + CpuUsage.Remove(); + CpuCapacity.Remove(); + MemoryUsage.Remove(); + TotalNetworkBytesIn.Remove(); + TotalNetworkBytesOut.Remove(); + TotalDiskBytesRead.Remove(); + TotalDiskBytesWrite.Remove(); + } + + public void Unpublish() + { + CpuUsage.Unpublish(); + CpuCapacity.Unpublish(); + MemoryUsage.Unpublish(); + TotalNetworkBytesIn.Unpublish(); + TotalNetworkBytesOut.Unpublish(); + TotalDiskBytesRead.Unpublish(); + TotalDiskBytesWrite.Unpublish(); } // While logically counters, all of these are gauges because we do not know when Docker might reset the values. diff --git a/ContainerTrackerStateMetrics.cs b/ContainerTrackerStateMetrics.cs index a098a39..e71d13e 100644 --- a/ContainerTrackerStateMetrics.cs +++ b/ContainerTrackerStateMetrics.cs @@ -12,22 +12,23 @@ namespace DockerExporter public ContainerTrackerStateMetrics(string id, string displayName) { - _id = id; - _displayName = displayName; - RestartCount = BaseRestartCount.WithLabels(id, displayName); RunningState = BaseRunningState.WithLabels(id, displayName); StartTime = BaseStartTime.WithLabels(id, displayName); } - private readonly string _id; - private readonly string _displayName; - public void Dispose() { - BaseRestartCount.RemoveLabelled(_id, _displayName); - BaseRunningState.RemoveLabelled(_id, _displayName); - BaseStartTime.RemoveLabelled(_id, _displayName); + RestartCount.Remove(); + RunningState.Remove(); + StartTime.Remove(); + } + + public void Unpublish() + { + RestartCount.Unpublish(); + RunningState.Unpublish(); + StartTime.Unpublish(); } private static readonly Gauge BaseRestartCount = Metrics diff --git a/DockerExporter.csproj b/DockerExporter.csproj index 385899f..17b145b 100644 --- a/DockerExporter.csproj +++ b/DockerExporter.csproj @@ -25,7 +25,7 @@ - + diff --git a/DockerTracker.cs b/DockerTracker.cs index a6c7510..f456bc7 100644 --- a/DockerTracker.cs +++ b/DockerTracker.cs @@ -58,17 +58,21 @@ namespace DockerExporter if (writeLock == null) { - // Otherwise, we just no-op once the one that came before has updated the data. + // Otherwise, we just no-op once the earlier probe request has updated the data. await WaitForPredecessorUpdateAsync(cts.Token); return; } + using var probeDurationTimer = DockerTrackerMetrics.ProbeDuration.NewTimer(); + using var client = _clientConfiguration.CreateClient(); IList allContainers; try { + using var listDurationTimer = DockerTrackerMetrics.ListContainersDuration.NewTimer(); + allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters { All = true @@ -83,9 +87,10 @@ namespace DockerExporter // Errors are ignored - if we fail to get data, we just skip an update and log the failure. // The next update will hopefully get past the error. - // We won't even try update the trackers if we can't even list the containers. - // TODO: Is this wise? What if individual container data is still available? - // Then again, if listing containers already does not work, can you expect anything to work? + // We will not remove the trackers yet but we will unpublish so we don't keep stale data published. + foreach (var tracker in _containerTrackers.Values) + tracker.Unpublish(); + return; } @@ -122,21 +127,38 @@ namespace DockerExporter var newIds = containerIds.Except(trackedIds); foreach (var id in newIds) { - _log.Debug($"Encountered container for the first time: {id}"); - _containerTrackers[id] = new ContainerTracker(id); + var displayName = GetDisplayNameOrId(allContainers.Single(c => c.ID == id)); + _log.Debug($"Encountered container for the first time: {displayName} ({id})."); + + _containerTrackers[id] = new ContainerTracker(id, displayName); } // Remove the trackers of any removed containers. var removedIds = trackedIds.Except(containerIds); foreach (var id in removedIds) { - _log.Debug($"Tracked container no longer exists. Removing: {id}"); var tracker = _containerTrackers[id]; + + _log.Debug($"Tracked container no longer exists. Removing: {tracker.DisplayName} ({id})."); + tracker.Dispose(); _containerTrackers.Remove(id); } } + /// + /// If a display name can be determined, returns it. Otherwise returns the container ID. + /// + private static string GetDisplayNameOrId(ContainerListResponse container) + { + var name = container.Names.FirstOrDefault(); + + if (!string.IsNullOrWhiteSpace(name)) + return name.Trim('/'); + + return container.ID; + } + // Synchronized - only single threaded access occurs. private readonly Dictionary _containerTrackers = new Dictionary(); diff --git a/DockerTrackerMetrics.cs b/DockerTrackerMetrics.cs index a6a9513..9abbb06 100644 --- a/DockerTrackerMetrics.cs +++ b/DockerTrackerMetrics.cs @@ -8,6 +8,18 @@ namespace DockerExporter .CreateGauge("docker_containers", "Number of containers that exist."); public static readonly Counter ListContainersErrorCount = Metrics - .CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed."); + .CreateCounter("docker_probe_list_containers_failed_total", "How many times the attempt to list all containers has failed."); + + public static readonly Histogram ProbeDuration = Metrics + .CreateHistogram("docker_probe_duration_seconds", "How long it takes to query Docker for the complete data set. Includes failed requests.", new HistogramConfiguration + { + Buckets = Constants.DurationBuckets + }); + + public static readonly Histogram ListContainersDuration = Metrics + .CreateHistogram("docker_probe_list_containers_duration_seconds", "How long it takes to query Docker for the list of containers. Includes failed requests.", new HistogramConfiguration + { + Buckets = Constants.DurationBuckets + }); } } diff --git a/ExporterLogic.cs b/ExporterLogic.cs index 1ae03c7..6314db9 100644 --- a/ExporterLogic.cs +++ b/ExporterLogic.cs @@ -31,7 +31,7 @@ namespace DockerExporter _tracker = new DockerTracker(new Uri(DockerUrl)); - Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics); + Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetricsAsync); var server = new MetricServer(9417); #if DEBUG @@ -74,17 +74,19 @@ namespace DockerExporter /// This acts as a primitive form of rate control to avoid overloading the fragile Docker API. /// The implementation for this is in DockerTracker. /// - private void UpdateMetrics() + private async Task UpdateMetricsAsync(CancellationToken cancel) { _log.Debug("Probing Docker."); using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration); + using var combinedCancellation = CancellationTokenSource.CreateLinkedTokenSource(inlineCancellation.Token, cancel); + var updateTask = _tracker!.TryUpdateAsync() - .WithAbandonment(inlineCancellation.Token); + .WithAbandonment(combinedCancellation.Token); try { - updateTask.WaitAndUnwrapExceptions(); + await updateTask; } catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested) {