A whole bunch of updates to logic
This commit is contained in:
parent
0f9886f076
commit
b55e8f33b4
9 changed files with 154 additions and 60 deletions
|
@ -1,4 +1,5 @@
|
|||
using System;
|
||||
using Prometheus;
|
||||
using System;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
|
@ -25,5 +26,10 @@ namespace DockerExporter
|
|||
/// more time than this. The next scrape will try again from scratch.
|
||||
/// </summary>
|
||||
public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2);
|
||||
|
||||
/// <summary>
|
||||
/// The default buckets used to measure Docker probe operation durations.
|
||||
/// </summary>
|
||||
public static readonly double[] DurationBuckets = Histogram.ExponentialBuckets(0.5, 1.5, 14);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
using Axinom.Toolkit;
|
||||
using Prometheus;
|
||||
using Docker.DotNet;
|
||||
using Docker.DotNet.Models;
|
||||
using Prometheus;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
|
@ -22,10 +20,14 @@ namespace DockerExporter
|
|||
sealed class ContainerTracker : IDisposable
|
||||
{
|
||||
public string Id { get; }
|
||||
public string DisplayName { get; }
|
||||
|
||||
public ContainerTracker(string id)
|
||||
public ContainerTracker(string id, string displayName)
|
||||
{
|
||||
Id = id;
|
||||
DisplayName = displayName;
|
||||
|
||||
_metrics = new ContainerTrackerMetrics(id, displayName);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
|
@ -34,27 +36,33 @@ namespace DockerExporter
|
|||
_stateMetrics?.Dispose();
|
||||
}
|
||||
|
||||
public void Unpublish()
|
||||
{
|
||||
_resourceMetrics?.Unpublish();
|
||||
_stateMetrics?.Unpublish();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Requests the tracker to update its data set.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// May be called multiple times concurrently.
|
||||
///
|
||||
/// Method does not throw exceptions on transient failures, merely logs and ignores them.
|
||||
/// </remarks>
|
||||
public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel)
|
||||
{
|
||||
ContainerInspectResponse container;
|
||||
StatsRecorder resourceStatsRecorder = new StatsRecorder();
|
||||
var resourceStatsRecorder = new StatsRecorder();
|
||||
|
||||
try
|
||||
{
|
||||
// First, inspect to get some basic information.
|
||||
container = await client.Containers.InspectContainerAsync(Id, cancel);
|
||||
using (_metrics.InspectContainerDuration.NewTimer())
|
||||
container = await client.Containers.InspectContainerAsync(Id, cancel);
|
||||
|
||||
// Then query for the latest resource usage stats (if container is running).
|
||||
if (container.State.Running)
|
||||
{
|
||||
using var statsTimer = _metrics.GetResourceStatsDuration.NewTimer();
|
||||
await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters
|
||||
{
|
||||
Stream = false // Only get latest, then stop.
|
||||
|
@ -63,12 +71,13 @@ namespace DockerExporter
|
|||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc();
|
||||
_metrics.FailedProbeCount.Inc();
|
||||
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
|
||||
_log.Debug(ex.ToString()); // Only to verbose output.
|
||||
|
||||
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
|
||||
// The next update will hopefully get past the error.
|
||||
// The next update will hopefully get past the error. For now, we just unpublish.
|
||||
Unpublish();
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -77,9 +86,8 @@ namespace DockerExporter
|
|||
// Now that we have the data assembled, update the metrics.
|
||||
if (_stateMetrics == null)
|
||||
{
|
||||
var displayName = GetDisplayNameOrId(container);
|
||||
_log.Debug($"First update of state metrics for {displayName} ({Id}).");
|
||||
_stateMetrics = new ContainerTrackerStateMetrics(Id, displayName);
|
||||
_log.Debug($"First update of state metrics for {DisplayName} ({Id}).");
|
||||
_stateMetrics = new ContainerTrackerStateMetrics(Id, DisplayName);
|
||||
}
|
||||
|
||||
UpdateStateMetrics(_stateMetrics, container);
|
||||
|
@ -88,16 +96,16 @@ namespace DockerExporter
|
|||
{
|
||||
if (_resourceMetrics == null)
|
||||
{
|
||||
var displayName = GetDisplayNameOrId(container);
|
||||
_log.Debug($"Initializing resource metrics for {displayName} ({Id}).");
|
||||
_resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName);
|
||||
_log.Debug($"Initializing resource metrics for {DisplayName} ({Id}).");
|
||||
_resourceMetrics = new ContainerTrackerResourceMetrics(Id, DisplayName);
|
||||
}
|
||||
|
||||
UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response);
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: It could be we already had resource metrics and now they should go away.
|
||||
// It could be we already had resource metrics and now they should go away.
|
||||
// They'll be recreated once we get the resource metrics again (e.g. after it starts).
|
||||
_resourceMetrics?.Dispose();
|
||||
_resourceMetrics = null;
|
||||
}
|
||||
|
@ -201,20 +209,10 @@ namespace DockerExporter
|
|||
public void Report(ContainerStatsResponse value) => Response = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// If a display name can be determined, returns it. Otherwise returns the container ID.
|
||||
/// </summary>
|
||||
private static string GetDisplayNameOrId(ContainerInspectResponse container)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(container.Name))
|
||||
return container.Name.Trim('/');
|
||||
|
||||
return container.ID;
|
||||
}
|
||||
|
||||
// We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base).
|
||||
private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew();
|
||||
|
||||
private ContainerTrackerMetrics _metrics;
|
||||
private ContainerTrackerStateMetrics? _stateMetrics;
|
||||
private ContainerTrackerResourceMetrics? _resourceMetrics;
|
||||
|
||||
|
|
42
ContainerTrackerMetrics.cs
Normal file
42
ContainerTrackerMetrics.cs
Normal file
|
@ -0,0 +1,42 @@
|
|||
using Prometheus;
|
||||
using System;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
sealed class ContainerTrackerMetrics : IDisposable
|
||||
{
|
||||
public Counter.Child FailedProbeCount { get; }
|
||||
|
||||
// These two are NOT differentiated by container, just to avoid a large number of series for each container.
|
||||
// Aggregate results seem useful, container scope less so. Can be expanded in the future if need be.
|
||||
public Histogram InspectContainerDuration => BaseInspectContainerDuration;
|
||||
public Histogram GetResourceStatsDuration => BaseGetResourceStatsDuration;
|
||||
|
||||
public ContainerTrackerMetrics(string id, string displayName)
|
||||
{
|
||||
FailedProbeCount = BaseFailedProbeCount.WithLabels(id, displayName);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
FailedProbeCount.Remove();
|
||||
}
|
||||
|
||||
private static readonly Counter BaseFailedProbeCount = Metrics.CreateCounter("docker_probe_container_failed_total", "Number of times the exporter failed to collect information about a specific container.", new CounterConfiguration
|
||||
{
|
||||
LabelNames = new[] { "id", "display_name" }
|
||||
});
|
||||
|
||||
private static readonly Histogram BaseInspectContainerDuration = Metrics
|
||||
.CreateHistogram("docker_probe_inspect_duration_seconds", "How long it takes to query Docker for the basic information about a single container. Includes failed requests.", new HistogramConfiguration
|
||||
{
|
||||
Buckets = Constants.DurationBuckets
|
||||
});
|
||||
|
||||
private static readonly Histogram BaseGetResourceStatsDuration = Metrics
|
||||
.CreateHistogram("docker_probe_stats_duration_seconds", "How long it takes to query Docker for the resource usage of a single container. Includes failed requests.", new HistogramConfiguration
|
||||
{
|
||||
Buckets = Constants.DurationBuckets
|
||||
});
|
||||
}
|
||||
}
|
|
@ -33,13 +33,24 @@ namespace DockerExporter
|
|||
|
||||
public void Dispose()
|
||||
{
|
||||
BaseCpuUsage.RemoveLabelled(_id, _displayName);
|
||||
BaseCpuCapacity.RemoveLabelled(_id, _displayName);
|
||||
BaseMemoryUsage.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName);
|
||||
CpuUsage.Remove();
|
||||
CpuCapacity.Remove();
|
||||
MemoryUsage.Remove();
|
||||
TotalNetworkBytesIn.Remove();
|
||||
TotalNetworkBytesOut.Remove();
|
||||
TotalDiskBytesRead.Remove();
|
||||
TotalDiskBytesWrite.Remove();
|
||||
}
|
||||
|
||||
public void Unpublish()
|
||||
{
|
||||
CpuUsage.Unpublish();
|
||||
CpuCapacity.Unpublish();
|
||||
MemoryUsage.Unpublish();
|
||||
TotalNetworkBytesIn.Unpublish();
|
||||
TotalNetworkBytesOut.Unpublish();
|
||||
TotalDiskBytesRead.Unpublish();
|
||||
TotalDiskBytesWrite.Unpublish();
|
||||
}
|
||||
|
||||
// While logically counters, all of these are gauges because we do not know when Docker might reset the values.
|
||||
|
|
|
@ -12,22 +12,23 @@ namespace DockerExporter
|
|||
|
||||
public ContainerTrackerStateMetrics(string id, string displayName)
|
||||
{
|
||||
_id = id;
|
||||
_displayName = displayName;
|
||||
|
||||
RestartCount = BaseRestartCount.WithLabels(id, displayName);
|
||||
RunningState = BaseRunningState.WithLabels(id, displayName);
|
||||
StartTime = BaseStartTime.WithLabels(id, displayName);
|
||||
}
|
||||
|
||||
private readonly string _id;
|
||||
private readonly string _displayName;
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
BaseRestartCount.RemoveLabelled(_id, _displayName);
|
||||
BaseRunningState.RemoveLabelled(_id, _displayName);
|
||||
BaseStartTime.RemoveLabelled(_id, _displayName);
|
||||
RestartCount.Remove();
|
||||
RunningState.Remove();
|
||||
StartTime.Remove();
|
||||
}
|
||||
|
||||
public void Unpublish()
|
||||
{
|
||||
RestartCount.Unpublish();
|
||||
RunningState.Unpublish();
|
||||
StartTime.Unpublish();
|
||||
}
|
||||
|
||||
private static readonly Gauge BaseRestartCount = Metrics
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
<PackageReference Include="Axinom.Toolkit" Version="14.0.0" />
|
||||
<PackageReference Include="Docker.DotNet" Version="3.125.2" />
|
||||
<PackageReference Include="Mono.Options" Version="5.3.0.1" />
|
||||
<PackageReference Include="prometheus-net" Version="3.4.0-pre-000079-eff2a83" />
|
||||
<PackageReference Include="prometheus-net" Version="3.4.0-pre-000082-546478d" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
|
@ -58,17 +58,21 @@ namespace DockerExporter
|
|||
|
||||
if (writeLock == null)
|
||||
{
|
||||
// Otherwise, we just no-op once the one that came before has updated the data.
|
||||
// Otherwise, we just no-op once the earlier probe request has updated the data.
|
||||
await WaitForPredecessorUpdateAsync(cts.Token);
|
||||
return;
|
||||
}
|
||||
|
||||
using var probeDurationTimer = DockerTrackerMetrics.ProbeDuration.NewTimer();
|
||||
|
||||
using var client = _clientConfiguration.CreateClient();
|
||||
|
||||
IList<ContainerListResponse> allContainers;
|
||||
|
||||
try
|
||||
{
|
||||
using var listDurationTimer = DockerTrackerMetrics.ListContainersDuration.NewTimer();
|
||||
|
||||
allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters
|
||||
{
|
||||
All = true
|
||||
|
@ -83,9 +87,10 @@ namespace DockerExporter
|
|||
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
|
||||
// The next update will hopefully get past the error.
|
||||
|
||||
// We won't even try update the trackers if we can't even list the containers.
|
||||
// TODO: Is this wise? What if individual container data is still available?
|
||||
// Then again, if listing containers already does not work, can you expect anything to work?
|
||||
// We will not remove the trackers yet but we will unpublish so we don't keep stale data published.
|
||||
foreach (var tracker in _containerTrackers.Values)
|
||||
tracker.Unpublish();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -122,21 +127,38 @@ namespace DockerExporter
|
|||
var newIds = containerIds.Except(trackedIds);
|
||||
foreach (var id in newIds)
|
||||
{
|
||||
_log.Debug($"Encountered container for the first time: {id}");
|
||||
_containerTrackers[id] = new ContainerTracker(id);
|
||||
var displayName = GetDisplayNameOrId(allContainers.Single(c => c.ID == id));
|
||||
_log.Debug($"Encountered container for the first time: {displayName} ({id}).");
|
||||
|
||||
_containerTrackers[id] = new ContainerTracker(id, displayName);
|
||||
}
|
||||
|
||||
// Remove the trackers of any removed containers.
|
||||
var removedIds = trackedIds.Except(containerIds);
|
||||
foreach (var id in removedIds)
|
||||
{
|
||||
_log.Debug($"Tracked container no longer exists. Removing: {id}");
|
||||
var tracker = _containerTrackers[id];
|
||||
|
||||
_log.Debug($"Tracked container no longer exists. Removing: {tracker.DisplayName} ({id}).");
|
||||
|
||||
tracker.Dispose();
|
||||
_containerTrackers.Remove(id);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// If a display name can be determined, returns it. Otherwise returns the container ID.
|
||||
/// </summary>
|
||||
private static string GetDisplayNameOrId(ContainerListResponse container)
|
||||
{
|
||||
var name = container.Names.FirstOrDefault();
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(name))
|
||||
return name.Trim('/');
|
||||
|
||||
return container.ID;
|
||||
}
|
||||
|
||||
// Synchronized - only single threaded access occurs.
|
||||
private readonly Dictionary<string, ContainerTracker> _containerTrackers = new Dictionary<string, ContainerTracker>();
|
||||
|
||||
|
|
|
@ -8,6 +8,18 @@ namespace DockerExporter
|
|||
.CreateGauge("docker_containers", "Number of containers that exist.");
|
||||
|
||||
public static readonly Counter ListContainersErrorCount = Metrics
|
||||
.CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
|
||||
.CreateCounter("docker_probe_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
|
||||
|
||||
public static readonly Histogram ProbeDuration = Metrics
|
||||
.CreateHistogram("docker_probe_duration_seconds", "How long it takes to query Docker for the complete data set. Includes failed requests.", new HistogramConfiguration
|
||||
{
|
||||
Buckets = Constants.DurationBuckets
|
||||
});
|
||||
|
||||
public static readonly Histogram ListContainersDuration = Metrics
|
||||
.CreateHistogram("docker_probe_list_containers_duration_seconds", "How long it takes to query Docker for the list of containers. Includes failed requests.", new HistogramConfiguration
|
||||
{
|
||||
Buckets = Constants.DurationBuckets
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ namespace DockerExporter
|
|||
|
||||
_tracker = new DockerTracker(new Uri(DockerUrl));
|
||||
|
||||
Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics);
|
||||
Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetricsAsync);
|
||||
|
||||
var server = new MetricServer(9417);
|
||||
#if DEBUG
|
||||
|
@ -74,17 +74,19 @@ namespace DockerExporter
|
|||
/// This acts as a primitive form of rate control to avoid overloading the fragile Docker API.
|
||||
/// The implementation for this is in DockerTracker.
|
||||
/// </remarks>
|
||||
private void UpdateMetrics()
|
||||
private async Task UpdateMetricsAsync(CancellationToken cancel)
|
||||
{
|
||||
_log.Debug("Probing Docker.");
|
||||
|
||||
using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration);
|
||||
using var combinedCancellation = CancellationTokenSource.CreateLinkedTokenSource(inlineCancellation.Token, cancel);
|
||||
|
||||
var updateTask = _tracker!.TryUpdateAsync()
|
||||
.WithAbandonment(inlineCancellation.Token);
|
||||
.WithAbandonment(combinedCancellation.Token);
|
||||
|
||||
try
|
||||
{
|
||||
updateTask.WaitAndUnwrapExceptions();
|
||||
await updateTask;
|
||||
}
|
||||
catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue