Implemented basic probing
This commit is contained in:
parent
e5127dd413
commit
84e26ddd33
10 changed files with 610 additions and 26 deletions
16
Constants.cs
16
Constants.cs
|
@ -8,8 +8,22 @@ namespace DockerExporter
|
|||
public const string VersionString = "__VERSIONSTRING__";
|
||||
|
||||
/// <summary>
|
||||
/// Docker can sometimes be slow to respond. If that is the case, we just give up and try again later.
|
||||
/// Docker can sometimes be slow to respond. If that is the case, we just give up and try
|
||||
/// again later. This limit is applied per individual API call, so does not reflect the
|
||||
/// total possible duration of a scrape, which is handled by the timeout values below.
|
||||
/// </summary>
|
||||
public static readonly TimeSpan DockerCommandTimeout = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// We are willing to delay a single scrape up to this long to wait for fresh data.
|
||||
/// Beyond this point, the update can still continue but will be done in the background.
|
||||
/// </summary>
|
||||
public static readonly TimeSpan MaxInlineUpdateDuration = TimeSpan.FromSeconds(20);
|
||||
|
||||
/// <summary>
|
||||
/// Even if the update happens in the background, it will be cancelled if it takes
|
||||
/// more time than this. The next scrape will try again from scratch.
|
||||
/// </summary>
|
||||
public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2);
|
||||
}
|
||||
}
|
||||
|
|
223
ContainerTracker.cs
Normal file
223
ContainerTracker.cs
Normal file
|
@ -0,0 +1,223 @@
|
|||
using Axinom.Toolkit;
|
||||
using Prometheus;
|
||||
using Docker.DotNet;
|
||||
using Docker.DotNet.Models;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Tracks the status of one container and exports metrics, updating the data when new scrapes are requested.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// NOT thread-safe! No concurrent usage is expected.
|
||||
/// DockerTracker performs the necessary synchronization logic.
|
||||
/// </remarks>
|
||||
sealed class ContainerTracker : IDisposable
|
||||
{
|
||||
public string Id { get; }
|
||||
|
||||
public ContainerTracker(string id)
|
||||
{
|
||||
Id = id;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_resourceMetrics?.Dispose();
|
||||
_stateMetrics?.Dispose();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Requests the tracker to update its data set.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// May be called multiple times concurrently.
|
||||
///
|
||||
/// Method does not throw exceptions on transient failures, merely logs and ignores them.
|
||||
/// </remarks>
|
||||
public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel)
|
||||
{
|
||||
ContainerInspectResponse container;
|
||||
StatsRecorder resourceStatsRecorder = new StatsRecorder();
|
||||
|
||||
try
|
||||
{
|
||||
// First, inspect to get some basic information.
|
||||
container = await client.Containers.InspectContainerAsync(Id, cancel);
|
||||
|
||||
// Then query for the latest resource usage stats (if container is running).
|
||||
if (container.State.Running)
|
||||
{
|
||||
await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters
|
||||
{
|
||||
Stream = false // Only get latest, then stop.
|
||||
}, resourceStatsRecorder, cancel);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc();
|
||||
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
|
||||
_log.Debug(ex.ToString()); // Only to verbose output.
|
||||
|
||||
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
|
||||
// The next update will hopefully get past the error.
|
||||
return;
|
||||
}
|
||||
|
||||
// If anything goes wrong below, it is a fatal error not to be ignored, so not in the try block.
|
||||
|
||||
// Now that we have the data assembled, update the metrics.
|
||||
if (_stateMetrics == null)
|
||||
{
|
||||
var displayName = GetDisplayNameOrId(container);
|
||||
_log.Debug($"First update of state metrics for {displayName} ({Id}).");
|
||||
_stateMetrics = new ContainerTrackerStateMetrics(Id, displayName);
|
||||
}
|
||||
|
||||
UpdateStateMetrics(_stateMetrics, container);
|
||||
|
||||
if (resourceStatsRecorder.Response != null)
|
||||
{
|
||||
if (_resourceMetrics == null)
|
||||
{
|
||||
var displayName = GetDisplayNameOrId(container);
|
||||
_log.Debug($"Initializing resource metrics for {displayName} ({Id}).");
|
||||
_resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName);
|
||||
}
|
||||
|
||||
UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response);
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: It could be we already had resource metrics and now they should go away.
|
||||
_resourceMetrics?.Dispose();
|
||||
_resourceMetrics = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void UpdateStateMetrics(ContainerTrackerStateMetrics metrics, ContainerInspectResponse container)
|
||||
{
|
||||
metrics.RestartCount.Set(container.RestartCount);
|
||||
|
||||
if (container.State.Running)
|
||||
metrics.RunningState.Set(1);
|
||||
else if (container.State.Restarting)
|
||||
metrics.RunningState.Set(0.5);
|
||||
else
|
||||
metrics.RunningState.Set(0);
|
||||
|
||||
if (container.State.Running && !string.IsNullOrWhiteSpace(container.State.StartedAt))
|
||||
metrics.StartTime.SetToTimeUtc(DateTimeOffset.Parse(container.State.StartedAt));
|
||||
}
|
||||
|
||||
private void UpdateResourceMetrics(ContainerTrackerResourceMetrics metrics, ContainerInspectResponse container, ContainerStatsResponse resources)
|
||||
{
|
||||
// The resource reporting is very different for different operating systems.
|
||||
// This field is only used on Windows. We assume a container can't exist with 0 memory.
|
||||
bool isWindowsContainer = resources.MemoryStats.Commit != 0;
|
||||
|
||||
// CPU usage
|
||||
// The mechanism of calculation is the rate of increase in container CPU time versus available ("system") CPU time.
|
||||
// The idea here is that we build two series - one counting used CPU in whatever units
|
||||
// the other counting potentially available CPU in whatever units. The % always comes right.
|
||||
// Docker CPU usage on Windows counts 100ns ticks.
|
||||
// Docker CPU usage on Linux counts unspecified ticks in relation to some other stats.
|
||||
// See https://github.com/moby/moby/blob/eb131c5383db8cac633919f82abad86c99bffbe5/cli/command/container/stats_helpers.go#L175
|
||||
if (isWindowsContainer)
|
||||
{
|
||||
// To compensate for core count on Windows, we normalize the container usage to a single core.
|
||||
// We also normalize the available CPU time to a single core.
|
||||
// This way the Windows calculation is always per-core averaged.
|
||||
// A .NET DateTimeOffset tick is 100ns, exactly, so matches what Docker uses.
|
||||
metrics.CpuCapacity.Set(CpuBaselineTimer.Elapsed.Ticks);
|
||||
metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage / resources.NumProcs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This is counting all cores (right?).
|
||||
metrics.CpuCapacity.Set(resources.CPUStats.SystemUsage);
|
||||
metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage);
|
||||
}
|
||||
|
||||
// Memory usage
|
||||
if (isWindowsContainer)
|
||||
{
|
||||
// Windows reports Private Working Set in Docker stats... but seems to use Commit Bytes to enforce limit!
|
||||
// We want to report the same metric that is limited, so there we go.
|
||||
metrics.MemoryUsage.Set(resources.MemoryStats.Commit);
|
||||
}
|
||||
else
|
||||
{
|
||||
metrics.MemoryUsage.Set(resources.MemoryStats.Usage);
|
||||
}
|
||||
|
||||
// Network I/O
|
||||
if (resources.Networks == null)
|
||||
{
|
||||
metrics.TotalNetworkBytesIn.Set(0);
|
||||
metrics.TotalNetworkBytesOut.Set(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
metrics.TotalNetworkBytesIn.Set(resources.Networks.Values.Sum(n => (double)n.RxBytes));
|
||||
metrics.TotalNetworkBytesOut.Set(resources.Networks.Values.Sum(n => (double)n.TxBytes));
|
||||
}
|
||||
|
||||
// Disk I/O
|
||||
if (isWindowsContainer)
|
||||
{
|
||||
metrics.TotalDiskBytesRead.Set(resources.StorageStats.ReadSizeBytes);
|
||||
metrics.TotalDiskBytesWrite.Set(resources.StorageStats.WriteSizeBytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
var readEntries = resources.BlkioStats.IoServiceBytesRecursive
|
||||
.Where(entry => entry.Op.Equals("read", StringComparison.InvariantCultureIgnoreCase))
|
||||
.ToArray();
|
||||
|
||||
var writeEntries = resources.BlkioStats.IoServiceBytesRecursive
|
||||
.Where(entry => entry.Op.Equals("write", StringComparison.InvariantCultureIgnoreCase))
|
||||
.ToArray();
|
||||
|
||||
var totalRead = readEntries.Any() ? readEntries.Sum(entry => (long)entry.Value) : 0;
|
||||
var totalWrite = writeEntries.Any() ? writeEntries.Sum(entry => (long)entry.Value) : 0;
|
||||
|
||||
metrics.TotalDiskBytesRead.Set(totalRead);
|
||||
metrics.TotalDiskBytesWrite.Set(totalWrite);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class StatsRecorder : IProgress<ContainerStatsResponse>
|
||||
{
|
||||
public ContainerStatsResponse? Response { get; private set; }
|
||||
public void Report(ContainerStatsResponse value) => Response = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// If a display name can be determined, returns it. Otherwise returns the container ID.
|
||||
/// </summary>
|
||||
private static string GetDisplayNameOrId(ContainerInspectResponse container)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(container.Name))
|
||||
return container.Name.Trim('/');
|
||||
|
||||
return container.ID;
|
||||
}
|
||||
|
||||
// We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base).
|
||||
private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew();
|
||||
|
||||
private ContainerTrackerStateMetrics? _stateMetrics;
|
||||
private ContainerTrackerResourceMetrics? _resourceMetrics;
|
||||
|
||||
private readonly LogSource _log = Log.Default;
|
||||
}
|
||||
}
|
77
ContainerTrackerResourceMetrics.cs
Normal file
77
ContainerTrackerResourceMetrics.cs
Normal file
|
@ -0,0 +1,77 @@
|
|||
using Prometheus;
|
||||
using System;
|
||||
using System.Linq;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
sealed class ContainerTrackerResourceMetrics : IDisposable
|
||||
{
|
||||
public Gauge.Child CpuUsage { get; private set; }
|
||||
public Gauge.Child CpuCapacity { get; private set; }
|
||||
public Gauge.Child MemoryUsage { get; private set; }
|
||||
public Gauge.Child TotalNetworkBytesIn { get; private set; }
|
||||
public Gauge.Child TotalNetworkBytesOut { get; private set; }
|
||||
public Gauge.Child TotalDiskBytesRead { get; private set; }
|
||||
public Gauge.Child TotalDiskBytesWrite { get; private set; }
|
||||
|
||||
public ContainerTrackerResourceMetrics(string id, string displayName)
|
||||
{
|
||||
_id = id;
|
||||
_displayName = displayName;
|
||||
|
||||
CpuUsage = BaseCpuUsage.WithLabels(id, displayName);
|
||||
CpuCapacity = BaseCpuCapacity.WithLabels(id, displayName);
|
||||
MemoryUsage = BaseMemoryUsage.WithLabels(id, displayName);
|
||||
TotalNetworkBytesIn = BaseTotalNetworkBytesIn.WithLabels(id, displayName);
|
||||
TotalNetworkBytesOut = BaseTotalNetworkBytesOut.WithLabels(id, displayName);
|
||||
TotalDiskBytesRead = BaseTotalDiskBytesRead.WithLabels(id, displayName);
|
||||
TotalDiskBytesWrite = BaseTotalDiskBytesWrite.WithLabels(id, displayName);
|
||||
}
|
||||
|
||||
private readonly string _id;
|
||||
private readonly string _displayName;
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
BaseCpuUsage.RemoveLabelled(_id, _displayName);
|
||||
BaseCpuCapacity.RemoveLabelled(_id, _displayName);
|
||||
BaseMemoryUsage.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName);
|
||||
BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName);
|
||||
}
|
||||
|
||||
// While logically counters, all of these are gauges because we do not know when Docker might reset the values.
|
||||
|
||||
private static readonly Gauge BaseCpuUsage = Metrics
|
||||
.CreateGauge("docker_container_cpu_used_total", "Accumulated CPU usage of a container, in unspecified units, averaged for all logical CPUs usable by the container.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseCpuCapacity = Metrics
|
||||
.CreateGauge("docker_container_cpu_capacity_total", "All potential CPU usage available to a container, in unspecified units, averaged for all logical CPUs usable by the container. Start point of measurement is undefined - only relative values should be used in analytics.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseMemoryUsage = Metrics
|
||||
.CreateGauge("docker_container_memory_used_bytes", "Memory usage of a container.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseTotalNetworkBytesIn = Metrics
|
||||
.CreateGauge("docker_container_network_in_bytes", "Total bytes received by the container's network interfaces.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseTotalNetworkBytesOut = Metrics
|
||||
.CreateGauge("docker_container_network_out_bytes", "Total bytes sent by the container's network interfaces.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseTotalDiskBytesRead = Metrics
|
||||
.CreateGauge("docker_container_disk_read_bytes", "Total bytes read from disk by a container.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseTotalDiskBytesWrite = Metrics
|
||||
.CreateGauge("docker_container_disk_write_bytes", "Total bytes written to disk by a container.", ConfigureGauge());
|
||||
|
||||
private static string[] LabelNames(params string[] extra) =>
|
||||
new[] { "id", "display_name" }.Concat(extra).ToArray();
|
||||
|
||||
private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
|
||||
{
|
||||
LabelNames = LabelNames(),
|
||||
SuppressInitialValue = true
|
||||
};
|
||||
}
|
||||
}
|
51
ContainerTrackerStateMetrics.cs
Normal file
51
ContainerTrackerStateMetrics.cs
Normal file
|
@ -0,0 +1,51 @@
|
|||
using Prometheus;
|
||||
using System;
|
||||
using System.Linq;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
sealed class ContainerTrackerStateMetrics : IDisposable
|
||||
{
|
||||
public Gauge.Child RestartCount { get; private set; }
|
||||
public Gauge.Child RunningState { get; private set; }
|
||||
public Gauge.Child StartTime { get; private set; }
|
||||
|
||||
public ContainerTrackerStateMetrics(string id, string displayName)
|
||||
{
|
||||
_id = id;
|
||||
_displayName = displayName;
|
||||
|
||||
RestartCount = BaseRestartCount.WithLabels(id, displayName);
|
||||
RunningState = BaseRunningState.WithLabels(id, displayName);
|
||||
StartTime = BaseStartTime.WithLabels(id, displayName);
|
||||
}
|
||||
|
||||
private readonly string _id;
|
||||
private readonly string _displayName;
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
BaseRestartCount.RemoveLabelled(_id, _displayName);
|
||||
BaseRunningState.RemoveLabelled(_id, _displayName);
|
||||
BaseStartTime.RemoveLabelled(_id, _displayName);
|
||||
}
|
||||
|
||||
private static readonly Gauge BaseRestartCount = Metrics
|
||||
.CreateGauge("docker_container_restart_count", "Number of times the runtime has restarted this container without explicit user action, since the container was last started.", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseRunningState = Metrics
|
||||
.CreateGauge("docker_container_running_state", "Whether the container is running (value 1), restarting (value 0.5) or stopped (value 0).", ConfigureGauge());
|
||||
|
||||
private static readonly Gauge BaseStartTime = Metrics
|
||||
.CreateGauge("docker_container_start_time", "Timestamp indicating when the container was started. Does not get reset by automatic restarts.", ConfigureGauge());
|
||||
|
||||
private static string[] LabelNames(params string[] extra) =>
|
||||
new[] { "id", "display_name" }.Concat(extra).ToArray();
|
||||
|
||||
private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
|
||||
{
|
||||
LabelNames = LabelNames(),
|
||||
SuppressInitialValue = true
|
||||
};
|
||||
}
|
||||
}
|
|
@ -25,6 +25,7 @@
|
|||
<PackageReference Include="Axinom.Toolkit" Version="14.0.0" />
|
||||
<PackageReference Include="Docker.DotNet" Version="3.125.2" />
|
||||
<PackageReference Include="Mono.Options" Version="5.3.0.1" />
|
||||
<PackageReference Include="prometheus-net" Version="3.4.0-pre-000079-eff2a83" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
145
DockerTracker.cs
Normal file
145
DockerTracker.cs
Normal file
|
@ -0,0 +1,145 @@
|
|||
using Axinom.Toolkit;
|
||||
using Docker.DotNet;
|
||||
using Docker.DotNet.Models;
|
||||
using Prometheus;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Tracks the status of one instance of Docker and exports metrics, updating the data when new scrapes are requested.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Thread-safe.
|
||||
/// </remarks>
|
||||
sealed class DockerTracker
|
||||
{
|
||||
public Uri DockerUrl { get; }
|
||||
|
||||
private readonly DockerClientConfiguration _clientConfiguration;
|
||||
|
||||
// If an execution can get the lock on first try, it will really perform the update.
|
||||
// Otherwise, it will wait for the lock and then perform a no-op update to just leave
|
||||
// the tracker with the same data the just-finished update generated.
|
||||
// This acts as basic rate control.
|
||||
private readonly SemaphoreSlim _updateLock = new SemaphoreSlim(1);
|
||||
|
||||
public DockerTracker(Uri dockerUrl)
|
||||
{
|
||||
DockerUrl = dockerUrl;
|
||||
|
||||
// TODO: Support mutual authentication via certificates.
|
||||
_clientConfiguration = new DockerClientConfiguration(dockerUrl, null, Constants.DockerCommandTimeout);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Requests the tracker to update its data set.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// May be called multiple times concurrently.
|
||||
///
|
||||
/// The method returns to signal that the trackerss of all containers
|
||||
/// when the method was called have attempted an update to their data.
|
||||
/// It may be that some updates failed - all we can say is that we tried.
|
||||
///
|
||||
/// Method does not throw exceptions on transient failures, merely logs and ignores them.
|
||||
/// </remarks>
|
||||
public async Task TryUpdateAsync()
|
||||
{
|
||||
using var cts = new CancellationTokenSource(Constants.MaxTotalUpdateDuration);
|
||||
|
||||
// If we get this lock, we will actually perform the update.
|
||||
using var writeLock = await SemaphoreLock.TryTakeAsync(_updateLock, TimeSpan.Zero);
|
||||
|
||||
if (writeLock == null)
|
||||
{
|
||||
// Otherwise, we just no-op once the one that came before has updated the data.
|
||||
await WaitForPredecessorUpdateAsync(cts.Token);
|
||||
return;
|
||||
}
|
||||
|
||||
using var client = _clientConfiguration.CreateClient();
|
||||
|
||||
IList<ContainerListResponse> allContainers;
|
||||
|
||||
try
|
||||
{
|
||||
allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters
|
||||
{
|
||||
All = true
|
||||
}, cts.Token);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
DockerTrackerMetrics.ListContainersErrorCount.Inc();
|
||||
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
|
||||
_log.Debug(ex.ToString()); // Only to verbose output.
|
||||
|
||||
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
|
||||
// The next update will hopefully get past the error.
|
||||
|
||||
// We won't even try update the trackers if we can't even list the containers.
|
||||
// TODO: Is this wise? What if individual container data is still available?
|
||||
// Then again, if listing containers already does not work, can you expect anything to work?
|
||||
return;
|
||||
}
|
||||
|
||||
DockerTrackerMetrics.ContainerCount.Set(allContainers.Count);
|
||||
SynchronizeTrackerSet(allContainers);
|
||||
|
||||
// Update each tracker. We do them in parallel to minimize the total time span spent on probing.
|
||||
var updateTasks = new List<Task>();
|
||||
|
||||
foreach (var tracker in _containerTrackers.Values)
|
||||
updateTasks.Add(tracker.TryUpdateAsync(client, cts.Token));
|
||||
|
||||
// Only exceptions from the update calls should be terminal exceptions,
|
||||
// so it is fine not to catch anything that may be thrown here.
|
||||
await Task.WhenAll(updateTasks);
|
||||
}
|
||||
|
||||
private async Task WaitForPredecessorUpdateAsync(CancellationToken cancel)
|
||||
{
|
||||
_log.Debug("Will not trigger new probe as it overlaps with existing probe.");
|
||||
using var readLock = await SemaphoreLock.TakeAsync(_updateLock, cancel);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ensures that we have a tracker for every listed container
|
||||
/// and removes trackers for any containers not in the list.
|
||||
/// </summary>
|
||||
private void SynchronizeTrackerSet(IList<ContainerListResponse> allContainers)
|
||||
{
|
||||
var containerIds = allContainers.Select(c => c.ID).ToArray();
|
||||
var trackedIds = _containerTrackers.Keys.ToArray();
|
||||
|
||||
// Create a tracker for any new containers.
|
||||
var newIds = containerIds.Except(trackedIds);
|
||||
foreach (var id in newIds)
|
||||
{
|
||||
_log.Debug($"Encountered container for the first time: {id}");
|
||||
_containerTrackers[id] = new ContainerTracker(id);
|
||||
}
|
||||
|
||||
// Remove the trackers of any removed containers.
|
||||
var removedIds = trackedIds.Except(containerIds);
|
||||
foreach (var id in removedIds)
|
||||
{
|
||||
_log.Debug($"Tracked container no longer exists. Removing: {id}");
|
||||
var tracker = _containerTrackers[id];
|
||||
tracker.Dispose();
|
||||
_containerTrackers.Remove(id);
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronized - only single threaded access occurs.
|
||||
private readonly Dictionary<string, ContainerTracker> _containerTrackers = new Dictionary<string, ContainerTracker>();
|
||||
|
||||
private readonly LogSource _log = Log.Default;
|
||||
}
|
||||
}
|
13
DockerTrackerMetrics.cs
Normal file
13
DockerTrackerMetrics.cs
Normal file
|
@ -0,0 +1,13 @@
|
|||
using Prometheus;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
sealed class DockerTrackerMetrics
|
||||
{
|
||||
public static readonly Gauge ContainerCount = Metrics
|
||||
.CreateGauge("docker_containers", "Number of containers that exist.");
|
||||
|
||||
public static readonly Counter ListContainersErrorCount = Metrics
|
||||
.CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
|
||||
}
|
||||
}
|
|
@ -1,8 +1,7 @@
|
|||
using Axinom.Toolkit;
|
||||
using Docker.DotNet;
|
||||
using Prometheus;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using System.Diagnostics;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
|
@ -15,6 +14,7 @@ namespace DockerExporter
|
|||
public ExporterLogic()
|
||||
{
|
||||
// Default value only valid if not running as container.
|
||||
// This is intended for development purposes only.
|
||||
if (Helpers.Environment.IsMicrosoftOperatingSystem())
|
||||
{
|
||||
DockerUrl = "npipe://./pipe/docker_engine";
|
||||
|
@ -27,18 +27,81 @@ namespace DockerExporter
|
|||
|
||||
public async Task RunAsync(CancellationToken cancel)
|
||||
{
|
||||
_log.Info($"Connecting to Docker via {DockerUrl}");
|
||||
_log.Info($"Configured to probe Docker on {DockerUrl}");
|
||||
|
||||
var clientConfig = new DockerClientConfiguration(new Uri(DockerUrl), null, Constants.DockerCommandTimeout);
|
||||
_tracker = new DockerTracker(new Uri(DockerUrl));
|
||||
|
||||
using (var client = clientConfig.CreateClient())
|
||||
Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics);
|
||||
|
||||
#if DEBUG
|
||||
var server = new MetricServer("localhost", 3652);
|
||||
_log.Info($"Open http://localhost:3652/metrics to initiate a probe.");
|
||||
#else
|
||||
var server = new MetricServer(80);
|
||||
#endif
|
||||
|
||||
server.Start();
|
||||
|
||||
while (!cancel.IsCancellationRequested)
|
||||
{
|
||||
var allContainers = await client.Containers.ListContainersAsync(new Docker.DotNet.Models.ContainersListParameters
|
||||
try
|
||||
{
|
||||
All = true
|
||||
}, cancel);
|
||||
await Task.Delay(-1, cancel);
|
||||
}
|
||||
catch (TaskCanceledException) when (cancel.IsCancellationRequested)
|
||||
{
|
||||
// Totally normal - we are exiting.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_log.Info(Helpers.Debug.ToDebugString(allContainers));
|
||||
await server.StopAsync();
|
||||
}
|
||||
|
||||
private DockerTracker? _tracker;
|
||||
|
||||
/// <summary>
|
||||
/// Called before every Prometheus collection in order to update metrics.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The Docker API can be very slow at times, so there is a risk that the scrape will
|
||||
/// just time out under load. To avoid that, we enforce a maximum update duration and
|
||||
/// will give up on fetching new values if the update takes longer than that. If the
|
||||
/// threshold is crossed, we simply allow the scrape to proceed with stale data, while
|
||||
/// the update keeps running in the background, hopefully eventually succeeding.
|
||||
///
|
||||
/// If multiple parallel scrapes are made, the results from the first one will be used
|
||||
/// to satisfy all requests that come in while the data loading triggered by the first
|
||||
/// scrape is still being performed (even if we give up with the scrape before loading finishes).
|
||||
/// This acts as a primitive form of rate control to avoid overloading the fragile Docker API.
|
||||
/// The implementation for this is in DockerTracker.
|
||||
/// </remarks>
|
||||
private void UpdateMetrics()
|
||||
{
|
||||
_log.Debug("Probing Docker.");
|
||||
|
||||
using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration);
|
||||
var updateTask = _tracker!.TryUpdateAsync()
|
||||
.WithAbandonment(inlineCancellation.Token);
|
||||
|
||||
try
|
||||
{
|
||||
updateTask.WaitAndUnwrapExceptions();
|
||||
}
|
||||
catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested)
|
||||
{
|
||||
_log.Debug("Probe took too long - will return stale results and finish probe in background.");
|
||||
|
||||
// This is expected if it goes above the inline threshold, and will be ignored.
|
||||
// Other exceptions are caught, logged, and ignored in DockerState itself.
|
||||
ExporterLogicMetrics.InlineTimeouts.Inc();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// TODO: Now what? If we throw here prometheus-net will just reject the scrape...
|
||||
// ... but what if this is a fatal error that we want to crash the app with?
|
||||
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
|
||||
Debugger.Break();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
9
ExporterLogicMetrics.cs
Normal file
9
ExporterLogicMetrics.cs
Normal file
|
@ -0,0 +1,9 @@
|
|||
using Prometheus;
|
||||
|
||||
namespace DockerExporter
|
||||
{
|
||||
static class ExporterLogicMetrics
|
||||
{
|
||||
public static readonly Counter InlineTimeouts = Metrics.CreateCounter("docker_probe_inline_timeouts_total", "Total number of times we have forced the scrape to happen in the background and returned outdated data because performing an update inline took too long.");
|
||||
}
|
||||
}
|
18
Program.cs
18
Program.cs
|
@ -54,23 +54,9 @@ namespace DockerExporter
|
|||
|
||||
Environment.ExitCode = -1;
|
||||
}
|
||||
catch (AggregateException ex)
|
||||
{
|
||||
foreach (var innerException in ex.InnerExceptions)
|
||||
{
|
||||
_log.Error(innerException.Message);
|
||||
_log.Error(innerException.GetType().Name);
|
||||
}
|
||||
|
||||
Environment.ExitCode = -1;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(ex.Message))
|
||||
{
|
||||
_log.Error(ex.Message);
|
||||
_log.Error(ex.GetType().Name);
|
||||
}
|
||||
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
|
||||
|
||||
Environment.ExitCode = -1;
|
||||
}
|
||||
|
@ -141,7 +127,9 @@ namespace DockerExporter
|
|||
// We default to displaying Info or higher but allow this to be reconfiured later, if the user wishes.
|
||||
_filteringLogListener = new FilteringLogListener(new ConsoleLogListener())
|
||||
{
|
||||
#if !DEBUG
|
||||
MinimumSeverity = LogEntrySeverity.Info
|
||||
#endif
|
||||
};
|
||||
|
||||
Log.Default.RegisterListener(_filteringLogListener);
|
||||
|
|
Loading…
Add table
Reference in a new issue