Implemented basic probing

This commit is contained in:
Sander Saares 2019-12-22 18:10:34 +00:00
parent e5127dd413
commit 84e26ddd33
10 changed files with 610 additions and 26 deletions

View file

@ -8,8 +8,22 @@ namespace DockerExporter
public const string VersionString = "__VERSIONSTRING__";
/// <summary>
/// Docker can sometimes be slow to respond. If that is the case, we just give up and try again later.
/// Docker can sometimes be slow to respond. If that is the case, we just give up and try
/// again later. This limit is applied per individual API call, so does not reflect the
/// total possible duration of a scrape, which is handled by the timeout values below.
/// </summary>
public static readonly TimeSpan DockerCommandTimeout = TimeSpan.FromSeconds(30);
/// <summary>
/// We are willing to delay a single scrape up to this long to wait for fresh data.
/// Beyond this point, the update can still continue but will be done in the background.
/// </summary>
public static readonly TimeSpan MaxInlineUpdateDuration = TimeSpan.FromSeconds(20);
/// <summary>
/// Even if the update happens in the background, it will be cancelled if it takes
/// more time than this. The next scrape will try again from scratch.
/// </summary>
public static readonly TimeSpan MaxTotalUpdateDuration = TimeSpan.FromMinutes(2);
}
}

223
ContainerTracker.cs Normal file
View file

@ -0,0 +1,223 @@
using Axinom.Toolkit;
using Prometheus;
using Docker.DotNet;
using Docker.DotNet.Models;
using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Diagnostics;
using System.Linq;
namespace DockerExporter
{
/// <summary>
/// Tracks the status of one container and exports metrics, updating the data when new scrapes are requested.
/// </summary>
/// <remarks>
/// NOT thread-safe! No concurrent usage is expected.
/// DockerTracker performs the necessary synchronization logic.
/// </remarks>
sealed class ContainerTracker : IDisposable
{
public string Id { get; }
public ContainerTracker(string id)
{
Id = id;
}
public void Dispose()
{
_resourceMetrics?.Dispose();
_stateMetrics?.Dispose();
}
/// <summary>
/// Requests the tracker to update its data set.
/// </summary>
/// <remarks>
/// May be called multiple times concurrently.
///
/// Method does not throw exceptions on transient failures, merely logs and ignores them.
/// </remarks>
public async Task TryUpdateAsync(DockerClient client, CancellationToken cancel)
{
ContainerInspectResponse container;
StatsRecorder resourceStatsRecorder = new StatsRecorder();
try
{
// First, inspect to get some basic information.
container = await client.Containers.InspectContainerAsync(Id, cancel);
// Then query for the latest resource usage stats (if container is running).
if (container.State.Running)
{
await client.Containers.GetContainerStatsAsync(Id, new ContainerStatsParameters
{
Stream = false // Only get latest, then stop.
}, resourceStatsRecorder, cancel);
}
}
catch (Exception ex)
{
// TODO: DockerTrackerMetrics.ListContainersErrorCount.Inc();
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
_log.Debug(ex.ToString()); // Only to verbose output.
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
// The next update will hopefully get past the error.
return;
}
// If anything goes wrong below, it is a fatal error not to be ignored, so not in the try block.
// Now that we have the data assembled, update the metrics.
if (_stateMetrics == null)
{
var displayName = GetDisplayNameOrId(container);
_log.Debug($"First update of state metrics for {displayName} ({Id}).");
_stateMetrics = new ContainerTrackerStateMetrics(Id, displayName);
}
UpdateStateMetrics(_stateMetrics, container);
if (resourceStatsRecorder.Response != null)
{
if (_resourceMetrics == null)
{
var displayName = GetDisplayNameOrId(container);
_log.Debug($"Initializing resource metrics for {displayName} ({Id}).");
_resourceMetrics = new ContainerTrackerResourceMetrics(Id, displayName);
}
UpdateResourceMetrics(_resourceMetrics, container, resourceStatsRecorder.Response);
}
else
{
// TODO: It could be we already had resource metrics and now they should go away.
_resourceMetrics?.Dispose();
_resourceMetrics = null;
}
}
private void UpdateStateMetrics(ContainerTrackerStateMetrics metrics, ContainerInspectResponse container)
{
metrics.RestartCount.Set(container.RestartCount);
if (container.State.Running)
metrics.RunningState.Set(1);
else if (container.State.Restarting)
metrics.RunningState.Set(0.5);
else
metrics.RunningState.Set(0);
if (container.State.Running && !string.IsNullOrWhiteSpace(container.State.StartedAt))
metrics.StartTime.SetToTimeUtc(DateTimeOffset.Parse(container.State.StartedAt));
}
private void UpdateResourceMetrics(ContainerTrackerResourceMetrics metrics, ContainerInspectResponse container, ContainerStatsResponse resources)
{
// The resource reporting is very different for different operating systems.
// This field is only used on Windows. We assume a container can't exist with 0 memory.
bool isWindowsContainer = resources.MemoryStats.Commit != 0;
// CPU usage
// The mechanism of calculation is the rate of increase in container CPU time versus available ("system") CPU time.
// The idea here is that we build two series - one counting used CPU in whatever units
// the other counting potentially available CPU in whatever units. The % always comes right.
// Docker CPU usage on Windows counts 100ns ticks.
// Docker CPU usage on Linux counts unspecified ticks in relation to some other stats.
// See https://github.com/moby/moby/blob/eb131c5383db8cac633919f82abad86c99bffbe5/cli/command/container/stats_helpers.go#L175
if (isWindowsContainer)
{
// To compensate for core count on Windows, we normalize the container usage to a single core.
// We also normalize the available CPU time to a single core.
// This way the Windows calculation is always per-core averaged.
// A .NET DateTimeOffset tick is 100ns, exactly, so matches what Docker uses.
metrics.CpuCapacity.Set(CpuBaselineTimer.Elapsed.Ticks);
metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage / resources.NumProcs);
}
else
{
// This is counting all cores (right?).
metrics.CpuCapacity.Set(resources.CPUStats.SystemUsage);
metrics.CpuUsage.Set(resources.CPUStats.CPUUsage.TotalUsage);
}
// Memory usage
if (isWindowsContainer)
{
// Windows reports Private Working Set in Docker stats... but seems to use Commit Bytes to enforce limit!
// We want to report the same metric that is limited, so there we go.
metrics.MemoryUsage.Set(resources.MemoryStats.Commit);
}
else
{
metrics.MemoryUsage.Set(resources.MemoryStats.Usage);
}
// Network I/O
if (resources.Networks == null)
{
metrics.TotalNetworkBytesIn.Set(0);
metrics.TotalNetworkBytesOut.Set(0);
}
else
{
metrics.TotalNetworkBytesIn.Set(resources.Networks.Values.Sum(n => (double)n.RxBytes));
metrics.TotalNetworkBytesOut.Set(resources.Networks.Values.Sum(n => (double)n.TxBytes));
}
// Disk I/O
if (isWindowsContainer)
{
metrics.TotalDiskBytesRead.Set(resources.StorageStats.ReadSizeBytes);
metrics.TotalDiskBytesWrite.Set(resources.StorageStats.WriteSizeBytes);
}
else
{
var readEntries = resources.BlkioStats.IoServiceBytesRecursive
.Where(entry => entry.Op.Equals("read", StringComparison.InvariantCultureIgnoreCase))
.ToArray();
var writeEntries = resources.BlkioStats.IoServiceBytesRecursive
.Where(entry => entry.Op.Equals("write", StringComparison.InvariantCultureIgnoreCase))
.ToArray();
var totalRead = readEntries.Any() ? readEntries.Sum(entry => (long)entry.Value) : 0;
var totalWrite = writeEntries.Any() ? writeEntries.Sum(entry => (long)entry.Value) : 0;
metrics.TotalDiskBytesRead.Set(totalRead);
metrics.TotalDiskBytesWrite.Set(totalWrite);
}
}
private sealed class StatsRecorder : IProgress<ContainerStatsResponse>
{
public ContainerStatsResponse? Response { get; private set; }
public void Report(ContainerStatsResponse value) => Response = value;
}
/// <summary>
/// If a display name can be determined, returns it. Otherwise returns the container ID.
/// </summary>
private static string GetDisplayNameOrId(ContainerInspectResponse container)
{
if (!string.IsNullOrWhiteSpace(container.Name))
return container.Name.Trim('/');
return container.ID;
}
// We just need a monotonically increasing timer that does not use excessively large numbers (no 1970 base).
private static readonly Stopwatch CpuBaselineTimer = Stopwatch.StartNew();
private ContainerTrackerStateMetrics? _stateMetrics;
private ContainerTrackerResourceMetrics? _resourceMetrics;
private readonly LogSource _log = Log.Default;
}
}

View file

@ -0,0 +1,77 @@
using Prometheus;
using System;
using System.Linq;
namespace DockerExporter
{
sealed class ContainerTrackerResourceMetrics : IDisposable
{
public Gauge.Child CpuUsage { get; private set; }
public Gauge.Child CpuCapacity { get; private set; }
public Gauge.Child MemoryUsage { get; private set; }
public Gauge.Child TotalNetworkBytesIn { get; private set; }
public Gauge.Child TotalNetworkBytesOut { get; private set; }
public Gauge.Child TotalDiskBytesRead { get; private set; }
public Gauge.Child TotalDiskBytesWrite { get; private set; }
public ContainerTrackerResourceMetrics(string id, string displayName)
{
_id = id;
_displayName = displayName;
CpuUsage = BaseCpuUsage.WithLabels(id, displayName);
CpuCapacity = BaseCpuCapacity.WithLabels(id, displayName);
MemoryUsage = BaseMemoryUsage.WithLabels(id, displayName);
TotalNetworkBytesIn = BaseTotalNetworkBytesIn.WithLabels(id, displayName);
TotalNetworkBytesOut = BaseTotalNetworkBytesOut.WithLabels(id, displayName);
TotalDiskBytesRead = BaseTotalDiskBytesRead.WithLabels(id, displayName);
TotalDiskBytesWrite = BaseTotalDiskBytesWrite.WithLabels(id, displayName);
}
private readonly string _id;
private readonly string _displayName;
public void Dispose()
{
BaseCpuUsage.RemoveLabelled(_id, _displayName);
BaseCpuCapacity.RemoveLabelled(_id, _displayName);
BaseMemoryUsage.RemoveLabelled(_id, _displayName);
BaseTotalNetworkBytesIn.RemoveLabelled(_id, _displayName);
BaseTotalNetworkBytesOut.RemoveLabelled(_id, _displayName);
BaseTotalDiskBytesRead.RemoveLabelled(_id, _displayName);
BaseTotalDiskBytesWrite.RemoveLabelled(_id, _displayName);
}
// While logically counters, all of these are gauges because we do not know when Docker might reset the values.
private static readonly Gauge BaseCpuUsage = Metrics
.CreateGauge("docker_container_cpu_used_total", "Accumulated CPU usage of a container, in unspecified units, averaged for all logical CPUs usable by the container.", ConfigureGauge());
private static readonly Gauge BaseCpuCapacity = Metrics
.CreateGauge("docker_container_cpu_capacity_total", "All potential CPU usage available to a container, in unspecified units, averaged for all logical CPUs usable by the container. Start point of measurement is undefined - only relative values should be used in analytics.", ConfigureGauge());
private static readonly Gauge BaseMemoryUsage = Metrics
.CreateGauge("docker_container_memory_used_bytes", "Memory usage of a container.", ConfigureGauge());
private static readonly Gauge BaseTotalNetworkBytesIn = Metrics
.CreateGauge("docker_container_network_in_bytes", "Total bytes received by the container's network interfaces.", ConfigureGauge());
private static readonly Gauge BaseTotalNetworkBytesOut = Metrics
.CreateGauge("docker_container_network_out_bytes", "Total bytes sent by the container's network interfaces.", ConfigureGauge());
private static readonly Gauge BaseTotalDiskBytesRead = Metrics
.CreateGauge("docker_container_disk_read_bytes", "Total bytes read from disk by a container.", ConfigureGauge());
private static readonly Gauge BaseTotalDiskBytesWrite = Metrics
.CreateGauge("docker_container_disk_write_bytes", "Total bytes written to disk by a container.", ConfigureGauge());
private static string[] LabelNames(params string[] extra) =>
new[] { "id", "display_name" }.Concat(extra).ToArray();
private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
{
LabelNames = LabelNames(),
SuppressInitialValue = true
};
}
}

View file

@ -0,0 +1,51 @@
using Prometheus;
using System;
using System.Linq;
namespace DockerExporter
{
sealed class ContainerTrackerStateMetrics : IDisposable
{
public Gauge.Child RestartCount { get; private set; }
public Gauge.Child RunningState { get; private set; }
public Gauge.Child StartTime { get; private set; }
public ContainerTrackerStateMetrics(string id, string displayName)
{
_id = id;
_displayName = displayName;
RestartCount = BaseRestartCount.WithLabels(id, displayName);
RunningState = BaseRunningState.WithLabels(id, displayName);
StartTime = BaseStartTime.WithLabels(id, displayName);
}
private readonly string _id;
private readonly string _displayName;
public void Dispose()
{
BaseRestartCount.RemoveLabelled(_id, _displayName);
BaseRunningState.RemoveLabelled(_id, _displayName);
BaseStartTime.RemoveLabelled(_id, _displayName);
}
private static readonly Gauge BaseRestartCount = Metrics
.CreateGauge("docker_container_restart_count", "Number of times the runtime has restarted this container without explicit user action, since the container was last started.", ConfigureGauge());
private static readonly Gauge BaseRunningState = Metrics
.CreateGauge("docker_container_running_state", "Whether the container is running (value 1), restarting (value 0.5) or stopped (value 0).", ConfigureGauge());
private static readonly Gauge BaseStartTime = Metrics
.CreateGauge("docker_container_start_time", "Timestamp indicating when the container was started. Does not get reset by automatic restarts.", ConfigureGauge());
private static string[] LabelNames(params string[] extra) =>
new[] { "id", "display_name" }.Concat(extra).ToArray();
private static GaugeConfiguration ConfigureGauge() => new GaugeConfiguration
{
LabelNames = LabelNames(),
SuppressInitialValue = true
};
}
}

View file

@ -25,6 +25,7 @@
<PackageReference Include="Axinom.Toolkit" Version="14.0.0" />
<PackageReference Include="Docker.DotNet" Version="3.125.2" />
<PackageReference Include="Mono.Options" Version="5.3.0.1" />
<PackageReference Include="prometheus-net" Version="3.4.0-pre-000079-eff2a83" />
</ItemGroup>
</Project>

145
DockerTracker.cs Normal file
View file

@ -0,0 +1,145 @@
using Axinom.Toolkit;
using Docker.DotNet;
using Docker.DotNet.Models;
using Prometheus;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace DockerExporter
{
/// <summary>
/// Tracks the status of one instance of Docker and exports metrics, updating the data when new scrapes are requested.
/// </summary>
/// <remarks>
/// Thread-safe.
/// </remarks>
sealed class DockerTracker
{
public Uri DockerUrl { get; }
private readonly DockerClientConfiguration _clientConfiguration;
// If an execution can get the lock on first try, it will really perform the update.
// Otherwise, it will wait for the lock and then perform a no-op update to just leave
// the tracker with the same data the just-finished update generated.
// This acts as basic rate control.
private readonly SemaphoreSlim _updateLock = new SemaphoreSlim(1);
public DockerTracker(Uri dockerUrl)
{
DockerUrl = dockerUrl;
// TODO: Support mutual authentication via certificates.
_clientConfiguration = new DockerClientConfiguration(dockerUrl, null, Constants.DockerCommandTimeout);
}
/// <summary>
/// Requests the tracker to update its data set.
/// </summary>
/// <remarks>
/// May be called multiple times concurrently.
///
/// The method returns to signal that the trackerss of all containers
/// when the method was called have attempted an update to their data.
/// It may be that some updates failed - all we can say is that we tried.
///
/// Method does not throw exceptions on transient failures, merely logs and ignores them.
/// </remarks>
public async Task TryUpdateAsync()
{
using var cts = new CancellationTokenSource(Constants.MaxTotalUpdateDuration);
// If we get this lock, we will actually perform the update.
using var writeLock = await SemaphoreLock.TryTakeAsync(_updateLock, TimeSpan.Zero);
if (writeLock == null)
{
// Otherwise, we just no-op once the one that came before has updated the data.
await WaitForPredecessorUpdateAsync(cts.Token);
return;
}
using var client = _clientConfiguration.CreateClient();
IList<ContainerListResponse> allContainers;
try
{
allContainers = await client.Containers.ListContainersAsync(new ContainersListParameters
{
All = true
}, cts.Token);
}
catch (Exception ex)
{
DockerTrackerMetrics.ListContainersErrorCount.Inc();
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
_log.Debug(ex.ToString()); // Only to verbose output.
// Errors are ignored - if we fail to get data, we just skip an update and log the failure.
// The next update will hopefully get past the error.
// We won't even try update the trackers if we can't even list the containers.
// TODO: Is this wise? What if individual container data is still available?
// Then again, if listing containers already does not work, can you expect anything to work?
return;
}
DockerTrackerMetrics.ContainerCount.Set(allContainers.Count);
SynchronizeTrackerSet(allContainers);
// Update each tracker. We do them in parallel to minimize the total time span spent on probing.
var updateTasks = new List<Task>();
foreach (var tracker in _containerTrackers.Values)
updateTasks.Add(tracker.TryUpdateAsync(client, cts.Token));
// Only exceptions from the update calls should be terminal exceptions,
// so it is fine not to catch anything that may be thrown here.
await Task.WhenAll(updateTasks);
}
private async Task WaitForPredecessorUpdateAsync(CancellationToken cancel)
{
_log.Debug("Will not trigger new probe as it overlaps with existing probe.");
using var readLock = await SemaphoreLock.TakeAsync(_updateLock, cancel);
}
/// <summary>
/// Ensures that we have a tracker for every listed container
/// and removes trackers for any containers not in the list.
/// </summary>
private void SynchronizeTrackerSet(IList<ContainerListResponse> allContainers)
{
var containerIds = allContainers.Select(c => c.ID).ToArray();
var trackedIds = _containerTrackers.Keys.ToArray();
// Create a tracker for any new containers.
var newIds = containerIds.Except(trackedIds);
foreach (var id in newIds)
{
_log.Debug($"Encountered container for the first time: {id}");
_containerTrackers[id] = new ContainerTracker(id);
}
// Remove the trackers of any removed containers.
var removedIds = trackedIds.Except(containerIds);
foreach (var id in removedIds)
{
_log.Debug($"Tracked container no longer exists. Removing: {id}");
var tracker = _containerTrackers[id];
tracker.Dispose();
_containerTrackers.Remove(id);
}
}
// Synchronized - only single threaded access occurs.
private readonly Dictionary<string, ContainerTracker> _containerTrackers = new Dictionary<string, ContainerTracker>();
private readonly LogSource _log = Log.Default;
}
}

13
DockerTrackerMetrics.cs Normal file
View file

@ -0,0 +1,13 @@
using Prometheus;
namespace DockerExporter
{
sealed class DockerTrackerMetrics
{
public static readonly Gauge ContainerCount = Metrics
.CreateGauge("docker_containers", "Number of containers that exist.");
public static readonly Counter ListContainersErrorCount = Metrics
.CreateCounter("docker_list_containers_failed_total", "How many times the attempt to list all containers has failed.");
}
}

View file

@ -1,8 +1,7 @@
using Axinom.Toolkit;
using Docker.DotNet;
using Prometheus;
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;
@ -15,6 +14,7 @@ namespace DockerExporter
public ExporterLogic()
{
// Default value only valid if not running as container.
// This is intended for development purposes only.
if (Helpers.Environment.IsMicrosoftOperatingSystem())
{
DockerUrl = "npipe://./pipe/docker_engine";
@ -27,18 +27,81 @@ namespace DockerExporter
public async Task RunAsync(CancellationToken cancel)
{
_log.Info($"Connecting to Docker via {DockerUrl}");
_log.Info($"Configured to probe Docker on {DockerUrl}");
var clientConfig = new DockerClientConfiguration(new Uri(DockerUrl), null, Constants.DockerCommandTimeout);
_tracker = new DockerTracker(new Uri(DockerUrl));
using (var client = clientConfig.CreateClient())
Metrics.DefaultRegistry.AddBeforeCollectCallback(UpdateMetrics);
#if DEBUG
var server = new MetricServer("localhost", 3652);
_log.Info($"Open http://localhost:3652/metrics to initiate a probe.");
#else
var server = new MetricServer(80);
#endif
server.Start();
while (!cancel.IsCancellationRequested)
{
var allContainers = await client.Containers.ListContainersAsync(new Docker.DotNet.Models.ContainersListParameters
try
{
All = true
}, cancel);
await Task.Delay(-1, cancel);
}
catch (TaskCanceledException) when (cancel.IsCancellationRequested)
{
// Totally normal - we are exiting.
break;
}
}
_log.Info(Helpers.Debug.ToDebugString(allContainers));
await server.StopAsync();
}
private DockerTracker? _tracker;
/// <summary>
/// Called before every Prometheus collection in order to update metrics.
/// </summary>
/// <remarks>
/// The Docker API can be very slow at times, so there is a risk that the scrape will
/// just time out under load. To avoid that, we enforce a maximum update duration and
/// will give up on fetching new values if the update takes longer than that. If the
/// threshold is crossed, we simply allow the scrape to proceed with stale data, while
/// the update keeps running in the background, hopefully eventually succeeding.
///
/// If multiple parallel scrapes are made, the results from the first one will be used
/// to satisfy all requests that come in while the data loading triggered by the first
/// scrape is still being performed (even if we give up with the scrape before loading finishes).
/// This acts as a primitive form of rate control to avoid overloading the fragile Docker API.
/// The implementation for this is in DockerTracker.
/// </remarks>
private void UpdateMetrics()
{
_log.Debug("Probing Docker.");
using var inlineCancellation = new CancellationTokenSource(Constants.MaxInlineUpdateDuration);
var updateTask = _tracker!.TryUpdateAsync()
.WithAbandonment(inlineCancellation.Token);
try
{
updateTask.WaitAndUnwrapExceptions();
}
catch (TaskCanceledException) when (inlineCancellation.IsCancellationRequested)
{
_log.Debug("Probe took too long - will return stale results and finish probe in background.");
// This is expected if it goes above the inline threshold, and will be ignored.
// Other exceptions are caught, logged, and ignored in DockerState itself.
ExporterLogicMetrics.InlineTimeouts.Inc();
}
catch (Exception ex)
{
// TODO: Now what? If we throw here prometheus-net will just reject the scrape...
// ... but what if this is a fatal error that we want to crash the app with?
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
Debugger.Break();
}
}

9
ExporterLogicMetrics.cs Normal file
View file

@ -0,0 +1,9 @@
using Prometheus;
namespace DockerExporter
{
static class ExporterLogicMetrics
{
public static readonly Counter InlineTimeouts = Metrics.CreateCounter("docker_probe_inline_timeouts_total", "Total number of times we have forced the scrape to happen in the background and returned outdated data because performing an update inline took too long.");
}
}

View file

@ -54,23 +54,9 @@ namespace DockerExporter
Environment.ExitCode = -1;
}
catch (AggregateException ex)
{
foreach (var innerException in ex.InnerExceptions)
{
_log.Error(innerException.Message);
_log.Error(innerException.GetType().Name);
}
Environment.ExitCode = -1;
}
catch (Exception ex)
{
if (!string.IsNullOrWhiteSpace(ex.Message))
{
_log.Error(ex.Message);
_log.Error(ex.GetType().Name);
}
_log.Error(Helpers.Debug.GetAllExceptionMessages(ex));
Environment.ExitCode = -1;
}
@ -141,7 +127,9 @@ namespace DockerExporter
// We default to displaying Info or higher but allow this to be reconfiured later, if the user wishes.
_filteringLogListener = new FilteringLogListener(new ConsoleLogListener())
{
#if !DEBUG
MinimumSeverity = LogEntrySeverity.Info
#endif
};
Log.Default.RegisterListener(_filteringLogListener);