WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit 93d595c

Browse files
dkurepapremun
andauthored
Add AttemptNumber and OperationId to WorkItemExecuted custom events, improve exception handling/logging (#4514)
<!-- Link the GitHub or AzDO issue this pull request is associated with. Please copy and paste the full URL rather than using the dotnet/arcade-services# syntax --> #4198 --------- Co-authored-by: Přemek Vysoký <[email protected]>
1 parent f74cf36 commit 93d595c

File tree

8 files changed

+126
-89
lines changed

8 files changed

+126
-89
lines changed

src/Microsoft.DotNet.Darc/DarcLib/TelemetryRecorder.cs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System;
5+
using System.Collections.Generic;
56

67
namespace Microsoft.DotNet.DarcLib;
78

@@ -12,17 +13,27 @@ public enum TrackedGitOperation
1213
Push,
1314
}
1415

16+
public enum CustomEventType
17+
{
18+
PullRequestUpdateFailed
19+
}
20+
1521
public interface ITelemetryRecorder
1622
{
1723
/// <summary>
1824
/// Records work item duration and result in the customEvents table, with the `workItem.{type}` name
1925
/// </summary>
20-
ITelemetryScope RecordWorkItemCompletion(string workItemName);
26+
ITelemetryScope RecordWorkItemCompletion(string workItemName, long attemptNumber, string operationId);
2127

2228
/// <summary>
2329
/// Records git operation duration and result.
2430
/// </summary>
2531
ITelemetryScope RecordGitOperation(TrackedGitOperation operation, string repoUri);
32+
33+
/// <summary>
34+
/// Records a custom event with the given name and custom properties.
35+
/// </summary>
36+
void RecordCustomEvent(CustomEventType customEvent, Dictionary<string, string> customProperties);
2637
}
2738

2839
public interface ITelemetryScope : IDisposable
@@ -40,8 +51,9 @@ public class NoTelemetryRecorder : ITelemetryRecorder
4051
{
4152
private static readonly NoTelemetryScope _instance = new();
4253

43-
public ITelemetryScope RecordWorkItemCompletion(string workItemName) => _instance;
54+
public ITelemetryScope RecordWorkItemCompletion(string workItemName, long attemptNumber, string operationId) => _instance;
4455
public ITelemetryScope RecordGitOperation(TrackedGitOperation operation, string repoUri) => _instance;
56+
public void RecordCustomEvent(CustomEventType eventName, Dictionary<string, string> customProperties) { }
4557

4658
public class NoTelemetryScope : ITelemetryScope
4759
{

src/ProductConstructionService/ProductConstructionService.Api/Telemetry/TelemetryRecorder.cs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,22 @@ public class TelemetryRecorder(
1515
private readonly ILogger<TelemetryRecorder> _logger = logger;
1616
private readonly TelemetryClient _telemetryClient = telemetryClient;
1717

18-
public ITelemetryScope RecordWorkItemCompletion(string workItemType)
19-
=> CreateScope("WorkItemExecuted", new() {{ "WorkItemType", workItemType }});
18+
public ITelemetryScope RecordWorkItemCompletion(string workItemType, long attemptNumber, string operationId)
19+
=> CreateScope("WorkItemExecuted", new()
20+
{
21+
{ "WorkItemType", workItemType },
22+
{ "Attempt", attemptNumber.ToString()},
23+
{ "OperationId", operationId }
24+
});
2025

2126
public ITelemetryScope RecordGitOperation(TrackedGitOperation operation, string repoUri)
2227
=> CreateScope($"Git{operation}", new() { { "Uri", repoUri } });
2328

29+
public void RecordCustomEvent(CustomEventType eventName, Dictionary<string, string> customProperties)
30+
{
31+
_telemetryClient.TrackEvent(eventName.ToString(), customProperties);
32+
}
33+
2434
private TelemetryScope CreateScope(string metricName, Dictionary<string, string> customDimensions)
2535
{
2636
return new TelemetryScope(metricName, _logger, _telemetryClient, customDimensions, []);

src/ProductConstructionService/ProductConstructionService.DependencyFlow/PullRequestUpdater.cs

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -998,15 +998,14 @@ await HandlePrUpdateConflictAsync(
998998
}
999999
return;
10001000
}
1001-
catch (Exception e)
1001+
catch (Exception)
10021002
{
1003-
_logger.LogError(e, "Failed to flow source changes for build {buildId} in subscription {subscriptionId}",
1003+
_logger.LogError("Failed to flow source changes for build {buildId} in subscription {subscriptionId}",
10041004
build.Id,
10051005
subscription.Id);
10061006
throw;
10071007
}
10081008

1009-
10101009
if (codeFlowRes.hadUpdates)
10111010
{
10121011
_logger.LogInformation("Code changes for {subscriptionId} ready in local branch {branch}",
@@ -1031,18 +1030,8 @@ await HandlePrUpdateConflictAsync(
10311030
}
10321031
else if (pr != null)
10331032
{
1034-
try
1035-
{
1036-
await UpdateCodeFlowPullRequestAsync(update, pr, previousSourceSha, isForwardFlow, subscription, localRepoPath);
1037-
_logger.LogInformation("Code flow update processed for pull request {prUrl}", pr.Url);
1038-
}
1039-
catch (Exception e)
1040-
{
1041-
// TODO https://github.com/dotnet/arcade-services/issues/4198: Notify us about these kind of failures
1042-
_logger.LogError(e, "Failed to update PR {url} of subscription {subscriptionId}",
1043-
pr.Url,
1044-
update.SubscriptionId);
1045-
}
1033+
await UpdateCodeFlowPullRequestAsync(update, pr, previousSourceSha, isForwardFlow, subscription, localRepoPath);
1034+
_logger.LogInformation("Code flow update processed for pull request {prUrl}", pr.Url);
10461035
}
10471036
}
10481037

@@ -1077,18 +1066,38 @@ private async Task UpdateCodeFlowPullRequestAsync(
10771066

10781067
var description = _pullRequestBuilder.GenerateCodeFlowPRDescription(update, build, previousSourceSha, realPR.Description);
10791068

1080-
await remote.UpdatePullRequestAsync(pullRequest.Url, new PullRequest
1069+
try
10811070
{
1082-
Title = title,
1083-
Description = description
1084-
});
1085-
1086-
pullRequest.SourceSha = update.SourceSha;
1087-
pullRequest.LastUpdate = DateTime.UtcNow;
1088-
pullRequest.MergeState = InProgressPullRequestState.Mergeable;
1089-
pullRequest.NextBuildsToProcess.Remove(update.SubscriptionId);
1090-
await SetPullRequestCheckReminder(pullRequest, isCodeFlow: true);
1091-
await _pullRequestUpdateReminders.UnsetReminderAsync(isCodeFlow: true);
1071+
await remote.UpdatePullRequestAsync(pullRequest.Url, new PullRequest
1072+
{
1073+
Title = title,
1074+
Description = description
1075+
});
1076+
}
1077+
catch (Exception e)
1078+
{
1079+
// If we get here, we already pushed the code updates, but failed to update things like the PR title and description
1080+
// and enqueue a PullRequestCheck, so we'll just log a custom event for it
1081+
_telemetryRecorder.RecordCustomEvent(CustomEventType.PullRequestUpdateFailed, new Dictionary<string, string>
1082+
{
1083+
{ "SubscriptionId", update.SubscriptionId.ToString() },
1084+
{ "PullRequestUrl", pullRequest.Url }
1085+
});
1086+
_logger.LogError(e, "Failed to update PR {url} of subscription {subscriptionId}",
1087+
pullRequest.Url,
1088+
update.SubscriptionId);
1089+
}
1090+
finally
1091+
{
1092+
// Even if we fail to update the PR title and description, the changes already got pushed, so we want to enqueue a
1093+
// PullRequestCheck
1094+
pullRequest.SourceSha = update.SourceSha;
1095+
pullRequest.LastUpdate = DateTime.UtcNow;
1096+
pullRequest.MergeState = InProgressPullRequestState.Mergeable;
1097+
pullRequest.NextBuildsToProcess.Remove(update.SubscriptionId);
1098+
await SetPullRequestCheckReminder(pullRequest, isCodeFlow: true);
1099+
await _pullRequestUpdateReminders.UnsetReminderAsync(isCodeFlow: true);
1100+
}
10921101
}
10931102

10941103
private async Task CreateCodeFlowPullRequestAsync(
@@ -1147,9 +1156,9 @@ await AddDependencyFlowEventsAsync(
11471156

11481157
_logger.LogInformation("Code flow pull request created: {prUrl}", prUrl);
11491158
}
1150-
catch (Exception e)
1159+
catch (Exception)
11511160
{
1152-
_logger.LogError(e, "Failed to create code flow pull request for subscription {subscriptionId}",
1161+
_logger.LogError("Failed to create code flow pull request for subscription {subscriptionId}",
11531162
update.SubscriptionId);
11541163
await darcRemote.DeleteBranchAsync(targetRepository, prBranch);
11551164
throw;

src/ProductConstructionService/ProductConstructionService.DependencyFlow/WorkItemProcessors/SubscriptionTriggerProcessor.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,10 @@ private async Task<bool> UpdateSubscriptionAsync(Guid subscriptionId, int buildI
115115
await triggerer.UpdateSubscriptionAsync(buildId);
116116
return true;
117117
}
118-
catch (Exception e)
118+
catch (Exception)
119119
{
120-
_logger.LogError(e, "Failed to update subscription '{subscriptionId}' with build '{buildId}'", subscriptionId, buildId);
121-
return false;
120+
_logger.LogError("Failed to update subscription '{subscriptionId}' with build '{buildId}'", subscriptionId, buildId);
121+
throw;
122122
}
123123
}
124124
}

src/ProductConstructionService/ProductConstructionService.WorkItems/WorkItemConsumer.cs

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
using System.Text.Json.Nodes;
55
using Azure.Storage.Queues;
66
using Azure.Storage.Queues.Models;
7+
using Microsoft.ApplicationInsights;
8+
using Microsoft.ApplicationInsights.DataContracts;
9+
using Microsoft.DotNet.DarcLib;
710
using Microsoft.Extensions.Hosting;
811
using Microsoft.Extensions.Logging;
912
using Microsoft.Extensions.Options;
@@ -18,7 +21,8 @@ internal class WorkItemConsumer(
1821
IOptions<WorkItemConsumerOptions> options,
1922
WorkItemScopeManager scopeManager,
2023
QueueServiceClient queueServiceClient,
21-
IMetricRecorder metricRecorder)
24+
IMetricRecorder metricRecorder,
25+
ITelemetryRecorder telemetryRecorder)
2226
: BackgroundService
2327
{
2428
private readonly string _consumerId = consumerId;
@@ -93,28 +97,38 @@ private async Task ReadAndProcessWorkItemAsync(QueueClient queueClient, WorkItem
9397

9498
_metricRecorder.QueueMessageReceived(message, delay ?? 0);
9599

96-
try
97-
{
98-
_logger.LogInformation("Starting attempt {attemptNumber} for {workItemType}", message.DequeueCount, workItemType);
99-
await workItemScope.RunWorkItemAsync(node, cancellationToken);
100-
await queueClient.DeleteMessageAsync(message.MessageId, message.PopReceipt, cancellationToken);
101-
}
102-
// If the cancellation token gets cancelled, don't retry, just exit without deleting the message, we'll handle it later
103-
catch (OperationCanceledException)
104-
{
105-
throw;
106-
}
107-
catch (Exception ex)
100+
TelemetryClient telemetryClient = workItemScope.GetRequiredService<TelemetryClient>();
101+
102+
using (var operation = telemetryClient.StartOperation<RequestTelemetry>(workItemType))
103+
using (ITelemetryScope telemetryScope = telemetryRecorder.RecordWorkItemCompletion(
104+
workItemType,
105+
message.DequeueCount,
106+
operation.Telemetry.Context.Operation.Id))
108107
{
109-
_logger.LogError(ex, "Processing work item {workItemType} attempt {attempt}/{maxAttempts} failed",
110-
workItemType, message.DequeueCount, _options.Value.MaxWorkItemRetries);
111-
// Let the workItem retry a few times. If it fails a few times, delete it from the queue, it's a bad work item
112-
if (message.DequeueCount == _options.Value.MaxWorkItemRetries || ex is NonRetriableException)
108+
try
113109
{
114-
_logger.LogError("Work item {type} has failed {maxAttempts} times. Discarding the message {message} from the queue",
115-
workItemType, _options.Value.MaxWorkItemRetries, message.Body.ToString());
110+
_logger.LogInformation("Starting attempt {attemptNumber} for {workItemType}", message.DequeueCount, workItemType);
111+
await workItemScope.RunWorkItemAsync(node, telemetryScope, cancellationToken);
116112
await queueClient.DeleteMessageAsync(message.MessageId, message.PopReceipt, cancellationToken);
117113
}
114+
// If the cancellation token gets cancelled, don't retry, just exit without deleting the message, we'll handle it later
115+
catch (OperationCanceledException)
116+
{
117+
throw;
118+
}
119+
catch (Exception ex)
120+
{
121+
operation.Telemetry.Success = false;
122+
_logger.LogError(ex, "Processing work item {workItemType} attempt {attempt}/{maxAttempts} failed",
123+
workItemType, message.DequeueCount, _options.Value.MaxWorkItemRetries);
124+
// Let the workItem retry a few times. If it fails a few times, delete it from the queue, it's a bad work item
125+
if (message.DequeueCount == _options.Value.MaxWorkItemRetries || ex is NonRetriableException)
126+
{
127+
_logger.LogError("Work item {type} has failed {maxAttempts} times. Discarding the message {message} from the queue",
128+
workItemType, _options.Value.MaxWorkItemRetries, message.Body.ToString());
129+
await queueClient.DeleteMessageAsync(message.MessageId, message.PopReceipt, cancellationToken);
130+
}
131+
}
118132
}
119133
}
120134
}

src/ProductConstructionService/ProductConstructionService.WorkItems/WorkItemScope.cs

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public async ValueTask DisposeAsync()
3838
_serviceScope.Dispose();
3939
}
4040

41-
public async Task RunWorkItemAsync(JsonNode node, CancellationToken cancellationToken)
41+
public async Task RunWorkItemAsync(JsonNode node, ITelemetryScope telemetryScope, CancellationToken cancellationToken)
4242
{
4343
var type = node["type"]!.ToString();
4444

@@ -60,29 +60,18 @@ public async Task RunWorkItemAsync(JsonNode node, CancellationToken cancellation
6060

6161
async Task ProcessWorkItemAsync()
6262
{
63-
using (var operation = telemetryClient.StartOperation<RequestTelemetry>(type))
64-
using (ITelemetryScope telemetryScope = _telemetryRecorder.RecordWorkItemCompletion(type))
6563
using (logger.BeginScope(processor.GetLoggingContextData(workItem)))
6664
{
67-
try
65+
logger.LogInformation("Processing work item {type}", type);
66+
var success = await processor.ProcessWorkItemAsync(workItem, cancellationToken);
67+
if (success)
6868
{
69-
logger.LogInformation("Processing work item {type}", type);
70-
var success = await processor.ProcessWorkItemAsync(workItem, cancellationToken);
71-
if (success)
72-
{
73-
telemetryScope.SetSuccess();
74-
logger.LogInformation("Work item {type} processed successfully", type);
75-
}
76-
else
77-
{
78-
logger.LogInformation("Work item {type} processed unsuccessfully", type);
79-
}
69+
telemetryScope.SetSuccess();
70+
logger.LogInformation("Work item {type} processed successfully", type);
8071
}
81-
catch (Exception e)
72+
else
8273
{
83-
operation.Telemetry.Success = false;
84-
logger.LogError(e, "Failed to process work item {type}", type);
85-
throw;
74+
logger.LogInformation("Work item {type} processed unsuccessfully", type);
8675
}
8776
}
8877
}
@@ -107,4 +96,9 @@ async Task ProcessWorkItemAsync()
10796
}
10897
} while (@lock == null);
10998
}
99+
100+
public T GetRequiredService<T>() where T : notnull
101+
{
102+
return _serviceScope.ServiceProvider.GetRequiredService<T>();
103+
}
110104
}

test/ProductConstructionService.DependencyFlow.Tests/PullRequestUpdaterTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ protected void ThenUpdateReminderIsRemoved()
598598
protected IPullRequestUpdater CreatePullRequestActor(IServiceProvider context)
599599
{
600600
var updaterFactory = context.GetRequiredService<IPullRequestUpdaterFactory>();
601-
return updaterFactory.CreatePullRequestUpdater(GetPullRequestUpdaterId());
601+
return updaterFactory.CreatePullRequestUpdater(GetPullRequestUpdaterId());
602602
}
603603

604604
protected SubscriptionUpdateWorkItem CreateSubscriptionUpdate(Build forBuild, bool isCodeFlow = false)

0 commit comments

Comments
 (0)