Skip to content

Commit 9bfa7b9

Browse files
committed
Await previous worker exit before spawning new run-service worker
Fixes #4357
1 parent c87d955 commit 9bfa7b9

2 files changed

Lines changed: 86 additions & 0 deletions

File tree

src/Runner.Listener/JobDispatcher.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,20 @@ private async Task EnsureDispatchFinished(WorkerDispatcher jobDispatch, bool can
254254
{
255255
Trace.Error($"We are not yet checking the state of jobrequest {jobDispatch.JobId} status. Cancel running worker right away.");
256256
jobDispatch.WorkerCancellationTokenSource.Cancel();
257+
258+
// Wait for the previous worker process to fully exit (including
259+
// TempDirectoryManager cleanup) before letting the new worker
260+
// start. Without this wait the exiting worker's _temp cleanup
261+
// races with the new worker and deletes files such as
262+
// _runner_file_commands/* out from under the active job. (#4357)
263+
Task completedTask = await Task.WhenAny(jobDispatch.WorkerDispatch, Task.Delay(TimeSpan.FromSeconds(45)));
264+
if (completedTask != jobDispatch.WorkerDispatch)
265+
{
266+
// at this point, the job execution might encounter some dead lock and even not able to be cancelled.
267+
// no need to localize the exception string should never happen.
268+
throw new InvalidOperationException($"Job dispatch process for {jobDispatch.JobId} has encountered unexpected error, the dispatch task is not able to be cancelled within 45 seconds.");
269+
}
270+
257271
return;
258272
}
259273

src/Test/L0/Listener/JobDispatcherL0.cs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,78 @@ public async void DispatchesOneTimeJobRequest()
745745
}
746746
}
747747

748+
// Regression test for https://github.com/actions/runner/issues/4357.
749+
//
750+
// For run-service jobs, the previous EnsureDispatchFinished implementation
751+
// cancelled the running worker and immediately returned without awaiting
752+
// the worker process exit. This let the new Worker spawn while the old
753+
// Worker was still running TempDirectoryManager.CleanupTempDirectory(),
754+
// which then wiped _runner_file_commands/* out from under the new job.
755+
//
756+
// EnsureDispatchFinished must now await the previous WorkerDispatch task
757+
// before returning so the new dispatch only proceeds once the previous
758+
// worker (and its temp cleanup) has finished.
759+
[Fact]
760+
[Trait("Level", "L0")]
761+
[Trait("Category", "Runner")]
762+
public async void EnsureDispatchFinishedAwaitsPreviousWorkerForRunServiceJob()
763+
{
764+
using (var hc = new TestHostContext(this))
765+
{
766+
hc.SetSingleton<IConfigurationStore>(_configurationStore.Object);
767+
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
768+
_configurationStore.Setup(x => x.GetSettings()).Returns(new RunnerSettings { PoolId = 1 });
769+
770+
var jobDispatcher = new JobDispatcher();
771+
jobDispatcher.Initialize(hc);
772+
EnableRunServiceJobForJobDispatcher(jobDispatcher);
773+
774+
// Build a previous-job WorkerDispatcher whose worker process simulates
775+
// a slow shutdown (representing TempDirectoryManager.CleanupTempDirectory
776+
// running). We control completion via a TaskCompletionSource.
777+
var workerDispatcherType = typeof(JobDispatcher).GetNestedType("WorkerDispatcher", BindingFlags.NonPublic);
778+
Assert.NotNull(workerDispatcherType);
779+
var ctor = workerDispatcherType.GetConstructor(
780+
BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance,
781+
null,
782+
new[] { typeof(Guid), typeof(long) },
783+
null);
784+
Assert.NotNull(ctor);
785+
var prev = ctor.Invoke(new object[] { Guid.NewGuid(), (long)42 });
786+
787+
var workerExitTcs = new TaskCompletionSource<int>();
788+
workerDispatcherType.GetProperty("WorkerDispatch").SetValue(prev, workerExitTcs.Task);
789+
790+
var ensureDispatchFinishedMethod = typeof(JobDispatcher).GetMethod(
791+
"EnsureDispatchFinished",
792+
BindingFlags.NonPublic | BindingFlags.Instance);
793+
Assert.NotNull(ensureDispatchFinishedMethod);
794+
795+
var ensureTask = (Task)ensureDispatchFinishedMethod.Invoke(jobDispatcher, new object[] { prev, false });
796+
797+
// Cancellation should already have been requested on the previous worker.
798+
var cts = (CancellationTokenSource)workerDispatcherType
799+
.GetProperty("WorkerCancellationTokenSource")
800+
.GetValue(prev);
801+
Assert.True(cts.IsCancellationRequested,
802+
"EnsureDispatchFinished should cancel the previous worker for run-service jobs.");
803+
804+
// EnsureDispatchFinished MUST NOT complete until the previous worker exits.
805+
// Give the continuation a chance to run, then assert it's still pending.
806+
await Task.Delay(50);
807+
Assert.False(ensureTask.IsCompleted,
808+
"EnsureDispatchFinished must wait for the previous worker process to exit (including temp cleanup) before returning.");
809+
810+
// Simulate the previous worker (and its TempDirectoryManager cleanup) finishing.
811+
workerExitTcs.SetResult(0);
812+
813+
// Now EnsureDispatchFinished should complete in a timely fashion.
814+
var completed = await Task.WhenAny(ensureTask, Task.Delay(TimeSpan.FromSeconds(10)));
815+
Assert.Same(ensureTask, completed);
816+
await ensureTask;
817+
}
818+
}
819+
748820
private static void EnableRunServiceJobForJobDispatcher(JobDispatcher jobDispatcher)
749821
{
750822
// Set the value of the _isRunServiceJob field to true

0 commit comments

Comments
 (0)