diff --git a/app/jobs/runtime/service_operations_create_in_progress_cleanup.rb b/app/jobs/runtime/service_operations_create_in_progress_cleanup.rb new file mode 100644 index 00000000000..d1e487f391f --- /dev/null +++ b/app/jobs/runtime/service_operations_create_in_progress_cleanup.rb @@ -0,0 +1,117 @@ +module VCAP::CloudController + module Jobs + module Runtime + class ServiceOperationsCreateInProgressCleanup < VCAP::CloudController::Jobs::CCJob + BATCH_SIZE = 10 + + def perform + logger.info("Cleaning up 'create' type service operations stuck 'in progress'") + cleanup_operations(ServiceInstanceOperation, ServiceInstance, :service_instance_id, 'service_instance.create', :cleanup_failed_provision) + cleanup_operations(ServiceBindingOperation, ServiceBinding, :service_binding_id, 'service_bindings.create', :cleanup_failed_bind) + cleanup_operations(ServiceKeyOperation, ServiceKey, :service_key_id, 'service_keys.create', :cleanup_failed_key) + end + + def max_attempts + 1 + end + + private + + def cleanup_operations(operation_model, instance_model, foreign_key, jobs_operation, orphan_mitigator_method) + # The explanation below uses service_instance_operations as the concrete example; + # the same logic applies to service_binding_operations and service_key_operations + # when invoked with their respective arguments. + # + # Find stuck service instance 'in progress' operations where the broker is still working + # but CC's polling job has permanently failed due to a transient error (e.g. brief db connection flip). + # Join path: service_instance_operations → service_instances → jobs → delayed_jobs. + # + # Filters: + # - service_instance_operations.state='in progress': the broker has not yet reported a final state + # (succeeded or failed) that CC could successfully persist; if CC had received and saved a final + # state from the broker, this column would already be 'succeeded' or 'failed' — not 'in progress' + # - service_instance_operations.type='create': scope to create operations only + # - service_instance_operations.created_at > CURRENT_TIMESTAMP - max_duration: operations beyond the max async polling window + # are intentionally excluded — the broker has given up on them too, so they are out of scope for this cleanup + # - jobs.state IN (POLLING, FAILED): the pollable job has not reached COMPLETE (a successful job + # would already be done and is out of scope); POLLING covers the case where the failure hook + # itself couldn't write FAILED due to the DB flip + # - jobs.operation='service_instance.create': prevents matching update/delete jobs for the same + # service instance that happen to share the same resource_guid + # - delayed_jobs.failed_at IS NOT NULL: the delayed job permanently failed (exhausted max_attempts); + # jobs still alive or locked have failed_at=NULL and must not be touched + operation_table = operation_model.table_name + instance_table = instance_model.table_name + + stuck = operation_model. + join(instance_table, id: Sequel[operation_table][foreign_key]). + join(:jobs, resource_guid: Sequel[instance_table][:guid]). + join(:delayed_jobs, guid: Sequel[:jobs][:delayed_job_guid]). + where(Sequel[operation_table][:state] => 'in progress'). + where(Sequel[operation_table][:type] => 'create'). + where(Sequel.lit("#{operation_table}.created_at > CURRENT_TIMESTAMP - INTERVAL '?' SECOND", default_maximum_duration_seconds.to_i)). + where(Sequel[:jobs][:state] => [PollableJobModel::POLLING_STATE, PollableJobModel::FAILED_STATE]). + where(Sequel[:jobs][:operation] => jobs_operation). + exclude(Sequel[:delayed_jobs][:failed_at] => nil). + select( + Sequel[:jobs][:guid].as(:pollable_guid), + Sequel[operation_table][:id].as(:op_id), + Sequel[operation_table][foreign_key].as(:resource_id) + ). + order(Sequel[operation_table][:created_at]). + limit(BATCH_SIZE) + + stuck.each do |row| + mitigate_orphan(operation_model, instance_model, orphan_mitigator_method, + row[:op_id], row[:resource_id], row[:pollable_guid]) + end + end + + def mitigate_orphan(operation_model, instance_model, orphan_mitigator_method, op_id, resource_id, pollable_guid) + # Mark the stuck create operation as failed, mark its pollable job as failed, + # and trigger broker-side orphan deprovisioning to clean up any resource the + # broker may have created. + operation_model.db.transaction do + operation = operation_model.where(id: op_id, state: 'in progress').for_update.skip_locked.first + return unless operation + + instance = instance_model.first(id: resource_id) + return unless instance + + instance_type = instance_model.to_s.split('::').last + + logger.info( + "#{instance_type} #{instance.guid} create operation is stuck in 'in progress'. " \ + "Setting operation's state to 'failed' and pollable job's state to 'FAILED'.", + instance_type: instance_type, + instance_guid: instance.guid, + operation_id: op_id, + pollable_job_guid: pollable_guid + ) + + operation.update(state: 'failed', + description: "Operation was stuck in 'in progress' state. Set to 'failed' by cleanup job; orphan mitigation triggered.") + PollableJobModel.where(guid: pollable_guid).update(state: PollableJobModel::FAILED_STATE) + orphan_mitigator.send(orphan_mitigator_method, instance) + end + end + + def orphan_mitigator + @orphan_mitigator ||= VCAP::Services::ServiceBrokers::V2::OrphanMitigator.new + end + + def default_maximum_duration_seconds + Config.config.get(:broker_client_max_async_poll_duration_minutes).minutes + end + + def logger + @logger ||= Steno.logger('cc.background.service-operations-in-progress-cleanup') + end + + def job_name_in_configuration + :service_operations_create_in_progress_cleanup + end + end + end + end +end diff --git a/config/cloud_controller.yml b/config/cloud_controller.yml index 0983c7f0673..fbf94806aff 100644 --- a/config/cloud_controller.yml +++ b/config/cloud_controller.yml @@ -55,6 +55,9 @@ pollable_jobs: service_operations_initial_cleanup: frequency_in_seconds: 300 +service_operations_create_in_progress_cleanup: + frequency_in_seconds: 300 + completed_tasks: cutoff_age_in_days: 31 @@ -369,7 +372,6 @@ diego_sync: pending_droplets: frequency_in_seconds: 300 expiration_in_seconds: 42 - pending_builds: expiration_in_seconds: 42 frequency_in_seconds: 300 diff --git a/db/migrations/20260505071445_add_jobs_operation_state_index.rb b/db/migrations/20260505071445_add_jobs_operation_state_index.rb new file mode 100644 index 00000000000..aeefa7f731e --- /dev/null +++ b/db/migrations/20260505071445_add_jobs_operation_state_index.rb @@ -0,0 +1,38 @@ +Sequel.migration do + no_transaction # required for concurrently option on postgres + + up do + if database_type == :postgres + VCAP::Migration.with_concurrent_timeout(self) do + add_index :jobs, %i[operation state], + name: :jobs_operation_state_index, + where: "state IN ('POLLING', 'FAILED')", + if_not_exists: true, + concurrently: true + end + elsif database_type == :mysql + alter_table(:jobs) do + # rubocop:disable Sequel/ConcurrentIndex -- MySQL does not support concurrent index operations + add_index %i[operation state], name: :jobs_operation_state_index unless @db.indexes(:jobs).key?(:jobs_operation_state_index) + # rubocop:enable Sequel/ConcurrentIndex + end + end + end + + down do + if database_type == :postgres + VCAP::Migration.with_concurrent_timeout(self) do + drop_index :jobs, %i[operation state], + name: :jobs_operation_state_index, + if_exists: true, + concurrently: true + end + elsif database_type == :mysql + alter_table(:jobs) do + # rubocop:disable Sequel/ConcurrentIndex + drop_index %i[operation state], name: :jobs_operation_state_index if @db.indexes(:jobs).key?(:jobs_operation_state_index) + # rubocop:enable Sequel/ConcurrentIndex + end + end + end +end diff --git a/lib/cloud_controller/clock/scheduler.rb b/lib/cloud_controller/clock/scheduler.rb index 388b5db11d5..3ee2451c9b9 100644 --- a/lib/cloud_controller/clock/scheduler.rb +++ b/lib/cloud_controller/clock/scheduler.rb @@ -24,7 +24,8 @@ class Scheduler { name: 'pending_droplets', class: Jobs::Runtime::PendingDropletCleanup }, { name: 'pending_builds', class: Jobs::Runtime::PendingBuildCleanup }, { name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup }, - { name: 'service_operations_initial_cleanup', class: Jobs::Runtime::ServiceOperationsInitialCleanup } + { name: 'service_operations_initial_cleanup', class: Jobs::Runtime::ServiceOperationsInitialCleanup }, + { name: 'service_operations_create_in_progress_cleanup', class: Jobs::Runtime::ServiceOperationsCreateInProgressCleanup } ].freeze def initialize(config) diff --git a/lib/cloud_controller/config_schemas/clock_schema.rb b/lib/cloud_controller/config_schemas/clock_schema.rb index 81976bbf551..743172e6178 100644 --- a/lib/cloud_controller/config_schemas/clock_schema.rb +++ b/lib/cloud_controller/config_schemas/clock_schema.rb @@ -34,6 +34,9 @@ class ClockSchema < VCAP::Config completed_tasks: { cutoff_age_in_days: Integer }, + service_operations_create_in_progress_cleanup: { + frequency_in_seconds: Integer + }, default_health_check_timeout: Integer, uaa: { diff --git a/lib/cloud_controller/jobs.rb b/lib/cloud_controller/jobs.rb index 9f39f53d152..5dfb1f71d30 100644 --- a/lib/cloud_controller/jobs.rb +++ b/lib/cloud_controller/jobs.rb @@ -25,6 +25,7 @@ require 'jobs/runtime/expired_blob_cleanup' require 'jobs/runtime/expired_orphaned_blob_cleanup' require 'jobs/runtime/expired_resource_cleanup' +require 'jobs/runtime/service_operations_create_in_progress_cleanup' require 'jobs/runtime/failed_jobs_cleanup' require 'jobs/runtime/service_operations_initial_cleanup' require 'jobs/runtime/legacy_jobs' diff --git a/lib/tasks/jobs.rake b/lib/tasks/jobs.rake index 86c33fe9208..df59d9a48eb 100644 --- a/lib/tasks/jobs.rake +++ b/lib/tasks/jobs.rake @@ -49,6 +49,7 @@ namespace :jobs do 'audit_events', 'failed_jobs', 'service_operations_initial_cleanup', + 'service_operations_create_in_progress_cleanup', 'service_usage_events', 'completed_tasks', 'expired_blob_cleanup', diff --git a/spec/migrations/20260505071445_add_jobs_operation_state_index_spec.rb b/spec/migrations/20260505071445_add_jobs_operation_state_index_spec.rb new file mode 100644 index 00000000000..162c1e51b00 --- /dev/null +++ b/spec/migrations/20260505071445_add_jobs_operation_state_index_spec.rb @@ -0,0 +1,37 @@ +require 'spec_helper' +require 'migrations/helpers/migration_shared_context' + +RSpec.describe 'migration to add operation_state_index on jobs table', isolation: :truncation, type: :migration do + include_context 'migration' do + let(:migration_filename) { '20260505071445_add_jobs_operation_state_index.rb' } + end + + def operation_state_index_present? + if db.database_type == :postgres + db.fetch("SELECT 1 FROM pg_indexes WHERE tablename = 'jobs' AND indexname = 'jobs_operation_state_index'").any? + else + db.indexes(:jobs).key?(:jobs_operation_state_index) + end + end + + describe 'jobs table' do + it 'adds index and handles idempotency gracefully' do + # Test up migration + expect(operation_state_index_present?).to be_falsey + expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error + expect(operation_state_index_present?).to be_truthy + + # Test up migration idempotency + expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error + expect(operation_state_index_present?).to be_truthy + + # Test down migration + expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error + expect(operation_state_index_present?).to be_falsey + + # Test down migration idempotency + expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error + expect(operation_state_index_present?).to be_falsey + end + end +end diff --git a/spec/unit/jobs/runtime/service_operations_create_in_progress_cleanup.rb b/spec/unit/jobs/runtime/service_operations_create_in_progress_cleanup.rb new file mode 100644 index 00000000000..736bd1bd164 --- /dev/null +++ b/spec/unit/jobs/runtime/service_operations_create_in_progress_cleanup.rb @@ -0,0 +1,258 @@ +require 'spec_helper' + +module VCAP::CloudController + module Jobs::Runtime + RSpec.describe ServiceOperationsCreateInProgressCleanup, job_context: :worker do + subject(:job) { ServiceOperationsCreateInProgressCleanup.new } + + let(:fake_logger) { instance_double(Steno::Logger, info: nil, warn: nil) } + let(:fake_mitigator) { instance_double(VCAP::Services::ServiceBrokers::V2::OrphanMitigator) } + let(:max_poll_duration_minutes) { 60 } + + before do + allow(Steno).to receive(:logger).and_return(fake_logger) + TestConfig.override(broker_client_max_async_poll_duration_minutes: max_poll_duration_minutes) + allow(VCAP::Services::ServiceBrokers::V2::OrphanMitigator).to receive(:new).and_return(fake_mitigator) + allow(fake_mitigator).to receive(:cleanup_failed_provision) + allow(fake_mitigator).to receive(:cleanup_failed_bind) + allow(fake_mitigator).to receive(:cleanup_failed_key) + end + + # Builds a fully stuck scenario for ServiceInstance create that the job should pick up and mitigate. + # All filter conditions are satisfied: sio is in progress/create/within cutoff, + # pjob is FAILED with operation=service_instance.create, delayed_job has failed_at set. + # Override individual parameters to break a single filter and test exclusion. + def prepare_stuck_service_instance( + service_instance_state: 'in progress', + service_instance_type: 'create', + service_instance_created_at: Time.now, + pollable_job_state: PollableJobModel::FAILED_STATE, + delayed_job_failed_at: Time.now + ) + service_instance = ManagedServiceInstance.make + + ServiceInstanceOperation.make( + service_instance_id: service_instance.id, + type: service_instance_type, + state: service_instance_state, + created_at: service_instance_created_at + ) + + dj = Delayed::Job.create!( + guid: SecureRandom.uuid, + handler: 'fake', + run_at: Time.now, + failed_at: delayed_job_failed_at, + queue: 'cc-generic' + ) + + pjob = PollableJobModel.make( + state: pollable_job_state, + operation: 'service_instance.create', + resource_guid: service_instance.guid, + resource_type: 'service_instances', + delayed_job_guid: dj.guid + ) + + { service_instance: service_instance, pjob: pjob, delayed_job: dj } + end + + it { is_expected.to be_a_valid_job } + + describe '#perform' do + context 'when sio state is not in progress' do + it 'does not mitigate when state is succeeded' do + scenario = prepare_stuck_service_instance(service_instance_state: 'succeeded') + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('succeeded') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + + it 'does not mitigate when state is failed' do + scenario = prepare_stuck_service_instance(service_instance_state: 'failed') + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('failed') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when sio type is not create' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance(service_instance_type: 'delete') + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when sio created_at is beyond the max polling window' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance(service_instance_created_at: Time.now - (max_poll_duration_minutes + 1).minutes) + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when delayed_job.failed_at is nil (job still running or locked)' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance(delayed_job_failed_at: nil) + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when pollable job state is COMPLETE' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance(pollable_job_state: PollableJobModel::COMPLETE_STATE) + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when pollable job state is PROCESSING' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance(pollable_job_state: PollableJobModel::PROCESSING_STATE) + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when pollable job operation is not service_instance.create' do + it 'does not mitigate' do + scenario = prepare_stuck_service_instance + scenario[:pjob].update(operation: 'service_instance.update') + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('in progress') + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when a service instance create job is stuck with state FAILED' do + it 'sets operation to failed, pollable job to FAILED, and triggers orphan mitigation' do + scenario = prepare_stuck_service_instance + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('failed') + expect(scenario[:pjob].reload.state).to eq(PollableJobModel::FAILED_STATE) + expect(fake_mitigator).to have_received(:cleanup_failed_provision).with(scenario[:service_instance]) + end + end + + context 'when a service instance create job is stuck with state POLLING (DB flip before failure hook)' do + it 'sets operation to failed, pollable job to FAILED, and triggers orphan mitigation' do + scenario = prepare_stuck_service_instance(pollable_job_state: PollableJobModel::POLLING_STATE) + job.perform + expect(scenario[:service_instance].last_operation.reload.state).to eq('failed') + expect(scenario[:pjob].reload.state).to eq(PollableJobModel::FAILED_STATE) + expect(fake_mitigator).to have_received(:cleanup_failed_provision).with(scenario[:service_instance]) + end + end + + context 'when there are multiple stuck jobs within the batch size' do + it 'mitigates each one' do + 3.times { prepare_stuck_service_instance } + job.perform + expect(ServiceInstanceOperation.where(state: 'failed').count).to eq(3) + end + end + + context 'when there are more stuck jobs than the batch size' do + it 'processes only up to BATCH_SIZE jobs per run' do + (ServiceOperationsCreateInProgressCleanup::BATCH_SIZE + 1).times { prepare_stuck_service_instance } + job.perform + expect(ServiceInstanceOperation.where(state: 'failed').count).to eq(ServiceOperationsCreateInProgressCleanup::BATCH_SIZE) + end + end + + context 'when a service binding create job is stuck' do + it 'sets operation to failed, pollable job to FAILED, and triggers orphan mitigation' do + service_binding = ServiceBinding.make + ServiceBindingOperation.make( + service_binding_id: service_binding.id, + type: 'create', + state: 'in progress' + ) + dj = Delayed::Job.create!(guid: SecureRandom.uuid, handler: 'fake', run_at: Time.now, failed_at: Time.now, queue: 'cc-generic') + pjob = PollableJobModel.make( + state: PollableJobModel::FAILED_STATE, + operation: 'service_bindings.create', + resource_guid: service_binding.guid, + resource_type: 'service_bindings', + delayed_job_guid: dj.guid + ) + + job.perform + + expect(service_binding.last_operation.reload.state).to eq('failed') + expect(pjob.reload.state).to eq(PollableJobModel::FAILED_STATE) + expect(fake_mitigator).to have_received(:cleanup_failed_bind).with(service_binding) + end + end + + context 'when a service key create job is stuck' do + it 'sets operation to failed, pollable job to FAILED, and triggers orphan mitigation' do + service_key = ServiceKey.make + ServiceKeyOperation.make( + service_key_id: service_key.id, + type: 'create', + state: 'in progress' + ) + dj = Delayed::Job.create!(guid: SecureRandom.uuid, handler: 'fake', run_at: Time.now, failed_at: Time.now, queue: 'cc-generic') + pjob = PollableJobModel.make( + state: PollableJobModel::FAILED_STATE, + operation: 'service_keys.create', + resource_guid: service_key.guid, + resource_type: 'service_keys', + delayed_job_guid: dj.guid + ) + + job.perform + + expect(service_key.last_operation.reload.state).to eq('failed') + expect(pjob.reload.state).to eq(PollableJobModel::FAILED_STATE) + expect(fake_mitigator).to have_received(:cleanup_failed_key).with(service_key) + end + end + end + + describe '#mitigate_orphan' do + context 'when another process already mitigated (skip_locked returns nil)' do + it 'does nothing' do + scenario = prepare_stuck_service_instance + + expect do + job.send(:mitigate_orphan, ServiceInstanceOperation, ServiceInstance, + :cleanup_failed_provision, -1, scenario[:service_instance].id, scenario[:pjob].guid) + end.not_to raise_error + expect(fake_mitigator).not_to have_received(:cleanup_failed_provision) + end + end + + context 'when the operation is stuck in progress' do + it 'sets the operation state from in progress to failed' do + scenario = prepare_stuck_service_instance + op = scenario[:service_instance].last_operation + + expect do + job.send(:mitigate_orphan, ServiceInstanceOperation, ServiceInstance, + :cleanup_failed_provision, op.id, scenario[:service_instance].id, scenario[:pjob].guid) + end.to change { op.reload.state }.from('in progress').to('failed') + end + + it 'sets the pollable job state to FAILED' do + scenario = prepare_stuck_service_instance(pollable_job_state: PollableJobModel::POLLING_STATE) + op = scenario[:service_instance].last_operation + + expect do + job.send(:mitigate_orphan, ServiceInstanceOperation, ServiceInstance, + :cleanup_failed_provision, op.id, scenario[:service_instance].id, scenario[:pjob].guid) + end.to change { scenario[:pjob].reload.state }.from(PollableJobModel::POLLING_STATE).to(PollableJobModel::FAILED_STATE) + end + end + end + end + end +end diff --git a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb index 20d0e49b5bd..3f53ff51788 100644 --- a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb @@ -21,6 +21,7 @@ module VCAP::CloudController failed_jobs: { frequency_in_seconds: 400, cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10 }, pollable_jobs: { cutoff_age_in_days: 2 }, service_operations_initial_cleanup: { frequency_in_seconds: 600 }, + service_operations_create_in_progress_cleanup: { frequency_in_seconds: 600 }, service_usage_events: { cutoff_age_in_days: 5 }, completed_tasks: { cutoff_age_in_days: 6 }, pending_droplets: { frequency_in_seconds: 300, expiration_in_seconds: 600 }, @@ -161,6 +162,12 @@ module VCAP::CloudController expect(block.call).to be_instance_of(Jobs::Runtime::ServiceOperationsInitialCleanup) end + expect(clock).to receive(:schedule_frequent_worker_job) do |args, &block| + expect(args).to eql(name: 'service_operations_create_in_progress_cleanup', interval: 600) + expect(Jobs::Runtime::ServiceOperationsCreateInProgressCleanup).to receive(:new).and_call_original + expect(block.call).to be_instance_of(Jobs::Runtime::ServiceOperationsCreateInProgressCleanup) + end + schedule.start end