Fix race condition in shutdown of pipelines

Prior to this a single worker could slurp down multiple shutdown messages, this prevents that from happening by using a flag that can't be overconsumed. Fixes elastic#9285
nugend · Mar 26, 2018 · cc72641 · cc72641
1 parent fcce86a
commit cc72641
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 10 deletions.
diff --git a/logstash-core/lib/logstash/pipeline.rb b/logstash-core/lib/logstash/pipeline.rb
@@ -195,6 +195,7 @@ def initialize(pipeline_config, namespaced_metric = nil, agent = nil)
     @running = Concurrent::AtomicBoolean.new(false)
     @flushing = Concurrent::AtomicReference.new(false)
     @outputs_registered = Concurrent::AtomicBoolean.new(false)
+    @worker_shutdown = java.util.concurrent.atomic.AtomicBoolean.new(false)
   end # def initialize
 
   def ready?
@@ -410,13 +411,10 @@ def dlq_enabled?
   # Main body of what a worker thread does
   # Repeatedly takes batches off the queue, filters, then outputs them
   def worker_loop(batch_size, batch_delay)
-    shutdown_requested = false
-
     @filter_queue_client.set_batch_dimensions(batch_size, batch_delay)
     output_events_map = Hash.new { |h, k| h[k] = [] }
     while true
       signal = @signal_queue.poll || NO_SIGNAL
-      shutdown_requested |= signal.shutdown? # latch on shutdown signal
 
       batch = @filter_queue_client.read_batch.to_java # metrics are started in read_batch
       batch_size = batch.filteredSize
@@ -430,7 +428,7 @@ def worker_loop(batch_size, batch_delay)
         @filter_queue_client.close_batch(batch)
       end
       # keep break at end of loop, after the read_batch operation, some pipeline specs rely on this "final read_batch" before shutdown.
-      break if (shutdown_requested && !draining_queue?)
+      break if (@worker_shutdown.get && !draining_queue?)
     end
 
     # we are shutting down, queue is drained if it was required, now  perform a final flush.
@@ -575,11 +573,8 @@ def stop_inputs
   # tell the worker threads to stop and then block until they've fully stopped
   # This also stops all filter and output plugins
   def shutdown_workers
-    # Each worker thread will receive this exactly once!
-    @worker_threads.each do |t|
-      @logger.debug("Pushing shutdown", default_logging_keys(:thread => t.inspect))
-      @signal_queue.put(SHUTDOWN)
-    end
+    @logger.debug("Setting shutdown", default_logging_keys)
+    @worker_shutdown.set(true)
 
     @worker_threads.each do |t|
       @logger.debug("Shutdown waiting for worker thread" , default_logging_keys(:thread => t.inspect))

diff --git a/logstash-core/spec/logstash/pipeline_spec.rb b/logstash-core/spec/logstash/pipeline_spec.rb
@@ -22,6 +22,15 @@ def close
   end
 end
 
+# This input runs long enough that a flush should occur
+class DummyFlushEnablingInput < DummyInput
+  def run(queue)
+    while !stop?
+      sleep 1
+    end
+  end
+end
+
 class DummyInputGenerator < LogStash::Inputs::Base
   config_name "dummyinputgenerator"
   milestone 2
@@ -650,7 +659,7 @@ class TestPipeline < LogStash::Pipeline
 
     before do
       allow(::LogStash::Outputs::DummyOutput).to receive(:new).with(any_args).and_return(output)
-      allow(LogStash::Plugin).to receive(:lookup).with("input", "dummy_input").and_return(DummyInput)
+      allow(LogStash::Plugin).to receive(:lookup).with("input", "dummy_input").and_return(DummyFlushEnablingInput)
       allow(LogStash::Plugin).to receive(:lookup).with("filter", "dummy_flushing_filter").and_return(DummyFlushingFilterPeriodic)
       allow(LogStash::Plugin).to receive(:lookup).with("output", "dummy_output").and_return(::LogStash::Outputs::DummyOutput)
       allow(LogStash::Plugin).to receive(:lookup).with("codec", "plain").and_return(LogStash::Codecs::Plain)