diff --git a/include/readdy/common/thread/ctpl.h b/include/readdy/common/thread/ctpl.h
index ddf4b4279..ba9b638d1 100644
--- a/include/readdy/common/thread/ctpl.h
+++ b/include/readdy/common/thread/ctpl.h
@@ -66,11 +66,18 @@ class Queue {
         return true;
     }
 
+    void popAll() {
+        std::unique_lock<std::mutex> lock(this->mutex);
+        while(!q.empty()) q.pop();
+    }
+
     bool empty() {
         std::unique_lock<std::mutex> lock(this->mutex);
         return this->q.empty();
     }
 
+    Queue() = default;
+
 private:
     std::queue<T> q;
     std::mutex mutex;
@@ -100,6 +107,14 @@ class thread_pool {
 
     std::thread &get_thread(int i) { return *this->threads[i]; }
 
+    void resize_wait(std::size_t n) {
+        stop(true);
+        nWaiting = 0;
+        isStop = false;
+        isDone = false;
+        resize(n);
+    }
+
     // change the number of threads in the pool
     // should be called from one thread, otherwise be careful to not interleave, also with this->stop()
     // nThreads must be >= 0
@@ -143,11 +158,10 @@ class thread_pool {
     std::function<void(int)> pop() {
         std::function<void(int id)> *_f = nullptr;
         this->q.pop(_f);
-        std::unique_ptr<std::function<void(int id)>> func(
-                _f); // at return, delete the function even if an exception occurred
+        // at return, delete the function even if an exception occurred
+        std::unique_ptr<std::function<void(int id)>> func(_f);
         std::function<void(int)> f;
-        if (_f)
-            f = *_f;
+        if (_f) f = *_f;
         return f;
     }
 
@@ -243,9 +257,9 @@ class thread_pool {
     }
 
     thread_pool(const thread_pool &) = delete;// = delete;
-    thread_pool(thread_pool &&) = delete;// = delete;
+    thread_pool(thread_pool &&) = delete;
     thread_pool &operator=(const thread_pool &) = delete;// = delete;
-    thread_pool &operator=(thread_pool &&) = delete;// = delete;
+    thread_pool &operator=(thread_pool &&) = delete;
 private:
 
 
diff --git a/kernels/cpu/include/readdy/kernel/cpu/CPUKernel.h b/kernels/cpu/include/readdy/kernel/cpu/CPUKernel.h
index be35f0f8e..9670d3c59 100644
--- a/kernels/cpu/include/readdy/kernel/cpu/CPUKernel.h
+++ b/kernels/cpu/include/readdy/kernel/cpu/CPUKernel.h
@@ -79,7 +79,7 @@ class CPUKernel : public readdy::model::Kernel {
     };
 
     void setNThreads(std::uint32_t n) {
-        _pool.resize(n);
+        _pool.resize_wait(n);
     };
 
     std::size_t getNThreads() {
diff --git a/wrappers/python/src/python/readdy/examples/cytosolic_reactions.py b/wrappers/python/src/python/readdy/examples/cytosolic_reactions.py
index 3dc4c8d86..3c658c34d 100644
--- a/wrappers/python/src/python/readdy/examples/cytosolic_reactions.py
+++ b/wrappers/python/src/python/readdy/examples/cytosolic_reactions.py
@@ -41,7 +41,8 @@ def traverse_performance_tree(tree):
 
 def perform(kernel="SingleCPU", n_particles_a=2357, force_constant=10., file_suffix="", full_simulation=False,
             debug_run=False, n_threads=-1):
-    print("kernel {}, n_particles_a {}, force_constant {}".format(kernel, n_particles_a, force_constant))
+    print("kernel {}, n_particles_a {}, force_constant {}, threads {}"
+          .format(kernel, n_particles_a, force_constant, n_threads))
     n_particles_b = n_particles_a
     n_particles_c = 0
     desired_a_density = 2357. / 1e6  # number of particles per nanometer**3
@@ -163,6 +164,12 @@ def perform(kernel="SingleCPU", n_particles_a=2357, force_constant=10., file_suf
     data.update(result)
 
     os.unlink(simulation.output_file)
+
+    del simulation
+    del system
+
+    import gc
+    gc.collect()
     return data
 
 
@@ -181,11 +188,11 @@ def perform(kernel="SingleCPU", n_particles_a=2357, force_constant=10., file_suf
     t1 = time.perf_counter()
     n_cores = multiprocessing.cpu_count()
     perf_results = []
-    for kernel in ["SingleCPU", "CPU", "CPU_Legacy"]:
+    for kernel in ["CPU", "SingleCPU", "CPU_Legacy"]:
             for n_particles_a in [200, 400, 700, 1000, 2357, 5000, 10000, 15000, 20000, 30000]:
                 for force_constant in [10.]:
                     if kernel != "SingleCPU":
-                        for n_threads in range(1, 4*n_cores+1):
+                        for n_threads in [1,2,3,4,5,6,7,8,10,16,24,32]:
                             data = perform(
                                 kernel=kernel,
                                 n_particles_a=n_particles_a,
@@ -195,6 +202,7 @@ def perform(kernel="SingleCPU", n_particles_a=2357, force_constant=10., file_suf
                                 debug_run=debug_run,
                                 n_threads=n_threads)
                             perf_results.append(data)
+                            time.sleep(.5)
                     else:
                         data = perform(
                             kernel=kernel,