Re-work process-graph to use lock-free queues

This removes the trigger_mutex which was used for exclusive access of the work-queue for concurrent processing.
2019-07-10 19:11:07 +02:00
parent 929ecf622b
commit b7369f421f
4 changed files with 353 additions and 322 deletions
--- a/libs/ardour/ardour/graph.h
+++ b/libs/ardour/ardour/graph.h
@@ -1,46 +1,43 @@
 /*
-    Copyright (C) 2010 Paul Davis
-    Copyright (C) 2017 Robin Gareus <robin@gareus.org>
-    Author: Torben Hohn
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
+ * Copyright (C) 2010 Paul Davis
+ * Copyright (C) 2017-2019 Robin Gareus <robin@gareus.org>
+ * incl. some work from Torben Hohn
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */

 #ifndef __ardour_graph_h__
 #define __ardour_graph_h__

 #include <list>
 #include <set>
-#include <vector>
 #include <string>
+#include <vector>

 #include <boost/shared_ptr.hpp>

-#include <glib.h>
-
+#include "pbd/mpmc_queue.h"
 #include "pbd/semutils.h"

-#include "ardour/libardour_visibility.h"
-#include "ardour/types.h"
 #include "ardour/audio_backend.h"
+#include "ardour/libardour_visibility.h"
 #include "ardour/session_handle.h"
+#include "ardour/types.h"

 namespace ARDOUR
 {
-
 class GraphNode;
 class Graph;

@@ -50,27 +47,27 @@ class GraphEdges;

 typedef boost::shared_ptr<GraphNode> node_ptr_t;

-typedef std::list< node_ptr_t > node_list_t;
-typedef std::set< node_ptr_t > node_set_t;
+typedef std::list<node_ptr_t> node_list_t;
+typedef std::set<node_ptr_t>  node_set_t;

 class LIBARDOUR_API Graph : public SessionHandleRef
 {
 public:
-	Graph (Session & session);
+	Graph (Session& session);

-	void trigger (GraphNode * n);
-	void rechain (boost::shared_ptr<RouteList>, GraphEdges const &);
+	void trigger (GraphNode* n);
+	void rechain (boost::shared_ptr<RouteList>, GraphEdges const&);

 	void dump (int chain);
-	void dec_ref();
+	void reached_terminal_node ();

-	void helper_thread();
+	void helper_thread ();

 	int process_routes (pframes_t nframes, samplepos_t start_sample, samplepos_t end_sample, bool& need_butler);

-	int routes_no_roll (pframes_t nframes, samplepos_t start_sample, samplepos_t end_sample, bool non_rt_pending );
+	int routes_no_roll (pframes_t nframes, samplepos_t start_sample, samplepos_t end_sample, bool non_rt_pending);

-	void process_one_route (Route * route);
+	void process_one_route (Route* route);

 	void clear_other_chain ();

@@ -80,36 +77,40 @@ protected:
 	virtual void session_going_away ();

 private:
-	volatile bool _threads_active;
-
 	void reset_thread_list ();
 	void drop_threads ();
-	void restart_cycle();
-	bool run_one();
-	void main_thread();
-	void prep();
+	void run_one ();
+	void main_thread ();
+	void prep ();

 	node_list_t _nodes_rt[2];
-
 	node_list_t _init_trigger_list[2];

-	std::vector<GraphNode *> _trigger_queue;
-	pthread_mutex_t          _trigger_mutex;
+	PBD::MPMCQueue<GraphNode*> _trigger_queue;      ///< nodes that can be processed
+	volatile guint             _trigger_queue_size; ///< number of entries in trigger-queue

+	/** Start worker threads */
 	PBD::Semaphore _execution_sem;

+	/** The number of processing threads that are asleep */
+	volatile guint _idle_thread_cnt;
+
 	/** Signalled to start a run of the graph for a process callback */
 	PBD::Semaphore _callback_start_sem;
 	PBD::Semaphore _callback_done_sem;

-	/** The number of processing threads that are asleep */
-	volatile gint _execution_tokens;
 	/** The number of unprocessed nodes that do not feed any other node; updated during processing */
-	volatile gint _finished_refcount;
-	/** The initial number of nodes that do not feed any other node (for each chain) */
-	volatile gint _init_finished_refcount[2];
+	volatile guint _terminal_refcnt;

-	bool _graph_empty;
+	/** The initial number of nodes that do not feed any other node (for each chain) */
+	guint _n_terminal_nodes[2];
+	bool  _graph_empty;
+
+	/* number of background worker threads >= 0 */
+	volatile guint _n_workers;
+
+	/* flag to terminate background threads */
+	volatile gint _terminate;

 	/* chain swapping */
 	Glib::Threads::Mutex _swap_mutex;
@@ -132,7 +133,7 @@ private:

 	/* engine / thread connection */
 	PBD::ScopedConnectionList engine_connections;
-	void engine_stopped ();
+	void                      engine_stopped ();
 };

 } // namespace
--- a/libs/ardour/ardour/graphnode.h
+++ b/libs/ardour/ardour/graphnode.h
@@ -17,7 +17,6 @@

 */

-
 #ifndef __ardour_graphnode_h__
 #define __ardour_graphnode_h__

@@ -29,40 +28,48 @@

 namespace ARDOUR
 {
-
 class Graph;
 class GraphNode;

 typedef boost::shared_ptr<GraphNode> node_ptr_t;
-typedef std::set< node_ptr_t > node_set_t;
-typedef std::list< node_ptr_t > node_list_t;
+typedef std::set<node_ptr_t>         node_set_t;
+typedef std::list<node_ptr_t>        node_list_t;

-/** A node on our processing graph, ie a Route */
-class LIBARDOUR_API GraphNode
+class LIBARDOUR_API GraphActivision
 {
-    public:
-	GraphNode( boost::shared_ptr<Graph> Graph );
-	virtual ~GraphNode();
-
-	void prep( int chain );
-	void dec_ref();
-	void finish( int chain );
-
-	virtual void process();
-
-    private:
+protected:
 	friend class Graph;
-
 	/** Nodes that we directly feed */
-	node_set_t  _activation_set[2];
-
-	boost::shared_ptr<Graph> _graph;
-
-	gint _refcount;
+	node_set_t _activation_set[2];
 	/** The number of nodes that we directly feed us (one count for each chain) */
 	gint _init_refcount[2];
 };

+/** A node on our processing graph, ie a Route */
+class LIBARDOUR_API GraphNode : public GraphActivision
+{
+public:
+	GraphNode (boost::shared_ptr<Graph> Graph);
+	virtual ~GraphNode ();
+
+	void prep (int chain);
+	void trigger ();
+
+	void
+	run (int chain)
+	{
+		process ();
+		finish (chain);
+	}
+
+private:
+	void finish (int chain);
+	void process ();
+
+	boost::shared_ptr<Graph> _graph;
+
+	gint _refcount;
+};
 }

 #endif
--- a/libs/ardour/graph.cc
+++ b/libs/ardour/graph.cc
@@ -1,36 +1,37 @@
 /*
-  Copyright (C) 2010 Paul Davis
-  Author: Torben Hohn
+ * Copyright (C) 2010 Paul Davis
+ * Copyright (C) 2017-2019 Robin Gareus <robin@gareus.org>
+ * incl. some work from Torben Hohn
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */

-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-*/
-#include <stdio.h>
 #include <cmath>
+#include <stdio.h>

 #include "pbd/compose.h"
 #include "pbd/debug_rt_alloc.h"
 #include "pbd/pthread_utils.h"

+#include "ardour/audioengine.h"
 #include "ardour/debug.h"
 #include "ardour/graph.h"
-#include "ardour/types.h"
-#include "ardour/session.h"
-#include "ardour/route.h"
 #include "ardour/process_thread.h"
-#include "ardour/audioengine.h"
+#include "ardour/route.h"
+#include "ardour/session.h"
+#include "ardour/types.h"

 #include "pbd/i18n.h"

@@ -43,44 +44,43 @@ static Graph* graph = 0;

 extern "C" {

-int alloc_allowed ()
+int
+alloc_allowed ()
 {
 	return !graph->in_process_thread ();
 }
-
 }
 #endif

-Graph::Graph (Session & session)
+#define g_atomic_uint_get(x) static_cast<guint> (g_atomic_int_get (x))
+
+Graph::Graph (Session& session)
 	: SessionHandleRef (session)
-	, _threads_active (false)
 	, _execution_sem ("graph_execution", 0)
 	, _callback_start_sem ("graph_start", 0)
 	, _callback_done_sem ("graph_done", 0)
+	, _graph_empty (true)
+	, _current_chain (0)
+	, _pending_chain (0)
+	, _setup_chain (0)
 {
-	pthread_mutex_init( &_trigger_mutex, NULL);
+	g_atomic_int_set (&_terminal_refcnt, 0);
+	g_atomic_int_set (&_terminate, 0);
+	g_atomic_int_set (&_n_workers, 0);
+	g_atomic_int_set (&_idle_thread_cnt, 0);
+	g_atomic_int_set (&_trigger_queue_size, 0);

-	/* XXX: rather hacky `fix' to stop _trigger_queue.push_back() allocating
-	 * memory in the RT thread.
-	 */
-	_trigger_queue.reserve (8192);
+	/* pre-allocate memory */
+	_trigger_queue.reserve (1024);

-	_execution_tokens = 0;
-
-	_current_chain = 0;
-	_pending_chain = 0;
-	_setup_chain   = 1;
-	_graph_empty = true;
-
-
-	ARDOUR::AudioEngine::instance()->Running.connect_same_thread (engine_connections, boost::bind (&Graph::reset_thread_list, this));
-	ARDOUR::AudioEngine::instance()->Stopped.connect_same_thread (engine_connections, boost::bind (&Graph::engine_stopped, this));
-	ARDOUR::AudioEngine::instance()->Halted.connect_same_thread (engine_connections, boost::bind (&Graph::engine_stopped, this));
+	ARDOUR::AudioEngine::instance ()->Running.connect_same_thread (engine_connections, boost::bind (&Graph::reset_thread_list, this));
+	ARDOUR::AudioEngine::instance ()->Stopped.connect_same_thread (engine_connections, boost::bind (&Graph::engine_stopped, this));
+	ARDOUR::AudioEngine::instance ()->Halted.connect_same_thread (engine_connections, boost::bind (&Graph::engine_stopped, this));

 	reset_thread_list ();

 #ifdef DEBUG_RT_ALLOC
-	graph = this;
+	graph             = this;
 	pbd_alloc_allowed = &::alloc_allowed;
 #endif
 }
@@ -89,9 +89,9 @@ void
 Graph::engine_stopped ()
 {
 #ifndef NDEBUG
-	cerr << "Graph::engine_stopped. n_thread: " << AudioEngine::instance()->process_thread_count() << endl;
+	cerr << "Graph::engine_stopped. n_thread: " << AudioEngine::instance ()->process_thread_count () << endl;
 #endif
-	if (AudioEngine::instance()->process_thread_count() != 0) {
+	if (AudioEngine::instance ()->process_thread_count () != 0) {
 		drop_threads ();
 	}
 }
@@ -101,73 +101,84 @@ void
 Graph::reset_thread_list ()
 {
 	uint32_t num_threads = how_many_dsp_threads ();
+	guint    n_workers   = g_atomic_uint_get (&_n_workers);

 	/* For now, we shouldn't be using the graph code if we only have 1 DSP thread */
 	assert (num_threads > 1);
+	assert (AudioEngine::instance ()->process_thread_count () == n_workers);

 	/* don't bother doing anything here if we already have the right
 	 * number of threads.
 	 */

-	if (AudioEngine::instance()->process_thread_count() == num_threads) {
+	if (AudioEngine::instance ()->process_thread_count () == num_threads) {
 		return;
 	}

-	Glib::Threads::Mutex::Lock lm (_session.engine().process_lock());
+	Glib::Threads::Mutex::Lock lm (_session.engine ().process_lock ());

-	if (AudioEngine::instance()->process_thread_count() != 0) {
+	if (n_workers > 0) {
 		drop_threads ();
 	}

-	_threads_active = true;
+	/* Allow threads to run */
+	g_atomic_int_set (&_terminate, 0);

-	if (AudioEngine::instance()->create_process_thread (boost::bind (&Graph::main_thread, this)) != 0) {
+	if (AudioEngine::instance ()->create_process_thread (boost::bind (&Graph::main_thread, this)) != 0) {
 		throw failed_constructor ();
 	}

 	for (uint32_t i = 1; i < num_threads; ++i) {
-		if (AudioEngine::instance()->create_process_thread (boost::bind (&Graph::helper_thread, this))) {
+		if (AudioEngine::instance ()->create_process_thread (boost::bind (&Graph::helper_thread, this))) {
 			throw failed_constructor ();
 		}
 	}
+
+	while (g_atomic_uint_get (&_n_workers) + 1 != num_threads) {
+		sched_yield ();
+	}
 }

 void
-Graph::session_going_away()
+Graph::session_going_away ()
 {
 	drop_threads ();

 	// now drop all references on the nodes.
-	_nodes_rt[0].clear();
-	_nodes_rt[1].clear();
-	_init_trigger_list[0].clear();
-	_init_trigger_list[1].clear();
-	_trigger_queue.clear();
+	_nodes_rt[0].clear ();
+	_nodes_rt[1].clear ();
+	_init_trigger_list[0].clear ();
+	_init_trigger_list[1].clear ();
+	g_atomic_int_set (&_trigger_queue_size, 0);
+	_trigger_queue.clear ();
 }

 void
 Graph::drop_threads ()
 {
 	Glib::Threads::Mutex::Lock ls (_swap_mutex);
-	_threads_active = false;

-	uint32_t thread_count = AudioEngine::instance()->process_thread_count ();
+	/* Flag threads to terminate */
+	g_atomic_int_set (&_terminate, 1);

-	for (unsigned int i=0; i < thread_count; i++) {
-		pthread_mutex_lock (&_trigger_mutex);
+	/* Wake-up sleeping threads */
+	guint tc = g_atomic_uint_get (&_idle_thread_cnt);
+	assert (tc == g_atomic_uint_get (&_n_workers));
+	for (guint i = 0; i < tc; ++i) {
 		_execution_sem.signal ();
-		pthread_mutex_unlock (&_trigger_mutex);
 	}

-	pthread_mutex_lock (&_trigger_mutex);
+	/* and the main thread */
 	_callback_start_sem.signal ();
-	pthread_mutex_unlock (&_trigger_mutex);

-	AudioEngine::instance()->join_process_threads ();
+	/* join process threads */
+	AudioEngine::instance ()->join_process_threads ();
+
+	g_atomic_int_set (&_n_workers, 0);
+	g_atomic_int_set (&_idle_thread_cnt, 0);

 	/* signal main process thread if it's waiting for an already terminated thread */
 	_callback_done_sem.signal ();
-	_execution_tokens = 0;

 	/* reset semaphores.
 	 * This is somewhat ugly, yet if a thread is killed (e.g jackd terminates
@@ -177,7 +188,7 @@ Graph::drop_threads ()
 	int d1 = _execution_sem.reset ();
 	int d2 = _callback_start_sem.reset ();
 	int d3 = _callback_done_sem.reset ();
-	cerr << "Graph::drop_threads() sema-counts: " << d1 << ", " << d2<< ", " << d3 << endl;
+	cerr << "Graph::drop_threads() sema-counts: " << d1 << ", " << d2 << ", " << d3 << endl;
 #else
 	_execution_sem.reset ();
 	_callback_start_sem.reset ();
@@ -185,6 +196,7 @@ Graph::drop_threads ()
 #endif
 }

+/* special case route removal -- called from Session::remove_routes */
 void
 Graph::clear_other_chain ()
 {
@@ -192,9 +204,8 @@ Graph::clear_other_chain ()

 	while (1) {
 		if (_setup_chain != _pending_chain) {
-
-			for (node_list_t::iterator ni=_nodes_rt[_setup_chain].begin(); ni!=_nodes_rt[_setup_chain].end(); ni++) {
-				(*ni)->_activation_set[_setup_chain].clear();
+			for (node_list_t::iterator ni = _nodes_rt[_setup_chain].begin (); ni != _nodes_rt[_setup_chain].end (); ++ni) {
+				(*ni)->_activation_set[_setup_chain].clear ();
 			}

 			_nodes_rt[_setup_chain].clear ();
@@ -209,98 +220,107 @@ Graph::clear_other_chain ()
 }

 void
-Graph::prep()
+Graph::prep ()
 {
-	node_list_t::iterator i;
-	int chain;
-
-	if (_swap_mutex.trylock()) {
-		// we got the swap mutex.
-		if (_current_chain != _pending_chain)
-		{
-			// printf ("chain swap ! %d -> %d\n", _current_chain, _pending_chain);
-			_setup_chain = _current_chain;
+	if (_swap_mutex.trylock ()) {
+		/* swap mutex acquired */
+		if (_current_chain != _pending_chain) {
+			/* use new chain */
+			_setup_chain   = _current_chain;
 			_current_chain = _pending_chain;
+			/* ensure that all nodes can be queued */
+			_trigger_queue.reserve (_nodes_rt[_current_chain].size ());
+			assert (g_atomic_uint_get (&_trigger_queue_size) == 0);
 			_cleanup_cond.signal ();
 		}
 		_swap_mutex.unlock ();
 	}

-	chain = _current_chain;
-
 	_graph_empty = true;
-	for (i=_nodes_rt[chain].begin(); i!=_nodes_rt[chain].end(); i++) {
-		(*i)->prep( chain);
+
+	int chain = _current_chain;
+
+	node_list_t::iterator i;
+	for (i = _nodes_rt[chain].begin (); i != _nodes_rt[chain].end (); ++i) {
+		(*i)->prep (chain);
 		_graph_empty = false;
 	}
-	_finished_refcount = _init_finished_refcount[chain];
+
+	assert (_graph_empty != (_n_terminal_nodes[chain] > 0));
+
+	g_atomic_int_set (&_terminal_refcnt, _n_terminal_nodes[chain]);

 	/* Trigger the initial nodes for processing, which are the ones at the `input' end */
-	pthread_mutex_lock (&_trigger_mutex);
-	for (i=_init_trigger_list[chain].begin(); i!=_init_trigger_list[chain].end(); i++) {
-		/* don't use ::trigger here, as we have already locked the mutex */
+	for (i = _init_trigger_list[chain].begin (); i != _init_trigger_list[chain].end (); i++) {
+		g_atomic_int_inc (&_trigger_queue_size);
 		_trigger_queue.push_back (i->get ());
 	}
-	pthread_mutex_unlock (&_trigger_mutex);
 }

 void
 Graph::trigger (GraphNode* n)
 {
-	pthread_mutex_lock (&_trigger_mutex);
+	g_atomic_int_inc (&_trigger_queue_size);
 	_trigger_queue.push_back (n);
-	pthread_mutex_unlock (&_trigger_mutex);
 }

 /** Called when a node at the `output' end of the chain (ie one that has no-one to feed)
 *  is finished.
 */
 void
-Graph::dec_ref()
+Graph::reached_terminal_node ()
 {
-	if (g_atomic_int_dec_and_test (const_cast<gint*> (&_finished_refcount))) {
+	if (g_atomic_int_dec_and_test (&_terminal_refcnt)) {
+	again:

 		/* We have run all the nodes that are at the `output' end of
 		 * the graph, so there is nothing more to do this time around.
 		 */
+		assert (g_atomic_uint_get (&_trigger_queue_size) == 0);

-		restart_cycle ();
+		/* Notify caller */
+		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 cycle done.\n", pthread_name ()));
+
+		_callback_done_sem.signal ();
+
+		/* Ensure that all background threads are idle.
+		 * When freewheeling there may be an immediate restart:
+		 * If there are more threads than CPU cores, some worker-
+		 * threads may only be "on the way" to become idle.
+		 */
+		guint n_workers = g_atomic_uint_get (&_n_workers);
+		while (g_atomic_uint_get (&_idle_thread_cnt) != n_workers) {
+			sched_yield ();
+		}
+
+		/* Block until the a process callback */
+		_callback_start_sem.wait ();
+
+		if (g_atomic_int_get (&_terminate)) {
+			return;
+		}
+
+		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 prepare new cycle.\n", pthread_name ()));
+
+		/* Prepare next cycle:
+		 *  - Reset terminal reference count
+		 *  - queue initial nodes
+		 */
+		prep ();
+
+		if (_graph_empty && !g_atomic_int_get (&_terminate)) {
+			goto again;
+		}
+		/* .. continue in worker-thread */
 	}
 }

-void
-Graph::restart_cycle()
-{
-	// we are through. wakeup our caller.
-	DEBUG_TRACE(DEBUG::ProcessThreads, string_compose ("%1 cycle done.\n", pthread_name()));
-
-again:
-	_callback_done_sem.signal ();
-
-	/* Block until the a process callback triggers us */
-	_callback_start_sem.wait();
-
-	if (!_threads_active) {
-		return;
-	}
-
-	DEBUG_TRACE(DEBUG::ProcessThreads, string_compose ("%1 prepare new cycle.\n", pthread_name()));
-	prep ();
-
-	if (_graph_empty && _threads_active) {
-		goto again;
-	}
-
-	// returning will restart the cycle.
-	// starting with waking up the others.
-}
-
 /** Rechain our stuff using a list of routes (which can be in any order) and
 *  a directed graph of their interconnections, which is guaranteed to be
 *  acyclic.
 */
 void
-Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const & edges)
+Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const& edges)
 {
 	Glib::Threads::Mutex::Lock ls (_swap_mutex);

@@ -310,26 +330,25 @@ Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const & edges
 	/* This will become the number of nodes that do not feed any other node;
 	 * once we have processed this number of those nodes, we have finished.
 	 */
-	_init_finished_refcount[chain] = 0;
+	_n_terminal_nodes[chain] = 0;

 	/* This will become a list of nodes that are not fed by another node, ie
 	 * those at the `input' end.
 	 */
-	_init_trigger_list[chain].clear();
+	_init_trigger_list[chain].clear ();

-	_nodes_rt[chain].clear();
+	_nodes_rt[chain].clear ();

 	/* Clear things out, and make _nodes_rt[chain] a copy of routelist */
-	for (RouteList::iterator ri=routelist->begin(); ri!=routelist->end(); ri++) {
+	for (RouteList::iterator ri = routelist->begin (); ri != routelist->end (); ri++) {
 		(*ri)->_init_refcount[chain] = 0;
-		(*ri)->_activation_set[chain].clear();
+		(*ri)->_activation_set[chain].clear ();
 		_nodes_rt[chain].push_back (*ri);
 	}

 	// now add refs for the connections.

-	for (node_list_t::iterator ni = _nodes_rt[chain].begin(); ni != _nodes_rt[chain].end(); ni++) {
-
+	for (node_list_t::iterator ni = _nodes_rt[chain].begin (); ni != _nodes_rt[chain].end (); ni++) {
 		boost::shared_ptr<Route> r = boost::dynamic_pointer_cast<Route> (*ni);

 		/* The routes that are directly fed by r */
@@ -339,7 +358,7 @@ Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const & edges
 		bool const has_output = !fed_from_r.empty ();

 		/* Set up r's activation set */
-		for (set<GraphVertex>::iterator i = fed_from_r.begin(); i != fed_from_r.end(); ++i) {
+		for (set<GraphVertex>::iterator i = fed_from_r.begin (); i != fed_from_r.end (); ++i) {
 			r->_activation_set[chain].insert (*i);
 		}

@@ -347,7 +366,7 @@ Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const & edges
 		bool const has_input = !edges.has_none_to (r);

 		/* Increment the refcount of any route that we directly feed */
-		for (node_set_t::iterator ai = r->_activation_set[chain].begin(); ai != r->_activation_set[chain].end(); ai++) {
+		for (node_set_t::iterator ai = r->_activation_set[chain].begin (); ai != r->_activation_set[chain].end (); ai++) {
 			(*ai)->_init_refcount[chain] += 1;
 		}

@@ -360,148 +379,147 @@ Graph::rechain (boost::shared_ptr<RouteList> routelist, GraphEdges const & edges
 			/* no output, so this is one of the nodes that we can count off to decide
 			 * if we've finished
 			 */
-			_init_finished_refcount[chain] += 1;
+			_n_terminal_nodes[chain] += 1;
 		}
 	}

 	_pending_chain = chain;
-	dump(chain);
+	dump (chain);
 }

-/** Called by both the main thread and all helpers.
- *  @return true to quit, false to carry on.
- */
-bool
-Graph::run_one()
+/** Called by both the main thread and all helpers. */
+void
+Graph::run_one ()
 {
-	GraphNode* to_run;
+	GraphNode* to_run = NULL;

-	pthread_mutex_lock (&_trigger_mutex);
-	if (_trigger_queue.size()) {
-		to_run = _trigger_queue.back();
-		_trigger_queue.pop_back();
-	} else {
-		to_run = 0;
+	if (g_atomic_int_get (&_terminate)) {
+		return;
 	}

-	/* the number of threads that are asleep */
-	int et = _execution_tokens;
-	/* the number of nodes that need to be run */
-	int ts = _trigger_queue.size();
+	if (_trigger_queue.pop_front (to_run)) {
+		/* Wake up idle threads, but at most as many as there's
+		 * work in the trigger queue that can be processed by
+		 * other threads.
+		 * This thread as not yet decreased _trigger_queue_size.
+		 */
+		guint idle_cnt   = g_atomic_uint_get (&_idle_thread_cnt);
+		guint work_avail = g_atomic_uint_get (&_trigger_queue_size);
+		guint wakeup     = std::min (idle_cnt + 1, work_avail);

-	/* hence how many threads to wake up */
-	int wakeup = min (et, ts);
-	/* update the number of threads that will still be sleeping */
-	_execution_tokens -= wakeup;
-
-	DEBUG_TRACE(DEBUG::ProcessThreads, string_compose ("%1 signals %2\n", pthread_name(), wakeup));
-
-	for (int i = 0; i < wakeup; i++) {
-		_execution_sem.signal ();
+		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 signals %2 threads\n", pthread_name (), wakeup));
+		for (guint i = 1; i < wakeup; ++i) {
+			_execution_sem.signal ();
+		}
 	}

-	while (to_run == 0) {
-		_execution_tokens += 1;
-		pthread_mutex_unlock (&_trigger_mutex);
-		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 goes to sleep\n", pthread_name()));
+	while (!to_run) {
+		/* Wait for work, fall asleep */
+		g_atomic_int_inc (&_idle_thread_cnt);
+		assert (g_atomic_uint_get (&_idle_thread_cnt) <= _n_workers);
+
+		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 goes to sleep\n", pthread_name ()));
 		_execution_sem.wait ();
-		if (!_threads_active) {
-			return true;
-		}
-		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 is awake\n", pthread_name()));
-		pthread_mutex_lock (&_trigger_mutex);
-		if (_trigger_queue.size()) {
-			to_run = _trigger_queue.back();
-			_trigger_queue.pop_back();
+
+		if (g_atomic_int_get (&_terminate)) {
+			return;
 		}
+
+		DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 is awake\n", pthread_name ()));
+
+		g_atomic_int_dec_and_test (&_idle_thread_cnt);
+
+		/* Try to find some work to do */
+		_trigger_queue.pop_front (to_run);
 	}
-	pthread_mutex_unlock (&_trigger_mutex);

-	to_run->process();
-	to_run->finish (_current_chain);
+	/* Process the graph-node */
+	g_atomic_int_dec_and_test (&_trigger_queue_size);
+	to_run->run (_current_chain);

-	DEBUG_TRACE(DEBUG::ProcessThreads, string_compose ("%1 has finished run_one()\n", pthread_name()));
-
-	return !_threads_active;
+	DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 has finished run_one()\n", pthread_name ()));
 }

 void
-Graph::helper_thread()
+Graph::helper_thread ()
 {
+	g_atomic_int_inc (&_n_workers);
+	guint id = g_atomic_uint_get (&_n_workers);
+
 	/* This is needed for ARDOUR::Session requests called from rt-processors
 	 * in particular Lua scripts may do cross-thread calls */
-	if (! SessionEvent::has_per_thread_pool ()) {
+	if (!SessionEvent::has_per_thread_pool ()) {
 		char name[64];
-		snprintf (name, 64, "RT-%p", this);
+		snprintf (name, 64, "RT-%u-%p", id, (void*)DEBUG_THREAD_SELF);
 		pthread_set_name (name);
 		SessionEvent::create_per_thread_pool (name, 64);
-		PBD::notify_event_loops_about_thread_creation (pthread_self(), name, 64);
+		PBD::notify_event_loops_about_thread_creation (pthread_self (), name, 64);
 	}

 	suspend_rt_malloc_checks ();
 	ProcessThread* pt = new ProcessThread ();
 	resume_rt_malloc_checks ();

-	pt->get_buffers();
+	pt->get_buffers ();

-	while(1) {
-		if (run_one()) {
-			break;
-		}
+	while (!g_atomic_int_get (&_terminate)) {
+		run_one ();
 	}

-	pt->drop_buffers();
+	pt->drop_buffers ();
 	delete pt;
 }

 /** Here's the main graph thread */
 void
-Graph::main_thread()
+Graph::main_thread ()
 {
+	/* first time setup */
+
 	suspend_rt_malloc_checks ();
 	ProcessThread* pt = new ProcessThread ();

 	/* This is needed for ARDOUR::Session requests called from rt-processors
 	 * in particular Lua scripts may do cross-thread calls */
-	if (! SessionEvent::has_per_thread_pool ()) {
+	if (!SessionEvent::has_per_thread_pool ()) {
 		char name[64];
-		snprintf (name, 64, "RT-main-%p", this);
+		snprintf (name, 64, "RT-main-%p", (void*)DEBUG_THREAD_SELF);
 		pthread_set_name (name);
 		SessionEvent::create_per_thread_pool (name, 64);
-		PBD::notify_event_loops_about_thread_creation (pthread_self(), name, 64);
+		PBD::notify_event_loops_about_thread_creation (pthread_self (), name, 64);
 	}
 	resume_rt_malloc_checks ();

-	pt->get_buffers();
+	pt->get_buffers ();

+	/* Wait for initial process callback */
 again:
 	_callback_start_sem.wait ();

-	DEBUG_TRACE(DEBUG::ProcessThreads, "main thread is awake\n");
+	DEBUG_TRACE (DEBUG::ProcessThreads, "main thread is awake\n");

-	if (!_threads_active) {
-		pt->drop_buffers();
+	if (g_atomic_int_get (&_terminate)) {
+		pt->drop_buffers ();
 		delete (pt);
 		return;
 	}

+	/* Bootstrap the trigger-list
+	 * (later this is done by Graph_reached_terminal_node) */
 	prep ();

-	if (_graph_empty && _threads_active) {
+	if (_graph_empty && !g_atomic_int_get (&_terminate)) {
 		_callback_done_sem.signal ();
-		DEBUG_TRACE(DEBUG::ProcessThreads, "main thread sees graph done, goes back to sleep\n");
+		DEBUG_TRACE (DEBUG::ProcessThreads, "main thread sees graph done, goes back to sleep\n");
 		goto again;
 	}

-	/* This loop will run forever */
-	while (1) {
-		DEBUG_TRACE(DEBUG::ProcessThreads, string_compose ("main thread (%1) runs one graph node\n", pthread_name ()));
-		if (run_one()) {
-			break;
-		}
+	/* After setup, the main-thread just becomes a normal worker */
+	while (!g_atomic_int_get (&_terminate)) {
+		run_one ();
 	}

-	pt->drop_buffers();
+	pt->drop_buffers ();
 	delete (pt);
 }

@@ -510,25 +528,25 @@ Graph::dump (int chain)
 {
 #ifndef NDEBUG
 	node_list_t::iterator ni;
-	node_set_t::iterator ai;
+	node_set_t::iterator  ai;

 	chain = _pending_chain;

 	DEBUG_TRACE (DEBUG::Graph, "--------------------------------------------Graph dump:\n");
-	for (ni=_nodes_rt[chain].begin(); ni!=_nodes_rt[chain].end(); ni++) {
-		boost::shared_ptr<Route> rp = boost::dynamic_pointer_cast<Route>( *ni);
-		DEBUG_TRACE (DEBUG::Graph, string_compose ("GraphNode: %1  refcount: %2\n", rp->name().c_str(), (*ni)->_init_refcount[chain]));
-		for (ai=(*ni)->_activation_set[chain].begin(); ai!=(*ni)->_activation_set[chain].end(); ai++) {
-			DEBUG_TRACE (DEBUG::Graph, string_compose ("  triggers: %1\n", boost::dynamic_pointer_cast<Route>(*ai)->name().c_str()));
+	for (ni = _nodes_rt[chain].begin (); ni != _nodes_rt[chain].end (); ni++) {
+		boost::shared_ptr<Route> rp = boost::dynamic_pointer_cast<Route> (*ni);
+		DEBUG_TRACE (DEBUG::Graph, string_compose ("GraphNode: %1  refcount: %2\n", rp->name ().c_str (), (*ni)->_init_refcount[chain]));
+		for (ai = (*ni)->_activation_set[chain].begin (); ai != (*ni)->_activation_set[chain].end (); ai++) {
+			DEBUG_TRACE (DEBUG::Graph, string_compose ("  triggers: %1\n", boost::dynamic_pointer_cast<Route> (*ai)->name ().c_str ()));
 		}
 	}

 	DEBUG_TRACE (DEBUG::Graph, "------------- trigger list:\n");
-	for (ni=_init_trigger_list[chain].begin(); ni!=_init_trigger_list[chain].end(); ni++) {
-		DEBUG_TRACE (DEBUG::Graph, string_compose ("GraphNode: %1  refcount: %2\n", boost::dynamic_pointer_cast<Route>(*ni)->name().c_str(), (*ni)->_init_refcount[chain]));
+	for (ni = _init_trigger_list[chain].begin (); ni != _init_trigger_list[chain].end (); ni++) {
+		DEBUG_TRACE (DEBUG::Graph, string_compose ("GraphNode: %1  refcount: %2\n", boost::dynamic_pointer_cast<Route> (*ni)->name ().c_str (), (*ni)->_init_refcount[chain]));
 	}

-	DEBUG_TRACE (DEBUG::Graph, string_compose ("final activation refcount: %1\n", _init_finished_refcount[chain]));
+	DEBUG_TRACE (DEBUG::Graph, string_compose ("final activation refcount: %1\n", _n_terminal_nodes[chain]));
 #endif
 }

@@ -537,17 +555,19 @@ Graph::process_routes (pframes_t nframes, samplepos_t start_sample, samplepos_t
 {
 	DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("graph execution from %1 to %2 = %3\n", start_sample, end_sample, nframes));

-	if (!_threads_active) return 0;
+	if (g_atomic_int_get (&_terminate)) {
+		return 0;
+	}

-	_process_nframes = nframes;
+	_process_nframes      = nframes;
 	_process_start_sample = start_sample;
-	_process_end_sample = end_sample;
+	_process_end_sample   = end_sample;

-	_process_noroll = false;
-	_process_retval = 0;
+	_process_noroll      = false;
+	_process_retval      = 0;
 	_process_need_butler = false;

-	DEBUG_TRACE(DEBUG::ProcessThreads, "wake graph for non-silent process\n");
+	DEBUG_TRACE (DEBUG::ProcessThreads, "wake graph for non-silent process\n");
 	_callback_start_sem.signal ();
 	_callback_done_sem.wait ();
 	DEBUG_TRACE (DEBUG::ProcessThreads, "graph execution complete\n");
@@ -562,18 +582,20 @@ Graph::routes_no_roll (pframes_t nframes, samplepos_t start_sample, samplepos_t
 {
 	DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("no-roll graph execution from %1 to %2 = %3\n", start_sample, end_sample, nframes));

-	if (!_threads_active) return 0;
+	if (g_atomic_int_get (&_terminate)) {
+		return 0;
+	}

-	_process_nframes = nframes;
-	_process_start_sample = start_sample;
-	_process_end_sample = end_sample;
+	_process_nframes        = nframes;
+	_process_start_sample   = start_sample;
+	_process_end_sample     = end_sample;
 	_process_non_rt_pending = non_rt_pending;

-	_process_noroll = true;
-	_process_retval = 0;
+	_process_noroll      = true;
+	_process_retval      = 0;
 	_process_need_butler = false;

-	DEBUG_TRACE(DEBUG::ProcessThreads, "wake graph for no-roll process\n");
+	DEBUG_TRACE (DEBUG::ProcessThreads, "wake graph for no-roll process\n");
 	_callback_start_sem.signal ();
 	_callback_done_sem.wait ();
 	DEBUG_TRACE (DEBUG::ProcessThreads, "graph execution complete\n");
@@ -584,11 +606,11 @@ void
 Graph::process_one_route (Route* route)
 {
 	bool need_butler = false;
-	int retval;
+	int  retval;

 	assert (route);

-	DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 runs route %2\n", pthread_name(), route->name()));
+	DEBUG_TRACE (DEBUG::ProcessThreads, string_compose ("%1 runs route %2\n", pthread_name (), route->name ()));

 	if (_process_noroll) {
 		retval = route->no_roll (_process_nframes, _process_start_sample, _process_end_sample, _process_non_rt_pending);
@@ -608,5 +630,5 @@ Graph::process_one_route (Route* route)
 bool
 Graph::in_process_thread () const
 {
-	return AudioEngine::instance()->in_process_thread ();
+	return AudioEngine::instance ()->in_process_thread ();
 }
--- a/libs/ardour/graphnode.cc
+++ b/libs/ardour/graphnode.cc
@@ -25,11 +25,11 @@
 using namespace ARDOUR;

 GraphNode::GraphNode (boost::shared_ptr<Graph> graph)
-	: _graph(graph)
+	: _graph (graph)
 {
 }

-GraphNode::~GraphNode()
+GraphNode::~GraphNode ()
 {
 }

@@ -37,19 +37,20 @@ void
 GraphNode::prep (int chain)
 {
 	/* This is the number of nodes that directly feed us */
-	_refcount = _init_refcount[chain];
+	g_atomic_int_set (&_refcount, _init_refcount[chain]);
 }

-/** Called by another node to tell us that one of the nodes that feed us
- *  has been processed.
- */
+/** Called by an upstream node, when it has completed processing */
 void
-GraphNode::dec_ref()
+GraphNode::trigger ()
 {
+	/* check if we can run */
 	if (g_atomic_int_dec_and_test (&_refcount)) {
-		/* All the nodes that feed us are done, so we can queue this node
-		 * for processing.
-		 */
+#if 0 // TODO optimize: remove prep()
+		/* reset reference count for next cycle */
+		g_atomic_int_set (&_refcount, _init_refcount[chain]);
+#endif
+		/* All nodes that feed this node have completed, so this node be processed now. */
 		_graph->trigger (this);
 	}
 }
@@ -58,23 +59,23 @@ void
 GraphNode::finish (int chain)
 {
 	node_set_t::iterator i;
-	bool feeds_somebody = false;
+	bool                 feeds = false;

-	/* Tell the nodes that we feed that we've finished */
-	for (i=_activation_set[chain].begin(); i!=_activation_set[chain].end(); i++) {
-		(*i)->dec_ref();
-		feeds_somebody = true;
+	/* Notify downstream nodes that depend on this node */
+	for (i = _activation_set[chain].begin (); i != _activation_set[chain].end (); ++i) {
+		(*i)->trigger ();
+		feeds = true;
 	}

-	if (!feeds_somebody) {
-		/* This node does not feed anybody, so decrement the graph's finished count */
-		_graph->dec_ref();
+	if (!feeds) {
+		/* This node is a terminal node that does not feed another note,
+		 * so notify the graph to decrement the the finished count */
+		_graph->reached_terminal_node ();
 	}
 }

-
 void
-GraphNode::process()
+GraphNode::process ()
 {
-	_graph->process_one_route (dynamic_cast<Route *>(this));
+	_graph->process_one_route (dynamic_cast<Route*> (this));
 }