[PATCH 1/6] gdb: re-work parallel-for-selftests.c

Mirror of the gdb-patches mailing list
 help / color / mirror / Atom feed

* [PATCH 1/6] gdb: re-work parallel-for-selftests.c
@ 2025-05-05 20:15 Simon Marchi
  2025-05-05 20:15 ` [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter Simon Marchi
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

From: Simon Marchi <simon.marchi@polymtl.ca>

I find this file difficult to work with and modify, due to how it uses
the preprocessor to include itself, to generate variations of the test
functions.  Change it to something a bit more C++-y, with a test
function that accepts a callback to invoke the foreach function under
test.

Change-Id: Ibf1e2907380a88a4f8e4b4b88df2b0dfd0e9b6c8
---
 gdb/unittests/parallel-for-selftests.c | 137 ++++++++++---------------
 1 file changed, 56 insertions(+), 81 deletions(-)

diff --git a/gdb/unittests/parallel-for-selftests.c b/gdb/unittests/parallel-for-selftests.c
index 841d914d75b0..c9a1689acfb0 100644
--- a/gdb/unittests/parallel-for-selftests.c
+++ b/gdb/unittests/parallel-for-selftests.c
@@ -17,13 +17,6 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
-/* This file is divided in two parts:
-   - FOR_EACH-undefined, and
-   - FOR_EACH-defined.
-   The former includes the latter, more than once, with different values for
-   FOR_EACH.  The FOR_EACH-defined part reads like a regular function.  */
-#ifndef FOR_EACH
-
 #include "gdbsupport/selftest.h"
 #include "gdbsupport/parallel-for.h"
 
@@ -49,37 +42,70 @@ struct save_restore_n_threads
   int n_threads;
 };
 
-/* Define test_par using TEST in the FOR_EACH-defined part.  */
-#define TEST test_par
-#define FOR_EACH gdb::parallel_for_each
-#include "parallel-for-selftests.c"
-#undef FOR_EACH
-#undef TEST
-
-/* Define test_seq using TEST in the FOR_EACH-defined part.  */
-#define TEST test_seq
-#define FOR_EACH gdb::sequential_for_each
-#include "parallel-for-selftests.c"
-#undef FOR_EACH
-#undef TEST
+using foreach_callback_t = gdb::function_view<void (int first, int last)>;
+using do_foreach_t = gdb::function_view<void (int first, int last,
+					     foreach_callback_t)>;
 
 static void
-test (int n_threads)
+test_one (int n_threads, do_foreach_t do_foreach)
 {
-  test_par (n_threads);
-  test_seq (n_threads);
+  save_restore_n_threads saver;
+  gdb::thread_pool::g_thread_pool->set_thread_count (n_threads);
+
+  {
+    constexpr int upper_bound = 1000;
+    std::atomic<int> counter (0);
+    do_foreach (0, upper_bound,
+		[&] (int start, int end) { counter += end - start; });
+    SELF_CHECK (counter == upper_bound);
+  }
+
+  {
+    std::atomic<int> counter (0);
+    do_foreach (0, 0, [&] (int start, int end) { counter += end - start; });
+    SELF_CHECK (counter == 0);
+  }
+
+  {
+    /* Check that if there are fewer tasks than threads, then we won't
+       end up with a null result.  */
+    std::vector<std::unique_ptr<int>> intresults;
+    std::atomic<bool> any_empty_tasks (false);
+
+    do_foreach (0, 1,
+		[&] (int start, int end)
+		  {
+		    if (start == end)
+		      any_empty_tasks = true;
+
+		    return std::make_unique<int> (end - start);
+		  });
+
+    SELF_CHECK (!any_empty_tasks);
+    SELF_CHECK (std::all_of (intresults.begin (), intresults.end (),
+			     [] (const std::unique_ptr<int> &entry)
+			       { return entry != nullptr; }));
+  }
 }
 
 static void
-test_n_threads ()
+test_parallel_for_each ()
 {
-  test (0);
-  test (1);
-  test (3);
+  const std::vector<do_foreach_t> for_each_functions
+    {
+      [] (int start, int end, foreach_callback_t callback)
+      { gdb::parallel_for_each (1, start, end, callback); },
+      [] (int start, int end, foreach_callback_t callback)
+      { gdb::sequential_for_each (1, start, end, callback);}
+    };
+
+  for (int n_threads : { 0, 1, 3 })
+    for (const auto &for_each_function : for_each_functions)
+      test_one (n_threads, for_each_function);
 }
 
-}
-}
+} /* namespace parallel_for */
+} /* namespace selftests */
 
 #endif /* CXX_STD_THREAD */
 
@@ -89,57 +115,6 @@ _initialize_parallel_for_selftests ()
 {
 #ifdef CXX_STD_THREAD
   selftests::register_test ("parallel_for",
-			    selftests::parallel_for::test_n_threads);
+			    selftests::parallel_for::test_parallel_for_each);
 #endif /* CXX_STD_THREAD */
 }
-
-#else /* FOR_EACH */
-
-static void
-TEST (int n_threads)
-{
-  save_restore_n_threads saver;
-  gdb::thread_pool::g_thread_pool->set_thread_count (n_threads);
-
-#define NUMBER 10000
-
-  std::atomic<int> counter (0);
-  FOR_EACH (1, 0, NUMBER,
-	    [&] (int start, int end)
-	    {
-	      counter += end - start;
-	    });
-  SELF_CHECK (counter == NUMBER);
-
-  counter = 0;
-  FOR_EACH (1, 0, 0,
-	    [&] (int start, int end)
-	    {
-	      counter += end - start;
-	    });
-  SELF_CHECK (counter == 0);
-
-#undef NUMBER
-
-  /* Check that if there are fewer tasks than threads, then we won't
-     end up with a null result.  */
-  std::vector<std::unique_ptr<int>> intresults;
-  std::atomic<bool> any_empty_tasks (false);
-
-  FOR_EACH (1, 0, 1,
-	    [&] (int start, int end)
-	      {
-		if (start == end)
-		  any_empty_tasks = true;
-		return std::make_unique<int> (end - start);
-	      });
-  SELF_CHECK (!any_empty_tasks);
-  SELF_CHECK (std::all_of (intresults.begin (),
-			   intresults.end (),
-			   [] (const std::unique_ptr<int> &entry)
-			     {
-			       return entry != nullptr;
-			     }));
-}
-
-#endif /* FOR_EACH */

base-commit: f3d834df28734ebc82931808960fe63401127c06
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
@ 2025-05-05 20:15 ` Simon Marchi
  2025-06-13 17:55   ` Tom Tromey
  2025-05-05 20:15 ` [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each Simon Marchi
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

From: Simon Marchi <simon.marchi@polymtl.ca>

This value will likely never change at runtime, so we might as well make
it a template parameter.  This has the "advantage" of being able to
remove the unnecessary param from gdb::sequential_for_each.

Change-Id: Ia172ab8e08964e30d4e3378a95ccfa782abce674
---
 gdb/minsyms.c                          |  2 +-
 gdb/unittests/parallel-for-selftests.c |  4 ++--
 gdbsupport/parallel-for.h              | 10 ++++------
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/gdb/minsyms.c b/gdb/minsyms.c
index 124d96d90b56..4a6459a6f2d2 100644
--- a/gdb/minsyms.c
+++ b/gdb/minsyms.c
@@ -1479,7 +1479,7 @@ minimal_symbol_reader::install ()
 
       msymbols = m_objfile->per_bfd->msymbols.get ();
       /* Arbitrarily require at least 10 elements in a thread.  */
-      gdb::parallel_for_each (10, &msymbols[0], &msymbols[mcount],
+      gdb::parallel_for_each<10> (&msymbols[0], &msymbols[mcount],
 	 [&] (minimal_symbol *start, minimal_symbol *end)
 	 {
 	   scoped_time_it time_it ("minsyms install worker");
diff --git a/gdb/unittests/parallel-for-selftests.c b/gdb/unittests/parallel-for-selftests.c
index c9a1689acfb0..f3d85a85704c 100644
--- a/gdb/unittests/parallel-for-selftests.c
+++ b/gdb/unittests/parallel-for-selftests.c
@@ -94,9 +94,9 @@ test_parallel_for_each ()
   const std::vector<do_foreach_t> for_each_functions
     {
       [] (int start, int end, foreach_callback_t callback)
-      { gdb::parallel_for_each (1, start, end, callback); },
+      { gdb::parallel_for_each<1> (start, end, callback); },
       [] (int start, int end, foreach_callback_t callback)
-      { gdb::sequential_for_each (1, start, end, callback);}
+      { gdb::sequential_for_each (start, end, callback);}
     };
 
   for (int n_threads : { 0, 1, 3 })
diff --git a/gdbsupport/parallel-for.h b/gdbsupport/parallel-for.h
index c485c36d4859..b74c8068cf2c 100644
--- a/gdbsupport/parallel-for.h
+++ b/gdbsupport/parallel-for.h
@@ -40,10 +40,9 @@ namespace gdb
    at least N elements processed per thread.  Setting N to 0 is not
    allowed.  */
 
-template<class RandomIt, class RangeFunction>
+template<std::size_t n, class RandomIt, class RangeFunction>
 void
-parallel_for_each (unsigned n, RandomIt first, RandomIt last,
-		   RangeFunction callback)
+parallel_for_each (RandomIt first, RandomIt last, RangeFunction callback)
 {
   /* If enabled, print debug info about how the work is distributed across
      the threads.  */
@@ -73,7 +72,7 @@ parallel_for_each (unsigned n, RandomIt first, RandomIt last,
   if (parallel_for_each_debug)
     {
       debug_printf (_("Parallel for: n_elements: %zu\n"), n_elements);
-      debug_printf (_("Parallel for: minimum elements per thread: %u\n"), n);
+      debug_printf (_("Parallel for: minimum elements per thread: %zu\n"), n);
       debug_printf (_("Parallel for: elts_per_thread: %zu\n"), elts_per_thread);
     }
 
@@ -141,8 +140,7 @@ parallel_for_each (unsigned n, RandomIt first, RandomIt last,
 
 template<class RandomIt, class RangeFunction>
 void
-sequential_for_each (unsigned n, RandomIt first, RandomIt last,
-		     RangeFunction callback)
+sequential_for_each (RandomIt first, RandomIt last, RangeFunction callback)
 {
   callback (first, last);
 }
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
  2025-05-05 20:15 ` [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter Simon Marchi
@ 2025-05-05 20:15 ` Simon Marchi
  2025-06-13 18:29   ` Tom Tromey
  2025-05-05 20:15 ` [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h Simon Marchi
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

From: Simon Marchi <simon.marchi@polymtl.ca>

gdb::parallel_for_each uses static partitioning of the workload, meaning
that each worker thread receives a similar number of work items.  Change
it to use dynamic partitioning, where worker threads pull work items
from a shared work queue when they need to.

Note that gdb::parallel_for_each is currently only used for processing
minimal symbols in GDB.  I am looking at improving the startup
performance of GDB, where the minimal symbol process is one step.

With static partitioning, there is a risk of workload imbalance if some
threads receive "easier" work than others.  Some threads sit still while
others finish working on their share of the work.  This is not
desirable, because the gdb::parallel_for_each takes as long as the
slowest thread takes.

When loading a file with a lot of minimal symbols (~600k) in GDB, with
"maint set per-command time on", I observe some imbalance:

    Time for "minsyms install worker": wall 0.732, user 0.550, sys 0.041, user+sys 0.591, 80.7 % CPU
    Time for "minsyms install worker": wall 0.881, user 0.722, sys 0.071, user+sys 0.793, 90.0 % CPU
    Time for "minsyms install worker": wall 2.107, user 1.804, sys 0.147, user+sys 1.951, 92.6 % CPU
    Time for "minsyms install worker": wall 2.351, user 2.003, sys 0.151, user+sys 2.154, 91.6 % CPU
    Time for "minsyms install worker": wall 2.611, user 2.322, sys 0.235, user+sys 2.557, 97.9 % CPU
    Time for "minsyms install worker": wall 3.074, user 2.729, sys 0.203, user+sys 2.932, 95.4 % CPU
    Time for "minsyms install worker": wall 3.486, user 3.074, sys 0.260, user+sys 3.334, 95.6 % CPU
    Time for "minsyms install worker": wall 3.927, user 3.475, sys 0.336, user+sys 3.811, 97.0 % CPU
                                              ^
                                          ----´

The fastest thread took 0.732 seconds to complete its work (and then sat
still), while the slowest took 3.927 seconds.  This means the
parallel_for_each took a bit less than 4 seconds.

Even if the number of minimal symbols assigned to each worker is the
same, I suppose that some symbols (e.g. those that need demangling) take
longer to process, which could explain the imbalance.

With this patch, things are much more balanced:

    Time for "minsym install worker": wall 2.807, user 2.222, sys 0.144, user+sys 2.366, 84.3 % CPU
    Time for "minsym install worker": wall 2.808, user 2.073, sys 0.131, user+sys 2.204, 78.5 % CPU
    Time for "minsym install worker": wall 2.804, user 1.994, sys 0.151, user+sys 2.145, 76.5 % CPU
    Time for "minsym install worker": wall 2.808, user 1.977, sys 0.135, user+sys 2.112, 75.2 % CPU
    Time for "minsym install worker": wall 2.808, user 2.061, sys 0.142, user+sys 2.203, 78.5 % CPU
    Time for "minsym install worker": wall 2.809, user 2.012, sys 0.146, user+sys 2.158, 76.8 % CPU
    Time for "minsym install worker": wall 2.809, user 2.178, sys 0.137, user+sys 2.315, 82.4 % CPU
    Time for "minsym install worker": wall 2.820, user 2.141, sys 0.170, user+sys 2.311, 82.0 % CPU
                                              ^
                                          ----´

In this version, the parallel_for_each took about 2.8 seconds,
representing a reduction of ~1.2 seconds for this step.  Not
life-changing, but it's still good I think.

Note that this patch helps when loading big programs.  My go-to test
program for this is telegram-desktop that I built from source.  For
small programs (including loading gdb itself), it makes no perceptible
difference.

Now the technical bits:

 - One impact that this change has on the minimal symbol processing
   specifically is that not all calls to compute_and_set_names (a
   critical region guarded by a mutex) are done at the end of each
   worker thread's task anymore.

   Before this patch, each thread would compute the names and hash values for
   all the minimal symbols it has been assigned, and then would call
   compute_and_set_names for all of them, while holding the mutex (thus
   preventing other threads from doing this same step).

   With the shared work queue approach, each thread grabs a batch of of
   minimal symbols, computes the names and hash values for them, and
   then calls compute_and_set_names (with the mutex held) for this batch
   only.  It then repeats that until the work queue is empty.

   There are therefore more small and spread out compute_and_set_names
   critical sections, instead of just one per worker thread at the end.
   Given that before this patch the work was not well balanced among worker
   threads, I guess that threads would enter that critical region at
   roughly different times, causing little contention.

   In the "with this patch" results, the CPU utilization numbers are not
   as good, suggesting that there is some contention.  But I don't know
   if it's contention due to the compute_and_set_names critical section
   or the shared work queue critical section.  That can be investigated
   later.  In any case, what ultimately counts is the wall time, which
   improves.

 - One choice I had to make was to decide how many work items (in this
   case minimal symbols) each worker should pop when getting work from
   the shared queue.  The general wisdom is that:

   - popping too few items, and the synchronization overhead becomes
     significant, and the total processing time increases
   - popping too many items, and we get some imbalance back, and the
     total processing time increases again

   I experimented using a dynamic batch size proportional to the number
   of remaining work items.  It worked well in some cases but not
   always.  So I decided to keep it simple, with a fixed batch size.
   That can always be tweaked later.

 - I want to still be able to use scoped_time_it to measure the time
   that each worker thread spent working on the task.  I find it really
   handy when measuring the performance impact of changes.

   Unfortunately, the current interface of gdb::parallel_for_each, which
   receives a simple callback, is not well-suited for that, once I
   introduce the dynamic partitioning.  The callback would get called
   once for each work item batch (multiple time for each worker thread),
   so it's not possible to maintain a per-worker thread object for the
   duration of the parallel for.

   To allow this, I changed gdb::parallel_for_each to receive a worker
   type as a template parameter.  Each worker thread creates one local
   instance of that type, and calls operator() on it for each work item
   batch.  By having a scoped_time_it object as a field of that worker,
   we can get the timings per worker thread.

   The drawbacks of this approach is that we must now define the
   parallel task in a separate class and manually capture any context we
   need as fields of that class.

Change-Id: Ibf1fea65c91f76a95b9ed8f706fd6fa5ef52d9cf
---
 gdb/minsyms.c                          | 131 ++++++++++++--------
 gdb/unittests/parallel-for-selftests.c |  20 ++-
 gdbsupport/parallel-for.h              | 162 ++++++++++++-------------
 3 files changed, 174 insertions(+), 139 deletions(-)

diff --git a/gdb/minsyms.c b/gdb/minsyms.c
index 4a6459a6f2d2..e353a8a9ba00 100644
--- a/gdb/minsyms.c
+++ b/gdb/minsyms.c
@@ -1392,6 +1392,81 @@ build_minimal_symbol_hash_tables
     }
 }
 
+/* gdb::parallel_for_each worker to compute minimal symbol names and hashes.  */
+
+class minimal_symbol_install_worker
+{
+public:
+  minimal_symbol_install_worker
+    (minimal_symbol *msymbols,
+     gdb::array_view<computed_hash_values> hash_values,
+     objfile_per_bfd_storage *per_bfd,
+     std::mutex &demangled_mutex)
+    : m_time_it ("minsym install worker"),
+      m_msymbols (msymbols),
+      m_hash_values (hash_values),
+      m_per_bfd (per_bfd),
+      m_demangled_mutex (demangled_mutex)
+  {}
+
+  void operator() (minimal_symbol *start, minimal_symbol *end) noexcept
+  {
+    for (minimal_symbol *msym = start; msym < end; ++msym)
+      {
+	size_t idx = msym - m_msymbols;
+	m_hash_values[idx].name_length = strlen (msym->linkage_name ());
+
+	if (!msym->name_set)
+	  {
+	    /* This will be freed later, by compute_and_set_names.  */
+	    gdb::unique_xmalloc_ptr<char> demangled_name
+	      = symbol_find_demangled_name (msym, msym->linkage_name ());
+	    msym->set_demangled_name (demangled_name.release (),
+				      &m_per_bfd->storage_obstack);
+	    msym->name_set = 1;
+	  }
+
+	/* This mangled_name_hash computation has to be outside of
+	   the name_set check, or compute_and_set_names below will
+	   be called with an invalid hash value.  */
+	m_hash_values[idx].mangled_name_hash
+	  = fast_hash (msym->linkage_name (), m_hash_values[idx].name_length);
+	m_hash_values[idx].minsym_hash = msymbol_hash (msym->linkage_name ());
+
+	/* We only use this hash code if the search name differs
+	   from the linkage name.  See the code in
+	   build_minimal_symbol_hash_tables.  */
+	if (msym->search_name () != msym->linkage_name ())
+	  m_hash_values[idx].minsym_demangled_hash
+	    = search_name_hash (msym->language (), msym->search_name ());
+      }
+
+    {
+      /* To limit how long we hold the lock, we only acquire it here
+	 and not while we demangle the names above.  */
+#if CXX_STD_THREAD
+      std::lock_guard<std::mutex> guard (m_demangled_mutex);
+#endif
+      for (minimal_symbol *msym = start; msym < end; ++msym)
+	{
+	  size_t idx = msym - m_msymbols;
+	  std::string_view name (msym->linkage_name (),
+				 m_hash_values[idx].name_length);
+	  hashval_t hashval = m_hash_values[idx].mangled_name_hash;
+
+	  msym->compute_and_set_names (name, false, m_per_bfd, hashval);
+	}
+    }
+  }
+
+private:
+  scoped_time_it m_time_it;
+  minimal_symbol *m_msymbols;
+  gdb::array_view<computed_hash_values> m_hash_values;
+  objfile_per_bfd_storage *m_per_bfd;
+  std::mutex &m_demangled_mutex;
+};
+
 /* Add the minimal symbols in the existing bunches to the objfile's official
    minimal symbol table.  In most cases there is no minimal symbol table yet
    for this objfile, and the existing bunches are used to create one.  Once
@@ -1478,59 +1553,11 @@ minimal_symbol_reader::install ()
       std::vector<computed_hash_values> hash_values (mcount);
 
       msymbols = m_objfile->per_bfd->msymbols.get ();
-      /* Arbitrarily require at least 10 elements in a thread.  */
-      gdb::parallel_for_each<10> (&msymbols[0], &msymbols[mcount],
-	 [&] (minimal_symbol *start, minimal_symbol *end)
-	 {
-	   scoped_time_it time_it ("minsyms install worker");
 
-	   for (minimal_symbol *msym = start; msym < end; ++msym)
-	     {
-	       size_t idx = msym - msymbols;
-	       hash_values[idx].name_length = strlen (msym->linkage_name ());
-	       if (!msym->name_set)
-		 {
-		   /* This will be freed later, by compute_and_set_names.  */
-		   gdb::unique_xmalloc_ptr<char> demangled_name
-		     = symbol_find_demangled_name (msym, msym->linkage_name ());
-		   msym->set_demangled_name
-		     (demangled_name.release (),
-		      &m_objfile->per_bfd->storage_obstack);
-		   msym->name_set = 1;
-		 }
-	       /* This mangled_name_hash computation has to be outside of
-		  the name_set check, or compute_and_set_names below will
-		  be called with an invalid hash value.  */
-	       hash_values[idx].mangled_name_hash
-		 = fast_hash (msym->linkage_name (),
-			      hash_values[idx].name_length);
-	       hash_values[idx].minsym_hash
-		 = msymbol_hash (msym->linkage_name ());
-	       /* We only use this hash code if the search name differs
-		  from the linkage name.  See the code in
-		  build_minimal_symbol_hash_tables.  */
-	       if (msym->search_name () != msym->linkage_name ())
-		 hash_values[idx].minsym_demangled_hash
-		   = search_name_hash (msym->language (), msym->search_name ());
-	     }
-	   {
-	     /* To limit how long we hold the lock, we only acquire it here
-		and not while we demangle the names above.  */
-#if CXX_STD_THREAD
-	     std::lock_guard<std::mutex> guard (demangled_mutex);
-#endif
-	     for (minimal_symbol *msym = start; msym < end; ++msym)
-	       {
-		 size_t idx = msym - msymbols;
-		 msym->compute_and_set_names
-		   (std::string_view (msym->linkage_name (),
-				      hash_values[idx].name_length),
-		    false,
-		    m_objfile->per_bfd,
-		    hash_values[idx].mangled_name_hash);
-	       }
-	   }
-	 });
+      gdb::parallel_for_each<1000, minimal_symbol *, minimal_symbol_install_worker>
+	(&msymbols[0], &msymbols[mcount], msymbols,
+	 gdb::array_view<computed_hash_values> (hash_values),
+	 m_objfile->per_bfd, demangled_mutex);
 
       build_minimal_symbol_hash_tables (m_objfile, hash_values);
     }
diff --git a/gdb/unittests/parallel-for-selftests.c b/gdb/unittests/parallel-for-selftests.c
index f3d85a85704c..6b6b557ba99e 100644
--- a/gdb/unittests/parallel-for-selftests.c
+++ b/gdb/unittests/parallel-for-selftests.c
@@ -91,12 +91,28 @@ test_one (int n_threads, do_foreach_t do_foreach)
 static void
 test_parallel_for_each ()
 {
+  struct test_worker
+  {
+    test_worker (foreach_callback_t callback)
+      : m_callback (callback)
+    {
+    }
+
+    void operator() (int first, int last) noexcept
+    {
+      return m_callback (first, last);
+    }
+
+  private:
+    foreach_callback_t m_callback;
+  };
+
   const std::vector<do_foreach_t> for_each_functions
     {
       [] (int start, int end, foreach_callback_t callback)
-      { gdb::parallel_for_each<1> (start, end, callback); },
+      { gdb::parallel_for_each<1, int, test_worker> (start, end, callback); },
       [] (int start, int end, foreach_callback_t callback)
-      { gdb::sequential_for_each (start, end, callback);}
+      { gdb::sequential_for_each<int, test_worker> (start, end, callback); },
     };
 
   for (int n_threads : { 0, 1, 3 })
diff --git a/gdbsupport/parallel-for.h b/gdbsupport/parallel-for.h
index b74c8068cf2c..bb41eb8700f0 100644
--- a/gdbsupport/parallel-for.h
+++ b/gdbsupport/parallel-for.h
@@ -21,115 +21,107 @@
 #define GDBSUPPORT_PARALLEL_FOR_H
 
 #include <algorithm>
-#include <type_traits>
+#include <tuple>
 #include "gdbsupport/thread-pool.h"
-#include "gdbsupport/function-view.h"
 
 namespace gdb
 {
 
-/* A very simple "parallel for".  This splits the range of iterators
-   into subranges, and then passes each subrange to the callback.  The
-   work may or may not be done in separate threads.
+/* A "parallel-for" implementation using a shared work queue.  Work items get
+   popped in batches of size up to BATCH_SIZE from the queue and handed out to
+   worker threads.
 
-   This approach was chosen over having the callback work on single
-   items because it makes it simple for the caller to do
-   once-per-subrange initialization and destruction.
+   Each worker thread instantiates an object of type Worker, forwarding ARGS to
+   its constructor.  The Worker object can be used to keep some per-worker
+   thread state.
 
-   The parameter N says how batching ought to be done -- there will be
-   at least N elements processed per thread.  Setting N to 0 is not
-   allowed.  */
+   Worker threads call Worker::operator() repeatedly until the queue is
+   empty.  */
 
-template<std::size_t n, class RandomIt, class RangeFunction>
+template<std::size_t batch_size, class RandomIt, class Worker,
+	 class... WorkerArgs>
 void
-parallel_for_each (RandomIt first, RandomIt last, RangeFunction callback)
+parallel_for_each (const RandomIt first, const RandomIt last,
+		   WorkerArgs &&...worker_args)
 {
   /* If enabled, print debug info about how the work is distributed across
      the threads.  */
   const bool parallel_for_each_debug = false;
 
-  size_t n_worker_threads = thread_pool::g_thread_pool->thread_count ();
-  size_t n_threads = n_worker_threads;
-  size_t n_elements = last - first;
-  size_t elts_per_thread = 0;
-  size_t elts_left_over = 0;
+  gdb_assert (first <= last);
 
-  if (n_threads > 1)
+  if (parallel_for_each_debug)
     {
-      /* Require that there should be at least N elements in a
-	 thread.  */
-      gdb_assert (n > 0);
-      if (n_elements / n_threads < n)
-	n_threads = std::max (n_elements / n, (size_t) 1);
-      elts_per_thread = n_elements / n_threads;
-      elts_left_over = n_elements % n_threads;
-      /* n_elements == n_threads * elts_per_thread + elts_left_over. */
+      debug_printf ("Parallel for: n elements: %zu\n",
+		    static_cast<std::size_t> (last - first));
+      debug_printf ("Parallel for: batch size: %zu\n", batch_size);
     }
 
-  size_t count = n_threads == 0 ? 0 : n_threads - 1;
+  const size_t n_worker_threads
+    = std::max<size_t> (thread_pool::g_thread_pool->thread_count (), 1);
   std::vector<gdb::future<void>> results;
 
-  if (parallel_for_each_debug)
-    {
-      debug_printf (_("Parallel for: n_elements: %zu\n"), n_elements);
-      debug_printf (_("Parallel for: minimum elements per thread: %zu\n"), n);
-      debug_printf (_("Parallel for: elts_per_thread: %zu\n"), elts_per_thread);
-    }
+  /* The next item to hand out.  */
+  std::atomic<RandomIt> next = first;
 
-  for (int i = 0; i < count; ++i)
-    {
-      RandomIt end;
-      end = first + elts_per_thread;
-      if (i < elts_left_over)
-	/* Distribute the leftovers over the worker threads, to avoid having
-	   to handle all of them in a single thread.  */
-	end++;
+  /* The worker thread task.
 
-      /* This case means we don't have enough elements to really
-	 distribute them.  Rather than ever submit a task that does
-	 nothing, we short-circuit here.  */
-      if (first == end)
-	end = last;
+     We need to capture args as a tuple, because it's not possible to capture
+     the parameter pack directly in C++17.  Once we migrate to C++20, the
+     capture can be simplified to:
 
-      if (end == last)
+       ... args = std::forward<Args>(args)
+
+     and `args` can be used as-is in the lambda.  */
+  auto args_tuple
+    = std::forward_as_tuple (std::forward<WorkerArgs> (worker_args)...);
+  auto task =
+    [&next, first, last, n_worker_threads, &args_tuple] () {
+      /* Instantiate the user-defined worker.  */
+      auto worker = std::make_from_tuple<Worker> (args_tuple);
+
+      for (;;)
 	{
-	  /* We're about to dispatch the last batch of elements, which
-	     we normally process in the main thread.  So just truncate
-	     the result list here.  This avoids submitting empty tasks
-	     to the thread pool.  */
-	  count = i;
-	  break;
+	  /* Grab a snapshot of NEXT.  */
+	  auto local_next = next.load ();
+	  gdb_assert (local_next <= last);
+
+	  /* Number of remaining items.  */
+	  auto n_remaining = last - local_next;
+	  gdb_assert (n_remaining >= 0);
+
+	  /* Are we done?  */
+	  if (n_remaining == 0)
+	    break;
+
+	  const auto this_batch_size
+	    = std::min<std::size_t> (batch_size, n_remaining);
+
+	  /* The range to process in this iteration.  */
+	  const auto this_batch_first = local_next;
+	  const auto this_batch_last = local_next + this_batch_size;
+
+	  /* Update NEXT.  If the current value of NEXT doesn't match
+	     LOCAL_NEXT, it means another thread updated it concurrently,
+	     restart.  */
+	  if (!next.compare_exchange_weak (local_next, this_batch_last))
+	    continue;
+
+	  if (parallel_for_each_debug)
+	    debug_printf ("Processing %zu items, range [%zu, %zu[\n",
+			  this_batch_size,
+			  static_cast<size_t> (this_batch_first - first),
+			  static_cast<size_t> (this_batch_last - first));
+
+	  worker (this_batch_first, this_batch_last);
 	}
+    };
 
-      if (parallel_for_each_debug)
-	{
-	  debug_printf (_("Parallel for: elements on worker thread %i\t: %zu"),
-			i, (size_t)(end - first));
-	  debug_printf (_("\n"));
-	}
-      results.push_back (gdb::thread_pool::g_thread_pool->post_task ([=] ()
-        {
-	  return callback (first, end);
-	}));
-      first = end;
-    }
-
-  for (int i = count; i < n_worker_threads; ++i)
-    if (parallel_for_each_debug)
-      {
-	debug_printf (_("Parallel for: elements on worker thread %i\t: 0"), i);
-	debug_printf (_("\n"));
-      }
-
-  /* Process all the remaining elements in the main thread.  */
-  if (parallel_for_each_debug)
-    {
-      debug_printf (_("Parallel for: elements on main thread\t\t: %zu"),
-		    (size_t)(last - first));
-      debug_printf (_("\n"));
-    }
-  callback (first, last);
+  /* Start N_WORKER_THREADS tasks.  */
+  for (int i = 0; i < n_worker_threads; ++i)
+    results.push_back (gdb::thread_pool::g_thread_pool->post_task (task));
 
+  /* Wait for all of them to be finished.  */
   for (auto &fut : results)
     fut.get ();
 }
@@ -138,11 +130,11 @@ parallel_for_each (RandomIt first, RandomIt last, RangeFunction callback)
    when debugging multi-threading behavior, and you want to limit
    multi-threading in a fine-grained way.  */
 
-template<class RandomIt, class RangeFunction>
+template<class RandomIt, class Worker, class... WorkerArgs>
 void
-sequential_for_each (RandomIt first, RandomIt last, RangeFunction callback)
+sequential_for_each (RandomIt first, RandomIt last, WorkerArgs &&...worker_args)
 {
-  callback (first, last);
+  Worker (std::forward<WorkerArgs> (worker_args)...) (first, last);
 }
 
 }
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
  2025-05-05 20:15 ` [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter Simon Marchi
  2025-05-05 20:15 ` [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each Simon Marchi
@ 2025-05-05 20:15 ` Simon Marchi
  2025-06-13 18:33   ` Tom Tromey
  2025-05-05 20:15 ` [PATCH 5/6] gdbsupport: add async parallel_for_each version Simon Marchi
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

In preparation for a following patch that will re-use the shared work
queue algorithm, move it to a separate class.

Change-Id: Id05cf8898a5d162048fa8fa056fbf7e0441bfb78
---
 gdbsupport/parallel-for.h | 45 +++++--------------
 gdbsupport/work-queue.h   | 95 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 33 deletions(-)
 create mode 100644 gdbsupport/work-queue.h

diff --git a/gdbsupport/parallel-for.h b/gdbsupport/parallel-for.h
index bb41eb8700f0..c5a15d3235a7 100644
--- a/gdbsupport/parallel-for.h
+++ b/gdbsupport/parallel-for.h
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <tuple>
 #include "gdbsupport/thread-pool.h"
+#include "gdbsupport/work-queue.h"
 
 namespace gdb
 {
@@ -57,12 +58,8 @@ parallel_for_each (const RandomIt first, const RandomIt last,
       debug_printf ("Parallel for: batch size: %zu\n", batch_size);
     }
 
-  const size_t n_worker_threads
-    = std::max<size_t> (thread_pool::g_thread_pool->thread_count (), 1);
   std::vector<gdb::future<void>> results;
-
-  /* The next item to hand out.  */
-  std::atomic<RandomIt> next = first;
+  work_queue<RandomIt, batch_size> queue (first, last);
 
   /* The worker thread task.
 
@@ -75,49 +72,31 @@ parallel_for_each (const RandomIt first, const RandomIt last,
      and `args` can be used as-is in the lambda.  */
   auto args_tuple
     = std::forward_as_tuple (std::forward<WorkerArgs> (worker_args)...);
-  auto task =
-    [&next, first, last, n_worker_threads, &args_tuple] () {
+  auto task = [&queue, first, &args_tuple] () {
       /* Instantiate the user-defined worker.  */
       auto worker = std::make_from_tuple<Worker> (args_tuple);
 
       for (;;)
 	{
-	  /* Grab a snapshot of NEXT.  */
-	  auto local_next = next.load ();
-	  gdb_assert (local_next <= last);
+	  const auto [batch_first, batch_last] = queue.pop_batch ();
 
-	  /* Number of remaining items.  */
-	  auto n_remaining = last - local_next;
-	  gdb_assert (n_remaining >= 0);
-
-	  /* Are we done?  */
-	  if (n_remaining == 0)
+	  if (batch_first == batch_last)
 	    break;
 
-	  const auto this_batch_size
-	    = std::min<std::size_t> (batch_size, n_remaining);
-
-	  /* The range to process in this iteration.  */
-	  const auto this_batch_first = local_next;
-	  const auto this_batch_last = local_next + this_batch_size;
-
-	  /* Update NEXT.  If the current value of NEXT doesn't match
-	     LOCAL_NEXT, it means another thread updated it concurrently,
-	     restart.  */
-	  if (!next.compare_exchange_weak (local_next, this_batch_last))
-	    continue;
-
 	  if (parallel_for_each_debug)
 	    debug_printf ("Processing %zu items, range [%zu, %zu[\n",
-			  this_batch_size,
-			  static_cast<size_t> (this_batch_first - first),
-			  static_cast<size_t> (this_batch_last - first));
+			  static_cast<std::size_t> (batch_last - batch_first),
+			  static_cast<std::size_t> (batch_first - first),
+			  static_cast<std::size_t> (batch_last - first));
 
-	  worker (this_batch_first, this_batch_last);
+	  worker (batch_first, batch_last);
 	}
     };
 
   /* Start N_WORKER_THREADS tasks.  */
+  const size_t n_worker_threads
+    = std::max<size_t> (thread_pool::g_thread_pool->thread_count (), 1);
+
   for (int i = 0; i < n_worker_threads; ++i)
     results.push_back (gdb::thread_pool::g_thread_pool->post_task (task));
 
diff --git a/gdbsupport/work-queue.h b/gdbsupport/work-queue.h
new file mode 100644
index 000000000000..f84b29a4f523
--- /dev/null
+++ b/gdbsupport/work-queue.h
@@ -0,0 +1,95 @@
+/* Synchronized work queue.
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef GDBSUPPORT_WORK_QUEUE_H
+#define GDBSUPPORT_WORK_QUEUE_H
+
+namespace gdb
+{
+
+/* Implementation of a thread-safe work queue.
+
+   The work items are specified by two iterators of type RandomIt.
+
+   BATCH_SIZE is the number of work items to pop in a batch.  */
+
+template<typename RandomIt, std::size_t batch_size>
+class work_queue
+{
+public:
+  /* The work items are specified by the range `[first, last[`.  */
+  work_queue (RandomIt first, RandomIt last) noexcept
+    : m_next (first),
+      m_last (last)
+  {
+    gdb_assert (first <= last);
+  }
+
+  DISABLE_COPY_AND_ASSIGN (work_queue);
+
+  /* Pop a batch of work items.
+
+     The return value is a pair containing iterators to the first and last
+     (exclusive) work items.  */
+  std::pair<RandomIt, RandomIt> pop_batch () noexcept
+  {
+    for (;;)
+      {
+	/* Grab a snapshot of M_NEXT.  */
+	auto next = m_next.load ();
+	gdb_assert (next <= m_last);
+
+	/* The number of items remaining in the queue.  */
+	const auto n_remaining = static_cast<std::size_t> (m_last - next);
+
+	/* Are we done?  */
+	if (n_remaining == 0)
+	  return std::make_pair (m_last, m_last);
+
+	/* The batch size is proportional to the number of items remaining in
+	   the queue.  We do this to try to stike a balance, avoiding
+	   synchronization overhead when there are many items to process at the
+	   start, and avoiding workload imbalance when there are few items to
+	   process at the end.  */
+	const auto this_batch_size = std::min (batch_size, n_remaining);
+
+	/* The range of items in this batch.  */
+	const auto this_batch_first = next;
+	const auto this_batch_last = next + this_batch_size;
+
+	/* Update M_NEXT.  If the current value of M_NEXT doesn't match NEXT, it
+	   means another thread updated it concurrently, restart.  */
+	if (!m_next.compare_exchange_weak (next, this_batch_last))
+	  continue;
+
+	return std::make_pair (this_batch_first, this_batch_last);
+      }
+  }
+
+private:
+  /* The next work item to hand out.  */
+  std::atomic<RandomIt> m_next;
+
+  /* The end of the work item range.  */
+  RandomIt m_last;
+};
+
+} /* namespace gdb */
+
+#endif /* GDBSUPPORT_WORK_QUEUE_H */
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 5/6] gdbsupport: add async parallel_for_each version
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
                   ` (2 preceding siblings ...)
  2025-05-05 20:15 ` [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h Simon Marchi
@ 2025-05-05 20:15 ` Simon Marchi
  2025-06-13 18:39   ` Tom Tromey
  2025-05-05 20:15 ` [PATCH 6/6] gdb/dwarf: use dynamic partitioning for DWARF CU indexing Simon Marchi
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

From: Simon Marchi <simon.marchi@polymtl.ca>

I would like to use gdb::parallel_for_each to implement the parallelism
of the DWARF unit indexing.  However, the existing implementation of
gdb::parallel_for_each is blocking, which doesn't work with the model
used by the DWARF indexer, which is asynchronous and callback-based.
Add an asynchronouys version of gdb::parallel_for_each that will be
suitable for this task.

This new version accepts a callback that is invoked when the parallel
for each is complete.

This function uses the same strategy as gdb::task_group to invoke the
"done" callback: worker threads have a shared_ptr reference to some
object.  The last worker thread to drop its reference causes the object
to be deleted, which invokes the callback.

Unlike for the sync version of gdb::parallel_for_each, it's not possible
to keep any state in the calling thread's stack, because that disappears
immediately after starting the workers.  So all the state is kept in
that same shared object.

There is a limitation that the sync version doesn't have, regarding the
arguments you can pass to the worker objects: it's not possibly to rely
on references.  There are more details in a comment in the code.

It would be possible to implement the sync version of
gdb::parallel_for_each on top of the async version, but I decided not to
do it to avoid the unnecessary dynamic allocation of the shared object,
and to avoid adding the limitations on passing references I mentioned
just above.  But if we judge that it would be an acceptable cost to
avoid the duplication, we could do it.

Add a self test for the new function.

Change-Id: I6173defb1e09856d137c1aa05ad51cbf521ea0b0
---
 gdb/unittests/parallel-for-selftests.c |  23 ++++
 gdbsupport/parallel-for.h              | 141 ++++++++++++++++++++++++-
 2 files changed, 159 insertions(+), 5 deletions(-)

diff --git a/gdb/unittests/parallel-for-selftests.c b/gdb/unittests/parallel-for-selftests.c
index 6b6b557ba99e..2f9313e32858 100644
--- a/gdb/unittests/parallel-for-selftests.c
+++ b/gdb/unittests/parallel-for-selftests.c
@@ -109,8 +109,31 @@ test_parallel_for_each ()
 
   const std::vector<do_foreach_t> for_each_functions
     {
+      /* Test gdb::parallel_for_each.  */
       [] (int start, int end, foreach_callback_t callback)
       { gdb::parallel_for_each<1, int, test_worker> (start, end, callback); },
+
+      /* Test gdb::parallel_for_each_async.  */
+      [] (int start, int end, foreach_callback_t callback)
+      {
+	bool done_flag = false;
+	std::condition_variable cv;
+	std::mutex mtx;
+
+	gdb::parallel_for_each_async<1, int, test_worker> (start, end,
+	  [&mtx, &done_flag, &cv] ()
+	    {
+	      std::lock_guard<std::mutex> lock (mtx);
+	      done_flag = true;
+	      cv.notify_one();
+	    },  callback);
+
+	/* Wait for the async parallel-for to complete.  */
+	std::unique_lock<std::mutex> lock (mtx);
+	cv.wait (lock, [&done_flag] () { return done_flag; });
+      },
+
+      /* Test gdb::sequential_for_each.  */
       [] (int start, int end, foreach_callback_t callback)
       { gdb::sequential_for_each<int, test_worker> (start, end, callback); },
     };
diff --git a/gdbsupport/parallel-for.h b/gdbsupport/parallel-for.h
index c5a15d3235a7..090eeaaec672 100644
--- a/gdbsupport/parallel-for.h
+++ b/gdbsupport/parallel-for.h
@@ -25,6 +25,10 @@
 #include "gdbsupport/thread-pool.h"
 #include "gdbsupport/work-queue.h"
 
+/* If enabled, print debug info about the inner workings of the parallel for
+   each functions.  */
+const bool parallel_for_each_debug = false;
+
 namespace gdb
 {
 
@@ -37,7 +41,10 @@ namespace gdb
    thread state.
 
    Worker threads call Worker::operator() repeatedly until the queue is
-   empty.  */
+   empty.
+
+   This function is synchronous, meaning that it blocks and returns once the
+   processing is complete.  */
 
 template<std::size_t batch_size, class RandomIt, class Worker,
 	 class... WorkerArgs>
@@ -45,10 +52,6 @@ void
 parallel_for_each (const RandomIt first, const RandomIt last,
 		   WorkerArgs &&...worker_args)
 {
-  /* If enabled, print debug info about how the work is distributed across
-     the threads.  */
-  const bool parallel_for_each_debug = false;
-
   gdb_assert (first <= last);
 
   if (parallel_for_each_debug)
@@ -116,6 +119,134 @@ sequential_for_each (RandomIt first, RandomIt last, WorkerArgs &&...worker_args)
   Worker (std::forward<WorkerArgs> (worker_args)...) (first, last);
 }
 
+namespace detail
+{
+
+/* Type to hold the state shared between threads of
+   gdb::parallel_for_each_async.  */
+
+template<std::size_t min_batch_size, typename RandomIt, typename... WorkerArgs>
+struct pfea_state
+{
+  pfea_state (RandomIt first, RandomIt last, std::function<void ()> &&done,
+	      WorkerArgs &&...worker_args)
+    : first (first),
+      last (last),
+      worker_args_tuple (std::forward_as_tuple<WorkerArgs>
+			 (std::forward<WorkerArgs> (worker_args))...),
+      queue (first, last),
+      m_done (std::move (done))
+  {}
+
+  DISABLE_COPY_AND_ASSIGN (pfea_state);
+
+  /* This gets called by the last worker thread that drops its reference on
+     the shared state, thus when the processing is complete.  */
+  ~pfea_state ()
+  {
+    if (m_done)
+      m_done ();
+  }
+
+  /* The interval to process.  */
+  const RandomIt first, last;
+
+  /* Tuple of arguments to pass when constructing the user's worker object.
+
+     Use std::decay_t to avoid storing references to the caller's local
+     variables.  If we didn't use it and the caller passed an lvalue `foo *`,
+     we would store it as a reference to `foo *`, thus storing a reference to
+     the caller's local variable.
+
+     The downside is that it's not possible to pass arguments by reference,
+     callers need to pass pointers or std::reference_wrappers.  */
+  std::tuple<std::decay_t<WorkerArgs>...> worker_args_tuple;
+
+  /* Work queue that worker threads pull work items from.  */
+  work_queue<RandomIt, min_batch_size> queue;
+
+private:
+  /* Callable called when the parallel-for is done.  */
+  std::function<void ()> m_done;
+};
+
+} /* namespace detail */
+
+/* A "parallel-for" implementation using a shared work queue.  Work items get
+   popped in batches from the queue and handed out to worker threads.
+
+   Batch sizes are proportional to the number of remaining items in the queue,
+   but always greater or equal to MIN_BATCH_SIZE.
+
+   The DONE callback is invoked when processing is done.
+
+   Each worker thread instantiates an object of type Worker, forwarding ARGS to
+   its constructor.  The Worker object can be used to keep some per-worker
+   thread state.  This version does not support passing references as arguments
+   to the worker.  Use std::reference_wrapper or pointers instead.
+
+   Worker threads call Worker::operator() repeatedly until the queue is
+   empty.
+
+   This function is asynchronous.  An arbitrary worker thread will call the DONE
+   callback when processing is done.  */
+
+template<std::size_t min_batch_size, class RandomIt, class Worker,
+	 class... WorkerArgs>
+void
+parallel_for_each_async (const RandomIt first, const RandomIt last,
+			 std::function<void ()> &&done,
+			 WorkerArgs &&...worker_args)
+{
+  gdb_assert (first <= last);
+
+  if (parallel_for_each_debug)
+    {
+      debug_printf ("Parallel for: n elements: %zu\n",
+		    static_cast<std::size_t> (last - first));
+      debug_printf ("Parallel for: min batch size: %zu\n", min_batch_size);
+    }
+
+  const size_t n_worker_threads
+    = std::max<size_t> (thread_pool::g_thread_pool->thread_count (), 1);
+
+  /* The state shared between all worker threads.  All worker threads get a
+     reference on the shared pointer through the lambda below.  The last worker
+     thread to drop its reference will cause this object to be destroyed, which
+     will call the DONE callback.  */
+  using state_t = detail::pfea_state<min_batch_size, RandomIt, WorkerArgs...>;
+  auto state
+    = std::make_shared<state_t> (first, last, std::move (done),
+				 std::forward<WorkerArgs> (worker_args)...);
+
+  /* The worker thread task.  */
+  auto task = [state] ()
+    {
+      /* Instantiate the user-defined worker.  */
+      auto worker = std::make_from_tuple<Worker> (state->worker_args_tuple);
+
+      for (;;)
+	{
+	  const auto [batch_first, batch_last] = state->queue.pop_batch ();
+
+	  if (batch_first == batch_last)
+	    break;
+
+	  if (parallel_for_each_debug)
+	    debug_printf ("Processing %zu items, range [%zu, %zu[\n",
+			  static_cast<std::size_t> (batch_last - batch_first),
+			  static_cast<std::size_t> (batch_first - state->first),
+			  static_cast<std::size_t> (batch_last - state->first));
+
+	  worker (batch_first, batch_last);
+	}
+    };
+
+  /* Start N_WORKER_THREADS tasks.  */
+  for (int i = 0; i < n_worker_threads; ++i)
+    gdb::thread_pool::g_thread_pool->post_task (task);
+}
+
 }
 
 #endif /* GDBSUPPORT_PARALLEL_FOR_H */
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 6/6] gdb/dwarf: use dynamic partitioning for DWARF CU indexing
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
                   ` (3 preceding siblings ...)
  2025-05-05 20:15 ` [PATCH 5/6] gdbsupport: add async parallel_for_each version Simon Marchi
@ 2025-05-05 20:15 ` Simon Marchi
  2025-05-27 14:44 ` [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
  2025-06-13 17:48 ` Tom Tromey
  6 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-05-05 20:15 UTC (permalink / raw)
  To: gdb-patches; +Cc: Simon Marchi

From: Simon Marchi <simon.marchi@polymtl.ca>

The DWARF indexer splits the work statically based on the unit sizes,
attempting to give each worker thread about the same amount of bytes to
process.  This works relatively well with standard compilation.  For
instance, I see this when loading a big program:

    Time for "DWARF indexing worker": wall 0.503, user 0.419, sys 0.077, user+sys 0.496, 98.6 % CPU
    Time for "DWARF indexing worker": wall 0.540, user 0.450, sys 0.081, user+sys 0.531, 98.3 % CPU
    Time for "DWARF indexing worker": wall 0.573, user 0.480, sys 0.087, user+sys 0.567, 99.0 % CPU
    Time for "DWARF indexing worker": wall 0.577, user 0.480, sys 0.090, user+sys 0.570, 98.8 % CPU
    Time for "DWARF indexing worker": wall 0.580, user 0.473, sys 0.097, user+sys 0.570, 98.3 % CPU
    Time for "DWARF indexing worker": wall 0.587, user 0.487, sys 0.082, user+sys 0.569, 96.9 % CPU
    Time for "DWARF indexing worker": wall 0.592, user 0.496, sys 0.081, user+sys 0.577, 97.5 % CPU
    Time for "DWARF indexing worker": wall 0.599, user 0.499, sys 0.086, user+sys 0.585, 97.7 % CPU

The "wall" times of all threads are relatively close, meaning the
workload balance is not too bad.  However, when compiling with DWO files
(-gsplit-dwarf), it's not as good:

    Time for "DWARF indexing worker": wall 0.201, user 0.125, sys 0.053, user+sys 0.178, 88.6 % CPU
    Time for "DWARF indexing worker": wall 0.232, user 0.158, sys 0.056, user+sys 0.214, 92.2 % CPU
    Time for "DWARF indexing worker": wall 0.413, user 0.311, sys 0.081, user+sys 0.392, 94.9 % CPU
    Time for "DWARF indexing worker": wall 0.523, user 0.400, sys 0.101, user+sys 0.501, 95.8 % CPU
    Time for "DWARF indexing worker": wall 0.669, user 0.526, sys 0.125, user+sys 0.651, 97.3 % CPU
    Time for "DWARF indexing worker": wall 0.860, user 0.699, sys 0.138, user+sys 0.837, 97.3 % CPU
    Time for "DWARF indexing worker": wall 0.968, user 0.782, sys 0.169, user+sys 0.951, 98.2 % CPU
    Time for "DWARF indexing worker": wall 1.107, user 0.916, sys 0.166, user+sys 1.082, 97.7 % CPU

Note how the wall times vary from 0.2 to 1.1 seconds.  This is
undesirable, because the time to do that indexing step takes as long as
the slowest worker thread takes.

With DWO files, the split of the workload by size doesn't work, because
it is done using the size of the skeleton units in the main file, which
is not representative of how much DWARF is contained in each DWO file.

I haven't tried it, but a similar problem could occur with cross-unit
imports, which can happen with dwz or LTO.  You could have a small unit
that imports a lot from other units, in which case the size of the units
is not representative of the work to accomplish.

To try to improve this situation, change the DWARF indexer to use
dynamic partitioning, using gdb::parallel_for_each_async.  With this,
each worker thread pops one unit at a time from a shared work queue to
process it, until the queue is empty.  That should result in a more
balance workload split.  I chose 1 as the minimum batch size here,
because I judged that indexing one CU was a big enough piece of work
compared to the synchronization overhead of the queue.  That can always
be tweaked later if someone wants to do more tests.

With the standard compilation case, the balance is a bit better, but the
the time is similar as before:

    Time for "DWARF indexing worker": wall 0.571, user 0.490, sys 0.071, user+sys 0.561, 98.2 % CPU
    Time for "DWARF indexing worker": wall 0.571, user 0.463, sys 0.097, user+sys 0.560, 98.1 % CPU
    Time for "DWARF indexing worker": wall 0.571, user 0.498, sys 0.066, user+sys 0.564, 98.8 % CPU
    Time for "DWARF indexing worker": wall 0.572, user 0.473, sys 0.087, user+sys 0.560, 97.9 % CPU
    Time for "DWARF indexing worker": wall 0.569, user 0.477, sys 0.084, user+sys 0.561, 98.6 % CPU
    Time for "DWARF indexing worker": wall 0.572, user 0.476, sys 0.079, user+sys 0.555, 97.0 % CPU
    Time for "DWARF indexing worker": wall 0.572, user 0.482, sys 0.079, user+sys 0.561, 98.1 % CPU
    Time for "DWARF indexing worker": wall 0.582, user 0.484, sys 0.083, user+sys 0.567, 97.4 % CPU

However, for the DWO case, it's much better:

    Time for "DWARF indexing worker": wall 0.686, user 0.496, sys 0.108, user+sys 0.604, 88.0 % CPU
    Time for "DWARF indexing worker": wall 0.686, user 0.513, sys 0.094, user+sys 0.607, 88.5 % CPU
    Time for "DWARF indexing worker": wall 0.686, user 0.480, sys 0.120, user+sys 0.600, 87.5 % CPU
    Time for "DWARF indexing worker": wall 0.686, user 0.500, sys 0.117, user+sys 0.617, 89.9 % CPU
    Time for "DWARF indexing worker": wall 0.687, user 0.476, sys 0.133, user+sys 0.609, 88.6 % CPU
    Time for "DWARF indexing worker": wall 0.687, user 0.509, sys 0.104, user+sys 0.613, 89.2 % CPU
    Time for "DWARF indexing worker": wall 0.687, user 0.507, sys 0.109, user+sys 0.616, 89.7 % CPU
    Time for "DWARF indexing worker": wall 0.688, user 0.497, sys 0.104, user+sys 0.601, 87.4 % CPU

The state previously kept in as local variables in
cooked_index_worker_debug_info::process_units becomes fields of the new
parallel worker object.

The work previously done for each unit in
cooked_index_worker_debug_info::process_units becomes the operator() of
the new parallel owrker object.

The work previously done at the end of
cooked_index_worker_debug_info::process_units (including calling
bfd_thread_cleanup) becomes the destructor of the new parallel worker
object.

The "done" callback of gdb::task_group becomes the "done" callback of
gdb::parallel_for_each_async.

Change-Id: I5dc5cf8793abe9ebe2659e78da38ffc94289e5f2
---
 gdb/dwarf2/cooked-index-worker.h |   6 ++
 gdb/dwarf2/read.c                | 136 +++++++++++--------------------
 2 files changed, 54 insertions(+), 88 deletions(-)

diff --git a/gdb/dwarf2/cooked-index-worker.h b/gdb/dwarf2/cooked-index-worker.h
index df5c31d572df..0d8932503ab9 100644
--- a/gdb/dwarf2/cooked-index-worker.h
+++ b/gdb/dwarf2/cooked-index-worker.h
@@ -268,8 +268,14 @@ class cooked_index_worker
 
   /* The per-objfile object.  */
   dwarf2_per_objfile *m_per_objfile;
+
   /* Result of each worker task.  */
   std::vector<cooked_index_worker_result> m_results;
+
+  /* Mutex to synchronize access to M_RESULTS when workers append their
+     result.  */
+  std::mutex m_results_mutex;
+
   /* Any warnings emitted.  For the time being at least, this only
      needed in do_reading, not in every worker.  Note that
      deferred_warnings uses gdb_stderr in its constructor, and this
diff --git a/gdb/dwarf2/read.c b/gdb/dwarf2/read.c
index f90b22781e0d..efba56ff1ec0 100644
--- a/gdb/dwarf2/read.c
+++ b/gdb/dwarf2/read.c
@@ -51,6 +51,7 @@
 #include "elf-bfd.h"
 #include "event-top.h"
 #include "exceptions.h"
+#include "gdbsupport/parallel-for.h"
 #include "gdbsupport/task-group.h"
 #include "maint.h"
 #include "symtab.h"
@@ -3244,12 +3245,6 @@ class cooked_index_worker_debug_info : public cooked_index_worker
   /* An iterator for the comp units.  */
   using unit_iterator = std::vector<dwarf2_per_cu_up>::iterator;
 
-  /* Process a batch of CUs.  This may be called multiple times in
-     separate threads.  TASK_NUMBER indicates which task this is --
-     the result is stored in that slot of M_RESULTS.  */
-  void process_units (size_t task_number, unit_iterator first,
-		      unit_iterator end);
-
   /* Process unit THIS_CU.  */
   void process_unit (dwarf2_per_cu *this_cu, dwarf2_per_objfile *per_objfile,
 		     cooked_index_worker_result *storage);
@@ -3467,36 +3462,6 @@ cooked_index_worker_debug_info::process_skeletonless_type_units
 	process_skeletonless_type_unit (unit, per_objfile, storage);
 }
 
-void
-cooked_index_worker_debug_info::process_units (size_t task_number,
-					       unit_iterator first,
-					       unit_iterator end)
-{
-  SCOPE_EXIT { bfd_thread_cleanup (); };
-
-  /* Ensure that complaints are handled correctly.  */
-  complaint_interceptor complaint_handler;
-
-  std::vector<gdb_exception> errors;
-  cooked_index_worker_result thread_storage;
-  for (auto inner = first; inner != end; ++inner)
-    {
-      dwarf2_per_cu *per_cu = inner->get ();
-
-      try
-	{
-	  process_unit (per_cu, m_per_objfile, &thread_storage);
-	}
-      catch (gdb_exception &except)
-	{
-	  thread_storage.note_error (std::move (except));
-	}
-    }
-
-  thread_storage.done_reading (complaint_handler.release ());
-  m_results[task_number] = std::move (thread_storage);
-}
-
 void
 cooked_index_worker_debug_info::done_reading ()
 {
@@ -3522,62 +3487,57 @@ cooked_index_worker_debug_info::do_reading ()
 			       m_index_storage.get_addrmap (),
 			       &m_warnings);
 
-  /* We want to balance the load between the worker threads.  This is
-     done by using the size of each CU as a rough estimate of how
-     difficult it will be to operate on.  This isn't ideal -- for
-     example if dwz is used, the early CUs will all tend to be
-     "included" and won't be parsed independently.  However, this
-     heuristic works well for typical compiler output.  */
-
-  size_t total_size = 0;
-  for (const auto &per_cu : per_bfd->all_units)
-    total_size += per_cu->length ();
-
-  /* How many worker threads we plan to use.  We may not actually use
-     this many.  We use 1 as the minimum to avoid division by zero,
-     and anyway in the N==0 case the work will be done
-     synchronously.  */
-  const size_t n_worker_threads
-    = std::max (gdb::thread_pool::g_thread_pool->thread_count (), (size_t) 1);
-
-  /* How much effort should be put into each worker.  */
-  const size_t size_per_thread
-    = std::max (total_size / n_worker_threads, (size_t) 1);
-
-  /* Work is done in a task group.  */
-  gdb::task_group workers ([this] ()
+  /* The task for prallel workers that index units.  */
+  struct parallel_indexing_worker
   {
-    this->done_reading ();
-  });
-
-  auto end = per_bfd->all_units.end ();
-  size_t task_count = 0;
-  for (auto iter = per_bfd->all_units.begin (); iter != end; )
+    explicit parallel_indexing_worker (cooked_index_worker_debug_info *parent)
+      : m_scoped_time_it ("DWARF indexing worker"),
+	m_parent (parent)
     {
-      auto last = iter;
-      /* Put all remaining CUs into the last task.  */
-      if (task_count == n_worker_threads - 1)
-	last = end;
-      else
-	{
-	  size_t chunk_size = 0;
-	  for (; last != end && chunk_size < size_per_thread; ++last)
-	    chunk_size += (*last)->length ();
-	}
-
-      gdb_assert (iter != last);
-      workers.add_task ([this, task_count, iter, last] ()
-	{
-	  scoped_time_it time_it ("DWARF indexing worker");
-	  process_units (task_count, iter, last);
-	});
-
-      ++task_count;
-      iter = last;
     }
 
-  m_results.resize (task_count);
-  workers.start ();
+    DISABLE_COPY_AND_ASSIGN (parallel_indexing_worker);
+
+    ~parallel_indexing_worker ()
+    {
+      bfd_thread_cleanup ();
+
+      m_thread_storage.done_reading (m_complaint_handler.release ());
+
+      std::lock_guard<std::mutex> lock (m_parent->m_results_mutex);
+      m_parent->m_results.emplace_back (std::move (m_thread_storage));
+    }
+
+    void operator() (dwarf2_per_cu_up *first, dwarf2_per_cu_up *last)
+    {
+      for (auto &it : iterator_range (first, last))
+	{
+	  try
+	    {
+	      m_parent->process_unit (it.get (), m_parent->m_per_objfile,
+				      &m_thread_storage);
+	    }
+	  catch (gdb_exception &except)
+	    {
+	      m_thread_storage.note_error (std::move (except));
+	    }
+	}
+    }
+
+  private:
+    scoped_time_it m_scoped_time_it;
+
+    complaint_interceptor m_complaint_handler;
+    std::vector<gdb_exception> m_errors;
+    cooked_index_worker_result m_thread_storage;
+
+    cooked_index_worker_debug_info *m_parent;
+  };
+
+  gdb::parallel_for_each_async<1, dwarf2_per_cu_up *, parallel_indexing_worker>
+    (per_bfd->all_units.data (),
+     per_bfd->all_units.data () + per_bfd->all_units.size (),
+     [this] () { this->done_reading (); }, this);
 }
 
 static void
-- 
2.49.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/6] gdb: re-work parallel-for-selftests.c
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
                   ` (4 preceding siblings ...)
  2025-05-05 20:15 ` [PATCH 6/6] gdb/dwarf: use dynamic partitioning for DWARF CU indexing Simon Marchi
@ 2025-05-27 14:44 ` Simon Marchi
  2025-06-13 17:48 ` Tom Tromey
  6 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-05-27 14:44 UTC (permalink / raw)
  To: Simon Marchi, gdb-patches



On 2025-05-05 16:15, Simon Marchi wrote:
> From: Simon Marchi <simon.marchi@polymtl.ca>
> 
> I find this file difficult to work with and modify, due to how it uses
> the preprocessor to include itself, to generate variations of the test
> functions.  Change it to something a bit more C++-y, with a test
> function that accepts a callback to invoke the foreach function under
> test.
> 
> Change-Id: Ibf1e2907380a88a4f8e4b4b88df2b0dfd0e9b6c8

Ping.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/6] gdb: re-work parallel-for-selftests.c
  2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
                   ` (5 preceding siblings ...)
  2025-05-27 14:44 ` [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
@ 2025-06-13 17:48 ` Tom Tromey
  2025-06-13 18:38   ` Simon Marchi
  6 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-06-13 17:48 UTC (permalink / raw)
  To: Simon Marchi; +Cc: gdb-patches, Simon Marchi

>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:

Simon> From: Simon Marchi <simon.marchi@polymtl.ca>
Simon> I find this file difficult to work with and modify, due to how it uses
Simon> the preprocessor to include itself, to generate variations of the test
Simon> functions.  Change it to something a bit more C++-y, with a test
Simon> function that accepts a callback to invoke the foreach function under
Simon> test.

Ok.
Approved-By: Tom Tromey <tom@tromey.com>

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter
  2025-05-05 20:15 ` [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter Simon Marchi
@ 2025-06-13 17:55   ` Tom Tromey
  2025-06-13 18:43     ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-06-13 17:55 UTC (permalink / raw)
  To: Simon Marchi; +Cc: gdb-patches, Simon Marchi

>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:

Simon> From: Simon Marchi <simon.marchi@polymtl.ca>
Simon> This value will likely never change at runtime, so we might as well make
Simon> it a template parameter.  This has the "advantage" of being able to
Simon> remove the unnecessary param from gdb::sequential_for_each.

I don't think this really adds much, but also there's no good reason to
object to it.
Approved-By: Tom Tromey <tom@tromey.com>

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-05-05 20:15 ` [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each Simon Marchi
@ 2025-06-13 18:29   ` Tom Tromey
  2025-06-13 19:22     ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-06-13 18:29 UTC (permalink / raw)
  To: Simon Marchi; +Cc: gdb-patches, Simon Marchi

>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:

Simon>  - I want to still be able to use scoped_time_it to measure the time
Simon>    that each worker thread spent working on the task.  I find it really
Simon>    handy when measuring the performance impact of changes.

Simon>    Unfortunately, the current interface of gdb::parallel_for_each, which
Simon>    receives a simple callback, is not well-suited for that, once I
Simon>    introduce the dynamic partitioning.  The callback would get called
Simon>    once for each work item batch (multiple time for each worker thread),
Simon>    so it's not possible to maintain a per-worker thread object for the
Simon>    duration of the parallel for.

Simon>    To allow this, I changed gdb::parallel_for_each to receive a worker
Simon>    type as a template parameter.  Each worker thread creates one local
Simon>    instance of that type, and calls operator() on it for each work item
Simon>    batch.  By having a scoped_time_it object as a field of that worker,
Simon>    we can get the timings per worker thread.

Simon>    The drawbacks of this approach is that we must now define the
Simon>    parallel task in a separate class and manually capture any context we
Simon>    need as fields of that class.

This seems like a real step backward to me.  It's basically replicating
lambdas by hand.

Why can't parallel_for_each just do this itself?  It could take an
argument naming the task to pass to the scoped_time_it constructor.  The
'task' lambda could instantiate the timer.

Simon> +  /* The next item to hand out.  */
Simon> +  std::atomic<RandomIt> next = first;

This is maybe a bit worrying since IIUC std::atomic only works for
trivially copyable types.  But this doesn't have to be a general purpose
facility, it only has to work for gdb's needs.

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h
  2025-05-05 20:15 ` [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h Simon Marchi
@ 2025-06-13 18:33   ` Tom Tromey
  2025-06-13 19:24     ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-06-13 18:33 UTC (permalink / raw)
  To: Simon Marchi; +Cc: gdb-patches

>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:

Simon> In preparation for a following patch that will re-use the shared work
Simon> queue algorithm, move it to a separate class.

Patch 3 said:

+parallel_for_each (const RandomIt first, const RandomIt last,
+		   WorkerArgs &&...worker_args)

Simon> +  /* The work items are specified by the range `[first, last[`.  */
Simon> +  work_queue (RandomIt first, RandomIt last) noexcept
Simon> +    : m_next (first),
Simon> +      m_last (last)

So I was wondering why not 'const RandomIt' here.
And then 'const RandomIt m_last;'

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/6] gdb: re-work parallel-for-selftests.c
  2025-06-13 17:48 ` Tom Tromey
@ 2025-06-13 18:38   ` Simon Marchi
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 18:38 UTC (permalink / raw)
  To: Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 1:48 PM, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:
> 
> Simon> From: Simon Marchi <simon.marchi@polymtl.ca>
> Simon> I find this file difficult to work with and modify, due to how it uses
> Simon> the preprocessor to include itself, to generate variations of the test
> Simon> functions.  Change it to something a bit more C++-y, with a test
> Simon> function that accepts a callback to invoke the foreach function under
> Simon> test.
> 
> Ok.
> Approved-By: Tom Tromey <tom@tromey.com>
> 
> Tom

Thanks, I push this patch.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/6] gdbsupport: add async parallel_for_each version
  2025-05-05 20:15 ` [PATCH 5/6] gdbsupport: add async parallel_for_each version Simon Marchi
@ 2025-06-13 18:39   ` Tom Tromey
  2025-06-13 19:29     ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-06-13 18:39 UTC (permalink / raw)
  To: Simon Marchi; +Cc: gdb-patches, Simon Marchi

>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:

Simon> +/* If enabled, print debug info about the inner workings of the parallel for
Simon> +   each functions.  */
Simon> +const bool parallel_for_each_debug = false;

Defining a global in a header makes me itch a little.
I feel like we've had linker problems with this somewhere?
Anyway couldn't this be constexpr?

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter
  2025-06-13 17:55   ` Tom Tromey
@ 2025-06-13 18:43     ` Simon Marchi
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 18:43 UTC (permalink / raw)
  To: Tom Tromey; +Cc: gdb-patches, Simon Marchi

On 6/13/25 1:55 PM, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:
> 
> Simon> From: Simon Marchi <simon.marchi@polymtl.ca>
> Simon> This value will likely never change at runtime, so we might as well make
> Simon> it a template parameter.  This has the "advantage" of being able to
> Simon> remove the unnecessary param from gdb::sequential_for_each.
> 
> I don't think this really adds much, but also there's no good reason to
> object to it.
> Approved-By: Tom Tromey <tom@tromey.com>
> 
> Tom

Yeah, it doesn't change much, but it doesn't hurt.  I pushed this one
too.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-06-13 18:29   ` Tom Tromey
@ 2025-06-13 19:22     ` Simon Marchi
  2025-06-13 19:56       ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 19:22 UTC (permalink / raw)
  To: Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 2:29 PM, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:
> 
> Simon>  - I want to still be able to use scoped_time_it to measure the time
> Simon>    that each worker thread spent working on the task.  I find it really
> Simon>    handy when measuring the performance impact of changes.
> 
> Simon>    Unfortunately, the current interface of gdb::parallel_for_each, which
> Simon>    receives a simple callback, is not well-suited for that, once I
> Simon>    introduce the dynamic partitioning.  The callback would get called
> Simon>    once for each work item batch (multiple time for each worker thread),
> Simon>    so it's not possible to maintain a per-worker thread object for the
> Simon>    duration of the parallel for.
> 
> Simon>    To allow this, I changed gdb::parallel_for_each to receive a worker
> Simon>    type as a template parameter.  Each worker thread creates one local
> Simon>    instance of that type, and calls operator() on it for each work item
> Simon>    batch.  By having a scoped_time_it object as a field of that worker,
> Simon>    we can get the timings per worker thread.
> 
> Simon>    The drawbacks of this approach is that we must now define the
> Simon>    parallel task in a separate class and manually capture any context we
> Simon>    need as fields of that class.
> 
> This seems like a real step backward to me.  It's basically replicating
> lambdas by hand.
> 
> Why can't parallel_for_each just do this itself?  It could take an
> argument naming the task to pass to the scoped_time_it constructor.  The
> 'task' lambda could instantiate the timer.

This would require scoped_time_it to be in gdbsupport.  For the moment,
scoped_time_it is defined in gdb, because it was easier to do so at the
time, but it could be moved to gdbsupport.

In the patch that makes the DWARF reader use
gdb::parallel_for_each_async, we keep more per-worker state, the
`cooked_index_worker_result` where each worker threads puts what it
computes.  We would need a way to make that work with the lambda.

One way to do this would be to pass to the lambda the index (0 to N-1)
of the worker thread, and the lambda could use it to index into the
vector of cooked_index_worker_result, which would have been prepared by
the caller of gdb::parallel_for_each_async.

The last detail would be: how do to some one-time work at the end of a
parallel for-each worker's life.  For example, in the DWARF indexer,
each worker does:

 - call bfd_thread_cleanup
 - move the complaints to the thread storage
 - move the thread storage to the results vector

How could this work, given that the lambda is invoked many times for
each worker?

We could make gdb::parallel_for_each accept a second lambda, "finalize",
which would be called at the end of a worker's life.

Another idea I had (but didn't try) was to make lambdas receive a
"magic" range object, like athis:

  [&] (gdb::dynamic_range<dwarf2_per_cu *> range)
    {
      for (dwarf2_per_cu *cu : range)
        process_unit (cu);
    }

"range" would get work items from the work queue in batches, but yield
one at a time.  It would reach its end when the work queue is empty.
It will be a bit of work to implement, but it would have the advantage
that any per-worker state could be right there in the lambda, like we
have today.

Any thoughts?

> Simon> +  /* The next item to hand out.  */
> Simon> +  std::atomic<RandomIt> next = first;
> 
> This is maybe a bit worrying since IIUC std::atomic only works for
> trivially copyable types.  But this doesn't have to be a general purpose
> facility, it only has to work for gdb's needs.

Yes, I hit this when trying to do:

  gdb::parallel_for_each (vector.begin (), vector.end (), ...);

By default, a vector iterator is small and trivial.  But when building
with -D_GLIBCXX_DEBUG, then it's no longer trivially copyable.  That's
why I used the less ideal:

  gdb::parallel_for_each (vector.data (),
                          vector.data () + vector.size (), ...);

If you have a better idea to make this work, let me know.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h
  2025-06-13 18:33   ` Tom Tromey
@ 2025-06-13 19:24     ` Simon Marchi
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 19:24 UTC (permalink / raw)
  To: Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 2:33 PM, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:
> 
> Simon> In preparation for a following patch that will re-use the shared work
> Simon> queue algorithm, move it to a separate class.
> 
> Patch 3 said:
> 
> +parallel_for_each (const RandomIt first, const RandomIt last,
> +		   WorkerArgs &&...worker_args)
> 
> Simon> +  /* The work items are specified by the range `[first, last[`.  */
> Simon> +  work_queue (RandomIt first, RandomIt last) noexcept
> Simon> +    : m_next (first),
> Simon> +      m_last (last)
> 
> So I was wondering why not 'const RandomIt' here.
> And then 'const RandomIt m_last;'

Hmm, probably an oversight.  It works when I add the consts, so I'll
update this patch.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/6] gdbsupport: add async parallel_for_each version
  2025-06-13 18:39   ` Tom Tromey
@ 2025-06-13 19:29     ` Simon Marchi
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 19:29 UTC (permalink / raw)
  To: Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 2:39 PM, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simon.marchi@efficios.com> writes:
> 
> Simon> +/* If enabled, print debug info about the inner workings of the parallel for
> Simon> +   each functions.  */
> Simon> +const bool parallel_for_each_debug = false;
> 
> Defining a global in a header makes me itch a little.
> I feel like we've had linker problems with this somewhere?
> Anyway couldn't this be constexpr?

I don't recall.  Doesn't the linker choose just one version of the
variable, ODR-style?

I can certainly make it constexpr, if that helps.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-06-13 19:22     ` Simon Marchi
@ 2025-06-13 19:56       ` Simon Marchi
  2025-06-13 20:12         ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 19:56 UTC (permalink / raw)
  To: Simon Marchi, Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 3:22 PM, Simon Marchi wrote:
> Another idea I had (but didn't try) was to make lambdas receive a
> "magic" range object, like athis:
> 
>   [&] (gdb::dynamic_range<dwarf2_per_cu *> range)
>     {
>       for (dwarf2_per_cu *cu : range)
>         process_unit (cu);
>     }
> 
> "range" would get work items from the work queue in batches, but yield
> one at a time.  It would reach its end when the work queue is empty.
> It will be a bit of work to implement, but it would have the advantage
> that any per-worker state could be right there in the lambda, like we
> have today.

Ah, I remembered why I didn't do this.  It wouldn't play well with how
minimal_symbol_reader::install() currently uses gdb::parallel_for_each.
Currently, it receives one contiguous range [start,end).  After having
computed the demangled names and hashes for the whole range, each worker
locks a mutex and installs the names in a shared hash table.

In my patch, it's similar, but each worker installs the names in the
share hash table at the end of each small range it receives.

With my "magic" iterator idea, it wouldn't be clear when and how to
install the names in the shared hash table.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-06-13 19:56       ` Simon Marchi
@ 2025-06-13 20:12         ` Simon Marchi
  2025-06-26 19:27           ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-06-13 20:12 UTC (permalink / raw)
  To: Simon Marchi, Tom Tromey, Simon Marchi; +Cc: gdb-patches

On 6/13/25 3:56 PM, Simon Marchi wrote:
> On 6/13/25 3:22 PM, Simon Marchi wrote:
>> Another idea I had (but didn't try) was to make lambdas receive a
>> "magic" range object, like athis:
>>
>>   [&] (gdb::dynamic_range<dwarf2_per_cu *> range)
>>     {
>>       for (dwarf2_per_cu *cu : range)
>>         process_unit (cu);
>>     }
>>
>> "range" would get work items from the work queue in batches, but yield
>> one at a time.  It would reach its end when the work queue is empty.
>> It will be a bit of work to implement, but it would have the advantage
>> that any per-worker state could be right there in the lambda, like we
>> have today.
> 
> Ah, I remembered why I didn't do this.  It wouldn't play well with how
> minimal_symbol_reader::install() currently uses gdb::parallel_for_each.
> Currently, it receives one contiguous range [start,end).  After having
> computed the demangled names and hashes for the whole range, each worker
> locks a mutex and installs the names in a shared hash table.
> 
> In my patch, it's similar, but each worker installs the names in the
> share hash table at the end of each small range it receives.
> 
> With my "magic" iterator idea, it wouldn't be clear when and how to
> install the names in the shared hash table.

One way to make this work would be to make the iterator yield batches,
so you would have two levels of iteration:

  [&] (gdb::dynamic_range<minimal_symbol *, 1000> range)
    {
      for (gdb::batch<minimal_symbol *> batch : range)
        {
	  for (minimal_symbol *msym : batch)
	    {
	      // compute demangled name and hash
	    }

	  // lock mutex

	  for (minimal_symbol *msym : batch)
	    {
	      // install msym in the shared hash table
	    }
	}
    }

Let me know if you see an easier solution to this.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-06-13 20:12         ` Simon Marchi
@ 2025-06-26 19:27           ` Simon Marchi
  2025-07-03 19:23             ` Tom Tromey
  0 siblings, 1 reply; 22+ messages in thread
From: Simon Marchi @ 2025-06-26 19:27 UTC (permalink / raw)
  To: Simon Marchi, Tom Tromey, Simon Marchi; +Cc: gdb-patches



On 2025-06-13 16:12, Simon Marchi wrote:
> On 6/13/25 3:56 PM, Simon Marchi wrote:
>> On 6/13/25 3:22 PM, Simon Marchi wrote:
>>> Another idea I had (but didn't try) was to make lambdas receive a
>>> "magic" range object, like athis:
>>>
>>>   [&] (gdb::dynamic_range<dwarf2_per_cu *> range)
>>>     {
>>>       for (dwarf2_per_cu *cu : range)
>>>         process_unit (cu);
>>>     }
>>>
>>> "range" would get work items from the work queue in batches, but yield
>>> one at a time.  It would reach its end when the work queue is empty.
>>> It will be a bit of work to implement, but it would have the advantage
>>> that any per-worker state could be right there in the lambda, like we
>>> have today.
>>
>> Ah, I remembered why I didn't do this.  It wouldn't play well with how
>> minimal_symbol_reader::install() currently uses gdb::parallel_for_each.
>> Currently, it receives one contiguous range [start,end).  After having
>> computed the demangled names and hashes for the whole range, each worker
>> locks a mutex and installs the names in a shared hash table.
>>
>> In my patch, it's similar, but each worker installs the names in the
>> share hash table at the end of each small range it receives.
>>
>> With my "magic" iterator idea, it wouldn't be clear when and how to
>> install the names in the shared hash table.
> 
> One way to make this work would be to make the iterator yield batches,
> so you would have two levels of iteration:
> 
>   [&] (gdb::dynamic_range<minimal_symbol *, 1000> range)
>     {
>       for (gdb::batch<minimal_symbol *> batch : range)
>         {
> 	  for (minimal_symbol *msym : batch)
> 	    {
> 	      // compute demangled name and hash
> 	    }
> 
> 	  // lock mutex
> 
> 	  for (minimal_symbol *msym : batch)
> 	    {
> 	      // install msym in the shared hash table
> 	    }
> 	}
>     }
> 
> Let me know if you see an easier solution to this.

I have thought about this problem some more, and I discussed this with
my team.  I still think that the functor approach is the only one that
relatively cleanly meets the various requirements:

 - be able to run a per-worker initialize step
 - be able to keep some per-worker data for the duration of the for-each
 - be able to run a per-worker finalize step

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-06-26 19:27           ` Simon Marchi
@ 2025-07-03 19:23             ` Tom Tromey
  2025-07-03 19:36               ` Simon Marchi
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Tromey @ 2025-07-03 19:23 UTC (permalink / raw)
  To: Simon Marchi; +Cc: Simon Marchi, Tom Tromey, Simon Marchi, gdb-patches

>>>>> "Simon" == Simon Marchi <simark@simark.ca> writes:

Sorry for the delay on this.  My time is split these days and so I'm not
keeping up with gdb quite as frequently.

Simon> I have thought about this problem some more, and I discussed this with
Simon> my team.  I still think that the functor approach is the only one that
Simon> relatively cleanly meets the various requirements:

Simon>  - be able to run a per-worker initialize step
Simon>  - be able to keep some per-worker data for the duration of the for-each
Simon>  - be able to run a per-worker finalize step

IMO let's just go with your original plan.  It seems clear &
straightforward.  I don't like the wordiness so much but I tend to think
the other alternatives are less clear.

This seems like another area that coroutines would clear up.  I have
high hopes for those removing a lot of extraneous classes and turning
them into local variables.  No idea when we can adopt C++20 though.

Tom

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each
  2025-07-03 19:23             ` Tom Tromey
@ 2025-07-03 19:36               ` Simon Marchi
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Marchi @ 2025-07-03 19:36 UTC (permalink / raw)
  To: Tom Tromey; +Cc: Simon Marchi, Simon Marchi, gdb-patches



On 2025-07-03 15:23, Tom Tromey wrote:
>>>>>> "Simon" == Simon Marchi <simark@simark.ca> writes:
> 
> Sorry for the delay on this.  My time is split these days and so I'm not
> keeping up with gdb quite as frequently.
> 
> Simon> I have thought about this problem some more, and I discussed this with
> Simon> my team.  I still think that the functor approach is the only one that
> Simon> relatively cleanly meets the various requirements:
> 
> Simon>  - be able to run a per-worker initialize step
> Simon>  - be able to keep some per-worker data for the duration of the for-each
> Simon>  - be able to run a per-worker finalize step
> 
> IMO let's just go with your original plan.  It seems clear &
> straightforward.  I don't like the wordiness so much but I tend to think
> the other alternatives are less clear.

It is a bit more chatty, but I find it easier to understand, actually.
All the code and data used by the worker thread is in that class, wells
separated from the "main" path.

I will send an updated v2 then.

> This seems like another area that coroutines would clear up.  I have
> high hopes for those removing a lot of extraneous classes and turning
> them into local variables.  No idea when we can adopt C++20 though.

I'd be curious to see how that would look like for our parallel
for-each.

Simon

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-07-03 19:37 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-05 20:15 [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
2025-05-05 20:15 ` [PATCH 2/6] gdbsupport: make gdb::parallel_for_each's n parameter a template parameter Simon Marchi
2025-06-13 17:55   ` Tom Tromey
2025-06-13 18:43     ` Simon Marchi
2025-05-05 20:15 ` [PATCH 3/6] gdbsupport: use dynamic partitioning in gdb::parallel_for_each Simon Marchi
2025-06-13 18:29   ` Tom Tromey
2025-06-13 19:22     ` Simon Marchi
2025-06-13 19:56       ` Simon Marchi
2025-06-13 20:12         ` Simon Marchi
2025-06-26 19:27           ` Simon Marchi
2025-07-03 19:23             ` Tom Tromey
2025-07-03 19:36               ` Simon Marchi
2025-05-05 20:15 ` [PATCH 4/6] gdbsupport: factor out work queue from parallel-for.h Simon Marchi
2025-06-13 18:33   ` Tom Tromey
2025-06-13 19:24     ` Simon Marchi
2025-05-05 20:15 ` [PATCH 5/6] gdbsupport: add async parallel_for_each version Simon Marchi
2025-06-13 18:39   ` Tom Tromey
2025-06-13 19:29     ` Simon Marchi
2025-05-05 20:15 ` [PATCH 6/6] gdb/dwarf: use dynamic partitioning for DWARF CU indexing Simon Marchi
2025-05-27 14:44 ` [PATCH 1/6] gdb: re-work parallel-for-selftests.c Simon Marchi
2025-06-13 17:48 ` Tom Tromey
2025-06-13 18:38   ` Simon Marchi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox