OpenMP在gcc里是如何启动的?

    • 图片未显示锁竞争(Lock Contention = 0s),表明 PyTorch 或 MKL 可能绕过 GOMP 的全局任务锁,使用原子操作或分区任务。
/* This structure is used to communicate across pthread_create.  */

struct gomp_thread_start_data
{
  void (*fn) (void *);
  void *fn_data;
  struct gomp_team_state ts;
  struct gomp_task *task;
  struct gomp_thread_pool *thread_pool;
  unsigned int place;
  unsigned int num_teams;
  unsigned int team_num;
  bool nested;
  pthread_t handle;
};

/* This function is a pthread_create entry point.  This contains the idle
   loop in which a thread waits to be called up to become part of a team.  */

static void *
gomp_thread_start (void *xdata)
{
  struct gomp_thread_start_data *data = xdata;
  struct gomp_thread *thr;
  struct gomp_thread_pool *pool;
  void (*local_fn) (void *);
  void *local_data;

#if defined HAVE_TLS || defined USE_EMUTLS
  thr = &gomp_tls_data;
#else
  struct gomp_thread local_thr;
  thr = &local_thr;
#endif
  gomp_sem_init (&thr->release, 0);

  /* Extract what we need from data.  */
  local_fn = data->fn;
  local_data = data->fn_data;
  thr->thread_pool = data->thread_pool;
  thr->ts = data->ts;
  thr->task = data->task;
  thr->place = data->place;
  thr->num_teams = data->num_teams;
  thr->team_num = data->team_num;
#ifdef GOMP_NEEDS_THREAD_HANDLE
  thr->handle = data->handle;
#endif
#if !(defined HAVE_TLS || defined USE_EMUTLS)
  pthread_setspecific (gomp_tls_key, thr);
#endif

  thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;

  /* Make thread pool local. */
  pool = thr->thread_pool;

  if (data->nested)
    {
      struct gomp_team *team = thr->ts.team;
      struct gomp_task *task = thr->task;

      gomp_barrier_wait (&team->barrier);

      local_fn (local_data);
      gomp_team_barrier_wait_final (&team->barrier);
      gomp_finish_task (task);
      gomp_barrier_wait_last (&team->barrier);
    }
  else
    {
      pool->threads[thr->ts.team_id] = thr;

      gomp_simple_barrier_wait (&pool->threads_dock);
      do
    {
      struct gomp_team *team = thr->ts.team;
      struct gomp_task *task = thr->task;

      local_fn (local_data);
      gomp_team_barrier_wait_final (&team->barrier);
      gomp_finish_task (task);

      gomp_simple_barrier_wait (&pool->threads_dock);

      local_fn = thr->fn;
      local_data = thr->data;
      thr->fn = NULL;
    }
      while (local_fn);
    }

  gomp_sem_destroy (&thr->release);
  pthread_detach (pthread_self ());
  thr->thread_pool = NULL;
  thr->task = NULL;
  return NULL;
}
#endif
  • 适用场景:线程属于顶级并行区域,通常与线程池关联。
  • 流程

    1. 注册线程:将当前线程存储到线程池的 threads 数组中,索引为 team_id。
    2. 初始屏障:调用 gomp_simple_barrier_wait,等待线程池中的所有线程到达 threads_dock 屏障。
    3. 循环执行任务
      • 执行 local_fn(local_data),运行分配的并行任务。
      • 调用 gomp_team_barrier_wait_final,等待团队完成任务。
      • 调用 gomp_finish_task,清理任务状态。
      • 再次调用 gomp_simple_barrier_wait,等待线程池分配新任务。
      • 更新 local_fn 和 local_data 从 thr->fn 和 thr->data(可能由线程池分配新任务)。
      • 如果 local_fn 非空,循环继续;否则退出。
    • 线程池重用:非嵌套线程在完成任务后不立即退出,而是等待线程池分配新任务,体现了 OpenMP 的线程池优化。
static inline void
gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
{
  gomp_barrier_wait (&bar->bar);
}
void
gomp_barrier_wait (gomp_barrier_t *barrier)
{
  gomp_barrier_wait_end (barrier, gomp_barrier_wait_start (barrier));
}
void
gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
  unsigned int n;

  if (state & BAR_WAS_LAST)
    {
      n = --bar->arrived;
      if (n > 0)
    {
      do
        gomp_sem_post (&bar->sem1);
      while (--n != 0);
      gomp_sem_wait (&bar->sem2);
    }
      gomp_mutex_unlock (&bar->mutex1);
    }
  else
    {
      gomp_mutex_unlock (&bar->mutex1);
      gomp_sem_wait (&bar->sem1);

#ifdef HAVE_SYNC_BUILTINS
      n = __sync_add_and_fetch (&bar->arrived, -1);
#else
      gomp_mutex_lock (&bar->mutex2);
      n = --bar->arrived;
      gomp_mutex_unlock (&bar->mutex2);
#endif

      if (n == 0)
    gomp_sem_post (&bar->sem2);
    }
}