[LTP] [PATCH v3] Migrating the libhugetlbfs/testcases/alloc-instantiate-race.c test
Andrea Cervesato
andrea.cervesato@suse.com
Tue Feb 17 15:37:14 CET 2026
On Sun Sep 28, 2025 at 5:07 AM CEST, Samir Mulani wrote:
> This test is designed to detect a kernel allocation race introduced
> with hugepage demand-faulting. The problem is that no lock is held
> between allocating a hugepage and instantiating it in the
> pagetables or page cache index. In between the two, the (huge)
> page is cleared, so there's substantial time. Thus two processes
> can race instantiating the (same) last available hugepage - one
> will fail on the allocation, and thus cause an OOM fault even
> though the page it actually wants is being instantiated by the
> other racing process.
>
> Signed-off-by: Samir Mulani <samir@linux.ibm.com>
> ---
> v3: --Addressed the below requested changes
> 1. Added support to run test cases multiple times using the -iN or -IN option.
> 2. Removed unnecessary comments.
> 3. Set "shared" mode as the default for mmap when the user does not specify
> <shared/private> with the -m option.
> 4. Updated user-facing help messages to clarify how to use the -m option.
> ---
> runtest/hugetlb | 1 +
> testcases/kernel/mem/.gitignore | 1 +
> .../kernel/mem/hugetlb/hugemmap/hugemmap36.c | 300 ++++++++++++++++++
> 3 files changed, 302 insertions(+)
> create mode 100644 testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
>
> diff --git a/runtest/hugetlb b/runtest/hugetlb
> index 0896d3c94..bd40a7a30 100644
> --- a/runtest/hugetlb
> +++ b/runtest/hugetlb
> @@ -36,6 +36,7 @@ hugemmap30 hugemmap30
> hugemmap31 hugemmap31
> hugemmap32 hugemmap32
> hugemmap34 hugemmap34
> +hugemmap36 hugemmap36
> hugemmap05_1 hugemmap05 -m
> hugemmap05_2 hugemmap05 -s
> hugemmap05_3 hugemmap05 -s -m
> diff --git a/testcases/kernel/mem/.gitignore b/testcases/kernel/mem/.gitignore
> index b4455de51..2ddef6bf1 100644
> --- a/testcases/kernel/mem/.gitignore
> +++ b/testcases/kernel/mem/.gitignore
> @@ -36,6 +36,7 @@
> /hugetlb/hugemmap/hugemmap31
> /hugetlb/hugemmap/hugemmap32
> /hugetlb/hugemmap/hugemmap34
> +/hugetlb/hugemmap/hugemmap36
> /hugetlb/hugeshmat/hugeshmat01
> /hugetlb/hugeshmat/hugeshmat02
> /hugetlb/hugeshmat/hugeshmat03
> diff --git a/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c b/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
> new file mode 100644
> index 000000000..14ca6db52
> --- /dev/null
> +++ b/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
> @@ -0,0 +1,300 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2005-2006 IBM Corporation
> + * Author: David Gibson & Adam Litke
> + */
> +
> +/*\
> + * [Description]
Description tag not needed.
> + *
> + * This test is designed to detect a kernel allocation race introduced
> + * with hugepage demand-faulting. The problem is that no lock is held
> + * between allocating a hugepage and instantiating it in the
> + * pagetables or page cache index. In between the two, the (huge)
> + * page is cleared, so there's substantial time. Thus two processes
> + * can race instantiating the (same) last available hugepage - one
> + * will fail on the allocation, and thus cause an OOM fault even
> + * though the page it actually wants is being instantiated by the
> + * other racing process.
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <pthread.h>
> +#include "tst_safe_pthread.h"
> +#include "hugetlb.h"
> +#define SYSFS_CPU_ONLINE_FMT "/sys/devices/system/cpu/cpu%d/online"
> +#define MNTPOINT "hugetlbfs/"
> +unsigned long totpages;
missing static definition. also it's used only in run() so there's no
need to have it here. Please verify the same for the other variables as
well.
> +static long hpage_size;
> +static char *str_op;
> +static int child1, child2, race_type, fd_sync, test_flag;
> +static pthread_t thread1, thread2;
> +static void *p_sync, *q_sync;
> +
> +struct racer_info {
> + void *p; /* instantiation address */
> + int cpu;
> + volatile int *mytrigger;
> + volatile int *othertrigger;
> + int status;
> +};
> +
> +static int one_racer(void *p, int cpu,
> + volatile int *mytrigger, volatile int *othertrigger)
> +{
> + volatile int *pi = p;
> + cpu_set_t *cpuset;
> + size_t mask_size;
> + int err;
> +
> + cpuset = CPU_ALLOC(cpu + 1);
> + if (!cpuset)
> + tst_brk(TBROK | TERRNO, "CPU_ALLOC() failed");
> +
> + mask_size = CPU_ALLOC_SIZE(cpu + 1);
> +
> + /* Split onto different CPUs to encourage the race */
> + CPU_ZERO_S(mask_size, cpuset);
> + CPU_SET_S(cpu, mask_size, cpuset);
> +
> + err = sched_setaffinity(getpid(), mask_size, cpuset);
> + if (err == -1)
> + tst_brk(TBROK | TERRNO, "sched_setaffinity() failed");
> +
> + /* Ready */
> + *mytrigger = 1;
> + /* Wait for the other trigger to be set */
> + while (!*othertrigger)
> + ;
This is a busy loop that is gonna create many issues. You probably need
to take a look at the checkpoints if you need to synchronize threads.
> +
> + /* Set the shared value */
> + *pi = 1;
> +
> + CPU_FREE(cpuset);
> + return 0;
> +}
> +
> +static void proc_racer(void *p, int cpu,
> + volatile int *mytrigger, volatile int *othertrigger)
> +{
> + exit(one_racer(p, cpu, mytrigger, othertrigger));
> +}
> +
> +static void *thread_racer(void *info)
> +{
> + struct racer_info *ri = info;
> +ri->status = one_racer(ri->p, ri->cpu, ri->mytrigger, ri->othertrigger);
indentation.
> + return ri;
> +}
> +
> +void check_online_cpus(int online_cpus[], int nr_cpus_needed)
should be a static function
> +{
> + cpu_set_t cpuset;
> + int total_cpus, cpu_idx, i;
i can be declared inside the loop.
> + // Initialize the CPU set
> + CPU_ZERO(&cpuset);
> +
> + for (int i = 0; i < CPU_SETSIZE; i++)
> + CPU_SET(i, &cpuset);
> + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) == -1)
> + tst_brk(TBROK | TERRNO, "sched_setaffinity() reset failed");
> + // Get the total number of configured CPUs
> + total_cpus = get_nprocs_conf();
> + // Get the CPU affinity mask of the calling process
> + if (sched_getaffinity(0, sizeof(cpu_set_t), &cpuset) == -1)
> + tst_brk(TBROK | TERRNO, "sched_getaffinity() failed");
> +
> + int available = CPU_COUNT(&cpuset);
Just use CPU_COUNT() it in the tst_res(..), there's no need to have a
variable here.
> +tst_res(TINFO, "Online CPUs needed: %d, available: %d", nr_cpus_needed, available);
Wrong indentation.
> +
> + // Check if there are enough online CPUs
> + if (CPU_COUNT(&cpuset) < nr_cpus_needed)
> + tst_brk(TBROK | TERRNO, "At least %d online CPUs are required ", nr_cpus_needed);
> +
> + cpu_idx = 0;
> + // Find the first `nr_cpus_needed` CPUs in the affinity mask
> + for (i = 0; i < total_cpus && cpu_idx < nr_cpus_needed; i++) {
> + if (CPU_ISSET(i, &cpuset))
> + online_cpus[cpu_idx++] = i;
> + }
> + if (cpu_idx < nr_cpus_needed)
> + tst_brk(TBROK | TERRNO, "Unable to find enough online CPUs");
> +}
> +
> +static void run_race(void *syncarea, int race_type)
> +{
> + volatile int *trigger1, *trigger2;
> + int fd;
> + void *p, *tret1, *tret2;
> + int status1, status2;
> + int online_cpus[2];
> +
> + check_online_cpus(online_cpus, 2);
> + memset(syncarea, 0, sizeof(*trigger1) + sizeof(*trigger2));
> + trigger1 = syncarea;
> + trigger2 = trigger1 + 1;
> +
> + /* Get a new file for the final page */
> + fd = tst_creat_unlinked(MNTPOINT, 0, 0600);
> + tst_res(TINFO, "Mapping final page.. ");
> +
> +
> + p = SAFE_MMAP(NULL, hpage_size, PROT_READ|PROT_WRITE, race_type, fd, 0);
> + if (race_type == MAP_SHARED) {
> + child1 = SAFE_FORK();
> + if (child1 == 0)
> + proc_racer(p, online_cpus[0], trigger1, trigger2);
> +
> + child2 = SAFE_FORK();
> + if (child2 == 0)
> + proc_racer(p, online_cpus[1], trigger2, trigger1);
> +
> + /* wait() calls */
> + SAFE_WAITPID(child1, &status1, 0);
> + tst_res(TINFO, "Child 1 status: %x", status1);
> +
> +
> + SAFE_WAITPID(child2, &status2, 0);
> + tst_res(TINFO, "Child 2 status: %x", status2);
> +
> + if (WIFSIGNALED(status1))
> + tst_res(TFAIL, "Child 1 killed by signal %s",
> + strsignal(WTERMSIG(status1)));
> + if (WIFSIGNALED(status2))
> + tst_res(TFAIL, "Child 2 killed by signal %s",
> + strsignal(WTERMSIG(status2)));
> + } else {
> + struct racer_info ri1 = {
> + .p = p,
> + .cpu = online_cpus[0],
> + .mytrigger = trigger1,
> + .othertrigger = trigger2,
> + .status = -1,
> + };
> + struct racer_info ri2 = {
> + .p = p,
> + .cpu = online_cpus[1],
> + .mytrigger = trigger2,
> + .othertrigger = trigger1,
> + .status = -1,
> + };
> + SAFE_PTHREAD_CREATE(&thread1, NULL, thread_racer, &ri1);
> + SAFE_PTHREAD_CREATE(&thread2, NULL, thread_racer, &ri2);
> + SAFE_PTHREAD_JOIN(thread1, &tret1);
> + if (tret1 != &ri1) {
> + test_flag = -1;
> + tst_res(TFAIL, "Thread 1 returned %p not %p, killed?\n",
> + tret1, &ri1);
> + }
> + SAFE_PTHREAD_JOIN(thread2, &tret2);
> +
> + if (tret2 != &ri2) {
> + test_flag = -1;
> + tst_res(TFAIL, "Thread 2 returned %p not %p, killed?\n",
> + tret2, &ri2);
> + }
> + status1 = ri1.status;
> + status2 = ri2.status;
> + }
> +
> + if (status1 != 0) {
> + test_flag = -1;
> + tst_res(TFAIL, "Racer 1 terminated with code %d", status1);
> + }
> +
> + if (status2 != 0) {
> + test_flag = -1;
> + tst_res(TFAIL, "Racer 2 terminated with code %d", status2);
> + }
> + if (test_flag != -1)
> + test_flag = 0;
> +
> + if (fd >= 0)
> + SAFE_CLOSE(fd);
> +
> + if (p != MAP_FAILED)
> + SAFE_MUNMAP(p, hpage_size);
> +
> + if (q_sync != MAP_FAILED) {
> + SAFE_MUNMAP(q_sync, getpagesize());
> + q_sync = NULL;
> + }
> +}
> +
> +static void run_test(void)
> +{
> + totpages = SAFE_READ_MEMINFO(MEMINFO_HPAGE_FREE);
> + hpage_size = tst_get_hugepage_size();
> +
> + /* Allocate all save one of the pages up front */
> + tst_res(TINFO, "instantiating.. ");
> +
> + fd_sync = tst_creat_unlinked(MNTPOINT, 0, 0600);
> + /* Get a shared normal page for synchronization */
> + q_sync = SAFE_MMAP(NULL, getpagesize(), PROT_READ|PROT_WRITE,
> + MAP_SHARED|MAP_ANONYMOUS, -1, 0);
> + tst_res(TINFO, "Mapping %ld/%ld pages.. ", totpages-1, totpages);
> + p_sync = SAFE_MMAP(NULL, (totpages-1)*hpage_size, PROT_READ|PROT_WRITE,
> + MAP_SHARED, fd_sync, 0);
> +
> + run_race(q_sync, race_type);
This q_sync is passed around in the wrong way. Please take a look at its
usage because q_sync is used in run_race() but initialized in run_test().
If run_test() fails before run_race(), q_sync could be uninitialized.
Kind regards,
--
Andrea Cervesato
SUSE QE Automation Engineer Linux
andrea.cervesato@suse.com
More information about the ltp
mailing list