[LTP] [PATCH v3] Migrating the libhugetlbfs/testcases/alloc-instantiate-race.c test

Tue Feb 17 15:37:14 CET 2026

On Sun Sep 28, 2025 at 5:07 AM CEST, Samir Mulani wrote:
> This test is designed to detect a kernel allocation race introduced
> with hugepage demand-faulting.  The problem is that no lock is held
> between allocating a hugepage and instantiating it in the
> pagetables or page cache index.  In between the two, the (huge)
> page is cleared, so there's substantial time.  Thus two processes
> can race instantiating the (same) last available hugepage - one
> will fail on the allocation, and thus cause an OOM fault even
> though the page it actually wants is being instantiated by the
> other racing process.
>
> Signed-off-by: Samir Mulani <samir@linux.ibm.com>
> ---
> v3: --Addressed the below requested changes
> 1. Added support to run test cases multiple times using the -iN or -IN option.
> 2. Removed unnecessary comments.
> 3. Set "shared" mode as the default for mmap when the user does not specify
>    <shared/private> with the -m option.
> 4. Updated user-facing help messages to clarify how to use the -m option.
> ---
>  runtest/hugetlb                               |   1 +
>  testcases/kernel/mem/.gitignore               |   1 +
>  .../kernel/mem/hugetlb/hugemmap/hugemmap36.c  | 300 ++++++++++++++++++
>  3 files changed, 302 insertions(+)
>  create mode 100644 testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
>
> diff --git a/runtest/hugetlb b/runtest/hugetlb
> index 0896d3c94..bd40a7a30 100644
> --- a/runtest/hugetlb
> +++ b/runtest/hugetlb
> @@ -36,6 +36,7 @@ hugemmap30 hugemmap30
>  hugemmap31 hugemmap31
>  hugemmap32 hugemmap32
>  hugemmap34 hugemmap34
> +hugemmap36 hugemmap36
>  hugemmap05_1 hugemmap05 -m
>  hugemmap05_2 hugemmap05 -s
>  hugemmap05_3 hugemmap05 -s -m
> diff --git a/testcases/kernel/mem/.gitignore b/testcases/kernel/mem/.gitignore
> index b4455de51..2ddef6bf1 100644
> --- a/testcases/kernel/mem/.gitignore
> +++ b/testcases/kernel/mem/.gitignore
> @@ -36,6 +36,7 @@
>  /hugetlb/hugemmap/hugemmap31
>  /hugetlb/hugemmap/hugemmap32
>  /hugetlb/hugemmap/hugemmap34
> +/hugetlb/hugemmap/hugemmap36
>  /hugetlb/hugeshmat/hugeshmat01
>  /hugetlb/hugeshmat/hugeshmat02
>  /hugetlb/hugeshmat/hugeshmat03
> diff --git a/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c b/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
> new file mode 100644
> index 000000000..14ca6db52
> --- /dev/null
> +++ b/testcases/kernel/mem/hugetlb/hugemmap/hugemmap36.c
> @@ -0,0 +1,300 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2005-2006 IBM Corporation
> + * Author: David Gibson & Adam Litke
> + */
> +
> +/*\
> + * [Description]

Description tag not needed.

> + *
> + * This test is designed to detect a kernel allocation race introduced
> + * with hugepage demand-faulting.  The problem is that no lock is held
> + * between allocating a hugepage and instantiating it in the
> + * pagetables or page cache index.  In between the two, the (huge)
> + * page is cleared, so there's substantial time.  Thus two processes
> + * can race instantiating the (same) last available hugepage - one
> + * will fail on the allocation, and thus cause an OOM fault even
> + * though the page it actually wants is being instantiated by the
> + * other racing process.
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <pthread.h>
> +#include "tst_safe_pthread.h"
> +#include "hugetlb.h"
> +#define SYSFS_CPU_ONLINE_FMT	"/sys/devices/system/cpu/cpu%d/online"
> +#define MNTPOINT "hugetlbfs/"
> +unsigned long totpages;

missing static definition. also it's used only in run() so there's no
need to have it here. Please verify the same for the other variables as
well.

> +static long hpage_size;
> +static char *str_op;
> +static int child1, child2, race_type, fd_sync, test_flag;
> +static pthread_t thread1, thread2;
> +static void *p_sync, *q_sync;
> +
> +struct racer_info {
> +	void *p; /* instantiation address */
> +	int cpu;
> +	volatile int *mytrigger;
> +	volatile int *othertrigger;
> +	int status;
> +};
> +
> +static int one_racer(void *p, int cpu,
> +		     volatile int *mytrigger, volatile int *othertrigger)
> +{
> +	volatile int *pi = p;
> +	cpu_set_t *cpuset;
> +	size_t mask_size;
> +	int err;
> +
> +	cpuset = CPU_ALLOC(cpu + 1);
> +	if (!cpuset)
> +		tst_brk(TBROK | TERRNO, "CPU_ALLOC() failed");
> +
> +	mask_size = CPU_ALLOC_SIZE(cpu + 1);
> +
> +	/* Split onto different CPUs to encourage the race */
> +	CPU_ZERO_S(mask_size, cpuset);
> +	CPU_SET_S(cpu, mask_size, cpuset);
> +
> +	err = sched_setaffinity(getpid(), mask_size, cpuset);
> +	if (err == -1)
> +		tst_brk(TBROK | TERRNO, "sched_setaffinity() failed");
> +
> +	/* Ready */
> +	*mytrigger = 1;
> +	/* Wait for the other trigger to be set */
> +	while (!*othertrigger)
> +		;

This is a busy loop that is gonna create many issues. You probably need
to take a look at the checkpoints if you need to synchronize threads.

> +
> +	/* Set the shared value */
> +	*pi = 1;
> +
> +	CPU_FREE(cpuset);
> +	return 0;
> +}
> +
> +static void proc_racer(void *p, int cpu,
> +		       volatile int *mytrigger, volatile int *othertrigger)
> +{
> +	exit(one_racer(p, cpu, mytrigger, othertrigger));
> +}
> +
> +static void *thread_racer(void *info)
> +{
> +	struct racer_info *ri = info;
> +ri->status = one_racer(ri->p, ri->cpu, ri->mytrigger, ri->othertrigger);

indentation.

> +	return ri;
> +}
> +
> +void check_online_cpus(int online_cpus[], int nr_cpus_needed)

should be a static function

> +{
> +	cpu_set_t cpuset;
> +	int total_cpus, cpu_idx, i;

i can be declared inside the loop.

> +	// Initialize the CPU set
> +	CPU_ZERO(&cpuset);
> +
> +	for (int i = 0; i < CPU_SETSIZE; i++)

> +		CPU_SET(i, &cpuset);
> +	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) == -1)
> +		tst_brk(TBROK | TERRNO, "sched_setaffinity() reset failed");
> +	// Get the total number of configured CPUs
> +	total_cpus = get_nprocs_conf();
> +	// Get the CPU affinity mask of the calling process
> +	if (sched_getaffinity(0, sizeof(cpu_set_t), &cpuset) == -1)
> +		tst_brk(TBROK | TERRNO, "sched_getaffinity() failed");
> +
> +	int available = CPU_COUNT(&cpuset);

Just use CPU_COUNT() it in the tst_res(..), there's no need to have a
variable here.

> +tst_res(TINFO, "Online CPUs needed: %d, available: %d", nr_cpus_needed, available);

Wrong indentation.

> +
> +	// Check if there are enough online CPUs
> +	if (CPU_COUNT(&cpuset) < nr_cpus_needed)
> +		tst_brk(TBROK | TERRNO, "At least %d online CPUs are required ", nr_cpus_needed);
> +
> +	cpu_idx = 0;
> +	// Find the first `nr_cpus_needed` CPUs in the affinity mask
> +	for (i = 0; i < total_cpus && cpu_idx < nr_cpus_needed; i++) {
> +		if (CPU_ISSET(i, &cpuset))
> +			online_cpus[cpu_idx++] = i;
> +	}
> +	if (cpu_idx < nr_cpus_needed)
> +		tst_brk(TBROK | TERRNO, "Unable to find enough online CPUs");
> +}
> +
> +static void run_race(void *syncarea, int race_type)
> +{
> +	volatile int *trigger1, *trigger2;
> +	int fd;
> +	void *p, *tret1, *tret2;
> +	int status1, status2;
> +	int online_cpus[2];
> +
> +	check_online_cpus(online_cpus, 2);
> +	memset(syncarea, 0, sizeof(*trigger1) + sizeof(*trigger2));
> +	trigger1 = syncarea;
> +	trigger2 = trigger1 + 1;
> +
> +	/* Get a new file for the final page */
> +	fd = tst_creat_unlinked(MNTPOINT, 0, 0600);
> +	tst_res(TINFO, "Mapping final page.. ");
> +
> +
> +	p = SAFE_MMAP(NULL, hpage_size, PROT_READ|PROT_WRITE, race_type, fd, 0);
> +	if (race_type == MAP_SHARED) {
> +		child1 = SAFE_FORK();
> +		if (child1 == 0)
> +			proc_racer(p, online_cpus[0], trigger1, trigger2);
> +
> +		child2 = SAFE_FORK();
> +		if (child2 == 0)
> +			proc_racer(p, online_cpus[1], trigger2, trigger1);
> +
> +		/* wait() calls */
> +		SAFE_WAITPID(child1, &status1, 0);
> +		tst_res(TINFO, "Child 1 status: %x", status1);
> +
> +
> +		SAFE_WAITPID(child2, &status2, 0);
> +		tst_res(TINFO, "Child 2 status: %x", status2);
> +
> +		if (WIFSIGNALED(status1))
> +			tst_res(TFAIL, "Child 1 killed by signal %s",
> +			strsignal(WTERMSIG(status1)));
> +		if (WIFSIGNALED(status2))
> +			tst_res(TFAIL, "Child 2 killed by signal %s",
> +			strsignal(WTERMSIG(status2)));
> +	} else {
> +		struct racer_info ri1 = {
> +			.p = p,
> +			.cpu = online_cpus[0],
> +			.mytrigger = trigger1,
> +			.othertrigger = trigger2,
> +			.status = -1,
> +		};
> +		struct racer_info ri2 = {
> +			.p = p,
> +			.cpu = online_cpus[1],
> +			.mytrigger = trigger2,
> +			.othertrigger = trigger1,
> +			.status = -1,
> +		};
> +		SAFE_PTHREAD_CREATE(&thread1, NULL, thread_racer, &ri1);
> +		SAFE_PTHREAD_CREATE(&thread2, NULL, thread_racer, &ri2);
> +		SAFE_PTHREAD_JOIN(thread1, &tret1);
> +		if (tret1 != &ri1) {
> +			test_flag = -1;
> +			tst_res(TFAIL, "Thread 1 returned %p not %p, killed?\n",
> +			     tret1, &ri1);
> +		}
> +		SAFE_PTHREAD_JOIN(thread2, &tret2);
> +
> +		if (tret2 != &ri2) {
> +			test_flag = -1;
> +			tst_res(TFAIL, "Thread 2 returned %p not %p, killed?\n",
> +			     tret2, &ri2);
> +		}
> +		status1 = ri1.status;
> +		status2 = ri2.status;
> +	}
> +
> +	if (status1 != 0) {
> +		test_flag = -1;
> +		tst_res(TFAIL, "Racer 1 terminated with code %d", status1);
> +	}
> +
> +	if (status2 != 0) {
> +		test_flag = -1;
> +		tst_res(TFAIL, "Racer 2 terminated with code %d", status2);
> +	}
> +	if (test_flag != -1)
> +		test_flag = 0;
> +
> +	if (fd >= 0)
> +		SAFE_CLOSE(fd);
> +
> +	if (p != MAP_FAILED)
> +		SAFE_MUNMAP(p, hpage_size);
> +
> +	if (q_sync != MAP_FAILED) {
> +		SAFE_MUNMAP(q_sync, getpagesize());
> +		q_sync = NULL;
> +	}
> +}
> +
> +static void run_test(void)
> +{
> +	totpages = SAFE_READ_MEMINFO(MEMINFO_HPAGE_FREE);
> +	hpage_size = tst_get_hugepage_size();
> +
> +	/* Allocate all save one of the pages up front */
> +	tst_res(TINFO, "instantiating.. ");
> +
> +	fd_sync = tst_creat_unlinked(MNTPOINT, 0, 0600);
> +	/* Get a shared normal page for synchronization */
> +	q_sync = SAFE_MMAP(NULL, getpagesize(), PROT_READ|PROT_WRITE,
> +			MAP_SHARED|MAP_ANONYMOUS, -1, 0);
> +	tst_res(TINFO, "Mapping %ld/%ld pages.. ", totpages-1, totpages);
> +	p_sync = SAFE_MMAP(NULL, (totpages-1)*hpage_size, PROT_READ|PROT_WRITE,
> +			MAP_SHARED, fd_sync, 0);
> +
> +	run_race(q_sync, race_type);

This q_sync is passed around in the wrong way. Please take a look at its
usage because q_sync is used in run_race() but initialized in run_test().
If run_test() fails before run_race(), q_sync could be uninitialized.

Kind regards,

-- 
Andrea Cervesato
SUSE QE Automation Engineer Linux
andrea.cervesato@suse.com