[LTP] [PATCH] [RFC] lib: shell: Fix timeout process races

Mon Sep 20 06:51:46 CEST 2021

Hi

Sorry for my late reply on this topic, I was on vacation.

On 9/17/2021 4:17 PM, Cyril Hrubis wrote:
> There were actually several races in the shell library timeout handling.
>
> This commit fixes hopefully all of them by:
>
> * Reimplementing the backgroud timer in C
I did that once, but at that point it was kinda rejected ;)
See https://lists.linux.it/pipermail/ltp/2021-May/022445.html
and https://lists.linux.it/pipermail/ltp/2021-May/022453.html

>
> * Making sure that the timer has started before we leave test setup
>
> The rewrite of the backround timer to C allows us to run all the timeout
> logic in a single process, which simplifies the whole problem greatly
> since previously we had chain of processes that estabilished signal
> handlers to kill it's descendants, which in the end had a few races in
> it.
>
> The race that caused the problems is, as far as I can tell, in the way
> how shell spawns it's children. I haven't checked the shell code, but I
> guess that when shell runs a process in bacground it does vfork() +
> exec() and because signals are ignored during the operation. If the
> SIGTERM arrives at that point it gets lost.
>
> That means that we created a race window in the shell library each time
> we started a new process. The rewrite to C simplifies the code but we
> still end up with a single place where this can happen and that is when
> we execute the tst_timeout_kill binary. This is now fixed in the shell
> library by waiting until the background process gets to a sleep state,
> which means that the proces has been executed and waiting for the
> timeout.
>
> After these fixes I haven't been able to reproduce the hang with:
>
> cat > debug.sh <<EOF
> #!/bin/sh
>
> TST_SETUP="setup"
> TST_TESTFUNC="do_test"
> . tst_test.sh
>
> setup()
> {
>          tst_brk TCONF "quit now!"
> }
>
> do_test()
> {
>          tst_res TPASS "pass :)"
> }
>
> tst_run
> EOF
>
> # while true; do ./debug.sh; done
>
> Reported-by: Petr Vorel <pvorel@suse.cz>
> Signed-off-by: Cyril Hrubis <chrubis@suse.cz>
> ---
>   testcases/lib/.gitignore         |  1 +
>   testcases/lib/Makefile           |  2 +-
>   testcases/lib/tst_test.sh        | 47 +++++-----------
>   testcases/lib/tst_timeout_kill.c | 93 ++++++++++++++++++++++++++++++++
>   4 files changed, 108 insertions(+), 35 deletions(-)
>   create mode 100644 testcases/lib/tst_timeout_kill.c
>
> diff --git a/testcases/lib/.gitignore b/testcases/lib/.gitignore
> index 5a0e8cba2..f266a3e6b 100644
> --- a/testcases/lib/.gitignore
> +++ b/testcases/lib/.gitignore
> @@ -14,3 +14,4 @@
>   /tst_sleep
>   /tst_supported_fs
>   /tst_hexdump
> +tst_timeout_kill
> diff --git a/testcases/lib/Makefile b/testcases/lib/Makefile
> index 179b47479..d6b4c7a91 100644
> --- a/testcases/lib/Makefile
> +++ b/testcases/lib/Makefile
> @@ -11,6 +11,6 @@ INSTALL_TARGETS		:= *.sh
>   MAKE_TARGETS		:= tst_sleep tst_random tst_checkpoint tst_rod tst_kvcmp\
>   			   tst_device tst_net_iface_prefix tst_net_ip_prefix tst_net_vars\
>   			   tst_getconf tst_supported_fs tst_check_drivers tst_get_unused_port\
> -			   tst_get_median tst_hexdump tst_get_free_pids
> +			   tst_get_median tst_hexdump tst_get_free_pids tst_timeout_kill
>   
>   include $(top_srcdir)/include/mk/generic_leaf_target.mk
> diff --git a/testcases/lib/tst_test.sh b/testcases/lib/tst_test.sh
> index 8f69b0551..2556b28f5 100644
> --- a/testcases/lib/tst_test.sh
> +++ b/testcases/lib/tst_test.sh
> @@ -474,27 +474,6 @@ _tst_multiply_timeout()
>   	return 0
>   }
>   
> -_tst_kill_test()
> -{
> -	local i=10
> -
> -	trap '' TERM
> -	tst_res TBROK "Test timed out, sending SIGTERM! If you are running on slow machine, try exporting LTP_TIMEOUT_MUL > 1"
> -	kill -TERM -$pid
> -	tst_sleep 100ms
> -
> -	while kill -0 $pid >/dev/null 2>&1 && [ $i -gt 0 ]; do
> -		tst_res TINFO "Test is still running, waiting ${i}s"
> -		sleep 1
> -		i=$((i-1))
> -	done
> -
> -	if kill -0 $pid >/dev/null 2>&1; then
> -		tst_res TBROK "Test still running, sending SIGKILL"
> -		kill -KILL -$pid
> -	fi
> -}
> -
>   _tst_cleanup_timer()
>   {
>   	if [ -n "$_tst_setup_timer_pid" ]; then
> @@ -503,18 +482,6 @@ _tst_cleanup_timer()
>   	fi
>   }
>   
> -_tst_timeout_process()
> -{
> -	local sleep_pid
> -
> -	sleep $sec &
> -	sleep_pid=$!
> -	trap "kill $sleep_pid; exit" TERM
> -	wait $sleep_pid
> -	trap - TERM
> -	_tst_kill_test
> -}
> -
>   _tst_setup_timer()
>   {
>   	TST_TIMEOUT=${TST_TIMEOUT:-300}
> @@ -539,9 +506,21 @@ _tst_setup_timer()
>   
>   	_tst_cleanup_timer
>   
> -	_tst_timeout_process &
> +	tst_timeout_kill $sec $pid &
>   
>   	_tst_setup_timer_pid=$!
> +
> +	while true; do
> +		local state
> +
> +		state=$(cut -d' ' -f3 "/proc/$_tst_setup_timer_pid/stat")
Hmm maybe we could use the checkpoint api here instead. Wouldn't this be 
more portable?
> +
> +		if [ "$state" = "S" ]; then
> +			break;
> +		fi
> +
> +		tst_sleep 1ms
> +	done
>   }
>   
>   tst_require_root()
> diff --git a/testcases/lib/tst_timeout_kill.c b/testcases/lib/tst_timeout_kill.c
> new file mode 100644
> index 000000000..6e97514f1
> --- /dev/null
> +++ b/testcases/lib/tst_timeout_kill.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (c) 2021 Cyril Hrubis <chrubis@suse.cz>
> + */
> +
> +#include <stdio.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <tst_res_flags.h>
> +#include <tst_ansi_color.h>
> +
> +static void print_help(const char *name)
> +{
> +	fprintf(stderr, "usage: %s timeout pid\n", name);
> +}
> +
> +static void print_msg(int type, const char *msg)
> +{
> +	const char *strtype = "";
> +
> +	switch (type) {
> +	case TBROK:
> +		strtype = "TBROK";
> +	break;
> +	case TINFO:
> +		strtype = "TINFO";
> +	break;
> +	}
> +
> +	if (tst_color_enabled(STDERR_FILENO)) {
> +		fprintf(stderr, "%s%s: %s%s\n", tst_ttype2color(type),
> +			strtype, ANSI_COLOR_RESET, msg);
> +	} else {
> +		fprintf(stderr, "%s: %s\n", strtype, msg);
> +	}
> +}
Shouldn't this be reused from the library instead of being reimplemented?
> +
> +int main(int argc, char *argv[])
> +{
> +	int timeout, pid, ret, i;
> +
> +	if (argc != 3) {
> +		print_help(argv[0]);
> +		return 1;
> +	}
> +
> +	timeout = atoi(argv[1]);
> +	pid = atoi(argv[2]);
> +
> +	if (timeout < 0) {
> +		fprintf(stderr, "Invalid timeout '%s'", argv[1]);
> +		print_help(argv[0]);
> +		return 1;
> +	}
> +
> +	if (pid <= 1) {
> +		fprintf(stderr, "Invalid pid '%s'", argv[2]);
> +		print_help(argv[0]);
> +		return 1;
> +	}
> +
> +	if (timeout)
> +		sleep(timeout);
> +
> +	print_msg(TBROK, "Test timed out, sending SIGTERM! If you are running on slow machine, try exporting LTP_TIMEOUT_MUL > 1");
> +
> +	ret = kill(-pid, SIGTERM);
> +	if (ret) {
> +		print_msg(TBROK, "kill() failed");
> +		return 1;
> +	}
> +
> +	usleep(100000);
> +
> +	i = 10;
> +
> +	while (!kill(-pid, 0) && i-- > 0) {
This was kill(pid, 0) in the original shell code. I am not sure if it is 
important
> +		print_msg(TINFO, "Test is still running...");
> +		sleep(1);
> +	}
> +
> +	if (!kill(-pid, 0)) {
Here as well
> +		print_msg(TBROK, "Test is still running, sending SIGKILL");
> +		ret = kill(-pid, SIGKILL);
> +		if (ret) {
> +			print_msg(TBROK, "kill() failed");
> +			return 1;
> +		}
> +	}
> +
> +	return 0;
> +}

Joerg