[LTP] [PATCH 2/2] [WORK-IN-PROGRESS] lib/tst_test: Dump stack for test processes stuck in kernel
Jan Stancek
jstancek@redhat.com
Thu Jun 28 15:05:25 CEST 2018
----- Original Message -----
> This commit adds a small helper library to find a process(es) given a
> process group ID and dump their stacks.
>
> Example output:
>
> $ ./shmctl05
> tst_test.c:1015: INFO: Timeout per run is 0h 00m 10s
> Test timeouted, sending SIGKILL!
> tst_test.c:1059: TFAIL: Test process child stuck in the kernel!
> tst_find_pid.c:90: INFO: Pid 1272 stuck in kernel!
> Kernel stacktrace follows:
> [<ffffffffa3c12564>] __switch_to_asm+0x34/0x70
> [<ffffffffa3c12570>] __switch_to_asm+0x40/0x70
> [<ffffffffa3625761>] __switch_to+0x2c1/0x6e0
> [<ffffffffa393e194>] call_rwsem_down_read_failed+0x14/0x30
> [<ffffffffa3704802>] acct_collect+0x42/0x1a0
> [<ffffffffa367d36a>] do_exit+0x74a/0xaf0
> [<ffffffffa3c13d27>] rewind_stack_do_exit+0x17/0x20
> [<ffffffffffffffff>] 0xffffffffffffffff
> tst_test.c:1061: FAIL: Congratulation, likely test hit a kernel bug.
>
> TODO: The main test process uses signal handler and alarm to call _exit if
> the
> child process that executes the actuall test timeouts. We need to
> redesign
> this if we want to dump the stack in that case as well.
Hi,
What if we dropped _exit() from signal handler, and left all
killing to code added in 1/2 of this series?
Signal handler will only note that we hit timeout:
static void alarm_handler(int sig LTP_ATTRIBUTE_UNUSED)
{
WRITE_MSG("Test timed out!\n");
++timeout_hit;
}
and fork_testrun() will be periodically checking for it:
do {
usleep(10000);
ret = SAFE_WAITPID(test_pid, &status, WNOHANG);
} while (ret == 0 || timeout_hit == 0);
// try to kill process group here
>
> Signed-off-by: Cyril Hrubis <chrubis@suse.cz>
> CC: Jan Stancek <jstancek@redhat.com>
> ---
> include/tst_dump_stacks.h | 25 +++++++++++
> lib/tst_dump_stacks.c | 108
> ++++++++++++++++++++++++++++++++++++++++++++++
> lib/tst_test.c | 3 +-
> 3 files changed, 135 insertions(+), 1 deletion(-)
> create mode 100644 include/tst_dump_stacks.h
> create mode 100644 lib/tst_dump_stacks.c
>
> diff --git a/include/tst_dump_stacks.h b/include/tst_dump_stacks.h
> new file mode 100644
> index 000000000..643cc58a8
> --- /dev/null
> +++ b/include/tst_dump_stacks.h
> @@ -0,0 +1,25 @@
> +/*
> + * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef TST_DUMP_STACKS__
> +#define TST_DUMP_STACKS__
> +
> +void tst_dump_stacks_by_pgid(pid_t pgid);
> +
> +void tst_dump_stack_by_pid(pid_t pid);
> +
> +#endif /* TST_DUMP_STACKS__ */
> diff --git a/lib/tst_dump_stacks.c b/lib/tst_dump_stacks.c
> new file mode 100644
> index 000000000..aa97c6820
> --- /dev/null
> +++ b/lib/tst_dump_stacks.c
> @@ -0,0 +1,108 @@
> +/*
> + * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <ctype.h>
> +#include <stdio.h>
> +
> +#define TST_NO_DEFAULT_MAIN 1
> +#include "tst_test.h"
> +
> +static void *process_search_init(void)
> +{
> + DIR *dir = SAFE_OPENDIR("/proc/");
> +
> + return dir;
> +}
> +
> +static int is_number(const char *str)
> +{
> + do {
> + if (!isdigit(*str))
> + return 0;
> + } while (*(++str));
> +
> + return 1;
> +}
> +
> +static int process_search_pgid_next(void *pid_search, pid_t pgid)
> +{
> + struct dirent *ent;
> + DIR *dir = pid_search;
> + char path[1024];
> + int ppgid, pid;
> + FILE *f;
> +
> + while ((ent = readdir(dir))) {
> + if (ent->d_type != DT_DIR)
> + continue;
> + if (!is_number(ent->d_name))
> + continue;
> +
> + snprintf(path, sizeof(path), "/proc/%s/stat", ent->d_name);
> +
> + f = fopen(path, "r");
> + if (!f)
> + continue;
> +
> + if (fscanf(f, "%i %*s %*c %*i %i", &pid, &ppgid) != 2) {
> + tst_res(TWARN, "fscanf(%s) failed!", ent->d_name);
> + fclose(f);
> + continue;
> + }
> +
> + fclose(f);
> +
> + if (ppgid == pgid)
> + break;
> + }
> +
> + if (ent)
> + return pid;
> +
> + closedir(dir);
> + return -1;
> +}
> +
> +void tst_dump_stack_by_pid(pid_t pid)
> +{
> + int fd, len;
> + char buf[512];
> + char path[1024];
> +
> + tst_res(TINFO, "Pid %i stuck in kernel!", pid);
> +
> + fprintf(stderr, "Kernel stacktrace follows:\n");
> + fflush(stderr);
> +
> + snprintf(path, sizeof(path), "/proc/%i/stack", pid);
> +
> + fd = SAFE_OPEN(path, O_RDONLY);
> +
> + while ((len = SAFE_READ(0, fd, buf, sizeof(buf))) > 0)
> + SAFE_WRITE(1, 2, buf, len);
> +
> + SAFE_CLOSE(fd);
> +}
> +
> +void tst_dump_stacks_by_pgid(pid_t pgid)
> +{
> + void *ps = process_search_init();
> + int pid;
> +
> + while ((pid = process_search_pgid_next(ps, pgid)) != -1)
> + tst_dump_stack_by_pid(pid);
> +}
> diff --git a/lib/tst_test.c b/lib/tst_test.c
> index 329168a24..d9476c02c 100644
> --- a/lib/tst_test.c
> +++ b/lib/tst_test.c
> @@ -1058,7 +1058,8 @@ static int fork_testrun(void)
> if (retries++ <= 14)
> continue;
>
> - tst_res(TFAIL, "Test process child stuck in the kernel!");
> + tst_res(TFAIL, "Test process child(ren) stuck in the kernel!");
> + tst_dump_stacks_by_pgid(test_pid);
> tst_brk(TFAIL, "Congratulation, likely test hit a kernel bug.");
> }
Looks good to me.
Regards,
Jan
More information about the ltp
mailing list