[LTP] [PATCH 1/2] read_all: Add worker timeout
Richard Palethorpe
rpalethorpe@suse.com
Tue Jul 12 14:46:16 CEST 2022
Kill and restart workers that take too long to read a file. The
default being one second. A custom time can be set with the new -t
option.
This is to prevent a worker from blocking forever in a read. Currently
when this happens the whole test times out and any remaining files in
the worker's queue are not tested.
As a side effect we can now also set the timeout very low to cause
partial reads.
Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
Cc: Joerg Vehlow <lkml@jv-coder.de>
Cc: Li Wang <liwang@redhat.com>
---
testcases/kernel/fs/read_all/read_all.c | 83 ++++++++++++++++++++++++-
1 file changed, 82 insertions(+), 1 deletion(-)
diff --git a/testcases/kernel/fs/read_all/read_all.c b/testcases/kernel/fs/read_all/read_all.c
index a5b93b966..d8c68bd33 100644
--- a/testcases/kernel/fs/read_all/read_all.c
+++ b/testcases/kernel/fs/read_all/read_all.c
@@ -26,8 +26,10 @@
* an IPC stress test on systems with large numbers of weak cores. This can be
* overridden with the 'w' parameters.
*/
+#include <signal.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/wait.h>
#include <lapi/fnmatch.h>
#include <stdlib.h>
#include <stdio.h>
@@ -43,7 +45,9 @@
#include <pwd.h>
#include <grp.h>
+#include "tst_clocks.h"
#include "tst_test.h"
+#include "tst_timer.h"
#define QUEUE_SIZE 16384
#define BUFFER_SIZE 1024
@@ -60,6 +64,7 @@ struct queue {
struct worker {
pid_t pid;
struct queue *q;
+ int last_seen;
};
enum dent_action {
@@ -80,6 +85,8 @@ static char *str_max_workers;
static long max_workers = 15;
static struct worker *workers;
static char *drop_privs;
+static char *str_worker_timeout;
+static int worker_timeout = 1000;
static char *blacklist[] = {
NULL, /* reserved for -e parameter */
@@ -227,10 +234,14 @@ static int worker_run(struct worker *self)
.sa_flags = 0,
};
struct queue *q = self->q;
+ struct timespec now;
sigaction(SIGTTIN, &term_sa, NULL);
while (1) {
+ tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+ tst_atomic_store(tst_timespec_to_ms(now), &self->last_seen);
+
if (!queue_pop(q, buf))
break;
@@ -270,11 +281,15 @@ static void spawn_workers(void)
{
int i;
struct worker *wa = workers;
+ struct timespec now;
+
+ tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
memset(workers, 0, worker_count * sizeof(*workers));
for (i = 0; i < worker_count; i++) {
wa[i].q = queue_init();
+ wa[i].last_seen = tst_timespec_to_ms(now);
wa[i].pid = SAFE_FORK();
if (!wa[i].pid) {
maybe_drop_privs();
@@ -283,9 +298,52 @@ static void spawn_workers(void)
}
}
+static void restart_worker(struct worker *const worker)
+{
+ int wstatus, ret, i, q_len;
+ struct timespec now;
+
+ kill(worker->pid, SIGKILL);
+ ret = waitpid(worker->pid, &wstatus, 0);
+
+ if (ret != worker->pid) {
+ tst_brk(TBROK | TERRNO, "waitpid(%d, ...) = %d",
+ worker->pid, ret);
+ }
+
+ /* Make sure the queue length and semaphore match. Threre is a
+ * race in qeuue_pop where the semaphore can be decremented
+ * then the worker killed before updating q->front
+ */
+ q_len = 0;
+ i = worker->q->front;
+ while (i != worker->q->back) {
+ if (!worker->q->data[i])
+ q_len++;
+
+ i = (i + 1) % QUEUE_SIZE;
+ }
+
+ ret = sem_destroy(&worker->q->sem);
+ if (ret == -1)
+ tst_brk(TBROK | TERRNO, "sem_destroy");
+ ret = sem_init(&worker->q->sem, 1, q_len);
+ if (ret == -1)
+ tst_brk(TBROK | TERRNO, "sem_init");
+
+ tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+ worker->last_seen = tst_timespec_to_ms(now);
+ worker->pid = SAFE_FORK();
+
+ if (!worker->pid)
+ exit(worker_run(worker));
+}
+
static void work_push_retry(int worker, const char *buf)
{
int i, ret, worker_min, worker_max, usleep_time = 100;
+ struct timespec now;
+ int elapsed;
if (worker < 0) {
/* pick any, try -worker first */
@@ -299,10 +357,25 @@ static void work_push_retry(int worker, const char *buf)
i = worker_min;
for (;;) {
- ret = queue_push(workers[i].q, buf);
+ struct worker *const w = workers + i;
+
+ ret = queue_push(w->q, buf);
if (ret == 1)
break;
+ tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+ elapsed =
+ tst_timespec_to_ms(now) - tst_atomic_load(&w->last_seen);
+
+ if (elapsed > worker_timeout) {
+ if (!quiet) {
+ tst_res(TINFO,
+ "Worker %d (%d) stuck for %dms, restarting it",
+ i, w->pid, elapsed);
+ }
+ restart_worker(w);
+ }
+
if (++i >= worker_max) {
i = worker_min;
if (usleep_time < 100000)
@@ -368,6 +441,12 @@ static void setup(void)
if (!worker_count)
worker_count = MIN(MAX(tst_ncpus() - 1, 1), max_workers);
workers = SAFE_MALLOC(worker_count * sizeof(*workers));
+
+ if (tst_parse_int(str_worker_timeout, &worker_timeout, 1, INT_MAX)) {
+ tst_brk(TBROK,
+ "Invalid worker timeout (-t) argument: '%s'",
+ str_worker_count);
+ }
}
static void cleanup(void)
@@ -465,6 +544,8 @@ static struct tst_test test = {
"Count Override the worker count. Ignores (-w) and the processor count."},
{"p", &drop_privs,
"Drop privileges; switch to the nobody user."},
+ {"t:", &str_worker_timeout,
+ "Milliseconds a worker has to read a file before it is restarted"},
{}
},
.setup = setup,
--
2.36.1
More information about the ltp
mailing list