Add thread sanitizer run to daily CI (#13964)

Add thread sanitizer run to daily CI.

Few tests are skipped in tsan runs for two reasons:
* Stack trace producing tests (oom, `unit/moduleapi/crash`, etc) are
tagged `tsan:skip` because redis calls `backtrace()` in signal handler
which turns out to be signal-unsafe since it might allocate memory (e.g.
glibc 2.39 does it through a call to `_dl_map_object_deps()`).
* Few tests become flaky with thread sanitizer builds and don't finish
in expected deadlines because of the additional tsan overhead. Instead
of skipping those tests, this can improved in the future by allowing
more iterations when waiting for tsan builds.

Deadlock detection is disabled for now because of tsan limitation where
max 64 locks can be taken at once.

There is one outstanding (false-positive?) race in jemalloc which is
suppressed in `tsan.sup`.

Fix few races thread sanitizer reported having to do with writes from
signal handlers. Since in multi-threaded setting signal handlers might
be called on any thread (modulo pthread_sigmask) while the main thread
is running, `volatile sig_atomic_t` type is not sufficient and atomics
are used instead.
This commit is contained in:
Slavomir Kaslev
2025-06-02 10:13:23 +03:00
committed by GitHub
parent 7f60945bc6
commit b7c6755b1b
20 changed files with 145 additions and 46 deletions

View File

@@ -697,6 +697,52 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: ./src/redis-server test all --accurate
test-sanitizer-thread:
runs-on: ubuntu-latest
if: |
(github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) &&
!contains(github.event.inputs.skipjobs, 'sanitizer')
timeout-minutes: 14400
strategy:
matrix:
compiler: [ gcc, clang ]
env:
CC: ${{ matrix.compiler }}
steps:
- name: prep
if: github.event_name == 'workflow_dispatch'
run: |
echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV
echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV
echo "skipjobs: ${{github.event.inputs.skipjobs}}"
echo "skiptests: ${{github.event.inputs.skiptests}}"
echo "test_args: ${{github.event.inputs.test_args}}"
echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
- uses: actions/checkout@v4
with:
repository: ${{ env.GITHUB_REPOSITORY }}
ref: ${{ env.GITHUB_HEAD_REF }}
- name: make
# TODO Investigate why jemalloc with clang TSan crash on start;
# with gcc TSan, jemalloc works modulo sentinel tests hanging.
run: make SANITIZER=thread USE_JEMALLOC=no REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS'
- name: testprep
run: |
sudo apt-get update
sudo apt-get install tcl8.6 tclx -y
- name: test
if: true && !contains(github.event.inputs.skiptests, 'redis')
run: ./runtest --tsan --clients 1 --config io-threads 4 --accurate --verbose --dump-logs ${{github.event.inputs.test_args}}
- name: module api test
if: true && !contains(github.event.inputs.skiptests, 'modules')
run: CFLAGS='-Werror' ./runtest-moduleapi --tsan --clients 1 --config io-threads 4 --accurate --verbose --dump-logs ${{github.event.inputs.test_args}}
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel --config io-threads 2 ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster --config io-threads 2 ${{github.event.inputs.cluster_test_args}}
test-centos-jemalloc:
runs-on: ubuntu-latest
if: |

View File

@@ -2501,7 +2501,7 @@ void removeSigSegvHandlers(void) {
}
void printCrashReport(void) {
server.crashing = 1;
atomicSet(server.crashing, 1);
/* Log INFO and CLIENT LIST */
logServerInfo();
@@ -2583,6 +2583,9 @@ void serverLogHexDump(int level, char *descr, void *value, size_t len) {
#include <sys/time.h>
void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) {
/* Save and restore errno to avoid spoiling it's value as caught by
* WARNING: ThreadSanitizer: signal handler spoils errno */
int save_errno = errno;
#ifdef HAVE_BACKTRACE
ucontext_t *uc = (ucontext_t*) secret;
#else
@@ -2603,6 +2606,7 @@ void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) {
serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace().");
#endif
serverLogRawFromHandler(LL_WARNING,"--------\n");
errno = save_errno;
}
/* Schedule a SIGALRM delivery after the specified period in milliseconds.

View File

@@ -3111,6 +3111,12 @@ char *getClientSockname(client *c) {
return c->sockname;
}
static inline int isCrashing(void) {
int crashing;
atomicGet(server.crashing, crashing);
return crashing;
}
/* Concatenate a string representing the state of a client in a human
* readable format, into the sds string 's'. */
sds catClientInfoString(sds s, client *client) {
@@ -3120,7 +3126,7 @@ sds catClientInfoString(sds s, client *client) {
int paused = 0;
if (client->running_tid != IOTHREAD_MAIN_THREAD_ID &&
pthread_equal(server.main_thread_id, pthread_self()) &&
!server.crashing)
!isCrashing())
{
paused = 1;
pauseIOThread(client->running_tid);
@@ -3221,7 +3227,7 @@ sds getAllClientsInfoString(int type) {
/* Pause all IO threads to access data of clients safely, and pausing the
* specific IO thread will not repeatedly execute in catClientInfoString. */
int allpaused = 0;
if (server.io_threads_num > 1 && !server.crashing &&
if (server.io_threads_num > 1 && !isCrashing() &&
pthread_equal(server.main_thread_id, pthread_self()))
{
allpaused = 1;

View File

@@ -108,6 +108,12 @@ static inline int isCommandReusable(struct redisCommand *cmd, robj *commandArg)
* function of Redis may be called from other threads. */
void nolocks_localtime(struct tm *tmp, time_t t, time_t tz, int dst);
static inline int shouldShutdownAsap(void) {
int shutdown_asap;
atomicGet(server.shutdown_asap, shutdown_asap);
return shutdown_asap;
}
/* Low level logging. To use only for very big messages, otherwise
* serverLog() is to prefer. */
void serverLogRaw(int level, const char *msg) {
@@ -1484,11 +1490,13 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
/* We received a SIGTERM or SIGINT, shutting down here in a safe way, as it is
* not ok doing so inside the signal handler. */
if (server.shutdown_asap && !isShutdownInitiated()) {
if (shouldShutdownAsap() && !isShutdownInitiated()) {
int shutdownFlags = SHUTDOWN_NOFLAGS;
if (server.last_sig_received == SIGINT && server.shutdown_on_sigint)
int last_sig_received;
atomicGet(server.last_sig_received, last_sig_received);
if (last_sig_received == SIGINT && server.shutdown_on_sigint)
shutdownFlags = server.shutdown_on_sigint;
else if (server.last_sig_received == SIGTERM && server.shutdown_on_sigterm)
else if (last_sig_received == SIGTERM && server.shutdown_on_sigterm)
shutdownFlags = server.shutdown_on_sigterm;
if (prepareForShutdown(shutdownFlags) == C_OK) exit(0);
@@ -1730,11 +1738,11 @@ void whileBlockedCron(void) {
/* We received a SIGTERM during loading, shutting down here in a safe way,
* as it isn't ok doing so inside the signal handler. */
if (server.shutdown_asap && server.loading) {
if (shouldShutdownAsap() && server.loading) {
if (prepareForShutdown(SHUTDOWN_NOSAVE) == C_OK) exit(0);
serverLog(LL_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
server.shutdown_asap = 0;
server.last_sig_received = 0;
atomicSet(server.shutdown_asap, 0);
atomicSet(server.last_sig_received, 0);
}
}
@@ -4616,10 +4624,10 @@ int isReadyToShutdown(void) {
}
static void cancelShutdown(void) {
server.shutdown_asap = 0;
atomicSet(server.shutdown_asap, 0);
server.shutdown_flags = 0;
server.shutdown_mstime = 0;
server.last_sig_received = 0;
atomicSet(server.last_sig_received, 0);
replyToClientsBlockedOnShutdown();
unpauseActions(PAUSE_DURING_SHUTDOWN);
}
@@ -4628,10 +4636,10 @@ static void cancelShutdown(void) {
int abortShutdown(void) {
if (isShutdownInitiated()) {
cancelShutdown();
} else if (server.shutdown_asap) {
} else if (shouldShutdownAsap()) {
/* Signal handler has requested shutdown, but it hasn't been initiated
* yet. Just clear the flag. */
server.shutdown_asap = 0;
atomicSet(server.shutdown_asap, 0);
} else {
/* Shutdown neither initiated nor requested. */
return C_ERR;
@@ -6772,7 +6780,7 @@ static void sigShutdownHandler(int sig) {
* If we receive the signal the second time, we interpret this as
* the user really wanting to quit ASAP without waiting to persist
* on disk and without waiting for lagging replicas. */
if (server.shutdown_asap && sig == SIGINT) {
if (shouldShutdownAsap() && sig == SIGINT) {
serverLogRawFromHandler(LL_WARNING, "You insist... exiting now.");
rdbRemoveTempFile(getpid(), 1);
exit(1); /* Exit with an error since this was not a clean shutdown. */
@@ -6781,8 +6789,8 @@ static void sigShutdownHandler(int sig) {
}
serverLogRawFromHandler(LL_WARNING, msg);
server.shutdown_asap = 1;
server.last_sig_received = sig;
atomicSet(server.shutdown_asap, 1);
atomicSet(server.last_sig_received, sig);
}
void setupSignalHandlers(void) {

View File

@@ -1762,10 +1762,10 @@ struct redisServer {
rax *errors; /* Errors table */
int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */
unsigned int lruclock; /* Clock for LRU eviction */
volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
volatile sig_atomic_t crashing; /* Server is crashing report. */
redisAtomic int shutdown_asap; /* Shutdown ordered by signal handler. */
redisAtomic int crashing; /* Server is crashing report. */
mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */
int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */
redisAtomic int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */
int shutdown_flags; /* Flags passed to prepareForShutdown(). */
int activerehashing; /* Incremental rehash in serverCron() */
int active_defrag_running; /* Active defragmentation running (holds current scan aggressiveness) */

View File

@@ -24,8 +24,8 @@ static const clock_t RUN_ON_THREADS_TIMEOUT = 2;
/*================================= Globals ================================= */
static run_on_thread_cb g_callback = NULL;
static volatile size_t g_tids_len = 0;
static redisAtomic run_on_thread_cb g_callback = NULL;
static redisAtomic size_t g_tids_len = 0;
static redisAtomic size_t g_num_threads_done = 0;
/* This flag is set while ThreadsManager_runOnThreads is running */
@@ -63,15 +63,15 @@ int ThreadsManager_runOnThreads(pid_t *tids, size_t tids_len, run_on_thread_cb c
}
/* Update g_callback */
g_callback = callback;
atomicSet(g_callback, callback);
/* Set g_tids_len */
g_tids_len = tids_len;
atomicSet(g_tids_len, tids_len);
/* set g_num_threads_done to 0 To handler the case where in the previous run we reached the timeout
and called ThreadsManager_cleanups before one or more threads were done and increased
(the already set to 0) g_num_threads_done */
g_num_threads_done = 0;
atomicSet(g_num_threads_done, 0);
/* Send signal to all the threads in tids */
pid_t pid = getpid();
@@ -103,7 +103,9 @@ static int test_and_start(void) {
__attribute__ ((noinline))
static void invoke_callback(int sig) {
UNUSED(sig);
run_on_thread_cb callback = g_callback;
run_on_thread_cb callback;
atomicGet(g_callback, callback);
if (callback) {
callback();
atomicIncr(g_num_threads_done, 1);
@@ -122,6 +124,7 @@ static void wait_threads(void) {
/* Wait until all threads are done to invoke the callback or until we reached the timeout */
size_t curr_done_count;
struct timespec curr_time;
size_t tids_len;
do {
struct timeval tv = {
@@ -132,7 +135,8 @@ static void wait_threads(void) {
select(0, NULL, NULL, NULL, &tv);
atomicGet(g_num_threads_done, curr_done_count);
clock_gettime(CLOCK_REALTIME, &curr_time);
} while (curr_done_count < g_tids_len &&
atomicGet(g_tids_len, tids_len);
} while (curr_done_count < tids_len &&
curr_time.tv_sec <= timeout_time.tv_sec);
if (curr_time.tv_sec > timeout_time.tv_sec) {
@@ -142,9 +146,9 @@ static void wait_threads(void) {
}
static void ThreadsManager_cleanups(void) {
g_callback = NULL;
g_tids_len = 0;
g_num_threads_done = 0;
atomicSet(g_callback, NULL);
atomicSet(g_tids_len, 0);
atomicSet(g_num_threads_done, 0);
/* Lastly, turn off g_in_progress */
atomicSet(g_in_progress, 0);

8
src/tsan.sup Normal file
View File

@@ -0,0 +1,8 @@
# collect_stacktrace_data() calls backtrace() from a signal handler but
# backtrace() is signal-unsafe since it might allocate memory, at least on
# glibc 2.39 it does through a call to _dl_map_object_deps().
signal:collect_stacktrace_data
signal:printCrashReport
# TODO Investigate this race in jemalloc probably related to
# https://github.com/jemalloc/jemalloc/issues/2621
race:malloc_mutex_trylock_final

View File

@@ -39,6 +39,7 @@ The following compatibility and capability tags are currently used:
| `cluster:skip` | Not compatible with `--cluster-mode`. |
| `large-memory` | Test that requires more than 100mb |
| `tls:skip` | Not compatible with `--tls`. |
| `tsan:skip` | Not compatible with running under thread sanitizer. |
| `needs:repl` | Uses replication and needs to be able to `SYNC` from server. |
| `needs:debug` | Uses the `DEBUG` command or other debugging focused commands (like `OBJECT REFCOUNT`). |
| `needs:pfdebug` | Uses the `PFDEBUG` command. |

View File

@@ -208,7 +208,7 @@ tags {"aof external:skip"} {
}
}
start_server {overrides {appendonly {yes} appendfsync always}} {
start_server {tags {"tsan:skip"} overrides {appendonly {yes} appendfsync always}} {
test {AOF fsync always barrier issue} {
set rd [redis_deferring_client]
# Set a sleep when aof is flushed, so that we have a chance to look

View File

@@ -14,7 +14,7 @@ tags {"dump" "corruption" "external:skip"} {
# iterating as many as 2^61 hash table slots.
set arch_name [exec uname -m]
set run_oom_tests [expr {$arch_name == "x86_64" || $arch_name == "aarch64"}]
set run_oom_tests [expr {($arch_name == "x86_64" || $arch_name == "aarch64") && !$::tsan}]
set corrupt_payload_7445 "\x0E\x01\x1D\x1D\x00\x00\x00\x16\x00\x00\x00\x03\x00\x00\x04\x43\x43\x43\x43\x06\x04\x42\x42\x42\x42\x06\x3F\x41\x41\x41\x41\xFF\x09\x00\x88\xA5\xCA\xA8\xC5\x41\xF4\x35"
@@ -583,7 +583,7 @@ test {corrupt payload: fuzzer findings - OOM in dictExpand} {
assert_match "*Bad data format*" $err
r ping
}
}
} {} {tsan:skip}
}
@@ -670,7 +670,7 @@ test {corrupt payload: fuzzer findings - dict init to huge size} {
assert_match "*Bad data format*" $err
r ping
}
}
} {} {tsan:skip}
test {corrupt payload: fuzzer findings - huge string} {
start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no] ] {
@@ -680,7 +680,7 @@ test {corrupt payload: fuzzer findings - huge string} {
assert_match "*Bad data format*" $err
r ping
}
}
} {} {tsan:skip}
test {corrupt payload: fuzzer findings - stream PEL without consumer} {
start_server [list overrides [list loglevel verbose use-exit-on-panic yes crash-memcheck-enabled no] ] {

View File

@@ -641,7 +641,7 @@ start_server {tags {"repl external:skip"}} {
}
}
start_server {tags {"repl external:skip"}} {
start_server {tags {"repl external:skip tsan:skip"}} {
set replica [srv 0 client]
set replica_pid [srv 0 pid]

View File

@@ -880,7 +880,7 @@ proc compute_cpu_usage {start end} {
# test diskless rdb pipe with multiple replicas, which may drop half way
start_server {tags {"repl external:skip"} overrides {save ""}} {
start_server {tags {"repl external:skip tsan:skip"} overrides {save ""}} {
set master [srv 0 client]
$master config set repl-diskless-sync yes
$master config set repl-diskless-sync-delay 5
@@ -1152,7 +1152,7 @@ test "diskless replication read pipe cleanup" {
$master ping
}
}
} {} {external:skip}
} {} {external:skip tsan:skip}
test {replicaof right after disconnection} {
# this is a rare race condition that was reproduced sporadically by the psync2 unit.

View File

@@ -231,6 +231,11 @@ proc tags_acceptable {tags err_return} {
return 0
}
if {$::tsan && [lsearch $tags "tsan:skip"] >= 0} {
set err "Not supported under thread sanitizer"
return 0
}
if {$::tls && [lsearch $tags "tls:skip"] >= 0} {
set err "Not supported in tls mode"
return 0
@@ -294,7 +299,12 @@ proc spawn_server {config_file stdout stderr args} {
# ASAN_OPTIONS environment variable is for address sanitizer. If a test
# tries to allocate huge memory area and expects allocator to return
# NULL, address sanitizer throws an error without this setting.
set pid [exec /usr/bin/env ASAN_OPTIONS=allocator_may_return_null=1 MSAN_OPTIONS=allocator_may_return_null=1 {*}$cmd >> $stdout 2>> $stderr &]
set env [list \
"ASAN_OPTIONS=allocator_may_return_null=1" \
"MSAN_OPTIONS=allocator_may_return_null=1" \
"TSAN_OPTIONS=allocator_may_return_null=1,detect_deadlocks=0,suppressions=src/tsan.sup" \
]
set pid [exec /usr/bin/env {*}$env {*}$cmd >> $stdout 2>> $stderr &]
}
if {$::wait_server} {

View File

@@ -1202,6 +1202,12 @@ proc format_command {args} {
# Returns whether or not the system supports stack traces
proc system_backtrace_supported {} {
# Thread sanitizer reports backtrace_symbols_fd() as
# signal-unsafe since it allocates memory
if {$::tsan} {
return 0
}
set system_name [string tolower [exec uname -s]]
if {$system_name eq {darwin}} {
return 1

View File

@@ -44,6 +44,7 @@ set ::baseport 21111; # initial port for spawned redis servers
set ::portcount 8000; # we don't wanna use more than 10000 to avoid collision with cluster bus ports
set ::traceleaks 0
set ::valgrind 0
set ::tsan 0
set ::durable 0
set ::tls 0
set ::tls_module 0
@@ -538,6 +539,7 @@ proc send_data_packet {fd status data {elapsed 0}} {
proc print_help_screen {} {
puts [join {
"--valgrind Run the test over valgrind."
"--tsan Run the test with thread sanitizer."
"--durable suppress test crashes and keep running"
"--stack-logging Enable OSX leaks/malloc stack logging."
"--accurate Run slow randomized tests for more iterations."
@@ -611,6 +613,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
incr j
} elseif {$opt eq {--valgrind}} {
set ::valgrind 1
} elseif {$opt eq {--tsan}} {
set ::tsan 1
} elseif {$opt eq {--stack-logging}} {
if {[string match {*Darwin*} [exec uname -a]]} {
set ::stack_logging 1

View File

@@ -480,7 +480,9 @@ start_server {} {
# Make sure we have only half of our clients now
wait_for_condition 200 100 {
[llength [regexp -all -inline {name=client} [r client list]]] == $client_count / 2
([lindex [r config get io-threads] 1] == 1) ?
([llength [regexp -all -inline {name=client} [r client list]]] == $client_count / 2) :
([llength [regexp -all -inline {name=client} [r client list]]] <= $client_count / 2)
} else {
fail "Failed to evict clients"
}

View File

@@ -421,7 +421,7 @@ start_server {tags {"maxmemory external:skip"}} {
} {4098}
}
start_server {tags {"maxmemory external:skip"}} {
start_server {tags {"maxmemory external:skip tsan:skip"}} {
test {client tracking don't cause eviction feedback loop} {
r config set latency-tracking no
r config set maxmemory 0

View File

@@ -931,11 +931,11 @@ run_solo {defrag} {
}
}
start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" loglevel notice}} {
start_cluster 1 0 {tags {"defrag external:skip tsan:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" loglevel notice}} {
test_active_defrag "cluster"
}
start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" loglevel notice}} {
start_server {tags {"defrag external:skip tsan:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" loglevel notice}} {
test_active_defrag "standalone"
}
} ;# run_solo

View File

@@ -4,7 +4,7 @@ set testmodule [file normalize tests/modules/crash.so]
set backtrace_supported [system_backtrace_supported]
# Valgrind will complain that the process terminated by a signal, skip it.
if {!$::valgrind} {
if {!$::valgrind && !$::tsan} {
start_server {tags {"modules"}} {
r module load $testmodule assert
test {Test module crash when info crashes with an assertion } {

View File

@@ -34,7 +34,7 @@ start_server {tags {"modules"} overrides {{save ""}}} {
assert_morethan [getInfoProperty $info defragtest_datatype_raw_defragged] 0
assert_morethan [getInfoProperty $info defragtest_defrag_started] 0
assert_morethan [getInfoProperty $info defragtest_defrag_ended] 0
}
} {} {tsan:skip}
test {Module defrag: late defrag with cursor works} {
r config set activedefrag no
@@ -64,7 +64,7 @@ start_server {tags {"modules"} overrides {{save ""}}} {
assert_morethan [getInfoProperty $info defragtest_datatype_raw_defragged] 0
assert_morethan [getInfoProperty $info defragtest_defrag_started] 0
assert_morethan [getInfoProperty $info defragtest_defrag_ended] 0
}
} {} {tsan:skip}
test {Module defrag: global defrag works} {
r config set activedefrag no
@@ -94,6 +94,6 @@ start_server {tags {"modules"} overrides {{save ""}}} {
assert_morethan [getInfoProperty $info defragtest_defrag_ended] 0
assert_morethan [getInfoProperty $info defragtest_global_dicts_resumes] [getInfoProperty $info defragtest_defrag_ended]
assert_morethan [getInfoProperty $info defragtest_global_subdicts_resumes] [getInfoProperty $info defragtest_defrag_ended]
}
} {} {tsan:skip}
}
}