mirror of
https://github.com/redis/redis.git
synced 2026-04-21 03:01:35 -04:00
This PR does 2 main things:
1) Add warning for suspected slow system clocksource setting. This is Linux specific.
2) Add a `--check-system` argument to redis which runs all system checks and prints a report.
## System checks
Add a command line option `--check-system` which runs all known system checks and provides
a report to stdout of which systems checks have failed with details on how to reconfigure the
system for optimized redis performance.
The `--system-check` mode exists with an appropriate error code after running all the checks.
## Slow clocksource details
We check the system's clocksource performance by running `clock_gettime()` in a loop and then
checking how much time was spent in a system call (via `getrusage()`). If we spend more than
10% of the time in the kernel then we print a warning. I verified that using the slow clock sources:
`acpi_pm` (~90% in the kernel on my laptop) and `xen` (~30% in the kernel on an ec2 `m4.large`)
we get this warning.
The check runs 5 system ticks so we can detect time spent in kernel at 20% jumps (0%,20%,40%...).
Anything more accurate will require the test to run longer. Typically 5 ticks are 50ms. This means
running the test on startup will delay startup by 50ms. To avoid this we make sure the test is only
executed in the `--check-system` mode.
For a quick startup check, we specifically warn if the we see the system is using the `xen` clocksource
which we know has bad performance and isn't recommended (at least on ec2). In such a case the
user should manually rung redis with `--check-system` to force the slower clocksource test described
above.
## Other changes in the PR
* All the system checks are now implemented as functions in _syscheck.c_.
They are implemented using a standard interface (see details in _syscheck.c_).
To do this I moved the checking functions `linuxOvercommitMemoryValue()`,
`THPIsEnabled()`, `linuxMadvFreeForkBugCheck()` out of _server.c_ and _latency.c_
and into the new _syscheck.c_. When moving these functions I made sure they don't
depend on other functionality provided in _server.c_ and made them use a standard
"check functions" interface. Specifically:
* I removed all logging out of `linuxMadvFreeForkBugCheck()`. In case there's some
unexpected error during the check aborts as before, but without any logging.
It returns an error code 0 meaning the check didn't not complete.
* All these functions now return 1 on success, -1 on failure, 0 in case the check itself
cannot be completed.
* The `linuxMadvFreeForkBugCheck()` function now internally calls `exit()` and not
`exitFromChild()` because the latter is only available in _server.c_ and I wanted to
remove that dependency. This isn't an because we don't need to worry about the
child process created by the test doing anything related to the rdb/aof files which
is why `exitFromChild()` was created.
* This also fixes parsing of other /proc/\<pid\>/stat fields to correctly handle spaces
in the process name and be more robust in general. Not that before this fix the rss
info in `INFO memory` was corrupt in case of spaces in the process name. To
recreate just rename `redis-server` to `redis server`, start it, and run `INFO memory`.
93 lines
3.8 KiB
C
93 lines
3.8 KiB
C
/* latency.h -- latency monitor API header file
|
|
* See latency.c for more information.
|
|
*
|
|
* ----------------------------------------------------------------------------
|
|
*
|
|
* Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __LATENCY_H
|
|
#define __LATENCY_H
|
|
|
|
#define LATENCY_TS_LEN 160 /* History length for every monitored event. */
|
|
|
|
/* Representation of a latency sample: the sampling time and the latency
|
|
* observed in milliseconds. */
|
|
struct latencySample {
|
|
int32_t time; /* We don't use time_t to force 4 bytes usage everywhere. */
|
|
uint32_t latency; /* Latency in milliseconds. */
|
|
};
|
|
|
|
/* The latency time series for a given event. */
|
|
struct latencyTimeSeries {
|
|
int idx; /* Index of the next sample to store. */
|
|
uint32_t max; /* Max latency observed for this event. */
|
|
struct latencySample samples[LATENCY_TS_LEN]; /* Latest history. */
|
|
};
|
|
|
|
/* Latency statistics structure. */
|
|
struct latencyStats {
|
|
uint32_t all_time_high; /* Absolute max observed since latest reset. */
|
|
uint32_t avg; /* Average of current samples. */
|
|
uint32_t min; /* Min of current samples. */
|
|
uint32_t max; /* Max of current samples. */
|
|
uint32_t mad; /* Mean absolute deviation. */
|
|
uint32_t samples; /* Number of non-zero samples. */
|
|
time_t period; /* Number of seconds since first event and now. */
|
|
};
|
|
|
|
void latencyMonitorInit(void);
|
|
void latencyAddSample(const char *event, mstime_t latency);
|
|
|
|
/* Latency monitoring macros. */
|
|
|
|
/* Start monitoring an event. We just set the current time. */
|
|
#define latencyStartMonitor(var) if (server.latency_monitor_threshold) { \
|
|
var = mstime(); \
|
|
} else { \
|
|
var = 0; \
|
|
}
|
|
|
|
/* End monitoring an event, compute the difference with the current time
|
|
* to check the amount of time elapsed. */
|
|
#define latencyEndMonitor(var) if (server.latency_monitor_threshold) { \
|
|
var = mstime() - var; \
|
|
}
|
|
|
|
/* Add the sample only if the elapsed time is >= to the configured threshold. */
|
|
#define latencyAddSampleIfNeeded(event,var) \
|
|
if (server.latency_monitor_threshold && \
|
|
(var) >= server.latency_monitor_threshold) \
|
|
latencyAddSample((event),(var));
|
|
|
|
/* Remove time from a nested event. */
|
|
#define latencyRemoveNestedEvent(event_var,nested_var) \
|
|
event_var += nested_var;
|
|
|
|
#endif /* __LATENCY_H */
|