daemon: fix a crash bug "FATAL: exception not rethrown"
This is caused by pthread_cancel effectively throwing a
not-specifically-identifiable C++ exception into the targeted thread,
which, if it is not rethrown, terminates the process entirely.
This is rather "impolite" behaviour, we would say. But thread
cancellation is *always* busted, and we should simply not use it where
unnecessary. It's particularly unnecessary when what we *actually* need
it for is, err, interrupting a poll(2).
That can in turn be achieved by simply listening to more stuff in the
poll, namely, a pipe, which we send a character to when needing to
stop the thread.
While looking at this code, we also investigated whether any of the
poll() madness is required, or was even *ever* required. Curiously we
found in the XNU kernel source code that the thing about needing to
listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM
should not have made any difference at all. We've left a FIXME to look
into that further because what's written here is super janky.
94d3b45284/bsd/kern/sys_generic.c (L1751-L1758)
This is the crash on some Hydra machines:
Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)):
0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so
7 0x00007f56b990ac9d in std:🧵:_State_impl<std:🧵:_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so
8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6
9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6
As for testing, we've started a daemon with this change and verified it
deals with HUPs correctly on x86_64-linux, but I don't think we can
easily test the destructor behaviour without whatever Hydra was
doing that broke.
Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
This commit is contained in:
parent
a8f443d960
commit
b3fb8d9822
1 changed files with 42 additions and 6 deletions
|
@ -10,6 +10,8 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
|
||||||
|
#include "error.hh"
|
||||||
|
#include "file-descriptor.hh"
|
||||||
#include "signals.hh"
|
#include "signals.hh"
|
||||||
|
|
||||||
namespace nix {
|
namespace nix {
|
||||||
|
@ -19,19 +21,36 @@ class MonitorFdHup
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
std::thread thread;
|
std::thread thread;
|
||||||
|
/**
|
||||||
|
* Pipe used to interrupt the poll()ing in the monitoring thread.
|
||||||
|
*/
|
||||||
|
Pipe terminatePipe;
|
||||||
|
std::atomic_bool quit = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
MonitorFdHup(int fd)
|
MonitorFdHup(int fd)
|
||||||
{
|
{
|
||||||
thread = std::thread([fd]() {
|
terminatePipe.create();
|
||||||
while (true) {
|
auto &quit_ = this->quit;
|
||||||
|
int terminateFd = terminatePipe.readSide.get();
|
||||||
|
thread = std::thread([fd, terminateFd, &quit_]() {
|
||||||
|
while (!quit_) {
|
||||||
/* Wait indefinitely until a POLLHUP occurs. */
|
/* Wait indefinitely until a POLLHUP occurs. */
|
||||||
struct pollfd fds[1];
|
struct pollfd fds[2];
|
||||||
fds[0].fd = fd;
|
fds[0].fd = fd;
|
||||||
/* Polling for no specific events (i.e. just waiting
|
/* Polling for no specific events (i.e. just waiting
|
||||||
for an error/hangup) doesn't work on macOS
|
for an error/hangup) doesn't work on macOS
|
||||||
anymore. So wait for read events and ignore
|
anymore. So wait for read events and ignore
|
||||||
them. */
|
them. */
|
||||||
|
// FIXME(jade): we have looked at the XNU kernel code and as
|
||||||
|
// far as we can tell, the above is bogus. It should be the
|
||||||
|
// case that the previous version of this and the current
|
||||||
|
// version are identical: waiting for POLLHUP and POLLRDNORM in
|
||||||
|
// the kernel *should* be identical.
|
||||||
|
// https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758
|
||||||
|
//
|
||||||
|
// So, this needs actual testing and we need to figure out if
|
||||||
|
// this is actually bogus.
|
||||||
fds[0].events =
|
fds[0].events =
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
POLLRDNORM
|
POLLRDNORM
|
||||||
|
@ -39,8 +58,18 @@ public:
|
||||||
0
|
0
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
auto count = poll(fds, 1, -1);
|
fds[1].fd = terminateFd;
|
||||||
if (count == -1) abort(); // can't happen
|
fds[1].events = POLLIN;
|
||||||
|
|
||||||
|
auto count = poll(fds, 2, -1);
|
||||||
|
if (count == -1) {
|
||||||
|
if (errno == EINTR || errno == EAGAIN) {
|
||||||
|
// These are best dealt with by just trying again.
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
throw SysError("in MonitorFdHup poll()");
|
||||||
|
}
|
||||||
|
}
|
||||||
/* This shouldn't happen, but can on macOS due to a bug.
|
/* This shouldn't happen, but can on macOS due to a bug.
|
||||||
See rdar://37550628.
|
See rdar://37550628.
|
||||||
|
|
||||||
|
@ -53,6 +82,11 @@ public:
|
||||||
triggerInterrupt();
|
triggerInterrupt();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// No reason to actually look at the pipe FD, we just need it
|
||||||
|
// to be able to get woken.
|
||||||
|
if (quit_) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
/* This will only happen on macOS. We sleep a bit to
|
/* This will only happen on macOS. We sleep a bit to
|
||||||
avoid waking up too often if the client is sending
|
avoid waking up too often if the client is sending
|
||||||
input. */
|
input. */
|
||||||
|
@ -63,7 +97,9 @@ public:
|
||||||
|
|
||||||
~MonitorFdHup()
|
~MonitorFdHup()
|
||||||
{
|
{
|
||||||
pthread_cancel(thread.native_handle());
|
quit = true;
|
||||||
|
// Poke the thread out of its poll wait
|
||||||
|
writeFull(terminatePipe.writeSide.get(), "*", false);
|
||||||
thread.join();
|
thread.join();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue