From 7d7508fb7ab5df1664262324f471d717585f1f8e Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sun, 23 Mar 2025 18:00:36 -0400 Subject: [PATCH 1/7] `monitor-fd.hh`: Format It's a pretty small diff, so let's just start formatting before we make other changes. (cherry picked from commit 041394b741ade095210a396d6a3ab3218d86e1c1) --- maintainers/flake-module.nix | 1 - src/libutil/unix/monitor-fd.hh | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/maintainers/flake-module.nix b/maintainers/flake-module.nix index f18e9b41e..4c75df246 100644 --- a/maintainers/flake-module.nix +++ b/maintainers/flake-module.nix @@ -396,7 +396,6 @@ ''^src/libutil/types\.hh$'' ''^src/libutil/unix/file-descriptor\.cc$'' ''^src/libutil/unix/file-path\.cc$'' - ''^src/libutil/unix/monitor-fd\.hh$'' ''^src/libutil/unix/processes\.cc$'' ''^src/libutil/unix/signals-impl\.hh$'' ''^src/libutil/unix/signals\.cc$'' diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index b6610feff..cfbf10d5a 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -14,7 +14,6 @@ namespace nix { - class MonitorFdHup { private: @@ -33,11 +32,11 @@ public: anymore. So wait for read events and ignore them. */ fds[0].events = - #ifdef __APPLE__ +#ifdef __APPLE__ POLLRDNORM - #else +#else 0 - #endif +#endif ; auto count = poll(fds, 1, -1); if (count == -1) @@ -50,7 +49,8 @@ public: coordination with the main thread if spinning proves too harmful. */ - if (count == 0) continue; + if (count == 0) + continue; if (fds[0].revents & POLLHUP) { unix::triggerInterrupt(); break; @@ -70,5 +70,4 @@ public: } }; - } From 709e228589caa6b0644f1d27450833c985814d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Fri, 21 Mar 2025 16:23:31 +0100 Subject: [PATCH 2/7] `MonitorFdHup`: raise explicit SysError rather unreachable Syscalls can fail for many reasons and we don't want to loose the errno and error context. (cherry picked from commit 8e0bc2c3a858118fa9f4c2532d43b71b39b0adc1) --- src/libutil/unix/monitor-fd.hh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index cfbf10d5a..0829c1309 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -39,8 +39,11 @@ public: #endif ; auto count = poll(fds, 1, -1); - if (count == -1) - unreachable(); + if (count == -1) { + if (errno == EINTR || errno == EAGAIN) + continue; + throw SysError("failed to poll() in MonitorFdHup"); + } /* This shouldn't happen, but can on macOS due to a bug. See rdar://37550628. From 1a461baee1b1a568aeac081e64a435e37878025f Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sun, 23 Mar 2025 17:58:50 -0400 Subject: [PATCH 3/7] `MonitorFdHup`: Cleanup a bit with designated initializers (cherry picked from commit d028bb4c4af2b502af21768eeae41e851dde74be) --- src/libutil/unix/monitor-fd.hh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index 0829c1309..235a7db3c 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -25,19 +25,22 @@ public: thread = std::thread([fd]() { while (true) { /* Wait indefinitely until a POLLHUP occurs. */ - struct pollfd fds[1]; - fds[0].fd = fd; + struct pollfd fds[1] = { + { + .fd = fd, + .events = /* Polling for no specific events (i.e. just waiting for an error/hangup) doesn't work on macOS anymore. So wait for read events and ignore them. */ - fds[0].events = #ifdef __APPLE__ - POLLRDNORM + POLLRDNORM, #else - 0 + 0, #endif - ; + }, + }; + auto count = poll(fds, 1, -1); if (count == -1) { if (errno == EINTR || errno == EAGAIN) From df18c9b2ed34c53f7533d49cb30791b4f153e280 Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sun, 23 Mar 2025 18:21:20 -0400 Subject: [PATCH 4/7] `MonitorFdHup`: introduce a `num_fds` variable Better than just putting `1` in multiple spots. (cherry picked from commit cb95791198019a5eb8996c4bc47b2ed10cf1ec41) --- src/libutil/unix/monitor-fd.hh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index 235a7db3c..ca1770342 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -25,7 +25,8 @@ public: thread = std::thread([fd]() { while (true) { /* Wait indefinitely until a POLLHUP occurs. */ - struct pollfd fds[1] = { + constexpr size_t num_fds = 1; + struct pollfd fds[num_fds] = { { .fd = fd, .events = @@ -41,7 +42,7 @@ public: }, }; - auto count = poll(fds, 1, -1); + auto count = poll(fds, num_fds, -1); if (count == -1) { if (errno == EINTR || errno == EAGAIN) continue; From ea19cb2f5002449ae9fa4dfbbafe722bf5577646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Baylac=20Jacqu=C3=A9?= Date: Tue, 12 Sep 2023 13:38:29 +0200 Subject: [PATCH 5/7] `MonitorFdHup`: replace `pthread_cancel` trick with a notification pipe On https://github.com/NixOS/nix/issues/8946, we faced a surprising behaviour wrt. exception when using pthread_cancel. In a nutshell when a thread is inside a catch block and it's getting pthread_cancel by another one, then the original exception is bubbled up and crashes the process. We now poll on the notification pipe from the thread and exit when the main thread closes its end. This solution does not exhibit surprising behaviour wrt. exceptions. Co-authored-by: Mic92 Fixes https://github.com/NixOS/nix/issues/8946 See also Lix https://gerrit.lix.systems/c/lix/+/1605 which is very similar by coincidence. Pulled a comment from that. (cherry picked from commit 1c636284a3f4c39dcab88c804a2c96a729c47b85) --- src/libutil-tests/monitorfdhup.cc | 18 +++++++++++++ src/libutil/unix/monitor-fd.hh | 42 +++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 11 deletions(-) create mode 100644 src/libutil-tests/monitorfdhup.cc diff --git a/src/libutil-tests/monitorfdhup.cc b/src/libutil-tests/monitorfdhup.cc new file mode 100644 index 000000000..01ecb92d9 --- /dev/null +++ b/src/libutil-tests/monitorfdhup.cc @@ -0,0 +1,18 @@ +#include "util.hh" +#include "monitor-fd.hh" + +#include +#include + +namespace nix { +TEST(MonitorFdHup, shouldNotBlock) +{ + Pipe p; + p.create(); + { + // when monitor gets destroyed it should cancel the + // background thread and do not block + MonitorFdHup monitor(p.readSide.get()); + } +} +} diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index ca1770342..d6ec47f49 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -18,27 +18,45 @@ class MonitorFdHup { private: std::thread thread; + Pipe notifyPipe; public: MonitorFdHup(int fd) { - thread = std::thread([fd]() { + notifyPipe.create(); + thread = std::thread([this, fd]() { while (true) { - /* Wait indefinitely until a POLLHUP occurs. */ - constexpr size_t num_fds = 1; - struct pollfd fds[num_fds] = { - { - .fd = fd, - .events = /* Polling for no specific events (i.e. just waiting for an error/hangup) doesn't work on macOS anymore. So wait for read events and ignore them. */ + // FIXME(jade): we have looked at the XNU kernel code and as + // far as we can tell, the above is bogus. It should be the + // case that the previous version of this and the current + // version are identical: waiting for POLLHUP and POLLRDNORM in + // the kernel *should* be identical. + // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 + // + // So, this needs actual testing and we need to figure out if + // this is actually bogus. + short hangup_events = #ifdef __APPLE__ - POLLRDNORM, + POLLRDNORM #else - 0, + 0 #endif + ; + + /* Wait indefinitely until a POLLHUP occurs. */ + constexpr size_t num_fds = 2; + struct pollfd fds[num_fds] = { + { + .fd = fd, + .events = hangup_events, + }, + { + .fd = notifyPipe.readSide.get(), + .events = hangup_events, }, }; @@ -48,7 +66,6 @@ public: continue; throw SysError("failed to poll() in MonitorFdHup"); } - /* This shouldn't happen, but can on macOS due to a bug. See rdar://37550628. @@ -62,6 +79,9 @@ public: unix::triggerInterrupt(); break; } + if (fds[1].revents & POLLHUP) { + break; + } /* This will only happen on macOS. We sleep a bit to avoid waking up too often if the client is sending input. */ @@ -72,7 +92,7 @@ public: ~MonitorFdHup() { - pthread_cancel(thread.native_handle()); + close(notifyPipe.writeSide.get()); thread.join(); } }; From 27f29ff6edf875d344fb8fb8f4f2df20505ab3fc Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Sat, 13 Jul 2024 00:27:09 +0200 Subject: [PATCH 6/7] daemon: remove workaround for macOS kernel bug that seems fixed This was filed as https://github.com/nixos/nix/issues/7584, but as far as I can tell, the previous solution of POLLHUP works just fine on macOS 14. I've also tested on an ancient machine with macOS 10.15.7, which also has POLLHUP work correctly. It's possible this might regress some older versions of macOS that have a kernel bug, but I went looking through the history on the sources and didn't find anything that looked terribly convincingly like a bug fix between 2020 and today. If such a broken version exists, it seems pretty reasonable to suggest simply updating the OS. Change-Id: I178a038baa000f927ea2cbc4587d69d8ab786843 Based off of commit 69e2ee5b25752ba5fd8644cef56fb9d627ca4a64. Ericson2314 added additional other information. (cherry picked from commit 9b3352c3c8c6719bab787acca993ee3f36bf73da) --- src/libutil/unix/monitor-fd.hh | 47 +++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index d6ec47f49..334506146 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -26,22 +26,38 @@ public: notifyPipe.create(); thread = std::thread([this, fd]() { while (true) { - /* Polling for no specific events (i.e. just waiting - for an error/hangup) doesn't work on macOS - anymore. So wait for read events and ignore - them. */ - // FIXME(jade): we have looked at the XNU kernel code and as - // far as we can tell, the above is bogus. It should be the - // case that the previous version of this and the current - // version are identical: waiting for POLLHUP and POLLRDNORM in - // the kernel *should* be identical. + // There is a POSIX violation on macOS: you have to listen for + // at least POLLHUP to receive HUP events for a FD. POSIX says + // this is not so, and you should just receive them regardless. + // However, as of our testing on macOS 14.5, the events do not + // get delivered if in the all-bits-unset case, but do get + // delivered if `POLLHUP` is set. + // + // This bug filed as rdar://37537852 + // (https://openradar.appspot.com/37537852). + // + // macOS's own man page + // (https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/poll.2.html) + // additionally says that `POLLHUP` is ignored as an input. It + // seems the likely order of events here was + // + // 1. macOS did not follow the POSIX spec + // + // 2. Somebody ninja-fixed this other spec violation to make + // sure `POLLHUP` was not forgotten about, even though they + // "fixed" this issue in a spec-non-compliant way. Whatever, + // we'll use the fix. + // + // Relevant code, current version, which shows the : // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 // - // So, this needs actual testing and we need to figure out if - // this is actually bogus. + // The `POLLHUP` detection was added in + // https://github.com/apple-oss-distributions/xnu/commit/e13b1fa57645afc8a7b2e7d868fe9845c6b08c40#diff-a5aa0b0e7f4d866ca417f60702689fc797e9cdfe33b601b05ccf43086c35d395R1468 + // That means added in 2007 or earlier. Should be good enough + // for us. short hangup_events = #ifdef __APPLE__ - POLLRDNORM + POLLHUP #else 0 #endif @@ -82,9 +98,10 @@ public: if (fds[1].revents & POLLHUP) { break; } - /* This will only happen on macOS. We sleep a bit to - avoid waking up too often if the client is sending - input. */ + // On macOS, it is possible (although not observed on macOS + // 14.5) that in some limited cases on buggy kernel versions, + // all the non-POLLHUP events for the socket get delivered. + // Sleeping avoids pointlessly spinning a thread on those. sleep(1); } }); From 490e7c0984be6ad749aa93fcb9d5a9b0b5356593 Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sun, 23 Mar 2025 19:11:17 -0400 Subject: [PATCH 7/7] `MonitorFdHup`: Don't sleep anymore After the previous commit it should not be necessary. Furthermore, if we *do* sleep, we'll exacerbate a race condition (in conjunction with getting rid of the thread cancellation) that will cause test failures. (cherry picked from commit 49f486d8e088d4633872dfef342fe9fac4f83b6d) --- src/libutil/unix/monitor-fd.hh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/libutil/unix/monitor-fd.hh b/src/libutil/unix/monitor-fd.hh index 334506146..d59832452 100644 --- a/src/libutil/unix/monitor-fd.hh +++ b/src/libutil/unix/monitor-fd.hh @@ -98,11 +98,24 @@ public: if (fds[1].revents & POLLHUP) { break; } - // On macOS, it is possible (although not observed on macOS - // 14.5) that in some limited cases on buggy kernel versions, - // all the non-POLLHUP events for the socket get delivered. - // Sleeping avoids pointlessly spinning a thread on those. - sleep(1); + // On macOS, (jade thinks that) it is possible (although not + // observed on macOS 14.5) that in some limited cases on buggy + // kernel versions, all the non-POLLHUP events for the socket + // get delivered. + // + // We could sleep to avoid pointlessly spinning a thread on + // those, but this opens up a different problem, which is that + // if do sleep, it will be longer before the daemon fork for a + // client exits. Imagine a sequential shell script, running Nix + // commands, each of which talk to the daemon. If the previous + // command registered a temp root, exits, and then the next + // command issues a delete request before the temp root is + // cleaned up, that delete request might fail. + // + // Not sleeping doesn't actually fix the race condition --- we + // would need to block on the old connections' tempt roots being + // cleaned up in in the new connection --- but it does make it + // much less likely. } }); };