Dev news

Commit b002acacc1 for qemu.org

commit b002acacc1d72521351501fa0af81d146106c9ed
Author: Hanna Czenczek <hreitz@redhat.com>
Date:   Mon Dec 15 15:15:40 2025 +0100

    Revert "nvme: Fix coroutine waking"

    This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.

    Said commit changed the replay_bh_schedule_oneshot_event() in
    nvme_rw_cb() to aio_co_wake(), allowing the request coroutine to be
    entered directly (instead of only being scheduled for later execution).
    This can cause the device to become stalled like so:

    It is possible that after completion the request coroutine goes on to
    submit another request without yielding, e.g. a flush after a write to
    emulate FUA.  This will likely cause a nested nvme_process_completion()
    call because nvme_rw_cb() itself is called from there.

    (After submitting a request, we invoke nvme_process_completion() through
    defer_call(); but the fact that nvme_process_completion() ran in the
    first place indicates that we are not in a call-deferring section, so
    defer_call() will call nvme_process_completion() immediately.)

    If this inner nvme_process_completion() loop then processes any
    completions, it will write the final completion queue (CQ) head index to
    the CQ head doorbell, and subsequently execution will return to the
    outer nvme_process_completion() loop.  Even if this loop now finds no
    further completions, it still processed at least one completion before,
    or it would not have called the nvme_rw_cb() which led to nesting.
    Therefore, it will now write the exact same CQ head index value to the
    doorbell, which effectively is an unrecoverable error[1].

    Therefore, nesting of nvme_process_completion() does not work at this
    point.  Reverting said commit removes the nesting (by scheduling the
    request coroutine instead of entering it immediately), and so fixes the
    stall.

    On the downside, reverting said commit breaks multiqueue for nvme, but
    better to have single-queue working than neither.  For 11.0, we will
    have a solution that makes both work.

    A side note: There is a comment in nvme_process_completion() above
    qemu_bh_schedule() that claims nesting works, as long as it is done
    through the completion_bh.  I am quite sure that is not true, for two
    reasons:
    - The problem described above, which is even worse when going through
      nvme_process_completion_bh() because that function unconditionally
      writes to the CQ head doorbell,
    - nvme_process_completion_bh() never takes q->lock, so
      nvme_process_completion() unlocking it will likely abort.

    Given the lack of reports of such aborts, I believe that completion_bh
    simply is unused in practice.

    [1] See the NVMe Base Specification revision 2.3, page 180, figure 152:
        “Invalid Doorbell Write Value: A host attempted to write an invalid
        doorbell value. Some possible causes of this error are: [...] the
        value written is the same as the previously written doorbell value.”

        To even be notified of this error, we would need to send an
        Asynchronous Event Request to the admin queue (p. 178ff), which we
        don’t do, and then to handle it, we would need to delete and
        recreate the queue (p. 88, section 3.3.1.2 Queue Usage).

    Cc: qemu-stable@nongnu.org
    Reported-by: Lukáš Doktor <ldoktor@redhat.com>
    Tested-by: Lukáš Doktor <ldoktor@redhat.com>
    Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
    Message-id: 20251215141540.88915-1-hreitz@redhat.com
    Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

diff --git a/block/nvme.c b/block/nvme.c
index 919e14cef9..c3d3b99d1f 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -1200,36 +1200,26 @@ fail:

 typedef struct {
     Coroutine *co;
-    bool skip_yield;
     int ret;
+    AioContext *ctx;
 } NVMeCoData;

+static void nvme_rw_cb_bh(void *opaque)
+{
+    NVMeCoData *data = opaque;
+    qemu_coroutine_enter(data->co);
+}
+
 /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NVMeCoData *data = opaque;
-
     data->ret = ret;
-
-    if (data->co == qemu_coroutine_self()) {
-        /*
-         * Fast path: We are inside of the request coroutine (through
-         * nvme_submit_command, nvme_deferred_fn, nvme_process_completion).
-         * We can set data->skip_yield here to keep the coroutine from
-         * yielding, and then we don't need to schedule a BH to wake it.
-         */
-        data->skip_yield = true;
-    } else {
-        /*
-         * Safe to call: The case where we run in the request coroutine is
-         * handled above, so we must be independent of it; and without
-         * skip_yield set, the coroutine will yield.
-         * No need to release NVMeQueuePair.lock (we are called without it
-         * held).  (Note: If we enter the coroutine here, @data will
-         * probably be dangling once aio_co_wake() returns.)
-         */
-        aio_co_wake(data->co);
+    if (!data->co) {
+        /* The rw coroutine hasn't yielded, don't try to enter. */
+        return;
     }
+    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
 }

 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
@@ -1253,7 +1243,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         .cdw12 = cpu_to_le32(cdw12),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };

@@ -1270,7 +1260,9 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         return r;
     }
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }

@@ -1366,7 +1358,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
         .nsid = cpu_to_le32(s->nsid),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };

@@ -1374,7 +1366,9 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     req = nvme_get_free_req(ioq);
     assert(req);
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    if (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }

@@ -1415,7 +1409,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     };

     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };

@@ -1435,7 +1429,9 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     assert(req);

     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }

@@ -1463,7 +1459,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     };

     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };

@@ -1508,7 +1504,9 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     trace_nvme_dsm(s, offset, bytes);

     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }