Dev news

Commit be3573124e63 for kernel

commit be3573124e630736d2d39650b12f5ef220b47ac1
Author: Jens Axboe <axboe@kernel.dk>
Date:   Tue Feb 10 10:00:44 2026 -0700

    io_uring/bpf_filter: pass in expected filter payload size

    It's quite possible that opcodes that have payloads attached to them,
    like IORING_OP_OPENAT/OPENAT2 or IORING_OP_SOCKET, that these paylods
    can change over time. For example, on the openat/openat2 side, the
    struct open_how argument is extensible, and could be extended in the
    future to allow further arguments to be passed in.

    Allow registration of a cBPF filter to give the size of the filter as
    seen by userspace. If that filter is for an opcode that takes extra
    payload data, allow it if the application payload expectation is the
    same size than the kernels. If that is the case, the kernel supports
    filtering on the payload that the application expects. If the size
    differs, the behavior depends on the IO_URING_BPF_FILTER_SZ_STRICT flag:

    1) If IO_URING_BPF_FILTER_SZ_STRICT is set and the size expectation
       differs, fail the attempt to load the filter.

    2) If IO_URING_BPF_FILTER_SZ_STRICT isn't set, allow the filter if
       the userspace pdu size is smaller than what the kernel offers.

    3) Regardless if IO_URING_BPF_FILTER_SZ_STRICT, fail loading the filter
       if the userspace pdu size is bigger than what the kernel supports.

    An attempt to load a filter due to sizing will error with -EMSGSIZE.
    For that error, the registration struct will have filter->pdu_size
    populated with the pdu size that the kernel uses.

    Reported-by: Christian Brauner <brauner@kernel.org>
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h
index 220351b81bc0..1b461d792a7b 100644
--- a/include/uapi/linux/io_uring/bpf_filter.h
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -35,13 +35,19 @@ enum {
 	 * If set, any currently unset opcode will have a deny filter attached
 	 */
 	IO_URING_BPF_FILTER_DENY_REST	= 1,
+	/*
+	 * If set, if kernel and application don't agree on pdu_size for
+	 * the given opcode, fail the registration of the filter.
+	 */
+	IO_URING_BPF_FILTER_SZ_STRICT	= 2,
 };

 struct io_uring_bpf_filter {
 	__u32	opcode;		/* io_uring opcode to filter */
 	__u32	flags;
 	__u32	filter_len;	/* number of BPF instructions */
-	__u32	resv;
+	__u8	pdu_size;	/* expected pdu size for opcode */
+	__u8	resv[3];
 	__u64	filter_ptr;	/* pointer to BPF filter */
 	__u64	resv2[5];
 };
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 8ac7d06de122..28a23e92ee81 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -308,36 +308,69 @@ static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
 	return ERR_PTR(-EBUSY);
 }

-#define IO_URING_BPF_FILTER_FLAGS	IO_URING_BPF_FILTER_DENY_REST
+#define IO_URING_BPF_FILTER_FLAGS	(IO_URING_BPF_FILTER_DENY_REST | \
+					 IO_URING_BPF_FILTER_SZ_STRICT)

-int io_register_bpf_filter(struct io_restriction *res,
-			   struct io_uring_bpf __user *arg)
+static int io_bpf_filter_import(struct io_uring_bpf *reg,
+				struct io_uring_bpf __user *arg)
 {
-	struct io_bpf_filters *filters, *old_filters = NULL;
-	struct io_bpf_filter *filter, *old_filter;
-	struct io_uring_bpf reg;
-	struct bpf_prog *prog;
-	struct sock_fprog fprog;
+	const struct io_issue_def *def;
 	int ret;

-	if (copy_from_user(&reg, arg, sizeof(reg)))
+	if (copy_from_user(reg, arg, sizeof(*reg)))
 		return -EFAULT;
-	if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+	if (reg->cmd_type != IO_URING_BPF_CMD_FILTER)
 		return -EINVAL;
-	if (reg.cmd_flags || reg.resv)
+	if (reg->cmd_flags || reg->resv)
 		return -EINVAL;

-	if (reg.filter.opcode >= IORING_OP_LAST)
+	if (reg->filter.opcode >= IORING_OP_LAST)
 		return -EINVAL;
-	if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
+	if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
 		return -EINVAL;
-	if (reg.filter.resv)
+	if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv)))
 		return -EINVAL;
-	if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2)))
+	if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2)))
 		return -EINVAL;
-	if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS)
+	if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS)
 		return -EINVAL;

+	/* Verify filter size */
+	def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)];
+
+	/* same size, always ok */
+	ret = 0;
+	if (reg->filter.pdu_size == def->filter_pdu_size)
+		;
+	/* size differs, fail in strict mode */
+	else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT)
+		ret = -EMSGSIZE;
+	/* userspace filter is bigger, always disallow */
+	else if (reg->filter.pdu_size > def->filter_pdu_size)
+		ret = -EMSGSIZE;
+
+	/* copy back kernel filter size */
+	reg->filter.pdu_size = def->filter_pdu_size;
+	if (copy_to_user(&arg->filter, &reg->filter, sizeof(reg->filter)))
+		return -EFAULT;
+
+	return ret;
+}
+
+int io_register_bpf_filter(struct io_restriction *res,
+			   struct io_uring_bpf __user *arg)
+{
+	struct io_bpf_filters *filters, *old_filters = NULL;
+	struct io_bpf_filter *filter, *old_filter;
+	struct io_uring_bpf reg;
+	struct bpf_prog *prog;
+	struct sock_fprog fprog;
+	int ret;
+
+	ret = io_bpf_filter_import(&reg, arg);
+	if (ret)
+		return ret;
+
 	fprog.len = reg.filter.filter_len;
 	fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);