Compare commits

..

87 Commits

Author SHA1 Message Date
Andrii Nakryiko
85d9be97eb sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   6f0b824a61f212e9707ff68abcabfdfa4724b811
Checkpoint bpf-next commit: 08a7491843224f8b96518fbe70d9e48163046054
Baseline bpf commit:        1d528e794f3db5d32279123a89957c44c4406a09
Checkpoint bpf commit:      22cc16c04b7893d8fc22810599f49a305d600b9e

Donglin Peng (4):
  libbpf: Add BTF permutation support for type reordering
  libbpf: Optimize type lookup with binary search for sorted BTF
  libbpf: Verify BTF sorting
  btf: Refactor the code by calling str_is_empty

Emil Tsalapatis (2):
  libbpf: Turn relo_core->sym_off unsigned
  libbpf: Move arena globals to the end of the arena

Ihor Solodrai (1):
  bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS

Leon Hwang (2):
  bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
  libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps

Matt Bobrowski (1):
  bpf: add new BPF_CGROUP_ITER_CHILDREN control option

Menglong Dong (2):
  bpf: add fsession support
  libbpf: add fsession support

Thomas Gleixner (1):
  treewide: Update email address

Thomas Weißschuh (1):
  vfs: use UAPI types for new struct delegation definition

Varun R Mallya (1):
  libbpf: Fix OOB read in btf_dump_get_bitfield_value

 include/uapi/linux/bpf.h        |  11 ++
 include/uapi/linux/fcntl.h      |  10 +-
 include/uapi/linux/perf_event.h |   2 +-
 src/bpf.c                       |   1 +
 src/bpf.h                       |   8 +
 src/bpf_helpers.h               |   6 +-
 src/btf.c                       | 276 +++++++++++++++++++++++++++-----
 src/btf.h                       |  42 +++++
 src/btf_dump.c                  |   9 ++
 src/libbpf.c                    |  64 +++++---
 src/libbpf.h                    |  21 +--
 src/libbpf.map                  |   1 +
 12 files changed, 369 insertions(+), 82 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-29 14:10:19 -08:00
Andrii Nakryiko
fddf93d20b sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-29 14:10:19 -08:00
Matt Bobrowski
ed6bb65cf1 bpf: add new BPF_CGROUP_ITER_CHILDREN control option
Currently, the BPF cgroup iterator supports walking descendants in
either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order
(BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive
depth-first search (DFS) of the hierarchy. In scenarios where a BPF
program may need to inspect only the direct children of a given parent
cgroup, a full DFS is unnecessarily expensive.

This patch introduces a new BPF cgroup iterator control option,
BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal
to the immediate children of a specified parent cgroup, allowing for
more targeted and efficient iteration, particularly when exhaustive
depth-first search (DFS) traversal is not required.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Menglong Dong
5ee8863eaf libbpf: add fsession support
Add BPF_TRACE_FSESSION to libbpf.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-9-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Menglong Dong
adde4f55b7 bpf: add fsession support
The fsession is something that similar to kprobe session. It allow to
attach a single BPF program to both the entry and the exit of the target
functions.

Introduce the struct bpf_fsession_link, which allows to add the link to
both the fentry and fexit progs_hlist of the trampoline.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Co-developed-by: Leon Hwang <leon.hwang@linux.dev>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Ihor Solodrai
977b1f820c bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS
Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument,
and remote bpf_stream_vprintk_impl from the kernel.

Update the selftests to use the new API with implicit argument.

bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk
kfunc, and the extern definition of bpf_stream_vprintk_impl is
replaced accordingly.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Thomas Gleixner
5d02120e10 treewide: Update email address
In a vain attempt to consolidate the email zoo switch everything to the
kernel.org account.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2026-01-29 14:10:19 -08:00
Donglin Peng
8a090ef1e5 btf: Refactor the code by calling str_is_empty
Calling the str_is_empty function to clarify the code and
no functional changes are introduced.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-12-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
ad9c763445 libbpf: Verify BTF sorting
This patch checks whether the BTF is sorted by name in ascending
order. If sorted, binary search will be used when looking up types.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-6-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
1c96b72cb0 libbpf: Optimize type lookup with binary search for sorted BTF
This patch introduces binary search optimization for BTF type lookups
when the BTF instance contains sorted types.

The optimization significantly improves performance when searching for
types in large BTF instances with sorted types. For unsorted BTF, the
implementation falls back to the original linear search.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-5-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
b7c6c02b5f libbpf: Add BTF permutation support for type reordering
Introduce btf__permute() API to allow in-place rearrangement of BTF types.
This function reorganizes BTF type order according to a provided array of
type IDs, updating all type references to maintain consistency.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-2-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Varun R Mallya
2c5038dcf4 libbpf: Fix OOB read in btf_dump_get_bitfield_value
When dumping bitfield data, btf_dump_get_bitfield_value() reads data
based on the underlying type's size (t->size). However, it does not
verify that the provided data buffer (data_sz) is large enough to
contain these bytes.

If btf_dump__dump_type_data() is called with a buffer smaller than
the type's size, this leads to an out-of-bounds read. This was
confirmed by AddressSanitizer in the linked issue.

Fix this by ensuring we do not read past the provided data_sz limit.

Fixes: a1d3cc3c5eca ("libbpf: Avoid use of __int128 in typed dump display")
Reported-by: Harrison Green <harrisonmichaelgreen@gmail.com>
Suggested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Varun R Mallya <varunrmallya@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260106233527.163487-1-varunrmallya@gmail.com

Closes: https://github.com/libbpf/libbpf/issues/928
2026-01-29 14:10:19 -08:00
Leon Hwang
dc8673b28b libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps
Add libbpf support for the BPF_F_CPU flag for percpu maps by embedding the
cpu info into the high 32 bits of:

1. **flags**: bpf_map_lookup_elem_flags(), bpf_map__lookup_elem(),
   bpf_map_update_elem() and bpf_map__update_elem()
2. **opts->elem_flags**: bpf_map_lookup_batch() and
   bpf_map_update_batch()

And the flag can be BPF_F_ALL_CPUS, but cannot be
'BPF_F_CPU | BPF_F_ALL_CPUS'.

Behavior:

* If the flag is BPF_F_ALL_CPUS, the update is applied across all CPUs.
* If the flag is BPF_F_CPU, it updates value only to the specified CPU.
* If the flag is BPF_F_CPU, lookup value only from the specified CPU.
* lookup does not support BPF_F_ALL_CPUS.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-7-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Leon Hwang
a6d7ceaaeb bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for
following APIs:

* 'map_lookup_elem()'
* 'map_update_elem()'
* 'generic_map_lookup_batch()'
* 'generic_map_update_batch()'

And, get the correct value size for these APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Thomas Weißschuh
e64e125ef6 vfs: use UAPI types for new struct delegation definition
Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

Use the fixed-width integer types provided by the UAPI headers instead.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Fixes: 4be9e04ebf75 ("vfs: add needed headers for new struct delegation definition")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251203-uapi-fcntl-v1-1-490c67bf3425@linutronix.de
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2026-01-29 14:10:19 -08:00
Emil Tsalapatis
9dd6fda504 libbpf: Move arena globals to the end of the arena
Arena globals are currently placed at the beginning of the arena
by libbpf. This is convenient, but prevents users from reserving
guard pages in the beginning of the arena to identify NULL pointer
dereferences. Adjust the load logic to place the globals at the
end of the arena instead.

Also modify bpftool to set the arena pointer in the program's BPF
skeleton to point to the globals. Users now call bpf_map__initial_value()
to find the beginning of the arena mapping and use the arena pointer
in the skeleton to determine which part of the mapping holds the
arena globals and which part is free.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com
2026-01-29 14:10:19 -08:00
Emil Tsalapatis
2c7fe6ec5d libbpf: Turn relo_core->sym_off unsigned
The symbols' relocation offsets in BPF are stored in an int field,
but cannot actually be negative. When in the next patch libbpf relocates
globals to the end of the arena, it is also possible to have valid
offsets > 2GiB that are used to calculate the final relo offsets.
Avoid accidentally interpreting large offsets as negative by turning
the sym_off field unsigned.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-4-emil@etsalapatis.com
2026-01-29 14:10:19 -08:00
Andrii Nakryiko
160423d498 ci: denylist flaky 'bpf_cookie/perf_event' selftest
It keeps failing. It relies on perf_events so not super reliable.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-26 13:12:58 -08:00
Andrii Nakryiko
afb8b17bc5 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   f8c67d8550ee69ce684c7015b2c8c63cda24bbfb
Checkpoint bpf-next commit: 6f0b824a61f212e9707ff68abcabfdfa4724b811
Baseline bpf commit:        e427054ae7bc8b1268cf1989381a43885795616f
Checkpoint bpf commit:      1d528e794f3db5d32279123a89957c44c4406a09

Alan Maguire (1):
  libbpf: Add debug messaging in dedup equivalence/identity matching

Amery Hung (2):
  bpf: Support associating BPF program with struct_ops
  libbpf: Add support for associating BPF program with struct_ops

Asbjørn Sloth Tønnesen (1):
  tools: ynl-gen: add regeneration comment

Heiko Carstens (1):
  tools: Remove s390 compat support

James Clark (1):
  perf: Add perf_event_attr::config4

Jeff Layton (2):
  vfs: expose delegation support to userland
  vfs: add needed headers for new struct delegation definition

Jianyun Gao (1):
  libbpf: Fix some incorrect @param descriptions in the comment of
    libbpf.h

Kuniyuki Iwashima (1):
  bpf: Introduce SK_BPF_BYPASS_PROT_MEM.

Mikhail Gavrilov (1):
  libbpf: Fix -Wdiscarded-qualifiers under C23

Paul Houssel (1):
  libbpf: Fix BTF dedup to support recursive typedef definitions

Peter Zijlstra (1):
  perf: Support deferred user unwind

Samiullah Khawaja (1):
  net: Extend NAPI threaded polling to allow kthread based busy polling

 include/uapi/linux/bpf.h        |  19 ++++++
 include/uapi/linux/fcntl.h      |  16 +++++
 include/uapi/linux/netdev.h     |   2 +
 include/uapi/linux/perf_event.h |  23 +++++++-
 src/bpf.c                       |  19 ++++++
 src/bpf.h                       |  21 +++++++
 src/btf.c                       | 100 +++++++++++++++++++++++++-------
 src/libbpf.c                    |  42 +++++++++++---
 src/libbpf.h                    |  43 ++++++++++----
 src/libbpf.map                  |   2 +
 src/usdt.c                      |   2 -
 11 files changed, 247 insertions(+), 42 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-12-16 09:52:07 -08:00
Mikhail Gavrilov
fda2bfcb7a libbpf: Fix -Wdiscarded-qualifiers under C23
glibc ≥ 2.42 (GCC 15) defaults to -std=gnu23, which promotes
-Wdiscarded-qualifiers to an error.

In C23, strstr() and strchr() return "const char *".

Change variable types to const char * where the pointers are never
modified (res, sym_sfx, next_path).

Suggested-by: Florian Weimer <fweimer@redhat.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Link: https://lore.kernel.org/r/20251206092825.1471385-1-mikhail.v.gavrilov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-12-16 09:52:07 -08:00
Amery Hung
5635185147 libbpf: Add support for associating BPF program with struct_ops
Add low-level wrapper and libbpf API for BPF_PROG_ASSOC_STRUCT_OPS
command in the bpf() syscall.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-4-ameryhung@gmail.com
2025-12-16 09:52:07 -08:00
Amery Hung
1a41b12b4f bpf: Support associating BPF program with struct_ops
Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating
a BPF program with a struct_ops map. This command takes a file
descriptor of a struct_ops map and a BPF program and set
prog->aux->st_ops_assoc to the kdata of the struct_ops map.

The command does not accept a struct_ops program nor a non-struct_ops
map. Programs of a struct_ops map is automatically associated with the
map during map update. If a program is shared between two struct_ops
maps, prog->aux->st_ops_assoc will be poisoned to indicate that the
associated struct_ops is ambiguous. The pointer, once poisoned, cannot
be reset since we have lost track of associated struct_ops. For other
program types, the associated struct_ops map, once set, cannot be
changed later. This restriction may be lifted in the future if there is
a use case.

A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve
the associated struct_ops pointer. The returned pointer, if not NULL, is
guaranteed to be valid and point to a fully updated struct_ops struct.
For struct_ops program reused in multiple struct_ops map, the return
will be NULL.

prog->aux->st_ops_assoc is protected by bumping the refcount for
non-struct_ops programs and RCU for struct_ops programs. Since it would
be inefficient to track programs associated with a struct_ops map, every
non-struct_ops program will bump the refcount of the map to make sure
st_ops_assoc stays valid. For a struct_ops program, it is protected by
RCU as map_free will wait for an RCU grace period before disassociating
the program with the map. The helper must be called in BPF program
context or RCU read-side critical section.

struct_ops implementers should note that the struct_ops returned may not
be initialized nor attached yet. The struct_ops implementer will be
responsible for tracking and checking the state of the associated
struct_ops map if the use case expects an initialized or attached
struct_ops.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com
2025-12-16 09:52:07 -08:00
Alan Maguire
8ffe064aed libbpf: Add debug messaging in dedup equivalence/identity matching
We have seen a number of issues like [1]; failures to deduplicate
key kernel data structures like task_struct.  These are often hard
to debug from pahole even with verbose output, especially when
identity/equivalence checks fail deep in a nested struct comparison.

Here we add debug messages of the form

libbpf: STRUCT 'task_struct' size=2560 vlen=194 cand_id[54222] canon_id[102820] shallow-equal but not equiv for field#23 'sched_class': 0

These will be emitted during dedup from pahole when --verbose/-V
is specified.  This greatly helps identify exactly where dedup
failures are experienced.

[1] https://lore.kernel.org/bpf/b8e8b560-bce5-414b-846d-0da6d22a9983@oracle.com/

Changes since v1:

- updated debug messages to refer to shallow-equal, added ids (Andrii)

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203191507.55565-1-alan.maguire@oracle.com
2025-12-16 09:52:07 -08:00
Asbjørn Sloth Tønnesen
7ac4e3a670 tools: ynl-gen: add regeneration comment
Add a comment on regeneration to the generated files.

The comment is placed after the YNL-GEN line[1], as to not interfere
with ynl-regen.sh's detection logic.

[1] and after the optional YNL-ARG line.

Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-16 09:52:07 -08:00
Samiullah Khawaja
cd173d0ea3 net: Extend NAPI threaded polling to allow kthread based busy polling
Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-16 09:52:07 -08:00
Kuniyuki Iwashima
3fe0a72123 bpf: Introduce SK_BPF_BYPASS_PROT_MEM.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

This is easily controlled by net.core.bypass_prot_mem sysctl, but it
lacks flexibility.

Let's support flagging (and clearing) sk->sk_bypass_prot_mem via
bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook.

  int val = 1;

  bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM,
                 &val, sizeof(val));

As with net.core.bypass_prot_mem, this is inherited to child sockets,
and BPF always takes precedence over sysctl at socket(2) and accept(2).

SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE
and not supported on other hooks for some reasons:

  1. UDP charges memory under sk->sk_receive_queue.lock instead
     of lock_sock()

  2. Modifying the flag after skb is charged to sk requires such
     adjustment during bpf_setsockopt() and complicates the logic
     unnecessarily

We can support other hooks later if a real use case justifies that.

Most changes are inline and hard to trace, but a microbenchmark on
__sk_mem_raise_allocated() during neper/tcp_stream showed that more
samples completed faster with sk->sk_bypass_prot_mem == 1.  This will
be more visible under tcp_mem pressure (but it's not a fair comparison).

  # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; }
    kretprobe:__sk_mem_raise_allocated /@start[tid]/
    { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }'
  # tcp_stream -6 -F 1000 -N -T 256

Without bpf prog:

  [128, 256)          3846 |                                                    |
  [256, 512)       1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@     |
  [1K, 2K)          198207 |@@@@@@                                              |
  [2K, 4K)           31199 |@                                                   |

With bpf prog in the next patch:
  (must be attached before tcp_stream)
  # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create
  # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test

  [128, 256)          6413 |                                                    |
  [256, 512)       1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                      |
  [1K, 2K)          117031 |@@@@                                                |
  [2K, 4K)           11773 |                                                    |

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com
2025-12-16 09:52:07 -08:00
Jianyun Gao
f561c42074 libbpf: Fix some incorrect @param descriptions in the comment of libbpf.h
Fix up some of missing or incorrect @param descriptions for libbpf public APIs
in libbpf.h.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251118033025.11804-1-jianyungao89@gmail.com
2025-12-16 09:52:07 -08:00
Paul Houssel
370271441c libbpf: Fix BTF dedup to support recursive typedef definitions
Handle recursive typedefs in BTF deduplication

Pahole fails to encode BTF for some Go projects (e.g. Kubernetes and
Podman) due to recursive type definitions that create reference loops
not representable in C. These recursive typedefs trigger a failure in
the BTF deduplication algorithm.

This patch extends btf_dedup_ref_type() to properly handle potential
recursion for BTF_KIND_TYPEDEF, similar to how recursion is already
handled for BTF_KIND_STRUCT. This allows pahole to successfully
generate BTF for Go binaries using recursive types without impacting
existing C-based workflows.

Suggested-by: Tristan d'Audibert <tristan.daudibert@gmail.com>
Co-developed-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Co-developed-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Co-developed-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Co-developed-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Co-developed-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Signed-off-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Signed-off-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Signed-off-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Signed-off-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Paul Houssel <paul.houssel@orange.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/bf00857b1e06f282aac12f6834de7396a7547ba6.1763037045.git.paul.houssel@orange.com
2025-12-16 09:52:07 -08:00
James Clark
8cc0f2c095 perf: Add perf_event_attr::config4
Arm FEAT_SPE_FDS adds the ability to filter on the data source of a
packet using another 64-bits of event filtering control. As the existing
perf_event_attr::configN fields are all used up for SPE PMU, an
additional field is needed. Add a new 'config4' field.

Reviewed-by: Leo Yan <leo.yan@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Will Deacon <will@kernel.org>
2025-12-16 09:52:07 -08:00
Heiko Carstens
9905b35d8a tools: Remove s390 compat support
Remove s390 compat support from everything within tools, since s390 compat
support will be removed from the kernel.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Weißschuh <linux@weissschuh.net> # tools/nolibc selftests/nolibc
Reviewed-by: Thomas Weißschuh <linux@weissschuh.net> # selftests/vDSO
Acked-by: Alexei Starovoitov <ast@kernel.org> # bpf bits
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2025-12-16 09:52:07 -08:00
Peter Zijlstra
8d178bd7b6 perf: Support deferred user unwind
Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
2025-12-16 09:52:07 -08:00
Jeff Layton
530f40421a vfs: add needed headers for new struct delegation definition
The definition of struct delegation uses stdint.h integer types. Add the
necessary headers to ensure that always works.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-12-16 09:52:07 -08:00
Jeff Layton
4f10610ae5 vfs: expose delegation support to userland
Now that support for recallable directory delegations is available,
expose this functionality to userland with new F_SETDELEG and F_GETDELEG
commands for fcntl().

Note that this also allows userland to request a FL_DELEG type lease on
files too. Userland applications that do will get signalled when there
are metadata changes in addition to just data changes (which is a
limitation of FL_LEASE leases).

These commands accept a new "struct delegation" argument that contains a
flags field for future expansion.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-12-16 09:52:07 -08:00
Andrii Nakryiko
d65dbb412d sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   de7342228b7343774d6a9981c2ddbfb5e201044b
Checkpoint bpf-next commit: f8c67d8550ee69ce684c7015b2c8c63cda24bbfb
Baseline bpf commit:        4d920ed684392ae064af62957d6f5a90312dfaf6
Checkpoint bpf commit:      e427054ae7bc8b1268cf1989381a43885795616f

Alan Maguire (1):
  libbpf: Fix parsing of multi-split BTF

Andrii Nakryiko (1):
  libbpf: Fix powerpc's stack register definition in bpf_tracing.h

Anton Protopopov (4):
  libbpf: fix formatting of bpf_object__append_subprog_code
  bpf, x86: add new map type: instructions array
  libbpf: Recognize insn_array map type
  libbpf: support llvm-generated indirect jumps

Donald Hunter (1):
  docs/bpf: Add missing BPF k/uprobe program types to docs

Jianyun Gao (4):
  libbpf: Optimize the redundant code in the
    bpf_object__init_user_btf_maps() function.
  libbpf: Fix the incorrect reference to the memlock_rlim variable in
    the comment.
  libbpf: Complete the missing @param and @return tags in btf.h
  libbpf: Update the comment to remove the reference to the deprecated
    interface bpf_program__load().

Mykyta Yatsenko (2):
  bpf: widen dynptr size/offset to 64 bit
  bpf: add _impl suffix for bpf_stream_vprintk() kfunc

Xu Kuohai (1):
  bpf: Add overwrite mode for BPF ring buffer

 docs/program_types.rst   |  18 +++
 include/uapi/linux/bpf.h |  33 ++++-
 src/bpf.c                |   2 +-
 src/bpf_helpers.h        |  28 ++--
 src/bpf_tracing.h        |   2 +-
 src/btf.c                |   4 +-
 src/btf.h                |   8 ++
 src/libbpf.c             | 296 +++++++++++++++++++++++++++++++++++----
 src/libbpf_internal.h    |   2 +
 src/libbpf_probes.c      |   4 +
 src/linker.c             |   3 +
 11 files changed, 353 insertions(+), 47 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-07 14:00:07 -08:00
Andrii Nakryiko
befbf010d7 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-07 14:00:07 -08:00
Mykyta Yatsenko
a00b10df8c bpf: add _impl suffix for bpf_stream_vprintk() kfunc
Rename bpf_stream_vprintk() to bpf_stream_vprintk_impl().

This makes bpf_stream_vprintk() follow the already established "_impl"
suffix-based naming convention for kfuncs with the bpf_prog_aux
argument provided by the verifier implicitly. This convention will be
taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to
preserve backwards compatibility to BPF programs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20251104-implv2-v3-2-4772b9ae0e06@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-11-07 14:00:07 -08:00
Anton Protopopov
24a89cb35d libbpf: support llvm-generated indirect jumps
For v4 instruction set LLVM is allowed to generate indirect jumps for
switch statements and for 'goto *rX' assembly. Every such a jump will
be accompanied by necessary metadata, e.g. (`llvm-objdump -Sr ...`):

       0:       r2 = 0x0 ll
                0000000000000030:  R_BPF_64_64  BPF.JT.0.0

Here BPF.JT.1.0 is a symbol residing in the .jumptables section:

    Symbol table:
       4: 0000000000000000   240 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0

The -bpf-min-jump-table-entries llvm option may be used to control the
minimal size of a switch which will be converted to an indirect jumps.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-11-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
349b78117b libbpf: Recognize insn_array map type
Teach libbpf about the existence of the new instruction array map.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
9d159773c5 bpf, x86: add new map type: instructions array
On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are
translated by the verifier into "xlated" BPF programs. During this
process the original instructions offsets might be adjusted and/or
individual instructions might be replaced by new sets of instructions,
or deleted.

Add a new BPF map type which is aimed to keep track of how, for a
given program, the original instructions were relocated during the
verification. Also, besides keeping track of the original -> xlated
mapping, make x86 JIT to build the xlated -> jitted mapping for every
instruction listed in an instruction array. This is required for every
future application of instruction arrays: static keys, indirect jumps
and indirect calls.

A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32
keys and value of size 8. The values have different semantics for
userspace and for BPF space. For userspace a value consists of two
u32 values – xlated and jitted offsets. For BPF side the value is
a real pointer to a jitted instruction.

On map creation/initialization, before loading the program, each
element of the map should be initialized to point to an instruction
offset within the program. Before the program load such maps should
be made frozen. After the program verification xlated and jitted
offsets can be read via the bpf(2) syscall.

If a tracked instruction is removed by the verifier, then the xlated
offset is set to (u32)-1 which is considered to be too big for a valid
BPF program offset.

One such a map can, obviously, be used to track one and only one BPF
program.  If the verification process was unsuccessful, then the same
map can be re-used to verify the program with a different log level.
However, if the program was loaded fine, then such a map, being
frozen in any case, can't be reused by other programs even after the
program release.

Example. Consider the following original and xlated programs:

    Original prog:                      Xlated prog:

     0:  r1 = 0x0                        0: r1 = 0
     1:  *(u32 *)(r10 - 0x4) = r1        1: *(u32 *)(r10 -4) = r1
     2:  r2 = r10                        2: r2 = r10
     3:  r2 += -0x4                      3: r2 += -4
     4:  r1 = 0x0 ll                     4: r1 = map[id:88]
     6:  call 0x1                        6: r1 += 272
                                         7: r0 = *(u32 *)(r2 +0)
                                         8: if r0 >= 0x1 goto pc+3
                                         9: r0 <<= 3
                                        10: r0 += r1
                                        11: goto pc+1
                                        12: r0 = 0
     7:  r6 = r0                        13: r6 = r0
     8:  if r6 == 0x0 goto +0x2         14: if r6 == 0x0 goto pc+4
     9:  call 0x76                      15: r0 = 0xffffffff8d2079c0
                                        17: r0 = *(u64 *)(r0 +0)
    10:  *(u64 *)(r6 + 0x0) = r0        18: *(u64 *)(r6 +0) = r0
    11:  r0 = 0x0                       19: r0 = 0x0
    12:  exit                           20: exit

An instruction array map, containing, e.g., instructions [0,4,7,12]
will be translated by the verifier to [0,4,13,20]. A map with
index 5 (the middle of 16-byte instruction) or indexes greater than 12
(outside the program boundaries) would be rejected.

The functionality provided by this patch will be extended in consequent
patches to implement BPF Static Keys, indirect jumps, and indirect calls.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Alan Maguire
0e14a12a1d libbpf: Fix parsing of multi-split BTF
When creating multi-split BTF we correctly set the start string offset
to be the size of the base string section plus the base BTF start
string offset; the latter is needed for multi-split BTF since the
offset is non-zero there.

Unfortunately the BTF parsing case needed that logic and it was
missed.

Fixes: 4e29128a9ace ("libbpf/btf: Fix string handling to support multi-split BTF")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251104203309.318429-2-alan.maguire@oracle.com
2025-11-07 14:00:07 -08:00
Donald Hunter
813fbe13ab docs/bpf: Add missing BPF k/uprobe program types to docs
Update the table of program types in the libbpf docs with the missing
k/uprobe multi and session program types.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251029180932.98038-1-donald.hunter@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
fd00fd999f libbpf: Update the comment to remove the reference to the deprecated interface bpf_program__load().
Commit be2f2d1680df ("libbpf: Deprecate bpf_program__load() API") marked
bpf_program__load() as deprecated starting with libbpf v0.6. And later
in commit 146bf811f5ac ("libbpf: remove most other deprecated high-level
APIs") actually removed the bpf_program__load() implementation and
related old high-level APIs.

This patch update the comment in bpf_program__set_attach_target() to
remove the reference to the deprecated interface bpf_program__load().

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103120727.145965-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
f4b32db745 libbpf: Complete the missing @param and @return tags in btf.h
Complete the missing @param and @return tags in the Doxygen comments of
the btf.h file.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103115836.144339-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Andrii Nakryiko
99bf90957a libbpf: Fix powerpc's stack register definition in bpf_tracing.h
retsnoop's build on powerpc (ppc64le) architecture ([0]) failed due to
wrong definition of PT_REGS_SP() macro. Looking at powerpc's
implementation of stack unwinding in perf_callchain_user_64() clearly
shows that stack pointer register is gpr[1].

Fix libbpf's definition of __PT_SP_REG for powerpc to fix all this.

  [0] https://kojipkgs.fedoraproject.org/work/tasks/1544/137921544/build.log

Fixes: 138d6153a139 ("samples/bpf: Enable powerpc support")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Naveen N Rao (AMD) <naveen@kernel.org>
Link: https://lore.kernel.org/r/20251020203643.989467-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Jianyun Gao
98b6e51fc6 libbpf: Fix the incorrect reference to the memlock_rlim variable in the comment.
The variable "memlock_rlim_max" referenced in the comment does not exist.
I think that the author probably meant the variable "memlock_rlim". So,
correct it.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251027032008.738944-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
bc3b400e06 libbpf: Optimize the redundant code in the bpf_object__init_user_btf_maps() function.
In the elf_sec_data() function, the input parameter 'scn' will be
evaluated. If it is NULL, then it will directly return NULL. Therefore,
the return value of the elf_sec_data() function already takes into
account the case where the input parameter scn is NULL. Therefore,
subsequently, the code only needs to check whether the return value of
the elf_sec_data() function is NULL.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20251024080802.642189-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Xu Kuohai
665ad8c7f7 bpf: Add overwrite mode for BPF ring buffer
When the BPF ring buffer is full, a new event cannot be recorded until one
or more old events are consumed to make enough space for it. In cases such
as fault diagnostics, where recent events are more useful than older ones,
this mechanism may lead to critical events being lost.

So add overwrite mode for BPF ring buffer to address it. In this mode, the
new event overwrites the oldest event when the buffer is full.

The basic idea is as follows:

1. producer_pos tracks the next position to record new event. When there
   is enough free space, producer_pos is simply advanced by producer to
   make space for the new event.

2. To avoid waiting for consumer when the buffer is full, a new variable,
   overwrite_pos, is introduced for producer. It points to the oldest event
   committed in the buffer. It is advanced by producer to discard one or more
   oldest events to make space for the new event when the buffer is full.

3. pending_pos tracks the oldest event to be committed. pending_pos is never
   passed by producer_pos, so multiple producers never write to the same
   position at the same time.

The following example diagrams show how it works in a 4096-byte ring buffer.

1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                                                                       |
   |                                                                       |
   |                                                                       |
   +-----------------------------------------------------------------------+
   ^
   |
   |
producer_pos = 0
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

2. Now reserve a 512-byte event A.

   There is enough free space, so A is allocated at offset 0. And producer_pos
   is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is
   set.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                                                              |
   |   A    |                                                              |
   | [BUSY] |                                                              |
   +-----------------------------------------------------------------------+
   ^        ^
   |        |
   |        |
   |    producer_pos = 512
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

3. Reserve event B, size 1024.

   B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced
   to the end of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                            |
   |   A    |        B        |                                            |
   | [BUSY] |      [BUSY]     |                                            |
   +-----------------------------------------------------------------------+
   ^                          ^
   |                          |
   |                          |
   |                   producer_pos = 1536
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

4. Reserve event C, size 2048.

   C is allocated at offset 1536, and producer_pos is advanced to 3584.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   | [BUSY] |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                                                              ^
   |                                                              |
   |                                                              |
   |                                                    producer_pos = 3584
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

5. Submit event A.

   The BUSY bit of A is cleared. B becomes the oldest event to be committed, so
   pending_pos is advanced to 512, the start of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^        ^                                                     ^
   |        |                                                     |
   |        |                                                     |
   |   pending_pos = 512                                  producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

6. Submit event B.

   The BUSY bit of B is cleared, and pending_pos is advanced to the start of C,
   which is now the oldest event to be committed.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |                 |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                          ^                                   ^
   |                          |                                   |
   |                          |                                   |
   |                     pending_pos = 1536               producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

7. Reserve event D, size 1536 (3 * 512).

   There are 2048 bytes not being written between producer_pos (currently 3584)
   and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced
   by 1536 (from 3584 to 5120).

   Since event D will overwrite all bytes of event A and the first 512 bytes of
   event B, overwrite_pos is advanced to the start of event C, the oldest event
   that is not overwritten.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |      [BUSY]     |        |               [BUSY]              | [BUSY] |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   pending_pos = 1536
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |
consumer_pos = 0

8. Reserve event E, size 1024.

   Although there are 512 bytes not being written between producer_pos and
   pending_pos, E cannot be reserved, as it would overwrite the first 512
   bytes of event C, which is still being written.

9. Submit event C and D.

   pending_pos is advanced to the end of D.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |                 |        |                                   |        |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |             pending_pos=5120
   |
consumer_pos = 0

The performance data for overwrite mode will be provided in a follow-up
patch that adds overwrite-mode benchmarks.

A sample of performance data for non-overwrite mode, collected on an x86_64
CPU and an arm64 CPU, before and after this patch, is shown below. As we can
see, no obvious performance regression occurs.

- x86_64 (AMD EPYC 9654)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s)

- arm64 (HiSilicon Kunpeng 920)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com
2025-11-07 14:00:07 -08:00
Mykyta Yatsenko
42995c95b9 bpf: widen dynptr size/offset to 64 bit
Dynptr currently caps size and offset at 24 bits, which isn’t sufficient
for file-backed use cases; even 32 bits can be limiting. Refactor dynptr
helpers/kfuncs to use 64-bit size and offset, ensuring consistency
across the APIs.

This change does not affect internals of xdp, skb or other dynptrs,
which continue to behave as before. Also it does not break binary
compatibility.

The widening enables large-file access support via dynptr, implemented
in the next patches.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
49c5e0eef4 libbpf: fix formatting of bpf_object__append_subprog_code
The commit 6c918709bd30 ("libbpf: Refactor bpf_object__reloc_code")
added the bpf_object__append_subprog_code() with incorrect indentations.
Use tabs instead. (This also makes a consequent commit better readable.)

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251019202145.3944697-14-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Alain Knaff
30599e72bf include: add __poll_t typedef in include/linux/types.h for Android/Termux
On Android/Termux, linux/types.h is included (indirectly) by sys/epoll.h which
depends on this definition to be present.

Signed-off-by: Alain Knaff <github@misc.lka.org.lu>
2025-11-07 12:35:07 -08:00
Andrii Nakryiko
3d451d916f ci: drop tmp.master testing of pahole
It hasn't been updated for a long time, seems abandoned.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
02b3ec9ffc ci: denylist verif_scale_pyperf600 as it now fails with newer Clang
We get "The sequence of 8193 jumps is too complex." with newer Clang.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
c7f77de09d ci: update clang to v21 for test workflow
Update Clang version used.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
2719a398b0 libbpf: fix Github's Makefile for libbpf_utils.c
Drop removed str_error.o from the list of object to build. Rename
libbpf_errno.o into libbpf_utils.o.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
7e9d669550 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   21aeabb68258ce17b91af113a768760b3a491d93
Checkpoint bpf-next commit: de7342228b7343774d6a9981c2ddbfb5e201044b
Baseline bpf commit:        27861fc720be2c39b861d8bdfb68287f54de6855
Checkpoint bpf commit:      4d920ed684392ae064af62957d6f5a90312dfaf6

Alasdair McWilliam (1):
  rtnetlink: add needed_{head,tail}room attributes

Andrii Nakryiko (5):
  libbpf: make libbpf_errno.c into more generic libbpf_utils.c
  libbpf: remove unused libbpf_strerror_r and STRERR_BUFSIZE
  libbpf: move libbpf_errstr() into libbpf_utils.c
  libbpf: move libbpf_sha256() implementation into libbpf_utils.c
  libbpf: remove linux/unaligned.h dependency for libbpf_sha256()

Christian Brauner (1):
  nsfs: support exhaustive file handles

D. Wythe (1):
  libbpf: Fix error when st-prefix_ops and ops from differ btf

Eric Biggers (2):
  libbpf: Replace AF_ALG with open coded SHA-256
  libbpf: Fix undefined behavior in {get,put}_unaligned_be32()

Hangbin Liu (1):
  bonding: add support for per-port LACP actor priority

Jakub Kicinski (1):
  uapi: wrap compiler_types.h in an ifdef instead of the implicit strip

Jiawei Zhao (2):
  libbpf: Fix USDT SIB argument handling causing unrecognized register
    error
  libbpf: Remove unused args in parse_usdt_note

KP Singh (7):
  bpf: Implement exclusive map creation
  libbpf: Implement SHA256 internal helper
  libbpf: Support exclusive map creation
  bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD
  bpf: Implement signature verification for BPF programs
  libbpf: Update light skeleton for signing
  libbpf: Embed and verify the metadata hash in the loader

Mykyta Yatsenko (1):
  bpf: bpf task work plumbing

Rong Tao (1):
  bpf: Finish constification of 1st parameter of bpf_d_path()

Tony Ambardar (1):
  libbpf: Fix missing #pragma in libbpf_utils.c

 include/uapi/linux/bpf.h     |  24 +++-
 include/uapi/linux/fcntl.h   |   1 +
 include/uapi/linux/if_link.h |   3 +
 include/uapi/linux/stddef.h  |   2 +
 src/bpf.c                    |   6 +-
 src/bpf.h                    |   5 +-
 src/bpf_gen_internal.h       |   2 +
 src/btf.c                    |   1 -
 src/btf_dump.c               |   1 -
 src/elf.c                    |   1 -
 src/features.c               |   1 -
 src/gen_loader.c             |  50 ++++++-
 src/libbpf.c                 | 108 ++++++++++++---
 src/libbpf.h                 |  25 +++-
 src/libbpf.map               |   3 +
 src/libbpf_errno.c           |  75 ----------
 src/libbpf_internal.h        |  19 +++
 src/libbpf_utils.c           | 256 +++++++++++++++++++++++++++++++++++
 src/linker.c                 |   1 -
 src/relo_core.c              |   1 -
 src/ringbuf.c                |   1 -
 src/skel_internal.h          |  76 ++++++++++-
 src/str_error.c              | 104 --------------
 src/str_error.h              |  19 ---
 src/usdt.bpf.h               |  44 +++++-
 src/usdt.c                   |  73 ++++++++--
 26 files changed, 650 insertions(+), 252 deletions(-)
 delete mode 100644 src/libbpf_errno.c
 create mode 100644 src/libbpf_utils.c
 delete mode 100644 src/str_error.c
 delete mode 100644 src/str_error.h

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
e4dc2acd35 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Eric Biggers
2b940bcde1 libbpf: Fix undefined behavior in {get,put}_unaligned_be32()
These violate aliasing rules and may be miscompiled unless
-fno-strict-aliasing is used.  Replace them with the standard memcpy()
solution.  Note that compilers know how to optimize this properly.

Fixes: 4a1c9e544b8d ("libbpf: remove linux/unaligned.h dependency for libbpf_sha256()")
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20251006012037.159295-1-ebiggers@kernel.org
2025-10-06 15:59:27 -07:00
Rong Tao
379ac32f2c bpf: Finish constification of 1st parameter of bpf_d_path()
The commit 1b8abbb12128 ("bpf...d_path(): constify path argument")
constified the first parameter of the bpf_d_path(), but failed to
update it in all places. Finish constification.

Otherwise the selftest fail to build:
.../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path'
  222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym;
      |            ^
.../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here
 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym;

Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument")
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Tony Ambardar
eca524d5a6 libbpf: Fix missing #pragma in libbpf_utils.c
The recent sha256 patch uses a GCC pragma to suppress compile errors for
a packed struct, but omits a needed pragma (see related link) and thus
still raises errors: (e.g. on GCC 12.3 armhf)

libbpf_utils.c:153:29: error: packed attribute causes inefficient alignment for ‘__val’ [-Werror=attributes]
  153 | struct __packed_u32 { __u32 __val; } __attribute__((packed));
      |                             ^~~~~

Resolve by adding the GCC diagnostic pragma to ignore "-Wattributes".

Link: https://lore.kernel.org/bpf/CAP-5=fXURWoZu2j6Y8xQy23i7=DfgThq3WC1RkGFBx-4moQKYQ@mail.gmail.com/

Fixes: 4a1c9e544b8d ("libbpf: remove linux/unaligned.h dependency for libbpf_sha256()")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
50d1b8e6b4 libbpf: remove linux/unaligned.h dependency for libbpf_sha256()
linux/unaligned.h include dependency is causing issues for libbpf's
Github mirror due to {get,put}_unaligned_be32() usage.

So get rid of it by implementing custom variants of those macros that
will work both in kernel and Github mirror repos.

Also switch round_up() to roundup(), as the former is not available in
Github mirror (and is just a subtly more specific variant of roundup()
anyways).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
84aad03545 libbpf: move libbpf_sha256() implementation into libbpf_utils.c
Move sha256 implementation out of already large and unwieldy libbpf.c
into libbpf_utils.c where we'll keep reusable helpers.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
6fcb2c1963 libbpf: move libbpf_errstr() into libbpf_utils.c
Get rid of str_err.{c,h} by moving implementation of libbpf_errstr()
into libbpf_utils.c and declarations into libbpf_internal.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
ce015f0184 libbpf: remove unused libbpf_strerror_r and STRERR_BUFSIZE
libbpf_strerror_r() is not exposed as public API and neither is it used
inside libbpf itself. Remove it altogether.

Same for STRERR_BUFSIZE, it's just an orphaned leftover constant which
we missed to clean up some time earlier.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
33021bb9dd libbpf: make libbpf_errno.c into more generic libbpf_utils.c
Libbpf is missing one convenient place to put common "utils"-like code
that is generic and usable from multiple places. Use libbpf_errno.c as
the base for more generic libbpf_utils.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Alasdair McWilliam
97fbf1d106 rtnetlink: add needed_{head,tail}room attributes
Various network interface types make use of needed_{head,tail}room values
to efficiently reserve buffer space for additional encapsulation headers,
such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible
to query these values in a generic way.

Introduce ability to query the needed_{head,tail}room values of a network
device via rtnetlink, such that applications that may wish to use these
values can do so.

For example, Cilium agent iterates over present devices based on user config
(direct routing, vxlan, geneve, wireguard etc.) and in future will configure
netkit in order to expose the needed_{head,tail}room into K8s pods. See
b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room").

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alasdair McWilliam <alasdair@mcwilliam.dev>
Reviewed-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-06 15:59:27 -07:00
Hangbin Liu
017c96d6e1 bonding: add support for per-port LACP actor priority
Introduce a new netlink attribute 'actor_port_prio' to allow setting
the LACP actor port priority on a per-slave basis. This extends the
existing bonding infrastructure to support more granular control over
LACP negotiations.

The priority value is embedded in LACPDU packets and will be used by
subsequent patches to influence aggregator selection policies.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-06 15:59:27 -07:00
Jakub Kicinski
1f098bc568 uapi: wrap compiler_types.h in an ifdef instead of the implicit strip
The uAPI stddef header includes compiler_types.h, a kernel-only
header, to make sure that kernel definitions of annotations
like __counted_by() take precedence.

There is a hack in scripts/headers_install.sh which strips includes
of compiler.h and compiler_types.h when installing uAPI headers.
While explicit handling makes sense for compiler.h, which is included
all over the uAPI, compiler_types.h is only included by stddef.h
(within the uAPI, obviously it's included in kernel code a lot).

Remove the stripping from scripts/headers_install.sh and wrap
the include of compiler_types.h in #ifdef __KERNEL__ instead.
This should be equivalent functionally, but is easier to understand
to a casual reader of the code. It also makes it easier to work
with kernel headers directly from under tools/

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250825201828.2370083-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-06 15:59:27 -07:00
Eric Biggers
367798a9cf libbpf: Replace AF_ALG with open coded SHA-256
Reimplement libbpf_sha256() using some basic SHA-256 C code.  This
eliminates the newly-added dependency on AF_ALG, which is a problematic
UAPI that is not supported by all kernels.

Make libbpf_sha256() return void, since it can no longer fail.  This
simplifies some callers.  Also drop the unnecessary 'sha_out_sz'
parameter.  Finally, also fix the typo in "compute_sha_udpate_offsets".

Fixes: c297fe3e9f99 ("libbpf: Implement SHA256 internal helper")
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Link: https://lore.kernel.org/r/20250928003833.138407-1-ebiggers@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
D. Wythe
f9369ca839 libbpf: Fix error when st-prefix_ops and ops from differ btf
When a module registers a struct_ops, the struct_ops type and its
corresponding map_value type ("bpf_struct_ops_") may reside in different
btf objects, here are four possible case:

+--------+---------------+-------------+---------------------------------+
|        |bpf_struct_ops_| xxx_ops     |                                 |
+--------+---------------+-------------+---------------------------------+
| case 0 | btf_vmlinux   | btf_vmlinux | be used and reg only in vmlinux |
+--------+---------------+-------------+---------------------------------+
| case 1 | btf_vmlinux   | mod_btf     | INVALID                         |
+--------+---------------+-------------+---------------------------------+
| case 2 | mod_btf       | btf_vmlinux | reg in mod but be used both in  |
|        |               |             | vmlinux and mod.                |
+--------+---------------+-------------+---------------------------------+
| case 3 | mod_btf       | mod_btf     | be used and reg only in mod     |
+--------+---------------+-------------+---------------------------------+

Currently we figure out the mod_btf by searching with the struct_ops type,
which makes it impossible to figure out the mod_btf when the struct_ops
type is in btf_vmlinux while it's corresponding map_value type is in
mod_btf (case 2).

The fix is to use the corresponding map_value type ("bpf_struct_ops_")
as the lookup anchor instead of the struct_ops type to figure out the
`btf` and `mod_btf` via find_ksym_btf_id(), and then we can locate
the kern_type_id via btf__find_by_name_kind() with the `btf` we just
obtained from find_ksym_btf_id().

With this change the lookup obtains the correct btf and mod_btf for case 2,
preserves correct behavior for other valid cases, and still fails as
expected for the invalid scenario (case 1).

Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20250926071751.108293-1-alibuda@linux.alibaba.com
2025-10-06 15:59:27 -07:00
Mykyta Yatsenko
70619ad135 bpf: bpf task work plumbing
This patch adds necessary plumbing in verifier, syscall and maps to
support handling new kfunc bpf_task_work_schedule and kernel structure
bpf_task_work. The idea is similar to how we already handle bpf_wq and
bpf_timer.
verifier changes validate calls to bpf_task_work_schedule to make sure
it is safe and expected invariants hold.
btf part is required to detect bpf_task_work structure inside map value
and store its offset, which will be used in the next patch to calculate
key and value addresses.
arraymap and hashtab changes are needed to handle freeing of the
bpf_task_work: run code needed to deinitialize it, for example cancel
task_work callback if possible.
The use of bpf_task_work and proper implementation for kfuncs are
introduced in the next patch.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
56cc32b5e3 libbpf: Embed and verify the metadata hash in the loader
To fulfill the BPF signing contract, represented as Sig(I_loader ||
H_meta), the generated trusted loader program must verify the integrity
of the metadata. This signature cryptographically binds the loader's
instructions (I_loader) to a hash of the metadata (H_meta).

The verification process is embedded directly into the loader program.
Upon execution, the loader loads the runtime hash from struct bpf_map
i.e. BPF_PSEUDO_MAP_IDX and compares this runtime hash against an
expected hash value that has been hardcoded directly by
bpf_obj__gen_loader.

The load from bpf_map can be improved by calling
BPF_OBJ_GET_INFO_BY_FD from the kernel context after BPF_OBJ_GET_INFO_BY_FD
has been updated for being called from the kernel context.

The following instructions are generated:

    ld_imm64 r1, const_ptr_to_map // insn[0].src_reg == BPF_PSEUDO_MAP_IDX
    r2 = *(u64 *)(r1 + 0);
    ld_imm64 r3, sha256_of_map_part1 // constant precomputed by
bpftool (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 8);
    ld_imm64 r3, sha256_of_map_part2 // (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 16);
    ld_imm64 r3, sha256_of_map_part3 // (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 24);
    ld_imm64 r3, sha256_of_map_part4 // (part of H_meta)
    if r2 != r3 goto out;
    ...

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-4-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
986d033976 libbpf: Update light skeleton for signing
* The metadata map is created with as an exclusive map (with an
excl_prog_hash) This restricts map access exclusively to the signed
loader program, preventing tampering by other processes.

* The map is then frozen, making it read-only from userspace.

* BPF_OBJ_GET_INFO_BY_ID instructs the kernel to compute the hash of the
  metadata map (H') and store it in bpf_map->sha.

* The loader is then loaded with the signature which is then verified by
  the kernel.

loading signed programs prebuilt into the kernel are not currently
supported. These can supported by enabling BPF_OBJ_GET_INFO_BY_ID to be
called from the kernel.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-3-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
decfae3a5d bpf: Implement signature verification for BPF programs
This patch extends the BPF_PROG_LOAD command by adding three new fields
to `union bpf_attr` in the user-space API:

  - signature: A pointer to the signature blob.
  - signature_size: The size of the signature blob.
  - keyring_id: The serial number of a loaded kernel keyring (e.g.,
    the user or session keyring) containing the trusted public keys.

When a BPF program is loaded with a signature, the kernel:

1.  Retrieves the trusted keyring using the provided `keyring_id`.
2.  Verifies the supplied signature against the BPF program's
    instruction buffer.
3.  If the signature is valid and was generated by a key in the trusted
    keyring, the program load proceeds.
4.  If no signature is provided, the load proceeds as before, allowing
    for backward compatibility. LSMs can chose to restrict unsigned
    programs and implement a security policy.
5.  If signature verification fails for any reason,
    the program is not loaded.

Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
a6bb074359 bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD
Currently only array maps are supported, but the implementation can be
extended for other maps and objects. The hash is memoized only for
exclusive and frozen maps as their content is stable until the exclusive
program modifies the map.

This is required for BPF signing, enabling a trusted loader program to
verify a map's integrity. The loader retrieves
the map's runtime hash from the kernel and compares it against an
expected hash computed at build time.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
59ed98f687 libbpf: Support exclusive map creation
Implement setters and getters that allow map to be registered as
exclusive to the specified program. The registration should be done
before the exclusive program is loaded.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-5-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
1202ada5c1 libbpf: Implement SHA256 internal helper
Use AF_ALG sockets to not have libbpf depend on OpenSSL. The helper is
used for the loader generation code to embed the metadata hash in the
loader program and also by the bpf_map__make_exclusive API to calculate
the hash of the program the map is exclusive to.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-4-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
8347a49c62 bpf: Implement exclusive map creation
Exclusive maps allow maps to only be accessed by program with a
program with a matching hash which is specified in the excl_prog_hash
attr.

For the signing use-case, this allows the trusted loader program
to load the map and verify the integrity

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Jiawei Zhao
fcc06c3da4 libbpf: Remove unused args in parse_usdt_note
Remove unused 'elf' and 'path' parameters from parse_usdt_note function
signature. These parameters are not referenced within the function body
and only add unnecessary complexity.

The function only requires the note header, data buffer, offsets, and
output structure to perform USDT note parsing.

Update function declaration, definition, and the single call site in
collect_usdt_targets() to match the simplified signature.

This is a safe internal cleanup as parse_usdt_note is a static function.

Signed-off-by: Jiawei Zhao <phoenix500526@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250904030525.1932293-1-phoenix500526@163.com
2025-10-06 15:59:27 -07:00
Jiawei Zhao
9236e137e9 libbpf: Fix USDT SIB argument handling causing unrecognized register error
On x86-64, USDT arguments can be specified using Scale-Index-Base (SIB)
addressing, e.g. "1@-96(%rbp,%rax,8)". The current USDT implementation
in libbpf cannot parse this format, causing `bpf_program__attach_usdt()`
to fail with -ENOENT (unrecognized register).

This patch fixes this by implementing the necessary changes:
- add correct handling for SIB-addressed arguments in `bpf_usdt_arg`.
- add adaptive support to `__bpf_usdt_arg_type` and
  `__bpf_usdt_arg_spec` to represent SIB addressing parameters.

Signed-off-by: Jiawei Zhao <phoenix500526@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250827053128.1301287-2-phoenix500526@163.com
2025-10-06 15:59:27 -07:00
Christian Brauner
2d769c3bc5 nsfs: support exhaustive file handles
Pidfd file handles are exhaustive meaning they don't require a handle on
another pidfd to pass to open_by_handle_at() so it can derive the
filesystem to decode in. Instead it can be derived from the file
handle itself. The same is possible for namespace file handles.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
9705048c0e include: implement be{32,64}_to_cpu() and cpu_to_be{32,64}() macros
libbpf is now using above macros for libbpf_sha256() implementation,
make them available in Github repo.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
6920913226 include: add BPF_JMP_REG() macro implementation
libbpf's gen_loader is now using BPF_JMP_REG(), so add it to
include/linux/filter.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
519d65b564 sync: fix sync script
Add missing {} for find -exec invoation.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
3f077472ee sync: sync stddef.h UAPI header
It contains __struct_group() macro needed for pkt_cls.h UAPI header...

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 17:04:39 -07:00
Andrii Nakryiko
0c33cc07f1 sync: sync networking UAPI headers
Some of them were outdated, again due to originally using UAPI headers
from tools/ subdirectory in Linux repo.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
Andrii Nakryiko
289e4a2160 sync: add back fcnt.h and openat2.h UAPI headers
They were removed during one of the syncs because Linux repo's tools/
versions of UAPI headers were removed (as they were not needed for perf
anymore). This is no right for libbpf, so add them back. And moving
forward, we'll sync them from Linux repo original UAPIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
Andrii Nakryiko
67901a67cb sync: update sync-kernel.sh to fetch original UAPI headers
Instead of UAPI headers copies from tools/ subdir in kernel repo, fetch
all the original UAPI headers straight from include/uapi/linux location.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
48 changed files with 2617 additions and 475 deletions

View File

@@ -19,13 +19,8 @@ jobs:
- kernel: 'LATEST'
runs_on: 'ubuntu-24.04'
arch: 'x86_64'
llvm-version: '18'
llvm-version: '21'
pahole: 'master'
- kernel: 'LATEST'
runs_on: 'ubuntu-24.04'
arch: 'x86_64'
llvm-version: '18'
pahole: 'tmp.master'
name: Linux ${{ matrix.kernel }} llvm-${{ matrix.llvm-version }}
uses: ./.github/workflows/vmtest.yml
with:

View File

@@ -8,6 +8,7 @@ Dan Carpenter <error27@gmail.com> <dan.carpenter@oracle.com>
Geliang Tang <geliang@kernel.org> <geliang.tang@suse.com>
Herbert Xu <herbert@gondor.apana.org.au>
Jakub Kicinski <kuba@kernel.org> <jakub.kicinski@netronome.com>
Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe@linaro.org>
Jesper Dangaard Brouer <hawk@kernel.org> <brouer@redhat.com>
Kees Cook <kees@kernel.org> <keescook@chromium.org>
Kuniyuki Iwashima <kuniyu@google.com> <kuniyu@amazon.co.jp>

View File

@@ -1 +1 @@
27861fc720be2c39b861d8bdfb68287f54de6855
22cc16c04b7893d8fc22810599f49a305d600b9e

View File

@@ -1 +1 @@
21aeabb68258ce17b91af113a768760b3a491d93
08a7491843224f8b96518fbe70d9e48163046054

View File

@@ -1,5 +1,6 @@
# TEMPORARY
btf_dump/btf_dump: syntax
bpf_cookie/perf_event
kprobe_multi_bench_attach
core_reloc/enum64val
core_reloc/size___diff_sz
@@ -13,3 +14,4 @@ tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46
migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation
migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation
verify_pkcs7_sig # keeps failing
verif_scale_pyperf600 # fails on newer Clangs

View File

@@ -100,10 +100,26 @@ described in more detail in the footnotes.
| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``usdt+`` [#usdt]_ | |
+ + +----------------------------------+-----------+
| | | ``usdt.s+`` [#usdt]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
+ + +----------------------------------+-----------+
| | | ``kretprobe.multi+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_SESSION`` | ``kprobe.session+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_MULTI`` | ``uprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.multi.s+`` [#upmul]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi.s+`` [#upmul]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_SESSION`` | ``uprobe.session+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.session.s+`` [#upmul]_ | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -219,6 +235,8 @@ described in more detail in the footnotes.
non-negative integer.
.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
.. [#upmul] The ``uprobe.multi`` attach format is ``uprobe.multi[.s]/<path>:<function-pattern>``
where ``function-pattern`` supports ``*`` and ``?`` wildcards.
.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
supports ``*`` and ``?`` wildcards. Valid characters for pattern are

View File

@@ -123,6 +123,14 @@
BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \
MAP_FD, VALUE_OFF)
#define BPF_JMP_REG(OP, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_K, \

View File

@@ -43,4 +43,18 @@
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define be32_to_cpu(x) __builtin_bswap32(x)
#define cpu_to_be32(x) __builtin_bswap32(x)
#define be64_to_cpu(x) __builtin_bswap64(x)
#define cpu_to_be64(x) __builtin_bswap64(x)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define be32_to_cpu(x) (x)
#define cpu_to_be32(x) (x)
#define be64_to_cpu(x) (x)
#define cpu_to_be64(x) (x)
#else
# error "__BYTE_ORDER__ undefined or invalid"
#endif
#endif

View File

@@ -30,4 +30,7 @@ struct list_head {
struct list_head *next, *prev;
};
/* needed on Android/Termux, where this linux/types.h is included by other include files */
typedef unsigned __bitwise __poll_t;
#endif

View File

@@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
/*
* Walks the immediate children of the specified parent
* cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
* BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
* the iterator does not include the specified parent as one of the
* returned iterator elements.
*/
BPF_CGROUP_ITER_CHILDREN,
};
union bpf_iter_link_info {
@@ -918,6 +926,16 @@ union bpf_iter_link_info {
* Number of bytes read from the stream on success, or -1 if an
* error occurred (in which case, *errno* is set appropriately).
*
* BPF_PROG_ASSOC_STRUCT_OPS
* Description
* Associate a BPF program with a struct_ops map. The struct_ops
* map is identified by *map_fd* and the BPF program is
* identified by *prog_fd*.
*
* Return
* 0 on success or -1 if an error occurred (in which case,
* *errno* is set appropriately).
*
* NOTES
* eBPF objects (maps and programs) can be shared between processes.
*
@@ -974,6 +992,7 @@ enum bpf_cmd {
BPF_PROG_BIND_MAP,
BPF_TOKEN_CREATE,
BPF_PROG_STREAM_READ_BY_FD,
BPF_PROG_ASSOC_STRUCT_OPS,
__MAX_BPF_CMD,
};
@@ -1026,6 +1045,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
BPF_MAP_TYPE_INSN_ARRAY,
__MAX_BPF_MAP_TYPE
};
@@ -1133,6 +1153,7 @@ enum bpf_attach_type {
BPF_NETKIT_PEER,
BPF_TRACE_KPROBE_SESSION,
BPF_TRACE_UPROBE_SESSION,
BPF_TRACE_FSESSION,
__MAX_BPF_ATTACH_TYPE
};
@@ -1372,6 +1393,8 @@ enum {
BPF_NOEXIST = 1, /* create new element if it didn't exist */
BPF_EXIST = 2, /* update existing element */
BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */
};
/* flags for BPF_MAP_CREATE command */
@@ -1430,6 +1453,9 @@ enum {
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
/* Enable BPF ringbuf overwrite mode */
BPF_F_RB_OVERWRITE = (1U << 19),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1522,6 +1548,12 @@ union bpf_attr {
* If provided, map_flags should have BPF_F_TOKEN_FD flag set.
*/
__s32 map_token_fd;
/* Hash of the program that has exclusive access to the map.
*/
__aligned_u64 excl_prog_hash;
/* Size of the passed excl_prog_hash. */
__u32 excl_prog_hash_size;
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */
@@ -1605,6 +1637,16 @@ union bpf_attr {
* continuous.
*/
__u32 fd_array_cnt;
/* Pointer to a buffer containing the signature of the BPF
* program.
*/
__aligned_u64 signature;
/* Size of the signature buffer in bytes. */
__u32 signature_size;
/* ID of the kernel keyring to be used for signature
* verification.
*/
__s32 keyring_id;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1874,6 +1916,12 @@ union bpf_attr {
__u32 prog_fd;
} prog_stream_read;
struct {
__u32 map_fd;
__u32 prog_fd;
__u32 flags;
} prog_assoc_struct_ops;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -4875,7 +4923,7 @@ union bpf_attr {
*
* **-ENOENT** if the bpf_local_storage cannot be found.
*
* long bpf_d_path(struct path *path, char *buf, u32 sz)
* long bpf_d_path(const struct path *path, char *buf, u32 sz)
* Description
* Return full path for given **struct path** object, which
* needs to be the kernel BTF *path* object. The path is
@@ -5602,7 +5650,7 @@ union bpf_attr {
* Return
* *sk* if casting is valid, or **NULL** otherwise.
*
* long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
* long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
* Description
* Get a dynptr to local memory *data*.
*
@@ -5645,7 +5693,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
* long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5655,7 +5703,7 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
@@ -5676,7 +5724,7 @@ union bpf_attr {
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
* Description
* Get a pointer to the underlying dynptr data.
*
@@ -6215,6 +6263,7 @@ enum {
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
BPF_RB_OVERWRITE_POS = 4,
};
/* BPF ring buffer constants */
@@ -6666,6 +6715,8 @@ struct bpf_map_info {
__u32 btf_value_type_id;
__u32 btf_vmlinux_id;
__u64 map_extra;
__aligned_u64 hash;
__u32 hash_size;
} __attribute__((aligned(8)));
struct bpf_btf_info {
@@ -7182,6 +7233,8 @@ enum {
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */
};
enum {
@@ -7418,6 +7471,10 @@ struct bpf_timer {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_task_work {
__u64 __opaque;
} __attribute__((aligned(8)));
struct bpf_wq {
__u64 __opaque[2];
} __attribute__((aligned(8)));
@@ -7623,4 +7680,24 @@ enum bpf_kfunc_flags {
BPF_F_PAD_ZEROS = (1ULL << 0),
};
/*
* Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
*
* Before the map is used the orig_off field should point to an
* instruction inside the program being loaded. The other fields
* must be set to 0.
*
* After the program is loaded, the xlated_off will be adjusted
* by the verifier to point to the index of the original instruction
* in the xlated program. If the instruction is deleted, it will
* be set to (u32)-1. The jitted_off will be set to the corresponding
* offset in the jitted image of the program.
*/
struct bpf_insn_array_value {
__u32 orig_off;
__u32 xlated_off;
__u32 jitted_off;
__u32 :32;
};
#endif /* __LINUX_BPF_H__ */

193
include/uapi/linux/fcntl.h Normal file
View File

@@ -0,0 +1,193 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _LINUX_FCNTL_H
#define _LINUX_FCNTL_H
#include <asm/fcntl.h>
#include <linux/openat2.h>
#include <linux/types.h>
#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
/*
* Request nofications on a directory.
* See below for events that may be notified.
*/
#define F_NOTIFY (F_LINUX_SPECIFIC_BASE + 2)
#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
/* Was the file just created? */
#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4)
/*
* Cancel a blocking posix lock; internal use only until we expose an
* asynchronous lock api to userspace:
*/
#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5)
/* Create a file descriptor with FD_CLOEXEC set. */
#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6)
/*
* Set and get of pipe page size array
*/
#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
/*
* Set/Get seals
*/
#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
/*
* Types of seals
*/
#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
/* (1U << 31) is reserved for signed error codes */
/*
* Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
* underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
* the specific file.
*/
#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
*/
#define RWH_WRITE_LIFE_NOT_SET 0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT 2
#define RWH_WRITE_LIFE_MEDIUM 3
#define RWH_WRITE_LIFE_LONG 4
#define RWH_WRITE_LIFE_EXTREME 5
/*
* The originally introduced spelling is remained from the first
* versions of the patch set that introduced the feature, see commit
* v4.13-rc1~212^2~51.
*/
#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
/* Set/Get delegations */
#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15)
#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16)
/* Argument structure for F_GETDELEG and F_SETDELEG */
struct delegation {
__u32 d_flags; /* Must be 0 */
__u16 d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */
__u16 __pad; /* Must be 0 */
};
/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
#define DN_MODIFY 0x00000002 /* File modified */
#define DN_CREATE 0x00000004 /* File created */
#define DN_DELETE 0x00000008 /* File removed */
#define DN_RENAME 0x00000010 /* File renamed */
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
/* Reserved kernel ranges [-100], [-10000, -40000]. */
#define AT_FDCWD -100 /* Special value for dirfd used to
indicate openat should use the
current working directory. */
/*
* The concept of process and threads in userland and the kernel is a confusing
* one - within the kernel every thread is a 'task' with its own individual PID,
* however from userland's point of view threads are grouped by a single PID,
* which is that of the 'thread group leader', typically the first thread
* spawned.
*
* To cut the Gideon knot, for internal kernel usage, we refer to
* PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
* perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
* group leader...
*/
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */
#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
/* Generic flags for the *at(2) family of syscalls. */
/* Reserved for per-syscall flags 0xff. */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic
links. */
/* Reserved for per-syscall flags 0x200 */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount
traversal. */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative
pathname to operate on dirfd
directly. */
/*
* These flags are currently statx(2)-specific, but they could be made generic
* in the future and so they should not be used for other per-syscall flags.
*/
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
/*
* Per-syscall flags for the *at(2) family of syscalls.
*
* These are flags that are so syscall-specific that a user passing these flags
* to the wrong syscall is so "clearly wrong" that we can safely call such
* usage "undefined behaviour".
*
* For example, the constants AT_REMOVEDIR and AT_EACCESS have the same value.
* AT_EACCESS is meaningful only to faccessat, while AT_REMOVEDIR is meaningful
* only to unlinkat. The two functions do completely different things and
* therefore, the flags can be allowed to overlap. For example, passing
* AT_REMOVEDIR to faccessat would be undefined behavior and thus treating it
* equivalent to AT_EACCESS is valid undefined behavior.
*
* Note for implementers: When picking a new per-syscall AT_* flag, try to
* reuse already existing flags first. This leaves us with as many unused bits
* as possible, so we can use them for generic bits in the future if necessary.
*/
/* Flags for renameat2(2) (must match legacy RENAME_* flags). */
#define AT_RENAME_NOREPLACE 0x0001
#define AT_RENAME_EXCHANGE 0x0002
#define AT_RENAME_WHITEOUT 0x0004
/* Flag for faccessat(2). */
#define AT_EACCESS 0x200 /* Test access permitted for
effective IDs, not real IDs. */
/* Flag for unlinkat(2). */
#define AT_REMOVEDIR 0x200 /* Remove directory instead of
unlinking file. */
/* Flags for name_to_handle_at(2). */
#define AT_HANDLE_FID 0x200 /* File handle is needed to compare
object identity and may not be
usable with open_by_handle_at(2). */
#define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */
#define AT_HANDLE_CONNECTABLE 0x002 /* Request a connectable file handle */
/* Flags for execveat2(2). */
#define AT_EXECVE_CHECK 0x10000 /* Only perform a check if execution
would be allowed. */
#endif /* _LINUX_FCNTL_H */

View File

@@ -378,6 +378,9 @@ enum {
IFLA_GRO_IPV4_MAX_SIZE,
IFLA_DPLL_PIN,
IFLA_MAX_PACING_OFFLOAD_HORIZON,
IFLA_NETNS_IMMUTABLE,
IFLA_HEADROOM,
IFLA_TAILROOM,
__IFLA_MAX
};
@@ -1396,6 +1399,8 @@ enum {
IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
IFLA_VXLAN_LOCALBYPASS,
IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
IFLA_VXLAN_RESERVED_BITS,
IFLA_VXLAN_MC_ROUTE,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
@@ -1437,6 +1442,7 @@ enum {
IFLA_GENEVE_TTL_INHERIT,
IFLA_GENEVE_DF,
IFLA_GENEVE_INNER_PROTO_INHERIT,
IFLA_GENEVE_PORT_RANGE,
__IFLA_GENEVE_MAX
};
#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
@@ -1449,6 +1455,11 @@ enum ifla_geneve_df {
GENEVE_DF_MAX = __GENEVE_DF_END - 1,
};
struct ifla_geneve_port_range {
__be16 low;
__be16 high;
};
/* Bareudp section */
enum {
IFLA_BAREUDP_UNSPEC,
@@ -1526,6 +1537,7 @@ enum {
IFLA_BOND_MISSED_MAX,
IFLA_BOND_NS_IP6_TARGET,
IFLA_BOND_COUPLED_CONTROL,
IFLA_BOND_BROADCAST_NEIGH,
__IFLA_BOND_MAX,
};
@@ -1554,6 +1566,7 @@ enum {
IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
IFLA_BOND_SLAVE_PRIO,
IFLA_BOND_SLAVE_ACTOR_PORT_PRIO,
__IFLA_BOND_SLAVE_MAX,
};
@@ -1960,6 +1973,7 @@ struct ifla_rmnet_flags {
enum {
IFLA_MCTP_UNSPEC,
IFLA_MCTP_NET,
IFLA_MCTP_PHYS_BINDING,
__IFLA_MCTP_MAX,
};
@@ -1977,4 +1991,19 @@ enum {
#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1)
/* OVPN section */
enum ovpn_mode {
OVPN_MODE_P2P,
OVPN_MODE_MP,
};
enum {
IFLA_OVPN_UNSPEC,
IFLA_OVPN_MODE,
__IFLA_OVPN_MAX,
};
#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1)
#endif /* _LINUX_IF_LINK_H */

View File

@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/netdev.yaml */
/* YNL-GEN uapi header */
/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#ifndef _LINUX_NETDEV_H
#define _LINUX_NETDEV_H
@@ -80,6 +81,7 @@ enum netdev_qstats_scope {
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {

View File

@@ -2,7 +2,7 @@
#ifndef __LINUX_NETLINK_H
#define __LINUX_NETLINK_H
#include <linux/kernel.h>
#include <linux/const.h>
#include <linux/socket.h> /* for __kernel_sa_family_t */
#include <linux/types.h>
@@ -20,7 +20,7 @@
#define NETLINK_CONNECTOR 11
#define NETLINK_NETFILTER 12 /* netfilter subsystem */
#define NETLINK_IP6_FW 13
#define NETLINK_DNRTMSG 14 /* DECnet routing messages */
#define NETLINK_DNRTMSG 14 /* DECnet routing messages (obsolete) */
#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */
#define NETLINK_GENERIC 16
/* leave room for NETLINK_DM (DM Events) */
@@ -41,12 +41,20 @@ struct sockaddr_nl {
__u32 nl_groups; /* multicast groups mask */
};
/**
* struct nlmsghdr - fixed format metadata header of Netlink messages
* @nlmsg_len: Length of message including header
* @nlmsg_type: Message content type
* @nlmsg_flags: Additional flags
* @nlmsg_seq: Sequence number
* @nlmsg_pid: Sending process port ID
*/
struct nlmsghdr {
__u32 nlmsg_len; /* Length of message including header */
__u16 nlmsg_type; /* Message content */
__u16 nlmsg_flags; /* Additional flags */
__u32 nlmsg_seq; /* Sequence number */
__u32 nlmsg_pid; /* Sending process port ID */
__u32 nlmsg_len;
__u16 nlmsg_type;
__u16 nlmsg_flags;
__u32 nlmsg_seq;
__u32 nlmsg_pid;
};
/* Flags values */
@@ -54,7 +62,7 @@ struct nlmsghdr {
#define NLM_F_REQUEST 0x01 /* It is request message. */
#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */
#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */
#define NLM_F_ECHO 0x08 /* Echo this request */
#define NLM_F_ECHO 0x08 /* Receive resulting notifications */
#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
@@ -72,6 +80,7 @@ struct nlmsghdr {
/* Modifiers to DELETE request */
#define NLM_F_NONREC 0x100 /* Do not delete recursively */
#define NLM_F_BULK 0x200 /* Delete multiple objects */
/* Flags for ACK message */
#define NLM_F_CAPPED 0x100 /* request was capped */
@@ -91,9 +100,10 @@ struct nlmsghdr {
#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN)
#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len))
#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0)))
#define NLMSG_DATA(nlh) ((void *)(((char *)nlh) + NLMSG_HDRLEN))
#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
(struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len)))
(struct nlmsghdr *)(((char *)(nlh)) + \
NLMSG_ALIGN((nlh)->nlmsg_len)))
#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len <= (len))
@@ -129,6 +139,11 @@ struct nlmsgerr {
* @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to
* be used - in the success case - to identify a created
* object or operation or similar (binary)
* @NLMSGERR_ATTR_POLICY: policy for a rejected attribute
* @NLMSGERR_ATTR_MISS_TYPE: type of a missing required attribute,
* %NLMSGERR_ATTR_MISS_NEST will not be present if the attribute was
* missing at the message level
* @NLMSGERR_ATTR_MISS_NEST: offset of the nest where attribute was missing
* @__NLMSGERR_ATTR_MAX: number of attributes
* @NLMSGERR_ATTR_MAX: highest attribute number
*/
@@ -137,6 +152,9 @@ enum nlmsgerr_attrs {
NLMSGERR_ATTR_MSG,
NLMSGERR_ATTR_OFFS,
NLMSGERR_ATTR_COOKIE,
NLMSGERR_ATTR_POLICY,
NLMSGERR_ATTR_MISS_TYPE,
NLMSGERR_ATTR_MISS_NEST,
__NLMSGERR_ATTR_MAX,
NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
@@ -249,4 +267,117 @@ struct nla_bitfield32 {
__u32 selector;
};
/*
* policy descriptions - it's specific to each family how this is used
* Normally, it should be retrieved via a dump inside another attribute
* specifying where it applies.
*/
/**
* enum netlink_attribute_type - type of an attribute
* @NL_ATTR_TYPE_INVALID: unused
* @NL_ATTR_TYPE_FLAG: flag attribute (present/not present)
* @NL_ATTR_TYPE_U8: 8-bit unsigned attribute
* @NL_ATTR_TYPE_U16: 16-bit unsigned attribute
* @NL_ATTR_TYPE_U32: 32-bit unsigned attribute
* @NL_ATTR_TYPE_U64: 64-bit unsigned attribute
* @NL_ATTR_TYPE_S8: 8-bit signed attribute
* @NL_ATTR_TYPE_S16: 16-bit signed attribute
* @NL_ATTR_TYPE_S32: 32-bit signed attribute
* @NL_ATTR_TYPE_S64: 64-bit signed attribute
* @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified
* @NL_ATTR_TYPE_STRING: string, min/max length may be specified
* @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string,
* min/max length may be specified
* @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute
* consists of sub-attributes. The nested policy and maxtype
* inside may be specified.
* @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this
* attribute contains sub-attributes whose type is irrelevant
* (just used to separate the array entries) and each such array
* entry has attributes again, the policy for those inner ones
* and the corresponding maxtype may be specified.
* @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute
* @NL_ATTR_TYPE_SINT: 32-bit or 64-bit signed attribute, aligned to 4B
* @NL_ATTR_TYPE_UINT: 32-bit or 64-bit unsigned attribute, aligned to 4B
*/
enum netlink_attribute_type {
NL_ATTR_TYPE_INVALID,
NL_ATTR_TYPE_FLAG,
NL_ATTR_TYPE_U8,
NL_ATTR_TYPE_U16,
NL_ATTR_TYPE_U32,
NL_ATTR_TYPE_U64,
NL_ATTR_TYPE_S8,
NL_ATTR_TYPE_S16,
NL_ATTR_TYPE_S32,
NL_ATTR_TYPE_S64,
NL_ATTR_TYPE_BINARY,
NL_ATTR_TYPE_STRING,
NL_ATTR_TYPE_NUL_STRING,
NL_ATTR_TYPE_NESTED,
NL_ATTR_TYPE_NESTED_ARRAY,
NL_ATTR_TYPE_BITFIELD32,
NL_ATTR_TYPE_SINT,
NL_ATTR_TYPE_UINT,
};
/**
* enum netlink_policy_type_attr - policy type attributes
* @NL_POLICY_TYPE_ATTR_UNSPEC: unused
* @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute,
* &enum netlink_attribute_type (U32)
* @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed
* integers (S64)
* @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed
* integers (S64)
* @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned
* integers (U64)
* @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned
* integers (U64)
* @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary
* attributes, no minimum if not given (U32)
* @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary
* attributes, no maximum if not given (U32)
* @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and
* nested array types (U32)
* @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy
* attribute for nested and nested array types, this can
* in theory be < the size of the policy pointed to by
* the index, if limited inside the nesting (U32)
* @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the
* bitfield32 type (U32)
* @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64)
* @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment
*
* @__NL_POLICY_TYPE_ATTR_MAX: number of attributes
* @NL_POLICY_TYPE_ATTR_MAX: highest attribute number
*/
enum netlink_policy_type_attr {
NL_POLICY_TYPE_ATTR_UNSPEC,
NL_POLICY_TYPE_ATTR_TYPE,
NL_POLICY_TYPE_ATTR_MIN_VALUE_S,
NL_POLICY_TYPE_ATTR_MAX_VALUE_S,
NL_POLICY_TYPE_ATTR_MIN_VALUE_U,
NL_POLICY_TYPE_ATTR_MAX_VALUE_U,
NL_POLICY_TYPE_ATTR_MIN_LENGTH,
NL_POLICY_TYPE_ATTR_MAX_LENGTH,
NL_POLICY_TYPE_ATTR_POLICY_IDX,
NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE,
NL_POLICY_TYPE_ATTR_BITFIELD32_MASK,
NL_POLICY_TYPE_ATTR_PAD,
NL_POLICY_TYPE_ATTR_MASK,
/* keep last */
__NL_POLICY_TYPE_ATTR_MAX,
NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1
};
#endif /* __LINUX_NETLINK_H */

View File

@@ -0,0 +1,43 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _LINUX_OPENAT2_H
#define _LINUX_OPENAT2_H
#include <linux/types.h>
/*
* Arguments for how openat2(2) should open the target path. If only @flags and
* @mode are non-zero, then openat2(2) operates very similarly to openat(2).
*
* However, unlike openat(2), unknown or invalid bits in @flags result in
* -EINVAL rather than being silently ignored. @mode must be zero unless one of
* {O_CREAT, O_TMPFILE} are set.
*
* @flags: O_* flags.
* @mode: O_CREAT/O_TMPFILE file mode.
* @resolve: RESOLVE_* flags.
*/
struct open_how {
__u64 flags;
__u64 mode;
__u64 resolve;
};
/* how->resolve flags for openat2(2). */
#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings
(includes bind-mounts). */
#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style
"magic-links". */
#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks
(implies OEXT_NO_MAGICLINKS) */
#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like
"..", symlinks, and absolute
paths which escape the dirfd. */
#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
be scoped inside the dirfd
(similar to chroot(2)). */
#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be
completed through cached lookup. May
return -EAGAIN if that's not
possible. */
#endif /* _LINUX_OPENAT2_H */

View File

@@ -2,7 +2,7 @@
/*
* Performance events:
*
* Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
*
@@ -382,6 +382,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */
/*
* 'struct perf_event_attr' contains various attributes that define
@@ -463,7 +464,9 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
__reserved_1 : 26;
defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
__reserved_1 : 24;
union {
__u32 wakeup_events; /* wake up every n events */
@@ -543,6 +546,7 @@ struct perf_event_attr {
__u64 sig_data;
__u64 config3; /* extension of config2 */
__u64 config4; /* extension of config3 */
};
/*
@@ -1239,6 +1243,22 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
/*
* This user callchain capture was deferred until shortly before
* returning to user space. Previous samples would have kernel
* callchains only and they need to be stitched with this to make full
* callchains.
*
* struct {
* struct perf_event_header header;
* u64 cookie;
* u64 nr;
* u64 ips[nr];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_CALLCHAIN_DEFERRED = 22,
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1269,6 +1289,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,

View File

@@ -16,9 +16,40 @@ enum {
TCA_ACT_STATS,
TCA_ACT_PAD,
TCA_ACT_COOKIE,
TCA_ACT_FLAGS,
TCA_ACT_HW_STATS,
TCA_ACT_USED_HW_STATS,
TCA_ACT_IN_HW_COUNT,
__TCA_ACT_MAX
};
/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */
#define TCA_ACT_FLAGS_NO_PERCPU_STATS (1 << 0) /* Don't use percpu allocator for
* actions stats.
*/
#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */
#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */
/* tca HW stats type
* When user does not pass the attribute, he does not care.
* It is the same as if he would pass the attribute with
* all supported bits set.
* In case no bits are set, user is not interested in getting any HW statistics.
*/
#define TCA_ACT_HW_STATS_IMMEDIATE (1 << 0) /* Means that in dump, user
* gets the current HW stats
* state from the device
* queried at the dump time.
*/
#define TCA_ACT_HW_STATS_DELAYED (1 << 1) /* Means that in dump, user gets
* HW stats that might be out of date
* for some time, maybe couple of
* seconds. This is the case when
* driver polls stats updates
* periodically or when it gets async
* stats update from the device.
*/
#define TCA_ACT_MAX __TCA_ACT_MAX
#define TCA_OLD_COMPAT (TCA_ACT_MAX+1)
#define TCA_ACT_MAX_PRIO 32
@@ -63,12 +94,53 @@ enum {
#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN
/* These macros are put here for binary compatibility with userspace apps that
* make use of them. For kernel code and new userspace apps, use the TCA_ID_*
* versions.
*/
#define TCA_ACT_GACT 5
#define TCA_ACT_IPT 6 /* obsoleted, can be reused */
#define TCA_ACT_PEDIT 7
#define TCA_ACT_MIRRED 8
#define TCA_ACT_NAT 9
#define TCA_ACT_XT 10
#define TCA_ACT_SKBEDIT 11
#define TCA_ACT_VLAN 12
#define TCA_ACT_BPF 13
#define TCA_ACT_CONNMARK 14
#define TCA_ACT_SKBMOD 15
#define TCA_ACT_CSUM 16
#define TCA_ACT_TUNNEL_KEY 17
#define TCA_ACT_SIMP 22
#define TCA_ACT_IFE 25
#define TCA_ACT_SAMPLE 26
/* Action type identifiers*/
enum {
TCA_ID_UNSPEC=0,
TCA_ID_POLICE=1,
enum tca_id {
TCA_ID_UNSPEC = 0,
TCA_ID_POLICE = 1,
TCA_ID_GACT = TCA_ACT_GACT,
TCA_ID_IPT = TCA_ACT_IPT, /* Obsoleted, can be reused */
TCA_ID_PEDIT = TCA_ACT_PEDIT,
TCA_ID_MIRRED = TCA_ACT_MIRRED,
TCA_ID_NAT = TCA_ACT_NAT,
TCA_ID_XT = TCA_ACT_XT,
TCA_ID_SKBEDIT = TCA_ACT_SKBEDIT,
TCA_ID_VLAN = TCA_ACT_VLAN,
TCA_ID_BPF = TCA_ACT_BPF,
TCA_ID_CONNMARK = TCA_ACT_CONNMARK,
TCA_ID_SKBMOD = TCA_ACT_SKBMOD,
TCA_ID_CSUM = TCA_ACT_CSUM,
TCA_ID_TUNNEL_KEY = TCA_ACT_TUNNEL_KEY,
TCA_ID_SIMP = TCA_ACT_SIMP,
TCA_ID_IFE = TCA_ACT_IFE,
TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
TCA_ID_CTINFO,
TCA_ID_MPLS,
TCA_ID_CT,
TCA_ID_GATE,
/* other actions go here */
__TCA_ID_MAX=255
__TCA_ID_MAX = 255
};
#define TCA_ID_MAX __TCA_ID_MAX
@@ -120,6 +192,10 @@ enum {
TCA_POLICE_RESULT,
TCA_POLICE_TM,
TCA_POLICE_PAD,
TCA_POLICE_RATE64,
TCA_POLICE_PEAKRATE64,
TCA_POLICE_PKTRATE64,
TCA_POLICE_PKTBURST64,
__TCA_POLICE_MAX
#define TCA_POLICE_RESULT TCA_POLICE_RESULT
};
@@ -170,16 +246,19 @@ struct tc_u32_key {
};
struct tc_u32_sel {
unsigned char flags;
unsigned char offshift;
unsigned char nkeys;
/* New members MUST be added within the __struct_group() macro below. */
__struct_group(tc_u32_sel_hdr, hdr, /* no attrs */,
unsigned char flags;
unsigned char offshift;
unsigned char nkeys;
__be16 offmask;
__u16 off;
short offoff;
__be16 offmask;
__u16 off;
short offoff;
short hoff;
__be32 hmask;
short hoff;
__be32 hmask;
);
struct tc_u32_key keys[];
};
@@ -286,12 +365,19 @@ enum {
/* Basic filter */
struct tc_basic_pcnt {
__u64 rcnt;
__u64 rhit;
};
enum {
TCA_BASIC_UNSPEC,
TCA_BASIC_CLASSID,
TCA_BASIC_EMATCHES,
TCA_BASIC_ACT,
TCA_BASIC_POLICE,
TCA_BASIC_PCNT,
TCA_BASIC_PAD,
__TCA_BASIC_MAX
};
@@ -438,17 +524,79 @@ enum {
TCA_FLOWER_IN_HW_COUNT,
TCA_FLOWER_KEY_PORT_SRC_MIN, /* be16 */
TCA_FLOWER_KEY_PORT_SRC_MAX, /* be16 */
TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */
TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */
TCA_FLOWER_KEY_CT_STATE, /* u16 */
TCA_FLOWER_KEY_CT_STATE_MASK, /* u16 */
TCA_FLOWER_KEY_CT_ZONE, /* u16 */
TCA_FLOWER_KEY_CT_ZONE_MASK, /* u16 */
TCA_FLOWER_KEY_CT_MARK, /* u32 */
TCA_FLOWER_KEY_CT_MARK_MASK, /* u32 */
TCA_FLOWER_KEY_CT_LABELS, /* u128 */
TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */
TCA_FLOWER_KEY_MPLS_OPTS,
TCA_FLOWER_KEY_HASH, /* u32 */
TCA_FLOWER_KEY_HASH_MASK, /* u32 */
TCA_FLOWER_KEY_NUM_OF_VLANS, /* u8 */
TCA_FLOWER_KEY_PPPOE_SID, /* be16 */
TCA_FLOWER_KEY_PPP_PROTO, /* be16 */
TCA_FLOWER_KEY_L2TPV3_SID, /* be32 */
TCA_FLOWER_L2_MISS, /* u8 */
TCA_FLOWER_KEY_CFM, /* nested */
TCA_FLOWER_KEY_SPI, /* be32 */
TCA_FLOWER_KEY_SPI_MASK, /* be32 */
TCA_FLOWER_KEY_ENC_FLAGS, /* be32 */
TCA_FLOWER_KEY_ENC_FLAGS_MASK, /* be32 */
__TCA_FLOWER_MAX,
};
#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
enum {
TCA_FLOWER_KEY_CT_FLAGS_NEW = 1 << 0, /* Beginning of a new connection. */
TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */
TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */
TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */
TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */
__TCA_FLOWER_KEY_CT_FLAGS_MAX,
};
enum {
TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_GENEVE_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_VXLAN_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_ERSPAN_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_GTP, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_GTP_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_PFCP, /* Nested
* TCA_FLOWER_KEY_ENC_IPT_PFCP
* attributes
*/
__TCA_FLOWER_KEY_ENC_OPTS_MAX,
};
@@ -467,17 +615,105 @@ enum {
(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
enum {
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */
__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */
__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_GTP_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_GTP_QFI, /* u8 */
__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_GTP_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_PFCP_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID, /* be64 */
__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX - 1)
enum {
TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC,
TCA_FLOWER_KEY_MPLS_OPTS_LSE,
__TCA_FLOWER_KEY_MPLS_OPTS_MAX,
};
#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1)
enum {
TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC,
TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX,
};
#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \
(__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1)
enum {
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM = (1 << 2),
TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT = (1 << 3),
TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM = (1 << 4),
TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT = (1 << 5),
__TCA_FLOWER_KEY_FLAGS_MAX,
};
#define TCA_FLOWER_KEY_FLAGS_MAX (__TCA_FLOWER_KEY_FLAGS_MAX - 1)
enum {
TCA_FLOWER_KEY_CFM_OPT_UNSPEC,
TCA_FLOWER_KEY_CFM_MD_LEVEL,
TCA_FLOWER_KEY_CFM_OPCODE,
__TCA_FLOWER_KEY_CFM_OPT_MAX,
};
#define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
#define TCA_FLOWER_KEY_CFM_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
#define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */
/* Match-all classifier */
struct tc_matchall_pcnt {
__u64 rhit;
};
enum {
TCA_MATCHALL_UNSPEC,
TCA_MATCHALL_CLASSID,
TCA_MATCHALL_ACT,
TCA_MATCHALL_FLAGS,
TCA_MATCHALL_PCNT,
TCA_MATCHALL_PAD,
__TCA_MATCHALL_MAX,
};

View File

@@ -2,6 +2,7 @@
#ifndef __LINUX_PKT_SCHED_H
#define __LINUX_PKT_SCHED_H
#include <linux/const.h>
#include <linux/types.h>
/* Logical priority bands not depending on specific packet scheduler.
@@ -255,6 +256,9 @@ enum {
TCA_RED_PARMS,
TCA_RED_STAB,
TCA_RED_MAX_P,
TCA_RED_FLAGS, /* bitfield32 */
TCA_RED_EARLY_DROP_BLOCK, /* u32 */
TCA_RED_MARK_BLOCK, /* u32 */
__TCA_RED_MAX,
};
@@ -267,12 +271,28 @@ struct tc_red_qopt {
unsigned char Wlog; /* log(W) */
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
unsigned char Scell_log; /* cell size for idle damping */
/* This field can be used for flags that a RED-like qdisc has
* historically supported. E.g. when configuring RED, it can be used for
* ECN, HARDDROP and ADAPTATIVE. For SFQ it can be used for ECN,
* HARDDROP. Etc. Because this field has not been validated, and is
* copied back on dump, any bits besides those to which a given qdisc
* has assigned a historical meaning need to be considered for free use
* by userspace tools.
*
* Any further flags need to be passed differently, e.g. through an
* attribute (such as TCA_RED_FLAGS above). Such attribute should allow
* passing both recent and historic flags in one value.
*/
unsigned char flags;
#define TC_RED_ECN 1
#define TC_RED_HARDDROP 2
#define TC_RED_ADAPTATIVE 4
#define TC_RED_NODROP 8
};
#define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE)
struct tc_red_xstats {
__u32 early; /* Early drops */
__u32 pdrop; /* Drops due to queue limits */
@@ -474,6 +494,7 @@ enum {
TCA_NETEM_JITTER64,
TCA_NETEM_SLOT,
TCA_NETEM_SLOT_DIST,
TCA_NETEM_PRNG_SEED,
__TCA_NETEM_MAX,
};
@@ -590,6 +611,11 @@ enum {
#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
enum {
TC_FP_EXPRESS = 1,
TC_FP_PREEMPTIBLE = 2,
};
struct tc_mqprio_qopt {
__u8 num_tc;
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -603,12 +629,23 @@ struct tc_mqprio_qopt {
#define TC_MQPRIO_F_MIN_RATE 0x4
#define TC_MQPRIO_F_MAX_RATE 0x8
enum {
TCA_MQPRIO_TC_ENTRY_UNSPEC,
TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */
TCA_MQPRIO_TC_ENTRY_FP, /* u32 */
/* add new constants above here */
__TCA_MQPRIO_TC_ENTRY_CNT,
TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
};
enum {
TCA_MQPRIO_UNSPEC,
TCA_MQPRIO_MODE,
TCA_MQPRIO_SHAPER,
TCA_MQPRIO_MIN_RATE64,
TCA_MQPRIO_MAX_RATE64,
TCA_MQPRIO_TC_ENTRY,
__TCA_MQPRIO_MAX,
};
@@ -698,6 +735,8 @@ struct tc_codel_xstats {
/* FQ_CODEL */
#define FQ_CODEL_QUANTUM_MAX (1 << 20)
enum {
TCA_FQ_CODEL_UNSPEC,
TCA_FQ_CODEL_TARGET,
@@ -709,6 +748,8 @@ enum {
TCA_FQ_CODEL_CE_THRESHOLD,
TCA_FQ_CODEL_DROP_BATCH_SIZE,
TCA_FQ_CODEL_MEMORY_LIMIT,
TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR,
TCA_FQ_CODEL_CE_THRESHOLD_MASK,
__TCA_FQ_CODEL_MAX
};
@@ -785,15 +826,30 @@ enum {
TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */
TCA_FQ_TIMER_SLACK, /* timer slack */
TCA_FQ_HORIZON, /* time horizon in us */
TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */
TCA_FQ_PRIOMAP, /* prio2band */
TCA_FQ_WEIGHTS, /* Weights for each band */
TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */
__TCA_FQ_MAX
};
#define TCA_FQ_MAX (__TCA_FQ_MAX - 1)
#define FQ_BANDS 3
#define FQ_MIN_WEIGHT 16384
struct tc_fq_qd_stats {
__u64 gc_flows;
__u64 highprio_packets;
__u64 tcp_retrans;
__u64 highprio_packets; /* obsolete */
__u64 tcp_retrans; /* obsolete */
__u64 throttled;
__u64 flows_plimit;
__u64 pkts_too_long;
@@ -804,6 +860,12 @@ struct tc_fq_qd_stats {
__u32 throttled_flows;
__u32 unthrottle_latency_ns;
__u64 ce_mark; /* packets above ce_threshold */
__u64 horizon_drops;
__u64 horizon_caps;
__u64 fastpath_packets;
__u64 band_drops[FQ_BANDS];
__u32 band_pkt_count[FQ_BANDS];
__u32 pad;
};
/* Heavy-Hitter Filter */
@@ -841,19 +903,56 @@ enum {
TCA_PIE_BETA,
TCA_PIE_ECN,
TCA_PIE_BYTEMODE,
TCA_PIE_DQ_RATE_ESTIMATOR,
__TCA_PIE_MAX
};
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
struct tc_pie_xstats {
__u32 prob; /* current probability */
__u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to pie_action */
__u32 overlimit; /* dropped due to lack of space in queue */
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
__u64 prob; /* current probability */
__u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in
* bits/pie_time
*/
__u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to pie_action */
__u32 overlimit; /* dropped due to lack of space
* in queue
*/
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
};
/* FQ PIE */
enum {
TCA_FQ_PIE_UNSPEC,
TCA_FQ_PIE_LIMIT,
TCA_FQ_PIE_FLOWS,
TCA_FQ_PIE_TARGET,
TCA_FQ_PIE_TUPDATE,
TCA_FQ_PIE_ALPHA,
TCA_FQ_PIE_BETA,
TCA_FQ_PIE_QUANTUM,
TCA_FQ_PIE_MEMORY_LIMIT,
TCA_FQ_PIE_ECN_PROB,
TCA_FQ_PIE_ECN,
TCA_FQ_PIE_BYTEMODE,
TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
__TCA_FQ_PIE_MAX
};
#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1)
struct tc_fq_pie_xstats {
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to fq_pie_action */
__u32 overlimit; /* dropped due to lack of space in queue */
__u32 overmemory; /* dropped due to lack of memory in queue */
__u32 ecn_mark; /* packets marked with ecn */
__u32 new_flow_count; /* count of new flows created by packets */
__u32 new_flows_len; /* count of flows in new list */
__u32 old_flows_len; /* count of flows in old list */
__u32 memory_usage; /* total memory across all queues */
};
/* CBS */
@@ -880,8 +979,9 @@ struct tc_etf_qopt {
__s32 delta;
__s32 clockid;
__u32 flags;
#define TC_ETF_DEADLINE_MODE_ON BIT(0)
#define TC_ETF_OFFLOAD_ON BIT(1)
#define TC_ETF_DEADLINE_MODE_ON _BITUL(0)
#define TC_ETF_OFFLOAD_ON _BITUL(1)
#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2)
};
enum {
@@ -913,6 +1013,7 @@ enum {
TCA_CAKE_INGRESS,
TCA_CAKE_ACK_FILTER,
TCA_CAKE_SPLIT_GSO,
TCA_CAKE_FWMARK,
__TCA_CAKE_MAX
};
#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1)
@@ -1039,6 +1140,40 @@ enum {
#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
/* The format for the admin sched (dump only):
* [TCA_TAPRIO_SCHED_ADMIN_SCHED]
* [TCA_TAPRIO_ATTR_SCHED_BASE_TIME]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
*/
#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST _BITUL(0)
#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD _BITUL(1)
enum {
TCA_TAPRIO_TC_ENTRY_UNSPEC,
TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */
TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */
TCA_TAPRIO_TC_ENTRY_FP, /* u32 */
/* add new constants above here */
__TCA_TAPRIO_TC_ENTRY_CNT,
TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
};
enum {
TCA_TAPRIO_OFFLOAD_STATS_PAD = 1, /* u64 */
TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS, /* u64 */
TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS, /* u64 */
/* add new constants above here */
__TCA_TAPRIO_OFFLOAD_STATS_CNT,
TCA_TAPRIO_OFFLOAD_STATS_MAX = (__TCA_TAPRIO_OFFLOAD_STATS_CNT - 1)
};
enum {
TCA_TAPRIO_ATTR_UNSPEC,
TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1047,9 +1182,101 @@ enum {
TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
TCA_TAPRIO_PAD,
TCA_TAPRIO_ATTR_PAD = TCA_TAPRIO_PAD,
TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
TCA_TAPRIO_ATTR_FLAGS, /* u32 */
TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
__TCA_TAPRIO_ATTR_MAX,
};
#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
/* ETS */
#define TCQ_ETS_MAX_BANDS 16
enum {
TCA_ETS_UNSPEC,
TCA_ETS_NBANDS, /* u8 */
TCA_ETS_NSTRICT, /* u8 */
TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */
TCA_ETS_QUANTA_BAND, /* u32 */
TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */
TCA_ETS_PRIOMAP_BAND, /* u8 */
__TCA_ETS_MAX,
};
#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
/* DUALPI2 */
enum tc_dualpi2_drop_overload {
TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0,
TC_DUALPI2_DROP_OVERLOAD_DROP = 1,
__TCA_DUALPI2_DROP_OVERLOAD_MAX,
};
#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1)
enum tc_dualpi2_drop_early {
TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0,
TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1,
__TCA_DUALPI2_DROP_EARLY_MAX,
};
#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1)
enum tc_dualpi2_ecn_mask {
TC_DUALPI2_ECN_MASK_L4S_ECT = 1,
TC_DUALPI2_ECN_MASK_CLA_ECT = 2,
TC_DUALPI2_ECN_MASK_ANY_ECT = 3,
__TCA_DUALPI2_ECN_MASK_MAX,
};
#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1)
enum tc_dualpi2_split_gso {
TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0,
TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1,
__TCA_DUALPI2_SPLIT_GSO_MAX,
};
#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1)
enum {
TCA_DUALPI2_UNSPEC,
TCA_DUALPI2_LIMIT, /* Packets */
TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */
TCA_DUALPI2_TARGET, /* us */
TCA_DUALPI2_TUPDATE, /* us */
TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */
TCA_DUALPI2_BETA, /* Hz scaled up by 256 */
TCA_DUALPI2_STEP_THRESH_PKTS, /* Step threshold in packets */
TCA_DUALPI2_STEP_THRESH_US, /* Step threshold in microseconds */
TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */
TCA_DUALPI2_COUPLING, /* Coupling factor between queues */
TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */
TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */
TCA_DUALPI2_C_PROTECTION, /* Percentage */
TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */
TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */
TCA_DUALPI2_PAD,
__TCA_DUALPI2_MAX
};
#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1)
struct tc_dualpi2_xstats {
__u32 prob; /* current probability */
__u32 delay_c; /* current delay in C queue */
__u32 delay_l; /* current delay in L queue */
__u32 packets_in_c; /* number of packets enqueued in C queue */
__u32 packets_in_l; /* number of packets enqueued in L queue */
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
__u32 step_marks; /* ECN marks due to the step AQM */
__s32 credit; /* current c_protection credit */
__u32 memory_used; /* Memory used by both queues */
__u32 max_memory_used; /* Maximum used memory */
__u32 memory_limit; /* Memory limit of both queues */
};
#endif

View File

@@ -0,0 +1,81 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _LINUX_STDDEF_H
#define _LINUX_STDDEF_H
#ifdef __KERNEL__
#endif
#ifndef __always_inline
#define __always_inline inline
#endif
/* Not all C++ standards support type declarations inside an anonymous union */
#ifndef __cplusplus
#define __struct_group_tag(TAG) TAG
#else
#define __struct_group_tag(TAG)
#endif
/**
* __struct_group() - Create a mirrored named and anonyomous struct
*
* @TAG: The tag name for the named sub-struct (usually empty)
* @NAME: The identifier name of the mirrored sub-struct
* @ATTRS: Any struct attributes (usually empty)
* @MEMBERS: The member declarations for the mirrored structs
*
* Used to create an anonymous union of two structs with identical layout
* and size: one anonymous and one named. The former's members can be used
* normally without sub-struct naming, and the latter can be used to
* reason about the start, end, and size of the group of struct members.
* The named struct can also be explicitly tagged for layer reuse (C only),
* as well as both having struct attributes appended.
*/
#define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \
union { \
struct { MEMBERS } ATTRS; \
struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \
} ATTRS
#ifdef __cplusplus
/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */
#define __DECLARE_FLEX_ARRAY(T, member) \
T member[0]
#else
/**
* __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
*
* @TYPE: The type of each flexible array element
* @NAME: The name of the flexible array member
*
* In order to have a flexible array member in a union or alone in a
* struct, it needs to be wrapped in an anonymous struct with at least 1
* named member, but that member can be empty.
*/
#define __DECLARE_FLEX_ARRAY(TYPE, NAME) \
struct { \
struct { } __empty_ ## NAME; \
TYPE NAME[]; \
}
#endif
#ifndef __counted_by
#define __counted_by(m)
#endif
#ifndef __counted_by_le
#define __counted_by_le(m)
#endif
#ifndef __counted_by_be
#define __counted_by_be(m)
#endif
#ifdef __KERNEL__
#define __kernel_nonstring __nonstring
#else
#define __kernel_nonstring
#endif
#endif /* _LINUX_STDDEF_H */

View File

@@ -39,18 +39,19 @@ trap "cd ${WORKDIR}; exit" INT TERM EXIT
declare -A PATH_MAP
PATH_MAP=( \
[tools/lib/bpf]=src \
[tools/include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \
[tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \
[tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \
[tools/include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \
[tools/include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \
[tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \
[tools/include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \
[tools/include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \
[tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \
[tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \
[tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \
[include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \
[include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \
[include/uapi/linux/btf.h]=include/uapi/linux/btf.h \
[include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \
[include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \
[include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \
[include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \
[include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \
[include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \
[include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \
[include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \
[include/uapi/linux/perf_event.h]=include/uapi/linux/perf_event.h \
[include/uapi/linux/stddef.h]=include/uapi/linux/stddef.h \
[Documentation/bpf/libbpf]=docs \
)
@@ -63,7 +64,7 @@ LIBBPF_TREE_FILTER="mkdir -p __libbpf/include/uapi/linux __libbpf/include/tools
for p in "${!PATH_MAP[@]}"; do
LIBBPF_TREE_FILTER+="git mv -kf ${p} __libbpf/${PATH_MAP[${p}]} && "$'\\\n'
done
LIBBPF_TREE_FILTER+="find __libbpf/include/uapi/linux -type f -exec sed -i 's/_UAPI\(__\?LINUX\)/\1/' {} + && "$'\\\n'
LIBBPF_TREE_FILTER+="find __libbpf/include/uapi/linux -type f -exec sed -i -e 's/_UAPI\(__\?LINUX\)/\1/g' -e 's@^#include <linux/compiler_types.h>@@' {} + && "$'\\\n'
LIBBPF_TREE_FILTER+="git rm --ignore-unmatch -f __libbpf/src/{Makefile,Build,test_libbpf.c,.gitignore} >/dev/null"
cd_to()

View File

@@ -55,9 +55,9 @@ endif
OBJDIR ?= .
SHARED_OBJDIR := $(OBJDIR)/sharedobjs
STATIC_OBJDIR := $(OBJDIR)/staticobjs
OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \
nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \
btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \
OBJS := bpf.o btf.o libbpf.o libbpf_utils.o netlink.o \
nlattr.o libbpf_probes.o bpf_prog_linfo.o \
btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \
relo_core.o usdt.o zip.o elf.o features.o btf_iter.o btf_relocate.o
SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS))
STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS))

View File

@@ -154,7 +154,7 @@ int bump_rlimit_memlock(void)
memlock_bumped = true;
/* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */
/* zero memlock_rlim disables auto-bumping RLIMIT_MEMLOCK */
if (memlock_rlim == 0)
return 0;
@@ -172,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type,
__u32 max_entries,
const struct bpf_map_create_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd);
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
union bpf_attr attr;
int fd;
@@ -203,6 +203,8 @@ int bpf_map_create(enum bpf_map_type map_type,
attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);
attr.map_token_fd = OPTS_GET(opts, token_fd, 0);
attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
return libbpf_err_errno(fd);
@@ -238,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, size_t insn_cnt,
struct bpf_prog_load_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt);
const size_t attr_sz = offsetofend(union bpf_attr, keyring_id);
void *finfo = NULL, *linfo = NULL;
const char *func_info, *line_info;
__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
@@ -792,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd,
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
case BPF_TRACE_FSESSION:
case BPF_LSM_MAC:
attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0);
if (!OPTS_ZEROED(opts, tracing))
@@ -1395,3 +1398,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz);
return libbpf_err_errno(err);
}
int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
struct bpf_prog_assoc_struct_ops_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops);
union bpf_attr attr;
int err;
if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts))
return libbpf_err(-EINVAL);
memset(&attr, 0, attr_sz);
attr.prog_assoc_struct_ops.map_fd = map_fd;
attr.prog_assoc_struct_ops.prog_fd = prog_fd;
attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0);
err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz);
return libbpf_err_errno(err);
}

View File

@@ -54,9 +54,12 @@ struct bpf_map_create_opts {
__s32 value_type_btf_obj_fd;
__u32 token_fd;
const void *excl_prog_hash;
__u32 excl_prog_hash_size;
size_t :0;
};
#define bpf_map_create_opts__last_field token_fd
#define bpf_map_create_opts__last_field excl_prog_hash_size
LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
const char *map_name,
@@ -286,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch,
* Update spin_lock-ed map elements. This must be
* specified if the map value contains a spinlock.
*
* **BPF_F_CPU**
* As for percpu maps, update value on the specified CPU. And the cpu
* info is embedded into the high 32 bits of **opts->elem_flags**.
*
* **BPF_F_ALL_CPUS**
* As for percpu maps, update value across all CPUs. This flag cannot
* be used with BPF_F_CPU at the same time.
*
* @param fd BPF map file descriptor
* @param keys pointer to an array of *count* keys
* @param values pointer to an array of *count* values
@@ -730,6 +741,27 @@ struct bpf_prog_stream_read_opts {
LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
struct bpf_prog_stream_read_opts *opts);
struct bpf_prog_assoc_struct_ops_opts {
size_t sz;
__u32 flags;
size_t :0;
};
#define bpf_prog_assoc_struct_ops_opts__last_field flags
/**
* @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a
* struct_ops map.
*
* @param prog_fd FD for the BPF program
* @param map_fd FD for the struct_ops map to be associated with the BPF program
* @param opts optional options, can be NULL
*
* @return 0 on success; negative error code, otherwise (errno is also set to
* the error code)
*/
LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
struct bpf_prog_assoc_struct_ops_opts *opts);
#ifdef __cplusplus
} /* extern "C" */
#endif

View File

@@ -4,6 +4,7 @@
#define __BPF_GEN_INTERNAL_H
#include "bpf.h"
#include "libbpf_internal.h"
struct ksym_relo_desc {
const char *name;
@@ -50,6 +51,7 @@ struct bpf_gen {
__u32 nr_ksyms;
int fd_array;
int nr_fd_array;
int hash_insn_offset[SHA256_DWORD_SIZE];
};
void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps);

View File

@@ -3558,7 +3558,7 @@ static int (* const bpf_inode_storage_delete)(void *map, void *inode) = (void *)
* including the trailing NUL character. On error, a negative
* value.
*/
static long (* const bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147;
static long (* const bpf_d_path)(const struct path *path, char *buf, __u32 sz) = (void *) 147;
/*
* bpf_copy_from_user
@@ -4484,7 +4484,7 @@ static struct mptcp_sock *(* const bpf_skc_to_mptcp_sock)(void *sk) = (void *) 1
* 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
* -EINVAL if flags is not 0.
*/
static long (* const bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197;
static long (* const bpf_dynptr_from_mem)(void *data, __u64 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197;
/*
* bpf_ringbuf_reserve_dynptr
@@ -4542,7 +4542,7 @@ static void (* const bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 f
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*/
static long (* const bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201;
static long (* const bpf_dynptr_read)(void *dst, __u64 len, const struct bpf_dynptr *src, __u64 offset, __u64 flags) = (void *) 201;
/*
* bpf_dynptr_write
@@ -4567,7 +4567,7 @@ static long (* const bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dyn
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*/
static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202;
static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u64 offset, void *src, __u64 len, __u64 flags) = (void *) 202;
/*
* bpf_dynptr_data
@@ -4585,7 +4585,7 @@ static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offse
* read-only, if the dynptr is invalid, or if the offset and length
* is out of bounds.
*/
static void *(* const bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203;
static void *(* const bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u64 offset, __u64 len) = (void *) 203;
/*
* bpf_tcp_raw_gen_syncookie_ipv4

View File

@@ -316,19 +316,19 @@ enum libbpf_tristate {
})
extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
__u32 len__sz, void *aux__prog) __weak __ksym;
__u32 len__sz) __weak __ksym;
#define bpf_stream_printk(stream_id, fmt, args...) \
({ \
static const char ___fmt[] = fmt; \
unsigned long long ___param[___bpf_narg(args)]; \
\
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \
\
bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\
#define bpf_stream_printk(stream_id, fmt, args...) \
({ \
static const char ___fmt[] = fmt; \
unsigned long long ___param[___bpf_narg(args)]; \
\
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \
\
bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \
})
/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args

View File

@@ -311,7 +311,7 @@ struct pt_regs___arm64 {
#define __PT_RET_REG regs[31]
#define __PT_FP_REG __unsupported__
#define __PT_RC_REG gpr[3]
#define __PT_SP_REG sp
#define __PT_SP_REG gpr[1]
#define __PT_IP_REG nip
#elif defined(bpf_target_sparc)

393
src/btf.c
View File

@@ -23,7 +23,6 @@
#include "libbpf_internal.h"
#include "hashmap.h"
#include "strset.h"
#include "str_error.h"
#define BTF_MAX_NR_TYPES 0x7fffffffU
#define BTF_MAX_STR_OFFSET 0x7fffffffU
@@ -93,6 +92,8 @@ struct btf {
* - for split BTF counts number of types added on top of base BTF.
*/
__u32 nr_types;
/* the start IDs of named types in sorted BTF */
int named_start_id;
/* if not NULL, points to the base BTF on top of which the current
* split BTF is based
*/
@@ -898,44 +899,103 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id)
return type_id;
}
__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
static void btf_check_sorted(struct btf *btf)
{
__u32 i, nr_types = btf__type_cnt(btf);
__u32 i, n, named_start_id = 0;
if (!strcmp(type_name, "void"))
n = btf__type_cnt(btf);
for (i = btf->start_id + 1; i < n; i++) {
struct btf_type *ta = btf_type_by_id(btf, i - 1);
struct btf_type *tb = btf_type_by_id(btf, i);
const char *na = btf__str_by_offset(btf, ta->name_off);
const char *nb = btf__str_by_offset(btf, tb->name_off);
if (strcmp(na, nb) > 0)
return;
if (named_start_id == 0 && na[0] != '\0')
named_start_id = i - 1;
if (named_start_id == 0 && nb[0] != '\0')
named_start_id = i;
}
if (named_start_id)
btf->named_start_id = named_start_id;
}
static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name,
__s32 start_id)
{
const struct btf_type *t;
const char *tname;
__s32 l, r, m;
l = start_id;
r = btf__type_cnt(btf) - 1;
while (l <= r) {
m = l + (r - l) / 2;
t = btf_type_by_id(btf, m);
tname = btf__str_by_offset(btf, t->name_off);
if (strcmp(tname, name) >= 0) {
if (l == r)
return r;
r = m;
} else {
l = m + 1;
}
}
return btf__type_cnt(btf);
}
static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id,
const char *type_name, __s32 kind)
{
__u32 nr_types = btf__type_cnt(btf);
const struct btf_type *t;
const char *tname;
__s32 id;
if (start_id < btf->start_id) {
id = btf_find_by_name_kind(btf->base_btf, start_id,
type_name, kind);
if (id >= 0)
return id;
start_id = btf->start_id;
}
if (kind == BTF_KIND_UNKN || strcmp(type_name, "void") == 0)
return 0;
for (i = 1; i < nr_types; i++) {
const struct btf_type *t = btf__type_by_id(btf, i);
const char *name = btf__name_by_offset(btf, t->name_off);
if (name && !strcmp(type_name, name))
return i;
if (btf->named_start_id > 0 && type_name[0]) {
start_id = max(start_id, btf->named_start_id);
id = btf_find_type_by_name_bsearch(btf, type_name, start_id);
for (; id < nr_types; id++) {
t = btf__type_by_id(btf, id);
tname = btf__str_by_offset(btf, t->name_off);
if (strcmp(tname, type_name) != 0)
return libbpf_err(-ENOENT);
if (kind < 0 || btf_kind(t) == kind)
return id;
}
} else {
for (id = start_id; id < nr_types; id++) {
t = btf_type_by_id(btf, id);
if (kind > 0 && btf_kind(t) != kind)
continue;
tname = btf__str_by_offset(btf, t->name_off);
if (strcmp(tname, type_name) == 0)
return id;
}
}
return libbpf_err(-ENOENT);
}
static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id,
const char *type_name, __u32 kind)
/* the kind value of -1 indicates that kind matching should be skipped */
__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
{
__u32 i, nr_types = btf__type_cnt(btf);
if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void"))
return 0;
for (i = start_id; i < nr_types; i++) {
const struct btf_type *t = btf__type_by_id(btf, i);
const char *name;
if (btf_kind(t) != kind)
continue;
name = btf__name_by_offset(btf, t->name_off);
if (name && !strcmp(type_name, name))
return i;
}
return libbpf_err(-ENOENT);
return btf_find_by_name_kind(btf, 1, type_name, -1);
}
__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name,
@@ -1007,6 +1067,7 @@ static struct btf *btf_new_empty(struct btf *base_btf)
btf->fd = -1;
btf->ptr_sz = sizeof(void *);
btf->swapped_endian = false;
btf->named_start_id = 0;
if (base_btf) {
btf->base_btf = base_btf;
@@ -1058,11 +1119,12 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b
btf->start_id = 1;
btf->start_str_off = 0;
btf->fd = -1;
btf->named_start_id = 0;
if (base_btf) {
btf->base_btf = base_btf;
btf->start_id = btf__type_cnt(base_btf);
btf->start_str_off = base_btf->hdr->str_len;
btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off;
}
if (is_mmap) {
@@ -1092,6 +1154,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b
err = err ?: btf_sanity_check(btf);
if (err)
goto done;
btf_check_sorted(btf);
done:
if (err) {
@@ -1716,6 +1779,7 @@ static void btf_invalidate_raw_data(struct btf *btf)
free(btf->raw_data_swapped);
btf->raw_data_swapped = NULL;
}
btf->named_start_id = 0;
}
/* Ensure BTF is ready to be modified (by splitting into a three memory
@@ -2070,7 +2134,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding
int sz, name_off;
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
/* byte_sz must be power of 2 */
if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16)
@@ -2118,7 +2182,7 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz)
int sz, name_off;
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
/* byte_sz must be one of the explicitly allowed values */
@@ -2173,7 +2237,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref
if (!t)
return libbpf_err(-ENOMEM);
if (name && name[0]) {
if (!str_is_empty(name)) {
name_off = btf__add_str(btf, name);
if (name_off < 0)
return name_off;
@@ -2250,7 +2314,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32
if (!t)
return libbpf_err(-ENOMEM);
if (name && name[0]) {
if (!str_is_empty(name)) {
name_off = btf__add_str(btf, name);
if (name_off < 0)
return name_off;
@@ -2351,7 +2415,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id,
if (!m)
return libbpf_err(-ENOMEM);
if (name && name[0]) {
if (!str_is_empty(name)) {
name_off = btf__add_str(btf, name);
if (name_off < 0)
return name_off;
@@ -2389,7 +2453,7 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz,
if (!t)
return libbpf_err(-ENOMEM);
if (name && name[0]) {
if (!str_is_empty(name)) {
name_off = btf__add_str(btf, name);
if (name_off < 0)
return name_off;
@@ -2447,7 +2511,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value)
return libbpf_err(-EINVAL);
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
if (value < INT_MIN || value > UINT_MAX)
return libbpf_err(-E2BIG);
@@ -2524,7 +2588,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value)
return libbpf_err(-EINVAL);
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
/* decompose and invalidate raw data */
@@ -2564,7 +2628,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value)
*/
int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
{
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
switch (fwd_kind) {
@@ -2600,7 +2664,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
*/
int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id)
{
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id, 0);
@@ -2652,7 +2716,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id)
*/
int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id)
{
if (!value || !value[0])
if (str_is_empty(value))
return libbpf_err(-EINVAL);
return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 0);
@@ -2669,7 +2733,7 @@ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id)
*/
int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id)
{
if (!value || !value[0])
if (str_is_empty(value))
return libbpf_err(-EINVAL);
return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 1);
@@ -2688,7 +2752,7 @@ int btf__add_func(struct btf *btf, const char *name,
{
int id;
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL &&
linkage != BTF_FUNC_EXTERN)
@@ -2774,7 +2838,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id)
if (!p)
return libbpf_err(-ENOMEM);
if (name && name[0]) {
if (!str_is_empty(name)) {
name_off = btf__add_str(btf, name);
if (name_off < 0)
return name_off;
@@ -2809,7 +2873,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id)
int sz, name_off;
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED &&
linkage != BTF_VAR_GLOBAL_EXTERN)
@@ -2858,7 +2922,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz)
int sz, name_off;
/* non-empty name */
if (!name || !name[0])
if (str_is_empty(name))
return libbpf_err(-EINVAL);
if (btf_ensure_modifiable(btf))
@@ -2935,7 +2999,7 @@ static int btf_add_decl_tag(struct btf *btf, const char *value, int ref_type_id,
struct btf_type *t;
int sz, value_off;
if (!value || !value[0] || component_idx < -1)
if (str_is_empty(value) || component_idx < -1)
return libbpf_err(-EINVAL);
if (validate_type_id(ref_type_id))
@@ -3902,6 +3966,20 @@ err_out:
return err;
}
/*
* Calculate type signature hash of TYPEDEF, ignoring referenced type IDs,
* as referenced type IDs equivalence is established separately during type
* graph equivalence check algorithm.
*/
static long btf_hash_typedef(struct btf_type *t)
{
long h;
h = hash_combine(0, t->name_off);
h = hash_combine(h, t->info);
return h;
}
static long btf_hash_common(struct btf_type *t)
{
long h;
@@ -3919,6 +3997,13 @@ static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2)
t1->size == t2->size;
}
/* Check structural compatibility of two TYPEDEF. */
static bool btf_equal_typedef(struct btf_type *t1, struct btf_type *t2)
{
return t1->name_off == t2->name_off &&
t1->info == t2->info;
}
/* Calculate type signature hash of INT or TAG. */
static long btf_hash_int_decl_tag(struct btf_type *t)
{
@@ -4411,11 +4496,14 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2,
struct btf_type *t1, *t2;
int k1, k2;
recur:
if (depth <= 0)
return false;
t1 = btf_type_by_id(d->btf, id1);
t2 = btf_type_by_id(d->btf, id2);
if (depth <= 0) {
pr_debug("Reached depth limit for identical type comparison for '%s'/'%s'\n",
btf__name_by_offset(d->btf, t1->name_off),
btf__name_by_offset(d->btf, t2->name_off));
return false;
}
k1 = btf_kind(t1);
k2 = btf_kind(t2);
@@ -4477,8 +4565,16 @@ recur:
for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) {
if (m1->type == m2->type)
continue;
if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1))
if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) {
if (t1->name_off) {
pr_debug("%s '%s' size=%d vlen=%d id1[%u] id2[%u] shallow-equal but not identical for field#%d '%s'\n",
k1 == BTF_KIND_STRUCT ? "STRUCT" : "UNION",
btf__name_by_offset(d->btf, t1->name_off),
t1->size, btf_vlen(t1), id1, id2, i,
btf__name_by_offset(d->btf, m1->name_off));
}
return false;
}
}
return true;
}
@@ -4719,8 +4815,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
canon_m = btf_members(canon_type);
for (i = 0; i < vlen; i++) {
eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type);
if (eq <= 0)
if (eq <= 0) {
if (cand_type->name_off) {
pr_debug("%s '%s' size=%d vlen=%d cand_id[%u] canon_id[%u] shallow-equal but not equiv for field#%d '%s': %d\n",
cand_kind == BTF_KIND_STRUCT ? "STRUCT" : "UNION",
btf__name_by_offset(d->btf, cand_type->name_off),
cand_type->size, vlen, cand_id, canon_id, i,
btf__name_by_offset(d->btf, cand_m->name_off), eq);
}
return eq;
}
cand_m++;
canon_m++;
}
@@ -4845,13 +4949,30 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
}
}
static inline long btf_hash_by_kind(struct btf_type *t, __u16 kind)
{
if (kind == BTF_KIND_TYPEDEF)
return btf_hash_typedef(t);
else
return btf_hash_struct(t);
}
static inline bool btf_equal_by_kind(struct btf_type *t1, struct btf_type *t2, __u16 kind)
{
if (kind == BTF_KIND_TYPEDEF)
return btf_equal_typedef(t1, t2);
else
return btf_shallow_equal_struct(t1, t2);
}
/*
* Deduplicate struct/union types.
* Deduplicate struct/union and typedef types.
*
* For each struct/union type its type signature hash is calculated, taking
* into account type's name, size, number, order and names of fields, but
* ignoring type ID's referenced from fields, because they might not be deduped
* completely until after reference types deduplication phase. This type hash
* completely until after reference types deduplication phase. For each typedef
* type, the hash is computed based on the types name and size. This type hash
* is used to iterate over all potential canonical types, sharing same hash.
* For each canonical candidate we check whether type graphs that they form
* (through referenced types in fields and so on) are equivalent using algorithm
@@ -4883,18 +5004,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
t = btf_type_by_id(d->btf, type_id);
kind = btf_kind(t);
if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION)
if (kind != BTF_KIND_STRUCT &&
kind != BTF_KIND_UNION &&
kind != BTF_KIND_TYPEDEF)
return 0;
h = btf_hash_struct(t);
h = btf_hash_by_kind(t, kind);
for_each_dedup_cand(d, hash_entry, h) {
__u32 cand_id = hash_entry->value;
int eq;
/*
* Even though btf_dedup_is_equiv() checks for
* btf_shallow_equal_struct() internally when checking two
* structs (unions) for equivalence, we need to guard here
* btf_equal_by_kind() internally when checking two
* structs (unions) or typedefs for equivalence, we need to guard here
* from picking matching FWD type as a dedup candidate.
* This can happen due to hash collision. In such case just
* relying on btf_dedup_is_equiv() would lead to potentially
@@ -4902,7 +5025,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
* FWD and compatible STRUCT/UNION are considered equivalent.
*/
cand_type = btf_type_by_id(d->btf, cand_id);
if (!btf_shallow_equal_struct(t, cand_type))
if (!btf_equal_by_kind(t, cand_type, kind))
continue;
btf_dedup_clear_hypot_map(d);
@@ -4940,18 +5063,18 @@ static int btf_dedup_struct_types(struct btf_dedup *d)
/*
* Deduplicate reference type.
*
* Once all primitive and struct/union types got deduplicated, we can easily
* Once all primitive, struct/union and typedef types got deduplicated, we can easily
* deduplicate all other (reference) BTF types. This is done in two steps:
*
* 1. Resolve all referenced type IDs into their canonical type IDs. This
* resolution can be done either immediately for primitive or struct/union types
* (because they were deduped in previous two phases) or recursively for
* resolution can be done either immediately for primitive, struct/union, and typedef
* types (because they were deduped in previous two phases) or recursively for
* reference types. Recursion will always terminate at either primitive or
* struct/union type, at which point we can "unwind" chain of reference types
* one by one. There is no danger of encountering cycles because in C type
* system the only way to form type cycle is through struct/union, so any chain
* of reference types, even those taking part in a type cycle, will inevitably
* reach struct/union at some point.
* struct/union and typedef types, at which point we can "unwind" chain of reference
* types one by one. There is no danger of encountering cycles in C, as the only way to
* form a type cycle is through struct or union types. Go can form such cycles through
* typedef. Thus, any chain of reference types, even those taking part in a type cycle,
* will inevitably reach a struct/union or typedef type at some point.
*
* 2. Once all referenced type IDs are resolved into canonical ones, BTF type
* becomes "stable", in the sense that no further deduplication will cause
@@ -4983,7 +5106,6 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
case BTF_KIND_VOLATILE:
case BTF_KIND_RESTRICT:
case BTF_KIND_PTR:
case BTF_KIND_TYPEDEF:
case BTF_KIND_FUNC:
case BTF_KIND_TYPE_TAG:
ref_type_id = btf_dedup_ref_type(d, t->type);
@@ -5819,7 +5941,7 @@ void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
{
btf->base_btf = (struct btf *)base_btf;
btf->start_id = btf__type_cnt(base_btf);
btf->start_str_off = base_btf->hdr->str_len;
btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off;
}
int btf__relocate(struct btf *btf, const struct btf *base_btf)
@@ -5830,3 +5952,136 @@ int btf__relocate(struct btf *btf, const struct btf *base_btf)
btf->owns_base = false;
return libbpf_err(err);
}
struct btf_permute {
struct btf *btf;
__u32 *id_map;
__u32 start_offs;
};
/* Callback function to remap individual type ID references */
static int btf_permute_remap_type_id(__u32 *type_id, void *ctx)
{
struct btf_permute *p = ctx;
__u32 new_id = *type_id;
/* refer to the base BTF or VOID type */
if (new_id < p->btf->start_id)
return 0;
if (new_id >= btf__type_cnt(p->btf))
return -EINVAL;
*type_id = p->id_map[new_id - p->btf->start_id + p->start_offs];
return 0;
}
int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt,
const struct btf_permute_opts *opts)
{
struct btf_permute p;
struct btf_ext *btf_ext;
void *nt, *new_types = NULL;
__u32 *order_map = NULL;
int err = 0, i;
__u32 n, id, start_offs = 0;
if (!OPTS_VALID(opts, btf_permute_opts))
return libbpf_err(-EINVAL);
if (btf__base_btf(btf)) {
n = btf->nr_types;
} else {
if (id_map[0] != 0)
return libbpf_err(-EINVAL);
n = btf__type_cnt(btf);
start_offs = 1;
}
if (id_map_cnt != n)
return libbpf_err(-EINVAL);
/* record the sequence of types */
order_map = calloc(id_map_cnt, sizeof(*id_map));
if (!order_map) {
err = -ENOMEM;
goto done;
}
new_types = calloc(btf->hdr->type_len, 1);
if (!new_types) {
err = -ENOMEM;
goto done;
}
if (btf_ensure_modifiable(btf)) {
err = -ENOMEM;
goto done;
}
for (i = start_offs; i < id_map_cnt; i++) {
id = id_map[i];
if (id < btf->start_id || id >= btf__type_cnt(btf)) {
err = -EINVAL;
goto done;
}
id -= btf->start_id - start_offs;
/* cannot be mapped to the same ID */
if (order_map[id]) {
err = -EINVAL;
goto done;
}
order_map[id] = i + btf->start_id - start_offs;
}
p.btf = btf;
p.id_map = id_map;
p.start_offs = start_offs;
nt = new_types;
for (i = start_offs; i < id_map_cnt; i++) {
struct btf_field_iter it;
const struct btf_type *t;
__u32 *type_id;
int type_size;
id = order_map[i];
t = btf__type_by_id(btf, id);
type_size = btf_type_size(t);
memcpy(nt, t, type_size);
/* fix up referenced IDs for BTF */
err = btf_field_iter_init(&it, nt, BTF_FIELD_ITER_IDS);
if (err)
goto done;
while ((type_id = btf_field_iter_next(&it))) {
err = btf_permute_remap_type_id(type_id, &p);
if (err)
goto done;
}
nt += type_size;
}
/* fix up referenced IDs for btf_ext */
btf_ext = OPTS_GET(opts, btf_ext, NULL);
if (btf_ext) {
err = btf_ext_visit_type_ids(btf_ext, btf_permute_remap_type_id, &p);
if (err)
goto done;
}
for (nt = new_types, i = 0; i < id_map_cnt - start_offs; i++) {
btf->type_offs[i] = nt - new_types;
nt += btf_type_size(nt);
}
free(order_map);
free(btf->types_data);
btf->types_data = new_types;
return 0;
done:
free(order_map);
free(new_types);
return libbpf_err(err);
}

View File

@@ -94,6 +94,7 @@ LIBBPF_API struct btf *btf__new_empty(void);
* @brief **btf__new_empty_split()** creates an unpopulated BTF object from an
* ELF BTF section except with a base BTF on top of which split BTF should be
* based
* @param base_btf base BTF object
* @return new BTF object instance which has to be eventually freed with
* **btf__free()**
*
@@ -115,6 +116,10 @@ LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf);
* When that split BTF is loaded against a (possibly changed) base, this
* distilled base BTF will help update references to that (possibly changed)
* base BTF.
* @param src_btf source split BTF object
* @param new_base_btf pointer to where the new base BTF object pointer will be stored
* @param new_split_btf pointer to where the new split BTF object pointer will be stored
* @return 0 on success; negative error code, otherwise
*
* Both the new split and its associated new base BTF must be freed by
* the caller.
@@ -264,6 +269,9 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
* to base BTF kinds, and verify those references are compatible with
* *base_btf*; if they are, *btf* is adjusted such that is re-parented to
* *base_btf* and type ids and strings are adjusted to accommodate this.
* @param btf split BTF object to relocate
* @param base_btf base BTF object
* @return 0 on success; negative error code, otherwise
*
* If successful, 0 is returned and **btf** now has **base_btf** as its
* base.
@@ -273,6 +281,48 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
*/
LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf);
struct btf_permute_opts {
size_t sz;
/* optional .BTF.ext info along the main BTF info */
struct btf_ext *btf_ext;
size_t :0;
};
#define btf_permute_opts__last_field btf_ext
/**
* @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping
* @param btf BTF object to permute
* @param id_map Array mapping original type IDs to new IDs
* @param id_map_cnt Number of elements in @id_map
* @param opts Optional parameters, including BTF extension data for reference updates
* @return 0 on success, negative error code on failure
*
* **btf__permute()** reorders BTF types based on the provided @id_map array,
* updating all internal type references to maintain consistency. The function
* operates in-place, modifying the BTF object directly.
*
* For **base BTF**:
* - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1`
* - @id_map_cnt must be `btf__type_cnt(btf)`
* - Mapping is defined as `id_map[original_id] = new_id`
* - `id_map[0]` must be 0 (void type cannot be moved)
*
* For **split BTF**:
* - @id_map must include only split types (types added on top of the base BTF)
* - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))`
* - Mapping is defined as `id_map[original_id - start_id] = new_id`
* - `start_id` equals `btf__type_cnt(btf__base_btf(btf))`
*
* After permutation, all type references within the BTF data and optional
* BTF extension (if provided via @opts) are updated automatically.
*
* On error, returns a negative error code and sets errno:
* - `-EINVAL`: Invalid parameters or invalid ID mapping
* - `-ENOMEM`: Memory allocation failure
*/
LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt,
const struct btf_permute_opts *opts);
struct btf_dump;
struct btf_dump_opts {

View File

@@ -21,7 +21,6 @@
#include "hashmap.h"
#include "libbpf.h"
#include "libbpf_internal.h"
#include "str_error.h"
static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t";
static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1;
@@ -1763,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d,
__u16 left_shift_bits, right_shift_bits;
const __u8 *bytes = data;
__u8 nr_copy_bits;
__u8 start_bit, nr_bytes;
__u64 num = 0;
int i;
/* Calculate how many bytes cover the bitfield */
start_bit = bits_offset % 8;
nr_bytes = (start_bit + bit_sz + 7) / 8;
/* Bound check */
if (data + nr_bytes > d->typed_dump->data_end)
return -E2BIG;
/* Maximum supported bitfield size is 64 bits */
if (t->size > 8) {
pr_warn("unexpected bitfield size %d\n", t->size);

View File

@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include "libbpf_internal.h"
#include "str_error.h"
/* A SHT_GNU_versym section holds 16-bit words. This bit is set if
* the symbol is hidden and can only be seen when referenced using an

View File

@@ -6,7 +6,6 @@
#include "libbpf.h"
#include "libbpf_common.h"
#include "libbpf_internal.h"
#include "str_error.h"
static inline __u64 ptr_to_u64(const void *ptr)
{

View File

@@ -4,6 +4,7 @@
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <asm/byteorder.h>
#include <linux/filter.h>
#include <sys/param.h>
#include "btf.h"
@@ -13,8 +14,6 @@
#include "hashmap.h"
#include "bpf_gen_internal.h"
#include "skel_internal.h"
#include <asm/byteorder.h>
#include "str_error.h"
#define MAX_USED_MAPS 64
#define MAX_USED_PROGS 32
@@ -110,6 +109,7 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in
static int add_data(struct bpf_gen *gen, const void *data, __u32 size);
static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off);
static void emit_signature_match(struct bpf_gen *gen);
void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps)
{
@@ -152,6 +152,8 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps
/* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */
emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7));
emit(gen, BPF_EXIT_INSN());
if (OPTS_GET(gen->opts, gen_hash, false))
emit_signature_match(gen);
}
static int add_data(struct bpf_gen *gen, const void *data, __u32 size)
@@ -368,6 +370,8 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off)
__emit_sys_close(gen);
}
static void compute_sha_update_offsets(struct bpf_gen *gen);
int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
{
int i;
@@ -394,6 +398,9 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
blob_fd_array_off(gen, i));
emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
emit(gen, BPF_EXIT_INSN());
if (OPTS_GET(gen->opts, gen_hash, false))
compute_sha_update_offsets(gen);
pr_debug("gen: finish %s\n", errstr(gen->error));
if (!gen->error) {
struct gen_loader_opts *opts = gen->opts;
@@ -446,6 +453,22 @@ void bpf_gen__free(struct bpf_gen *gen)
_val; \
})
static void compute_sha_update_offsets(struct bpf_gen *gen)
{
__u64 sha[SHA256_DWORD_SIZE];
__u64 sha_dw;
int i;
libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, (__u8 *)sha);
for (i = 0; i < SHA256_DWORD_SIZE; i++) {
struct bpf_insn *insn =
(struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]);
sha_dw = tgt_endian(sha[i]);
insn[0].imm = (__u32)sha_dw;
insn[1].imm = sha_dw >> 32;
}
}
void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
__u32 btf_raw_size)
{
@@ -557,6 +580,29 @@ void bpf_gen__map_create(struct bpf_gen *gen,
emit_sys_close_stack(gen, stack_off(inner_map_fd));
}
static void emit_signature_match(struct bpf_gen *gen)
{
__s64 off;
int i;
for (i = 0; i < SHA256_DWORD_SIZE; i++) {
emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX,
0, 0, 0, 0));
emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, i * sizeof(__u64)));
gen->hash_insn_offset[i] = gen->insn_cur - gen->insn_start;
emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_3, 0, 0, 0, 0, 0));
off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1;
if (is_simm16(off)) {
emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL));
emit(gen, BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, off));
} else {
gen->error = -ERANGE;
emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1));
}
}
}
void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name,
enum bpf_attach_type type)
{

View File

@@ -50,7 +50,6 @@
#include "libbpf.h"
#include "bpf.h"
#include "btf.h"
#include "str_error.h"
#include "libbpf_internal.h"
#include "hashmap.h"
#include "bpf_gen_internal.h"
@@ -116,6 +115,7 @@ static const char * const attach_type_name[] = {
[BPF_TRACE_FENTRY] = "trace_fentry",
[BPF_TRACE_FEXIT] = "trace_fexit",
[BPF_MODIFY_RETURN] = "modify_return",
[BPF_TRACE_FSESSION] = "trace_fsession",
[BPF_LSM_MAC] = "lsm_mac",
[BPF_LSM_CGROUP] = "lsm_cgroup",
[BPF_SK_LOOKUP] = "sk_lookup",
@@ -191,6 +191,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf",
[BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage",
[BPF_MAP_TYPE_ARENA] = "arena",
[BPF_MAP_TYPE_INSN_ARRAY] = "insn_array",
};
static const char * const prog_type_name[] = {
@@ -318,8 +319,6 @@ static void pr_perm_msg(int err)
buf);
}
#define STRERR_BUFSIZE 128
/* Copied from tools/perf/util/util.h */
#ifndef zfree
# define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
@@ -372,6 +371,7 @@ enum reloc_type {
RELO_EXTERN_CALL,
RELO_SUBPROG_ADDR,
RELO_CORE,
RELO_INSN_ARRAY,
};
struct reloc_desc {
@@ -381,8 +381,17 @@ struct reloc_desc {
const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */
struct {
int map_idx;
int sym_off;
int ext_idx;
unsigned int sym_off;
/*
* The following two fields can be unionized, as the
* ext_idx field is used for extern symbols, and the
* sym_size is used for jump tables, which are never
* extern
*/
union {
int ext_idx;
int sym_size;
};
};
};
};
@@ -424,6 +433,11 @@ struct bpf_sec_def {
libbpf_prog_attach_fn_t prog_attach_fn;
};
struct bpf_light_subprog {
__u32 sec_insn_off;
__u32 sub_insn_off;
};
/*
* bpf_prog should be a better name but it has been used in
* linux/filter.h.
@@ -496,6 +510,10 @@ struct bpf_program {
__u32 line_info_rec_size;
__u32 line_info_cnt;
__u32 prog_flags;
__u8 hash[SHA256_DIGEST_LENGTH];
struct bpf_light_subprog *subprogs;
__u32 subprog_cnt;
};
struct bpf_struct_ops {
@@ -575,6 +593,7 @@ struct bpf_map {
bool autocreate;
bool autoattach;
__u64 map_extra;
struct bpf_program *excl_prog;
};
enum extern_type {
@@ -668,6 +687,7 @@ struct elf_state {
int symbols_shndx;
bool has_st_ops;
int arena_data_shndx;
int jumptables_data_shndx;
};
struct usdt_manager;
@@ -738,6 +758,17 @@ struct bpf_object {
int arena_map_idx;
void *arena_data;
size_t arena_data_sz;
size_t arena_data_off;
void *jumptables_data;
size_t jumptables_data_sz;
struct {
struct bpf_program *prog;
unsigned int sym_off;
int fd;
} *jumptable_maps;
size_t jumptable_map_cnt;
struct kern_feature_cache *feat_cache;
char *token_path;
@@ -765,6 +796,7 @@ void bpf_program__unload(struct bpf_program *prog)
zfree(&prog->func_info);
zfree(&prog->line_info);
zfree(&prog->subprogs);
}
static void bpf_program__exit(struct bpf_program *prog)
@@ -1013,35 +1045,33 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw,
const struct btf_member *kern_data_member;
struct btf *btf = NULL;
__s32 kern_vtype_id, kern_type_id;
char tname[256];
char tname[192], stname[256];
__u32 i;
snprintf(tname, sizeof(tname), "%.*s",
(int)bpf_core_essential_name_len(tname_raw), tname_raw);
kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT,
&btf, mod_btf);
if (kern_type_id < 0) {
pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n",
tname);
return kern_type_id;
}
kern_type = btf__type_by_id(btf, kern_type_id);
snprintf(stname, sizeof(stname), "%s%s", STRUCT_OPS_VALUE_PREFIX, tname);
/* Find the corresponding "map_value" type that will be used
* in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example,
* find "struct bpf_struct_ops_tcp_congestion_ops" from the
* btf_vmlinux.
/* Look for the corresponding "map_value" type that will be used
* in map_update(BPF_MAP_TYPE_STRUCT_OPS) first, figure out the btf
* and the mod_btf.
* For example, find "struct bpf_struct_ops_tcp_congestion_ops".
*/
kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX,
tname, BTF_KIND_STRUCT);
kern_vtype_id = find_ksym_btf_id(obj, stname, BTF_KIND_STRUCT, &btf, mod_btf);
if (kern_vtype_id < 0) {
pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n",
STRUCT_OPS_VALUE_PREFIX, tname);
pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", stname);
return kern_vtype_id;
}
kern_vtype = btf__type_by_id(btf, kern_vtype_id);
kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
if (kern_type_id < 0) {
pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", tname);
return kern_type_id;
}
kern_type = btf__type_by_id(btf, kern_type_id);
/* Find "struct tcp_congestion_ops" from
* struct bpf_struct_ops_tcp_congestion_ops {
* [ ... ]
@@ -1054,8 +1084,8 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw,
break;
}
if (i == btf_vlen(kern_vtype)) {
pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n",
tname, STRUCT_OPS_VALUE_PREFIX, tname);
pr_warn("struct_ops init_kern: struct %s data is not found in struct %s\n",
tname, stname);
return -EINVAL;
}
@@ -2875,7 +2905,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
var_extra = btf_var(var);
map_name = btf__name_by_offset(obj->btf, var->name_off);
if (map_name == NULL || map_name[0] == '\0') {
if (str_is_empty(map_name)) {
pr_warn("map #%d: empty name.\n", var_idx);
return -EINVAL;
}
@@ -2963,10 +2993,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
void *data, size_t data_sz)
{
const long page_sz = sysconf(_SC_PAGE_SIZE);
const size_t data_alloc_sz = roundup(data_sz, page_sz);
size_t mmap_sz;
mmap_sz = bpf_map_mmap_sz(map);
if (roundup(data_sz, page_sz) > mmap_sz) {
if (data_alloc_sz > mmap_sz) {
pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n",
sec_name, mmap_sz, data_sz);
return -E2BIG;
@@ -2978,6 +3009,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
memcpy(obj->arena_data, data, data_sz);
obj->arena_data_sz = data_sz;
/* place globals at the end of the arena */
obj->arena_data_off = mmap_sz - data_alloc_sz;
/* make bpf_map__init_value() work for ARENA maps */
map->mmaped = obj->arena_data;
@@ -2999,7 +3033,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx);
data = elf_sec_data(obj, scn);
if (!scn || !data) {
if (!data) {
pr_warn("elf: failed to get %s map definitions for %s\n",
MAPS_ELF_SEC, obj->path);
return -EINVAL;
@@ -3945,6 +3979,13 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if (strcmp(name, ARENA_SEC) == 0) {
obj->efile.arena_data = data;
obj->efile.arena_data_shndx = idx;
} else if (strcmp(name, JUMPTABLES_SEC) == 0) {
obj->jumptables_data = malloc(data->d_size);
if (!obj->jumptables_data)
return -ENOMEM;
memcpy(obj->jumptables_data, data->d_buf, data->d_size);
obj->jumptables_data_sz = data->d_size;
obj->efile.jumptables_data_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@@ -4241,7 +4282,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
if (!sym_is_extern(sym))
continue;
ext_name = elf_sym_str(obj, sym->st_name);
if (!ext_name || !ext_name[0])
if (str_is_empty(ext_name))
continue;
ext = obj->externs;
@@ -4485,6 +4526,44 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx)
}
}
static int bpf_prog_compute_hash(struct bpf_program *prog)
{
struct bpf_insn *purged;
int i, err = 0;
purged = calloc(prog->insns_cnt, BPF_INSN_SZ);
if (!purged)
return -ENOMEM;
/* If relocations have been done, the map_fd needs to be
* discarded for the digest calculation.
*/
for (i = 0; i < prog->insns_cnt; i++) {
purged[i] = prog->insns[i];
if (purged[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
(purged[i].src_reg == BPF_PSEUDO_MAP_FD ||
purged[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
purged[i].imm = 0;
i++;
if (i >= prog->insns_cnt ||
prog->insns[i].code != 0 ||
prog->insns[i].dst_reg != 0 ||
prog->insns[i].src_reg != 0 ||
prog->insns[i].off != 0) {
err = -EINVAL;
goto out;
}
purged[i] = prog->insns[i];
purged[i].imm = 0;
}
}
libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn),
prog->hash);
out:
free(purged);
return err;
}
static int bpf_program__record_reloc(struct bpf_program *prog,
struct reloc_desc *reloc_desc,
__u32 insn_idx, const char *sym_name,
@@ -4590,7 +4669,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
reloc_desc->type = RELO_DATA;
reloc_desc->insn_idx = insn_idx;
reloc_desc->map_idx = obj->arena_map_idx;
reloc_desc->sym_off = sym->st_value;
reloc_desc->sym_off = sym->st_value + obj->arena_data_off;
map = &obj->maps[obj->arena_map_idx];
pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n",
@@ -4599,6 +4678,16 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
return 0;
}
/* jump table data relocation */
if (shdr_idx == obj->efile.jumptables_data_shndx) {
reloc_desc->type = RELO_INSN_ARRAY;
reloc_desc->insn_idx = insn_idx;
reloc_desc->map_idx = -1;
reloc_desc->sym_off = sym->st_value;
reloc_desc->sym_size = sym->st_size;
return 0;
}
/* generic map reference relocation */
if (type == LIBBPF_MAP_UNSPEC) {
if (!bpf_object__shndx_is_maps(obj, shdr_idx)) {
@@ -5234,6 +5323,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
create_attr.token_fd = obj->token_fd;
if (obj->token_fd)
create_attr.map_flags |= BPF_F_TOKEN_FD;
if (map->excl_prog) {
err = bpf_prog_compute_hash(map->excl_prog);
if (err)
return err;
create_attr.excl_prog_hash = map->excl_prog->hash;
create_attr.excl_prog_hash_size = SHA256_DIGEST_LENGTH;
}
if (bpf_map__is_struct_ops(map)) {
create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
@@ -5533,7 +5630,8 @@ retry:
return err;
}
if (obj->arena_data) {
memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz);
memcpy(map->mmaped + obj->arena_data_off, obj->arena_data,
obj->arena_data_sz);
zfree(&obj->arena_data);
}
}
@@ -6101,6 +6199,157 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx,
insn->imm = POISON_CALL_KFUNC_BASE + ext_idx;
}
static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off)
{
size_t i;
for (i = 0; i < obj->jumptable_map_cnt; i++) {
/*
* This might happen that same offset is used for two different
* programs (as jump tables can be the same). However, for
* different programs different maps should be created.
*/
if (obj->jumptable_maps[i].sym_off == sym_off &&
obj->jumptable_maps[i].prog == prog)
return obj->jumptable_maps[i].fd;
}
return -ENOENT;
}
static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off, int map_fd)
{
size_t cnt = obj->jumptable_map_cnt;
size_t size = sizeof(obj->jumptable_maps[0]);
void *tmp;
tmp = libbpf_reallocarray(obj->jumptable_maps, cnt + 1, size);
if (!tmp)
return -ENOMEM;
obj->jumptable_maps = tmp;
obj->jumptable_maps[cnt].prog = prog;
obj->jumptable_maps[cnt].sym_off = sym_off;
obj->jumptable_maps[cnt].fd = map_fd;
obj->jumptable_map_cnt++;
return 0;
}
static int find_subprog_idx(struct bpf_program *prog, int insn_idx)
{
int i;
for (i = prog->subprog_cnt - 1; i >= 0; i--) {
if (insn_idx >= prog->subprogs[i].sub_insn_off)
return i;
}
return -1;
}
static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo)
{
const __u32 jt_entry_size = 8;
unsigned int sym_off = relo->sym_off;
int jt_size = relo->sym_size;
__u32 max_entries = jt_size / jt_entry_size;
__u32 value_size = sizeof(struct bpf_insn_array_value);
struct bpf_insn_array_value val = {};
int subprog_idx;
int map_fd, err;
__u64 insn_off;
__u64 *jt;
__u32 i;
map_fd = find_jt_map(obj, prog, sym_off);
if (map_fd >= 0)
return map_fd;
if (sym_off % jt_entry_size) {
pr_warn("map '.jumptables': jumptable start %u should be multiple of %u\n",
sym_off, jt_entry_size);
return -EINVAL;
}
if (jt_size % jt_entry_size) {
pr_warn("map '.jumptables': jumptable size %d should be multiple of %u\n",
jt_size, jt_entry_size);
return -EINVAL;
}
map_fd = bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, ".jumptables",
4, value_size, max_entries, NULL);
if (map_fd < 0)
return map_fd;
if (!obj->jumptables_data) {
pr_warn("map '.jumptables': ELF file is missing jump table data\n");
err = -EINVAL;
goto err_close;
}
if (sym_off + jt_size > obj->jumptables_data_sz) {
pr_warn("map '.jumptables': jumptables_data size is %zd, trying to access %d\n",
obj->jumptables_data_sz, sym_off + jt_size);
err = -EINVAL;
goto err_close;
}
subprog_idx = -1; /* main program */
if (relo->insn_idx < 0 || relo->insn_idx >= prog->insns_cnt) {
pr_warn("map '.jumptables': invalid instruction index %d\n", relo->insn_idx);
err = -EINVAL;
goto err_close;
}
if (prog->subprogs)
subprog_idx = find_subprog_idx(prog, relo->insn_idx);
jt = (__u64 *)(obj->jumptables_data + sym_off);
for (i = 0; i < max_entries; i++) {
/*
* The offset should be made to be relative to the beginning of
* the main function, not the subfunction.
*/
insn_off = jt[i]/sizeof(struct bpf_insn);
if (subprog_idx >= 0) {
insn_off -= prog->subprogs[subprog_idx].sec_insn_off;
insn_off += prog->subprogs[subprog_idx].sub_insn_off;
} else {
insn_off -= prog->sec_insn_off;
}
/*
* LLVM-generated jump tables contain u64 records, however
* should contain values that fit in u32.
*/
if (insn_off > UINT32_MAX) {
pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %u\n",
(long long)jt[i], sym_off + i * jt_entry_size);
err = -EINVAL;
goto err_close;
}
val.orig_off = insn_off;
err = bpf_map_update_elem(map_fd, &i, &val, 0);
if (err)
goto err_close;
}
err = bpf_map_freeze(map_fd);
if (err)
goto err_close;
err = add_jt_map(obj, prog, sym_off, map_fd);
if (err)
goto err_close;
return map_fd;
err_close:
close(map_fd);
return err;
}
/* Relocate data references within program code:
* - map references;
* - global variable references;
@@ -6192,6 +6441,20 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
case RELO_CORE:
/* will be handled by bpf_program_record_relos() */
break;
case RELO_INSN_ARRAY: {
int map_fd;
map_fd = create_jt_map(obj, prog, relo);
if (map_fd < 0) {
pr_warn("prog '%s': relo #%d: can't create jump table: sym_off %u\n",
prog->name, i, relo->sym_off);
return map_fd;
}
insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
insn->imm = map_fd;
insn->off = 0;
}
break;
default:
pr_warn("prog '%s': relo #%d: bad relo type %d\n",
prog->name, i, relo->type);
@@ -6389,36 +6652,62 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra
return 0;
}
static int save_subprog_offsets(struct bpf_program *main_prog, struct bpf_program *subprog)
{
size_t size = sizeof(main_prog->subprogs[0]);
int cnt = main_prog->subprog_cnt;
void *tmp;
tmp = libbpf_reallocarray(main_prog->subprogs, cnt + 1, size);
if (!tmp)
return -ENOMEM;
main_prog->subprogs = tmp;
main_prog->subprogs[cnt].sec_insn_off = subprog->sec_insn_off;
main_prog->subprogs[cnt].sub_insn_off = subprog->sub_insn_off;
main_prog->subprog_cnt++;
return 0;
}
static int
bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog,
struct bpf_program *subprog)
{
struct bpf_insn *insns;
size_t new_cnt;
int err;
struct bpf_insn *insns;
size_t new_cnt;
int err;
subprog->sub_insn_off = main_prog->insns_cnt;
subprog->sub_insn_off = main_prog->insns_cnt;
new_cnt = main_prog->insns_cnt + subprog->insns_cnt;
insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns));
if (!insns) {
pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name);
return -ENOMEM;
}
main_prog->insns = insns;
main_prog->insns_cnt = new_cnt;
new_cnt = main_prog->insns_cnt + subprog->insns_cnt;
insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns));
if (!insns) {
pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name);
return -ENOMEM;
}
main_prog->insns = insns;
main_prog->insns_cnt = new_cnt;
memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns,
subprog->insns_cnt * sizeof(*insns));
memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns,
subprog->insns_cnt * sizeof(*insns));
pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n",
main_prog->name, subprog->insns_cnt, subprog->name);
pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n",
main_prog->name, subprog->insns_cnt, subprog->name);
/* The subprog insns are now appended. Append its relos too. */
err = append_subprog_relos(main_prog, subprog);
if (err)
return err;
return 0;
/* The subprog insns are now appended. Append its relos too. */
err = append_subprog_relos(main_prog, subprog);
if (err)
return err;
err = save_subprog_offsets(main_prog, subprog);
if (err) {
pr_warn("prog '%s': failed to add subprog offsets: %s\n",
main_prog->name, errstr(err));
return err;
}
return 0;
}
static int
@@ -8202,7 +8491,7 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type,
struct bpf_object *obj = ctx;
const struct btf_type *t;
struct extern_desc *ext;
char *res;
const char *res;
res = strstr(sym_name, ".llvm.");
if (sym_type == 'd' && res)
@@ -9185,6 +9474,13 @@ void bpf_object__close(struct bpf_object *obj)
zfree(&obj->arena_data);
zfree(&obj->jumptables_data);
obj->jumptables_data_sz = 0;
for (i = 0; i < obj->jumptable_map_cnt; i++)
close(obj->jumptable_maps[i].fd);
zfree(&obj->jumptable_maps);
free(obj);
}
@@ -9564,6 +9860,8 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
SEC_DEF("fsession+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace),
SEC_DEF("fsession.s+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace),
SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
@@ -10524,6 +10822,27 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd)
return 0;
}
int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog)
{
if (map_is_created(map)) {
pr_warn("exclusive programs must be set before map creation\n");
return libbpf_err(-EINVAL);
}
if (map->obj != prog->obj) {
pr_warn("excl_prog and map must be from the same bpf object\n");
return libbpf_err(-EINVAL);
}
map->excl_prog = prog;
return 0;
}
struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map)
{
return map->excl_prog;
}
static struct bpf_map *
__bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i)
{
@@ -10603,7 +10922,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name)
}
static int validate_map_op(const struct bpf_map *map, size_t key_sz,
size_t value_sz, bool check_value_sz)
size_t value_sz, bool check_value_sz, __u64 flags)
{
if (!map_is_created(map)) /* map is not yet created */
return -ENOENT;
@@ -10630,6 +10949,20 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz,
int num_cpu = libbpf_num_possible_cpus();
size_t elem_sz = roundup(map->def.value_size, 8);
if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) {
pr_warn("map '%s': BPF_F_CPU and BPF_F_ALL_CPUS are mutually exclusive\n",
map->name);
return -EINVAL;
}
if (map->def.value_size != value_sz) {
pr_warn("map '%s': unexpected value size %zu provided for either BPF_F_CPU or BPF_F_ALL_CPUS, expected %u\n",
map->name, value_sz, map->def.value_size);
return -EINVAL;
}
break;
}
if (value_sz != num_cpu * elem_sz) {
pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n",
map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz);
@@ -10654,7 +10987,7 @@ int bpf_map__lookup_elem(const struct bpf_map *map,
{
int err;
err = validate_map_op(map, key_sz, value_sz, true);
err = validate_map_op(map, key_sz, value_sz, true, flags);
if (err)
return libbpf_err(err);
@@ -10667,7 +11000,7 @@ int bpf_map__update_elem(const struct bpf_map *map,
{
int err;
err = validate_map_op(map, key_sz, value_sz, true);
err = validate_map_op(map, key_sz, value_sz, true, flags);
if (err)
return libbpf_err(err);
@@ -10679,7 +11012,7 @@ int bpf_map__delete_elem(const struct bpf_map *map,
{
int err;
err = validate_map_op(map, key_sz, 0, false /* check_value_sz */);
err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, flags);
if (err)
return libbpf_err(err);
@@ -10692,7 +11025,7 @@ int bpf_map__lookup_and_delete_elem(const struct bpf_map *map,
{
int err;
err = validate_map_op(map, key_sz, value_sz, true);
err = validate_map_op(map, key_sz, value_sz, true, flags);
if (err)
return libbpf_err(err);
@@ -10704,7 +11037,7 @@ int bpf_map__get_next_key(const struct bpf_map *map,
{
int err;
err = validate_map_op(map, key_sz, 0, false /* check_value_sz */);
err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, 0);
if (err)
return libbpf_err(err);
@@ -11261,8 +11594,6 @@ static const char *arch_specific_syscall_pfx(void)
return "ia32";
#elif defined(__s390x__)
return "s390x";
#elif defined(__s390__)
return "s390";
#elif defined(__arm__)
return "arm";
#elif defined(__aarch64__)
@@ -11510,7 +11841,8 @@ static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type,
*
* [0] fb6a421fb615 ("kallsyms: Match symbols exactly with CONFIG_LTO_CLANG")
*/
char sym_trim[256], *psym_trim = sym_trim, *sym_sfx;
char sym_trim[256], *psym_trim = sym_trim;
const char *sym_sfx;
if (!(sym_sfx = strstr(sym_name, ".llvm.")))
return 0;
@@ -12049,8 +12381,6 @@ static const char *arch_specific_lib_paths(void)
return "/lib/i386-linux-gnu";
#elif defined(__s390x__)
return "/lib/s390x-linux-gnu";
#elif defined(__s390__)
return "/lib/s390-linux-gnu";
#elif defined(__arm__) && defined(__SOFTFP__)
return "/lib/arm-linux-gnueabi";
#elif defined(__arm__) && !defined(__SOFTFP__)
@@ -12095,7 +12425,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz)
if (!search_paths[i])
continue;
for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) {
char *next_path;
const char *next_path;
int seg_len;
if (s[0] == ':')
@@ -13794,8 +14124,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
return libbpf_err(-EINVAL);
if (attach_prog_fd && !attach_func_name) {
/* remember attach_prog_fd and let bpf_program__load() find
* BTF ID during the program load
/* Store attach_prog_fd. The BTF ID will be resolved later during
* the normal object/program load phase.
*/
prog->attach_prog_fd = attach_prog_fd;
return 0;
@@ -13827,6 +14157,37 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
return 0;
}
int bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map,
struct bpf_prog_assoc_struct_ops_opts *opts)
{
int prog_fd, map_fd;
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
pr_warn("prog '%s': can't associate BPF program without FD (was it loaded?)\n",
prog->name);
return libbpf_err(-EINVAL);
}
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
pr_warn("prog '%s': can't associate struct_ops program\n", prog->name);
return libbpf_err(-EINVAL);
}
map_fd = bpf_map__fd(map);
if (map_fd < 0) {
pr_warn("map '%s': can't associate BPF map without FD (was it created?)\n", map->name);
return libbpf_err(-EINVAL);
}
if (!bpf_map__is_struct_ops(map)) {
pr_warn("map '%s': can't associate non-struct_ops map\n", map->name);
return libbpf_err(-EINVAL);
}
return bpf_prog_assoc_struct_ops(prog_fd, map_fd, opts);
}
int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz)
{
int err = 0, n, len, start, end = -1;
@@ -14092,7 +14453,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
if (!map_skel->mmaped)
continue;
*map_skel->mmaped = map->mmaped;
if (map->def.type == BPF_MAP_TYPE_ARENA)
*map_skel->mmaped = map->mmaped + map->obj->arena_data_off;
else
*map_skel->mmaped = map->mmaped;
}
return 0;

View File

@@ -448,7 +448,7 @@ LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path);
/**
* @brief **bpf_program__unpin()** unpins the BPF program from a file
* in the BPFFS specified by a path. This decrements the programs
* in the BPFFS specified by a path. This decrements program's in-kernel
* reference count.
*
* The file pinning the BPF program can also be unlinked by a different
@@ -481,14 +481,12 @@ LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path);
/**
* @brief **bpf_link__unpin()** unpins the BPF link from a file
* in the BPFFS specified by a path. This decrements the links
* reference count.
* in the BPFFS. This decrements link's in-kernel reference count.
*
* The file pinning the BPF link can also be unlinked by a different
* process in which case this function will return an error.
*
* @param prog BPF program to unpin
* @param path file path to the pin in a BPF file system
* @param link BPF link to unpin
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int bpf_link__unpin(struct bpf_link *link);
@@ -995,14 +993,35 @@ LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog);
* - fentry/fexit/fmod_ret;
* - lsm;
* - freplace.
* @param prog BPF program to set the attach type for
* @param type attach type to set the BPF map to have
* @param prog BPF program to configure; must be not yet loaded.
* @param attach_prog_fd FD of target BPF program (for freplace/extension).
* If >0 and func name omitted, defers BTF ID resolution.
* @param attach_func_name Target function name. Used either with
* attach_prog_fd to find destination BTF type ID in that BPF program, or
* alone (no attach_prog_fd) to resolve kernel (vmlinux/module) BTF ID.
* Must be provided if attach_prog_fd is 0.
* @return error code; or 0 if no error occurred.
*/
LIBBPF_API int
bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
const char *attach_func_name);
struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */
/**
* @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a
* struct_ops map.
*
* @param prog BPF program
* @param map struct_ops map to be associated with the BPF program
* @param opts optional options, can be NULL
*
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int
bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map,
struct bpf_prog_assoc_struct_ops_opts *opts);
/**
* @brief **bpf_object__find_map_by_name()** returns BPF map of
* the given name, if it exists within the passed BPF object
@@ -1098,6 +1117,7 @@ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map);
/**
* @brief **bpf_map__set_value_size()** sets map value size.
* @param map the BPF map instance
* @param size the new value size
* @return 0, on success; negative error, otherwise
*
* There is a special case for maps with associated memory-mapped regions, like
@@ -1196,13 +1216,14 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map);
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @param value pointer to memory in which looked up value will be stored
* @param value_sz size in byte of value data memory; it has to match BPF map
* definition's **value_size**. For per-CPU BPF maps value size has to be
* a product of BPF map value size and number of possible CPUs in the system
* (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* definition's **value_size**. For per-CPU BPF maps, value size can be
* `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified
* in **flags**, otherwise a product of BPF map value size and number of
* possible CPUs in the system (could be fetched with
* **libbpf_num_possible_cpus()**). Note also that for per-CPU values value
* size has to be aligned up to closest 8 bytes, so expected size is:
* `round_up(value_size, 8) * libbpf_num_possible_cpus()`.
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_elem()** is high-level equivalent of
@@ -1219,14 +1240,8 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
* @param key pointer to memory containing bytes of the key
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @param value pointer to memory containing bytes of the value
* @param value_sz size in byte of value data memory; it has to match BPF map
* definition's **value_size**. For per-CPU BPF maps value size has to be
* a product of BPF map value size and number of possible CPUs in the system
* (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param value_sz refer to **bpf_map__lookup_elem**'s description.'
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__update_elem()** is high-level equivalent of
@@ -1242,7 +1257,7 @@ LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map,
* @param map BPF map to delete element from
* @param key pointer to memory containing bytes of the key
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__delete_elem()** is high-level equivalent of
@@ -1265,7 +1280,7 @@ LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map,
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_and_delete_elem()** is high-level equivalent of
@@ -1291,6 +1306,28 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map,
*/
LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map,
const void *cur_key, void *next_key, size_t key_sz);
/**
* @brief **bpf_map__set_exclusive_program()** sets a map to be exclusive to the
* specified program. This must be called *before* the map is created.
*
* @param map BPF map to make exclusive.
* @param prog BPF program to be the exclusive user of the map. Must belong
* to the same bpf_object as the map.
* @return 0 on success; a negative error code otherwise.
*
* This function must be called after the BPF object is opened but before
* it is loaded. Once the object is loaded, only the specified program
* will be able to access the map's contents.
*/
LIBBPF_API int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog);
/**
* @brief **bpf_map__exclusive_program()** returns the exclusive program
* that is registered with the map (if any).
* @param map BPF map to which the exclusive program is registered.
* @return the registered exclusive program.
*/
LIBBPF_API struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map);
struct bpf_xdp_set_link_opts {
size_t sz;
@@ -1615,6 +1652,7 @@ struct perf_buffer_opts {
* @param sample_cb function called on each received data record
* @param lost_cb function called when record loss has occurred
* @param ctx user-provided extra context passed into *sample_cb* and *lost_cb*
* @param opts optional parameters for the perf buffer, can be null
* @return a new instance of struct perf_buffer on success, NULL on error with
* *errno* containing an error code
*/
@@ -1835,9 +1873,10 @@ struct gen_loader_opts {
const char *insns;
__u32 data_sz;
__u32 insns_sz;
bool gen_hash;
};
#define gen_loader_opts__last_field insns_sz
#define gen_loader_opts__last_field gen_hash
LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj,
struct gen_loader_opts *opts);

View File

@@ -448,4 +448,10 @@ LIBBPF_1.6.0 {
} LIBBPF_1.5.0;
LIBBPF_1.7.0 {
global:
bpf_map__set_exclusive_program;
bpf_map__exclusive_program;
bpf_prog_assoc_struct_ops;
bpf_program__assoc_struct_ops;
btf__permute;
} LIBBPF_1.6.0;

View File

@@ -1,75 +0,0 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/*
* Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
* Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
* Copyright (C) 2015 Huawei Inc.
* Copyright (C) 2017 Nicira, Inc.
*/
#undef _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include "libbpf.h"
#include "libbpf_internal.h"
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c)
#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf",
[ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid",
[ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost",
[ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch",
[ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf",
[ERRCODE_OFFSET(RELOC)] = "Relocation failed",
[ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading",
[ERRCODE_OFFSET(PROG2BIG)] = "Program too big",
[ERRCODE_OFFSET(KVER)] = "Incorrect kernel version",
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
[ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
};
int libbpf_strerror(int err, char *buf, size_t size)
{
int ret;
if (!buf || !size)
return libbpf_err(-EINVAL);
err = err > 0 ? err : -err;
if (err < __LIBBPF_ERRNO__START) {
ret = strerror_r(err, buf, size);
buf[size - 1] = '\0';
return libbpf_err_errno(ret);
}
if (err < __LIBBPF_ERRNO__END) {
const char *msg;
msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
ret = snprintf(buf, size, "%s", msg);
buf[size - 1] = '\0';
/* The length of the buf and msg is positive.
* A negative number may be returned only when the
* size exceeds INT_MAX. Not likely to appear.
*/
if (ret >= size)
return libbpf_err(-ERANGE);
return 0;
}
ret = snprintf(buf, size, "Unknown libbpf error %d", err);
buf[size - 1] = '\0';
if (ret >= size)
return libbpf_err(-ERANGE);
return libbpf_err(-ENOENT);
}

View File

@@ -74,6 +74,8 @@
#define ELF64_ST_VISIBILITY(o) ((o) & 0x03)
#endif
#define JUMPTABLES_SEC ".jumptables"
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)
@@ -172,6 +174,16 @@ do { \
#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__)
#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__)
/**
* @brief **libbpf_errstr()** returns string corresponding to numeric errno
* @param err negative numeric errno
* @return pointer to string representation of the errno, that is invalidated
* upon the next call.
*/
const char *libbpf_errstr(int err);
#define errstr(err) libbpf_errstr(err)
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@@ -712,6 +724,11 @@ static inline bool is_pow_of_2(size_t x)
return x && (x & (x - 1)) == 0;
}
static inline __u32 ror32(__u32 v, int bits)
{
return (v >> bits) | (v << (32 - bits));
}
#define PROG_LOAD_ATTEMPTS 5
int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts);
@@ -736,4 +753,8 @@ int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern,
int probe_fd(int fd);
#define SHA256_DIGEST_LENGTH 32
#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64)
void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]);
#endif /* __LIBBPF_LIBBPF_INTERNAL_H */

View File

@@ -364,6 +364,10 @@ static int probe_map_create(enum bpf_map_type map_type)
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
break;
case BPF_MAP_TYPE_INSN_ARRAY:
key_size = sizeof(__u32);
value_size = sizeof(struct bpf_insn_array_value);
break;
case BPF_MAP_TYPE_UNSPEC:
default:
return -EOPNOTSUPP;

256
src/libbpf_utils.c Normal file
View File

@@ -0,0 +1,256 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/*
* Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
* Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
* Copyright (C) 2015 Huawei Inc.
* Copyright (C) 2017 Nicira, Inc.
*/
#undef _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/kernel.h>
#include "libbpf.h"
#include "libbpf_internal.h"
#ifndef ENOTSUPP
#define ENOTSUPP 524
#endif
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c)
#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf",
[ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid",
[ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost",
[ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch",
[ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf",
[ERRCODE_OFFSET(RELOC)] = "Relocation failed",
[ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading",
[ERRCODE_OFFSET(PROG2BIG)] = "Program too big",
[ERRCODE_OFFSET(KVER)] = "Incorrect kernel version",
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
[ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
};
int libbpf_strerror(int err, char *buf, size_t size)
{
int ret;
if (!buf || !size)
return libbpf_err(-EINVAL);
err = err > 0 ? err : -err;
if (err < __LIBBPF_ERRNO__START) {
ret = strerror_r(err, buf, size);
buf[size - 1] = '\0';
return libbpf_err_errno(ret);
}
if (err < __LIBBPF_ERRNO__END) {
const char *msg;
msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
ret = snprintf(buf, size, "%s", msg);
buf[size - 1] = '\0';
/* The length of the buf and msg is positive.
* A negative number may be returned only when the
* size exceeds INT_MAX. Not likely to appear.
*/
if (ret >= size)
return libbpf_err(-ERANGE);
return 0;
}
ret = snprintf(buf, size, "Unknown libbpf error %d", err);
buf[size - 1] = '\0';
if (ret >= size)
return libbpf_err(-ERANGE);
return libbpf_err(-ENOENT);
}
const char *libbpf_errstr(int err)
{
static __thread char buf[12];
if (err > 0)
err = -err;
switch (err) {
case -E2BIG: return "-E2BIG";
case -EACCES: return "-EACCES";
case -EADDRINUSE: return "-EADDRINUSE";
case -EADDRNOTAVAIL: return "-EADDRNOTAVAIL";
case -EAGAIN: return "-EAGAIN";
case -EALREADY: return "-EALREADY";
case -EBADF: return "-EBADF";
case -EBADFD: return "-EBADFD";
case -EBUSY: return "-EBUSY";
case -ECANCELED: return "-ECANCELED";
case -ECHILD: return "-ECHILD";
case -EDEADLK: return "-EDEADLK";
case -EDOM: return "-EDOM";
case -EEXIST: return "-EEXIST";
case -EFAULT: return "-EFAULT";
case -EFBIG: return "-EFBIG";
case -EILSEQ: return "-EILSEQ";
case -EINPROGRESS: return "-EINPROGRESS";
case -EINTR: return "-EINTR";
case -EINVAL: return "-EINVAL";
case -EIO: return "-EIO";
case -EISDIR: return "-EISDIR";
case -ELOOP: return "-ELOOP";
case -EMFILE: return "-EMFILE";
case -EMLINK: return "-EMLINK";
case -EMSGSIZE: return "-EMSGSIZE";
case -ENAMETOOLONG: return "-ENAMETOOLONG";
case -ENFILE: return "-ENFILE";
case -ENODATA: return "-ENODATA";
case -ENODEV: return "-ENODEV";
case -ENOENT: return "-ENOENT";
case -ENOEXEC: return "-ENOEXEC";
case -ENOLINK: return "-ENOLINK";
case -ENOMEM: return "-ENOMEM";
case -ENOSPC: return "-ENOSPC";
case -ENOTBLK: return "-ENOTBLK";
case -ENOTDIR: return "-ENOTDIR";
case -ENOTSUPP: return "-ENOTSUPP";
case -ENOTTY: return "-ENOTTY";
case -ENXIO: return "-ENXIO";
case -EOPNOTSUPP: return "-EOPNOTSUPP";
case -EOVERFLOW: return "-EOVERFLOW";
case -EPERM: return "-EPERM";
case -EPIPE: return "-EPIPE";
case -EPROTO: return "-EPROTO";
case -EPROTONOSUPPORT: return "-EPROTONOSUPPORT";
case -ERANGE: return "-ERANGE";
case -EROFS: return "-EROFS";
case -ESPIPE: return "-ESPIPE";
case -ESRCH: return "-ESRCH";
case -ETXTBSY: return "-ETXTBSY";
case -EUCLEAN: return "-EUCLEAN";
case -EXDEV: return "-EXDEV";
default:
snprintf(buf, sizeof(buf), "%d", err);
return buf;
}
}
static inline __u32 get_unaligned_be32(const void *p)
{
__be32 val;
memcpy(&val, p, sizeof(val));
return be32_to_cpu(val);
}
static inline void put_unaligned_be32(__u32 val, void *p)
{
__be32 be_val = cpu_to_be32(val);
memcpy(p, &be_val, sizeof(be_val));
}
#define SHA256_BLOCK_LENGTH 64
#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
static const __u32 sha256_K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \
{ \
__u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \
d += tmp; \
h = tmp + Sigma_0(a) + Maj(a, b, c); \
}
static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks)
{
while (nblocks--) {
__u32 a = state[0];
__u32 b = state[1];
__u32 c = state[2];
__u32 d = state[3];
__u32 e = state[4];
__u32 f = state[5];
__u32 g = state[6];
__u32 h = state[7];
__u32 w[64];
int i;
for (i = 0; i < 16; i++)
w[i] = get_unaligned_be32(&data[4 * i]);
for (; i < ARRAY_SIZE(w); i++)
w[i] = sigma_1(w[i - 2]) + w[i - 7] +
sigma_0(w[i - 15]) + w[i - 16];
for (i = 0; i < ARRAY_SIZE(w); i += 8) {
SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
data += SHA256_BLOCK_LENGTH;
}
}
void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH])
{
__u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
const __be64 bitcount = cpu_to_be64((__u64)len * 8);
__u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 };
size_t final_len = len % SHA256_BLOCK_LENGTH;
int i;
sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH);
memcpy(final_data, data + len - final_len, final_len);
final_data[final_len] = 0x80;
final_len = roundup(final_len + 9, SHA256_BLOCK_LENGTH);
memcpy(&final_data[final_len - 8], &bitcount, 8);
sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH);
for (i = 0; i < ARRAY_SIZE(state); i++)
put_unaligned_be32(state[i], &out[4 * i]);
}

View File

@@ -25,7 +25,6 @@
#include "btf.h"
#include "libbpf_internal.h"
#include "strset.h"
#include "str_error.h"
#define BTF_EXTERN_SEC ".extern"
@@ -2026,6 +2025,9 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj,
obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx;
return 0;
}
if (strcmp(src_sec->sec_name, JUMPTABLES_SEC) == 0)
goto add_sym;
}
if (sym_bind == STB_LOCAL)

View File

@@ -64,7 +64,6 @@ enum libbpf_print_level {
#include "libbpf.h"
#include "bpf.h"
#include "btf.h"
#include "str_error.h"
#include "libbpf_internal.h"
#endif

View File

@@ -21,7 +21,6 @@
#include "libbpf.h"
#include "libbpf_internal.h"
#include "bpf.h"
#include "str_error.h"
struct ring {
ring_buffer_sample_fn sample_cb;

View File

@@ -13,10 +13,15 @@
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <linux/keyctl.h>
#include <stdlib.h>
#include "bpf.h"
#endif
#ifndef SHA256_DIGEST_LENGTH
#define SHA256_DIGEST_LENGTH 32
#endif
#ifndef __NR_bpf
# if defined(__mips__) && defined(_ABIO32)
# define __NR_bpf 4355
@@ -64,6 +69,11 @@ struct bpf_load_and_run_opts {
__u32 data_sz;
__u32 insns_sz;
const char *errstr;
void *signature;
__u32 signature_sz;
__s32 keyring_id;
void *excl_prog_hash;
__u32 excl_prog_hash_sz;
};
long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size);
@@ -220,14 +230,19 @@ static inline int skel_map_create(enum bpf_map_type map_type,
const char *map_name,
__u32 key_size,
__u32 value_size,
__u32 max_entries)
__u32 max_entries,
const void *excl_prog_hash,
__u32 excl_prog_hash_sz)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
union bpf_attr attr;
memset(&attr, 0, attr_sz);
attr.map_type = map_type;
attr.excl_prog_hash = (unsigned long) excl_prog_hash;
attr.excl_prog_hash_size = excl_prog_hash_sz;
strncpy(attr.map_name, map_name, sizeof(attr.map_name));
attr.key_size = key_size;
attr.value_size = value_size;
@@ -300,6 +315,35 @@ static inline int skel_link_create(int prog_fd, int target_fd,
return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz);
}
static inline int skel_obj_get_info_by_fd(int fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, info);
__u8 sha[SHA256_DIGEST_LENGTH];
struct bpf_map_info info;
__u32 info_len = sizeof(info);
union bpf_attr attr;
memset(&info, 0, sizeof(info));
info.hash = (long) &sha;
info.hash_size = SHA256_DIGEST_LENGTH;
memset(&attr, 0, attr_sz);
attr.info.bpf_fd = fd;
attr.info.info = (long) &info;
attr.info.info_len = info_len;
return skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz);
}
static inline int skel_map_freeze(int fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_fd);
union bpf_attr attr;
memset(&attr, 0, attr_sz);
attr.map_fd = fd;
return skel_sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz);
}
#ifdef __KERNEL__
#define set_err
#else
@@ -308,12 +352,13 @@ static inline int skel_link_create(int prog_fd, int target_fd,
static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
{
const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array);
const size_t prog_load_attr_sz = offsetofend(union bpf_attr, keyring_id);
const size_t test_run_attr_sz = offsetofend(union bpf_attr, test);
int map_fd = -1, prog_fd = -1, key = 0, err;
union bpf_attr attr;
err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1);
err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1,
opts->excl_prog_hash, opts->excl_prog_hash_sz);
if (map_fd < 0) {
opts->errstr = "failed to create loader map";
set_err;
@@ -327,11 +372,34 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
goto out;
}
#ifndef __KERNEL__
err = skel_map_freeze(map_fd);
if (err < 0) {
opts->errstr = "failed to freeze map";
set_err;
goto out;
}
err = skel_obj_get_info_by_fd(map_fd);
if (err < 0) {
opts->errstr = "failed to fetch obj info";
set_err;
goto out;
}
#endif
memset(&attr, 0, prog_load_attr_sz);
attr.prog_type = BPF_PROG_TYPE_SYSCALL;
attr.insns = (long) opts->insns;
attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
attr.license = (long) "Dual BSD/GPL";
#ifndef __KERNEL__
attr.signature = (long) opts->signature;
attr.signature_size = opts->signature_sz;
#else
if (opts->signature || opts->signature_sz)
pr_warn("signatures are not supported from bpf_preload\n");
#endif
attr.keyring_id = opts->keyring_id;
memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog"));
attr.fd_array = (long) &map_fd;
attr.log_level = opts->ctx->log_level;

View File

@@ -1,104 +0,0 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
#undef _GNU_SOURCE
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "str_error.h"
#ifndef ENOTSUPP
#define ENOTSUPP 524
#endif
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
/*
* Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl
* libc, while checking strerror_r() return to avoid having to check this in
* all places calling it.
*/
char *libbpf_strerror_r(int err, char *dst, int len)
{
int ret = strerror_r(err < 0 ? -err : err, dst, len);
/* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't
* handle the error, on glibc >=2.13 *positive* (errno-like) error
* code is returned directly
*/
if (ret == -1)
ret = errno;
if (ret) {
if (ret == EINVAL)
/* strerror_r() doesn't recognize this specific error */
snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err);
else
snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret);
}
return dst;
}
const char *libbpf_errstr(int err)
{
static __thread char buf[12];
if (err > 0)
err = -err;
switch (err) {
case -E2BIG: return "-E2BIG";
case -EACCES: return "-EACCES";
case -EADDRINUSE: return "-EADDRINUSE";
case -EADDRNOTAVAIL: return "-EADDRNOTAVAIL";
case -EAGAIN: return "-EAGAIN";
case -EALREADY: return "-EALREADY";
case -EBADF: return "-EBADF";
case -EBADFD: return "-EBADFD";
case -EBUSY: return "-EBUSY";
case -ECANCELED: return "-ECANCELED";
case -ECHILD: return "-ECHILD";
case -EDEADLK: return "-EDEADLK";
case -EDOM: return "-EDOM";
case -EEXIST: return "-EEXIST";
case -EFAULT: return "-EFAULT";
case -EFBIG: return "-EFBIG";
case -EILSEQ: return "-EILSEQ";
case -EINPROGRESS: return "-EINPROGRESS";
case -EINTR: return "-EINTR";
case -EINVAL: return "-EINVAL";
case -EIO: return "-EIO";
case -EISDIR: return "-EISDIR";
case -ELOOP: return "-ELOOP";
case -EMFILE: return "-EMFILE";
case -EMLINK: return "-EMLINK";
case -EMSGSIZE: return "-EMSGSIZE";
case -ENAMETOOLONG: return "-ENAMETOOLONG";
case -ENFILE: return "-ENFILE";
case -ENODATA: return "-ENODATA";
case -ENODEV: return "-ENODEV";
case -ENOENT: return "-ENOENT";
case -ENOEXEC: return "-ENOEXEC";
case -ENOLINK: return "-ENOLINK";
case -ENOMEM: return "-ENOMEM";
case -ENOSPC: return "-ENOSPC";
case -ENOTBLK: return "-ENOTBLK";
case -ENOTDIR: return "-ENOTDIR";
case -ENOTSUPP: return "-ENOTSUPP";
case -ENOTTY: return "-ENOTTY";
case -ENXIO: return "-ENXIO";
case -EOPNOTSUPP: return "-EOPNOTSUPP";
case -EOVERFLOW: return "-EOVERFLOW";
case -EPERM: return "-EPERM";
case -EPIPE: return "-EPIPE";
case -EPROTO: return "-EPROTO";
case -EPROTONOSUPPORT: return "-EPROTONOSUPPORT";
case -ERANGE: return "-ERANGE";
case -EROFS: return "-EROFS";
case -ESPIPE: return "-ESPIPE";
case -ESRCH: return "-ESRCH";
case -ETXTBSY: return "-ETXTBSY";
case -EUCLEAN: return "-EUCLEAN";
case -EXDEV: return "-EXDEV";
default:
snprintf(buf, sizeof(buf), "%d", err);
return buf;
}
}

View File

@@ -1,19 +0,0 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __LIBBPF_STR_ERROR_H
#define __LIBBPF_STR_ERROR_H
#define STRERR_BUFSIZE 128
char *libbpf_strerror_r(int err, char *dst, int len);
/**
* @brief **libbpf_errstr()** returns string corresponding to numeric errno
* @param err negative numeric errno
* @return pointer to string representation of the errno, that is invalidated
* upon the next call.
*/
const char *libbpf_errstr(int err);
#define errstr(err) libbpf_errstr(err)
#endif /* __LIBBPF_STR_ERROR_H */

View File

@@ -34,13 +34,32 @@ enum __bpf_usdt_arg_type {
BPF_USDT_ARG_CONST,
BPF_USDT_ARG_REG,
BPF_USDT_ARG_REG_DEREF,
BPF_USDT_ARG_SIB,
};
/*
* This struct layout is designed specifically to be backwards/forward
* compatible between libbpf versions for ARG_CONST, ARG_REG, and
* ARG_REG_DEREF modes. ARG_SIB requires libbpf v1.7+.
*/
struct __bpf_usdt_arg_spec {
/* u64 scalar interpreted depending on arg_type, see below */
__u64 val_off;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
/* arg location case, see bpf_usdt_arg() for details */
enum __bpf_usdt_arg_type arg_type;
enum __bpf_usdt_arg_type arg_type: 8;
/* index register offset within struct pt_regs */
__u16 idx_reg_off: 12;
/* scale factor for index register (1, 2, 4, or 8) */
__u16 scale_bitshift: 4;
/* reserved for future use, keeps reg_off offset stable */
__u8 __reserved: 8;
#else
__u8 __reserved: 8;
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
enum __bpf_usdt_arg_type arg_type: 8;
#endif
/* offset of referenced register within struct pt_regs */
short reg_off;
/* whether arg should be interpreted as signed value */
@@ -149,7 +168,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
{
struct __bpf_usdt_spec *spec;
struct __bpf_usdt_arg_spec *arg_spec;
unsigned long val;
unsigned long val, idx;
int err, spec_id;
*res = 0;
@@ -202,6 +221,27 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
return err;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
val >>= arg_spec->arg_bitshift;
#endif
break;
case BPF_USDT_ARG_SIB:
/* Arg is in memory addressed by SIB (Scale-Index-Base) mode
* (e.g., "-1@-96(%rbp,%rax,8)" in USDT arg spec). We first
* fetch the base register contents and the index register
* contents from pt_regs. Then we calculate the final address
* as base + (index * scale) + offset, and do a user-space
* probe read to fetch the argument value.
*/
err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
if (err)
return err;
err = bpf_probe_read_kernel(&idx, sizeof(idx), (void *)ctx + arg_spec->idx_reg_off);
if (err)
return err;
err = bpf_probe_read_user(&val, sizeof(val), (void *)(val + (idx << arg_spec->scale_bitshift) + arg_spec->val_off));
if (err)
return err;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
val >>= arg_spec->arg_bitshift;
#endif
break;
default:

View File

@@ -20,7 +20,6 @@
#include "libbpf_common.h"
#include "libbpf_internal.h"
#include "hashmap.h"
#include "str_error.h"
/* libbpf's USDT support consists of BPF-side state/code and user-space
* state/code working together in concert. BPF-side parts are defined in
@@ -200,12 +199,23 @@ enum usdt_arg_type {
USDT_ARG_CONST,
USDT_ARG_REG,
USDT_ARG_REG_DEREF,
USDT_ARG_SIB,
};
/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */
struct usdt_arg_spec {
__u64 val_off;
enum usdt_arg_type arg_type;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
enum usdt_arg_type arg_type: 8;
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
__u8 __reserved: 8; /* keep reg_off offset stable */
#else
__u8 __reserved: 8; /* keep reg_off offset stable */
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
enum usdt_arg_type arg_type: 8;
#endif
short reg_off;
bool arg_signed;
char arg_bitshift;
@@ -570,9 +580,8 @@ static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long o
return NULL;
}
static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr,
const char *data, size_t name_off, size_t desc_off,
struct usdt_note *usdt_note);
static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off,
size_t desc_off, struct usdt_note *usdt_note);
static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie);
@@ -626,7 +635,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
struct elf_seg *seg = NULL;
void *tmp;
err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, &note);
err = parse_usdt_note(&nhdr, data->d_buf, name_off, desc_off, &note);
if (err)
goto err_out;
@@ -1132,8 +1141,7 @@ err_out:
/* Parse out USDT ELF note from '.note.stapsdt' section.
* Logic inspired by perf's code.
*/
static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr,
const char *data, size_t name_off, size_t desc_off,
static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off,
struct usdt_note *note)
{
const char *provider, *name, *args;
@@ -1283,11 +1291,51 @@ static int calc_pt_regs_off(const char *reg_name)
static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
{
char reg_name[16];
int len, reg_off;
long off;
char reg_name[16] = {0}, idx_reg_name[16] = {0};
int len, reg_off, idx_reg_off, scale = 1;
long off = 0;
if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) {
if (sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^,] , %d ) %n",
arg_sz, &off, reg_name, idx_reg_name, &scale, &len) == 5 ||
sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^,] , %d ) %n",
arg_sz, reg_name, idx_reg_name, &scale, &len) == 4 ||
sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^)] ) %n",
arg_sz, &off, reg_name, idx_reg_name, &len) == 4 ||
sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^)] ) %n",
arg_sz, reg_name, idx_reg_name, &len) == 3
) {
/*
* Scale Index Base case:
* 1@-96(%rbp,%rax,8)
* 1@(%rbp,%rax,8)
* 1@-96(%rbp,%rax)
* 1@(%rbp,%rax)
*/
arg->arg_type = USDT_ARG_SIB;
arg->val_off = off;
reg_off = calc_pt_regs_off(reg_name);
if (reg_off < 0)
return reg_off;
arg->reg_off = reg_off;
idx_reg_off = calc_pt_regs_off(idx_reg_name);
if (idx_reg_off < 0)
return idx_reg_off;
arg->idx_reg_off = idx_reg_off;
/* validate scale factor and set fields directly */
switch (scale) {
case 1: arg->scale_bitshift = 0; break;
case 2: arg->scale_bitshift = 1; break;
case 4: arg->scale_bitshift = 2; break;
case 8: arg->scale_bitshift = 3; break;
default:
pr_warn("usdt: invalid SIB scale %d, expected 1, 2, 4, 8\n", scale);
return -EINVAL;
}
} else if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n",
arg_sz, &off, reg_name, &len) == 3) {
/* Memory dereference case, e.g., -4@-20(%rbp) */
arg->arg_type = USDT_ARG_REG_DEREF;
arg->val_off = off;
@@ -1306,6 +1354,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
} else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) {
/* Register read case, e.g., -4@%eax */
arg->arg_type = USDT_ARG_REG;
/* register read has no memory offset */
arg->val_off = 0;
reg_off = calc_pt_regs_off(reg_name);
@@ -1327,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
#elif defined(__s390x__)
/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */
static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
{
unsigned int reg;