Compare commits

...

400 Commits

Author SHA1 Message Date
Andrii Nakryiko
85d9be97eb sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   6f0b824a61f212e9707ff68abcabfdfa4724b811
Checkpoint bpf-next commit: 08a7491843224f8b96518fbe70d9e48163046054
Baseline bpf commit:        1d528e794f3db5d32279123a89957c44c4406a09
Checkpoint bpf commit:      22cc16c04b7893d8fc22810599f49a305d600b9e

Donglin Peng (4):
  libbpf: Add BTF permutation support for type reordering
  libbpf: Optimize type lookup with binary search for sorted BTF
  libbpf: Verify BTF sorting
  btf: Refactor the code by calling str_is_empty

Emil Tsalapatis (2):
  libbpf: Turn relo_core->sym_off unsigned
  libbpf: Move arena globals to the end of the arena

Ihor Solodrai (1):
  bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS

Leon Hwang (2):
  bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
  libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps

Matt Bobrowski (1):
  bpf: add new BPF_CGROUP_ITER_CHILDREN control option

Menglong Dong (2):
  bpf: add fsession support
  libbpf: add fsession support

Thomas Gleixner (1):
  treewide: Update email address

Thomas Weißschuh (1):
  vfs: use UAPI types for new struct delegation definition

Varun R Mallya (1):
  libbpf: Fix OOB read in btf_dump_get_bitfield_value

 include/uapi/linux/bpf.h        |  11 ++
 include/uapi/linux/fcntl.h      |  10 +-
 include/uapi/linux/perf_event.h |   2 +-
 src/bpf.c                       |   1 +
 src/bpf.h                       |   8 +
 src/bpf_helpers.h               |   6 +-
 src/btf.c                       | 276 +++++++++++++++++++++++++++-----
 src/btf.h                       |  42 +++++
 src/btf_dump.c                  |   9 ++
 src/libbpf.c                    |  64 +++++---
 src/libbpf.h                    |  21 +--
 src/libbpf.map                  |   1 +
 12 files changed, 369 insertions(+), 82 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-29 14:10:19 -08:00
Andrii Nakryiko
fddf93d20b sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-29 14:10:19 -08:00
Matt Bobrowski
ed6bb65cf1 bpf: add new BPF_CGROUP_ITER_CHILDREN control option
Currently, the BPF cgroup iterator supports walking descendants in
either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order
(BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive
depth-first search (DFS) of the hierarchy. In scenarios where a BPF
program may need to inspect only the direct children of a given parent
cgroup, a full DFS is unnecessarily expensive.

This patch introduces a new BPF cgroup iterator control option,
BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal
to the immediate children of a specified parent cgroup, allowing for
more targeted and efficient iteration, particularly when exhaustive
depth-first search (DFS) traversal is not required.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Menglong Dong
5ee8863eaf libbpf: add fsession support
Add BPF_TRACE_FSESSION to libbpf.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-9-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Menglong Dong
adde4f55b7 bpf: add fsession support
The fsession is something that similar to kprobe session. It allow to
attach a single BPF program to both the entry and the exit of the target
functions.

Introduce the struct bpf_fsession_link, which allows to add the link to
both the fentry and fexit progs_hlist of the trampoline.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Co-developed-by: Leon Hwang <leon.hwang@linux.dev>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Ihor Solodrai
977b1f820c bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS
Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument,
and remote bpf_stream_vprintk_impl from the kernel.

Update the selftests to use the new API with implicit argument.

bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk
kfunc, and the extern definition of bpf_stream_vprintk_impl is
replaced accordingly.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Thomas Gleixner
5d02120e10 treewide: Update email address
In a vain attempt to consolidate the email zoo switch everything to the
kernel.org account.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2026-01-29 14:10:19 -08:00
Donglin Peng
8a090ef1e5 btf: Refactor the code by calling str_is_empty
Calling the str_is_empty function to clarify the code and
no functional changes are introduced.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-12-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
ad9c763445 libbpf: Verify BTF sorting
This patch checks whether the BTF is sorted by name in ascending
order. If sorted, binary search will be used when looking up types.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-6-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
1c96b72cb0 libbpf: Optimize type lookup with binary search for sorted BTF
This patch introduces binary search optimization for BTF type lookups
when the BTF instance contains sorted types.

The optimization significantly improves performance when searching for
types in large BTF instances with sorted types. For unsorted BTF, the
implementation falls back to the original linear search.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-5-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Donglin Peng
b7c6c02b5f libbpf: Add BTF permutation support for type reordering
Introduce btf__permute() API to allow in-place rearrangement of BTF types.
This function reorganizes BTF type order according to a provided array of
type IDs, updating all type references to maintain consistency.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-2-dolinux.peng@gmail.com
2026-01-29 14:10:19 -08:00
Varun R Mallya
2c5038dcf4 libbpf: Fix OOB read in btf_dump_get_bitfield_value
When dumping bitfield data, btf_dump_get_bitfield_value() reads data
based on the underlying type's size (t->size). However, it does not
verify that the provided data buffer (data_sz) is large enough to
contain these bytes.

If btf_dump__dump_type_data() is called with a buffer smaller than
the type's size, this leads to an out-of-bounds read. This was
confirmed by AddressSanitizer in the linked issue.

Fix this by ensuring we do not read past the provided data_sz limit.

Fixes: a1d3cc3c5eca ("libbpf: Avoid use of __int128 in typed dump display")
Reported-by: Harrison Green <harrisonmichaelgreen@gmail.com>
Suggested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Varun R Mallya <varunrmallya@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260106233527.163487-1-varunrmallya@gmail.com

Closes: https://github.com/libbpf/libbpf/issues/928
2026-01-29 14:10:19 -08:00
Leon Hwang
dc8673b28b libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps
Add libbpf support for the BPF_F_CPU flag for percpu maps by embedding the
cpu info into the high 32 bits of:

1. **flags**: bpf_map_lookup_elem_flags(), bpf_map__lookup_elem(),
   bpf_map_update_elem() and bpf_map__update_elem()
2. **opts->elem_flags**: bpf_map_lookup_batch() and
   bpf_map_update_batch()

And the flag can be BPF_F_ALL_CPUS, but cannot be
'BPF_F_CPU | BPF_F_ALL_CPUS'.

Behavior:

* If the flag is BPF_F_ALL_CPUS, the update is applied across all CPUs.
* If the flag is BPF_F_CPU, it updates value only to the specified CPU.
* If the flag is BPF_F_CPU, lookup value only from the specified CPU.
* lookup does not support BPF_F_ALL_CPUS.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-7-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Leon Hwang
a6d7ceaaeb bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for
following APIs:

* 'map_lookup_elem()'
* 'map_update_elem()'
* 'generic_map_lookup_batch()'
* 'generic_map_update_batch()'

And, get the correct value size for these APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-01-29 14:10:19 -08:00
Thomas Weißschuh
e64e125ef6 vfs: use UAPI types for new struct delegation definition
Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

Use the fixed-width integer types provided by the UAPI headers instead.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Fixes: 4be9e04ebf75 ("vfs: add needed headers for new struct delegation definition")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251203-uapi-fcntl-v1-1-490c67bf3425@linutronix.de
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2026-01-29 14:10:19 -08:00
Emil Tsalapatis
9dd6fda504 libbpf: Move arena globals to the end of the arena
Arena globals are currently placed at the beginning of the arena
by libbpf. This is convenient, but prevents users from reserving
guard pages in the beginning of the arena to identify NULL pointer
dereferences. Adjust the load logic to place the globals at the
end of the arena instead.

Also modify bpftool to set the arena pointer in the program's BPF
skeleton to point to the globals. Users now call bpf_map__initial_value()
to find the beginning of the arena mapping and use the arena pointer
in the skeleton to determine which part of the mapping holds the
arena globals and which part is free.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com
2026-01-29 14:10:19 -08:00
Emil Tsalapatis
2c7fe6ec5d libbpf: Turn relo_core->sym_off unsigned
The symbols' relocation offsets in BPF are stored in an int field,
but cannot actually be negative. When in the next patch libbpf relocates
globals to the end of the arena, it is also possible to have valid
offsets > 2GiB that are used to calculate the final relo offsets.
Avoid accidentally interpreting large offsets as negative by turning
the sym_off field unsigned.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-4-emil@etsalapatis.com
2026-01-29 14:10:19 -08:00
Andrii Nakryiko
160423d498 ci: denylist flaky 'bpf_cookie/perf_event' selftest
It keeps failing. It relies on perf_events so not super reliable.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2026-01-26 13:12:58 -08:00
Andrii Nakryiko
afb8b17bc5 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   f8c67d8550ee69ce684c7015b2c8c63cda24bbfb
Checkpoint bpf-next commit: 6f0b824a61f212e9707ff68abcabfdfa4724b811
Baseline bpf commit:        e427054ae7bc8b1268cf1989381a43885795616f
Checkpoint bpf commit:      1d528e794f3db5d32279123a89957c44c4406a09

Alan Maguire (1):
  libbpf: Add debug messaging in dedup equivalence/identity matching

Amery Hung (2):
  bpf: Support associating BPF program with struct_ops
  libbpf: Add support for associating BPF program with struct_ops

Asbjørn Sloth Tønnesen (1):
  tools: ynl-gen: add regeneration comment

Heiko Carstens (1):
  tools: Remove s390 compat support

James Clark (1):
  perf: Add perf_event_attr::config4

Jeff Layton (2):
  vfs: expose delegation support to userland
  vfs: add needed headers for new struct delegation definition

Jianyun Gao (1):
  libbpf: Fix some incorrect @param descriptions in the comment of
    libbpf.h

Kuniyuki Iwashima (1):
  bpf: Introduce SK_BPF_BYPASS_PROT_MEM.

Mikhail Gavrilov (1):
  libbpf: Fix -Wdiscarded-qualifiers under C23

Paul Houssel (1):
  libbpf: Fix BTF dedup to support recursive typedef definitions

Peter Zijlstra (1):
  perf: Support deferred user unwind

Samiullah Khawaja (1):
  net: Extend NAPI threaded polling to allow kthread based busy polling

 include/uapi/linux/bpf.h        |  19 ++++++
 include/uapi/linux/fcntl.h      |  16 +++++
 include/uapi/linux/netdev.h     |   2 +
 include/uapi/linux/perf_event.h |  23 +++++++-
 src/bpf.c                       |  19 ++++++
 src/bpf.h                       |  21 +++++++
 src/btf.c                       | 100 +++++++++++++++++++++++++-------
 src/libbpf.c                    |  42 +++++++++++---
 src/libbpf.h                    |  43 ++++++++++----
 src/libbpf.map                  |   2 +
 src/usdt.c                      |   2 -
 11 files changed, 247 insertions(+), 42 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-12-16 09:52:07 -08:00
Mikhail Gavrilov
fda2bfcb7a libbpf: Fix -Wdiscarded-qualifiers under C23
glibc ≥ 2.42 (GCC 15) defaults to -std=gnu23, which promotes
-Wdiscarded-qualifiers to an error.

In C23, strstr() and strchr() return "const char *".

Change variable types to const char * where the pointers are never
modified (res, sym_sfx, next_path).

Suggested-by: Florian Weimer <fweimer@redhat.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Link: https://lore.kernel.org/r/20251206092825.1471385-1-mikhail.v.gavrilov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-12-16 09:52:07 -08:00
Amery Hung
5635185147 libbpf: Add support for associating BPF program with struct_ops
Add low-level wrapper and libbpf API for BPF_PROG_ASSOC_STRUCT_OPS
command in the bpf() syscall.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-4-ameryhung@gmail.com
2025-12-16 09:52:07 -08:00
Amery Hung
1a41b12b4f bpf: Support associating BPF program with struct_ops
Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating
a BPF program with a struct_ops map. This command takes a file
descriptor of a struct_ops map and a BPF program and set
prog->aux->st_ops_assoc to the kdata of the struct_ops map.

The command does not accept a struct_ops program nor a non-struct_ops
map. Programs of a struct_ops map is automatically associated with the
map during map update. If a program is shared between two struct_ops
maps, prog->aux->st_ops_assoc will be poisoned to indicate that the
associated struct_ops is ambiguous. The pointer, once poisoned, cannot
be reset since we have lost track of associated struct_ops. For other
program types, the associated struct_ops map, once set, cannot be
changed later. This restriction may be lifted in the future if there is
a use case.

A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve
the associated struct_ops pointer. The returned pointer, if not NULL, is
guaranteed to be valid and point to a fully updated struct_ops struct.
For struct_ops program reused in multiple struct_ops map, the return
will be NULL.

prog->aux->st_ops_assoc is protected by bumping the refcount for
non-struct_ops programs and RCU for struct_ops programs. Since it would
be inefficient to track programs associated with a struct_ops map, every
non-struct_ops program will bump the refcount of the map to make sure
st_ops_assoc stays valid. For a struct_ops program, it is protected by
RCU as map_free will wait for an RCU grace period before disassociating
the program with the map. The helper must be called in BPF program
context or RCU read-side critical section.

struct_ops implementers should note that the struct_ops returned may not
be initialized nor attached yet. The struct_ops implementer will be
responsible for tracking and checking the state of the associated
struct_ops map if the use case expects an initialized or attached
struct_ops.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com
2025-12-16 09:52:07 -08:00
Alan Maguire
8ffe064aed libbpf: Add debug messaging in dedup equivalence/identity matching
We have seen a number of issues like [1]; failures to deduplicate
key kernel data structures like task_struct.  These are often hard
to debug from pahole even with verbose output, especially when
identity/equivalence checks fail deep in a nested struct comparison.

Here we add debug messages of the form

libbpf: STRUCT 'task_struct' size=2560 vlen=194 cand_id[54222] canon_id[102820] shallow-equal but not equiv for field#23 'sched_class': 0

These will be emitted during dedup from pahole when --verbose/-V
is specified.  This greatly helps identify exactly where dedup
failures are experienced.

[1] https://lore.kernel.org/bpf/b8e8b560-bce5-414b-846d-0da6d22a9983@oracle.com/

Changes since v1:

- updated debug messages to refer to shallow-equal, added ids (Andrii)

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203191507.55565-1-alan.maguire@oracle.com
2025-12-16 09:52:07 -08:00
Asbjørn Sloth Tønnesen
7ac4e3a670 tools: ynl-gen: add regeneration comment
Add a comment on regeneration to the generated files.

The comment is placed after the YNL-GEN line[1], as to not interfere
with ynl-regen.sh's detection logic.

[1] and after the optional YNL-ARG line.

Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-16 09:52:07 -08:00
Samiullah Khawaja
cd173d0ea3 net: Extend NAPI threaded polling to allow kthread based busy polling
Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-16 09:52:07 -08:00
Kuniyuki Iwashima
3fe0a72123 bpf: Introduce SK_BPF_BYPASS_PROT_MEM.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

This is easily controlled by net.core.bypass_prot_mem sysctl, but it
lacks flexibility.

Let's support flagging (and clearing) sk->sk_bypass_prot_mem via
bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook.

  int val = 1;

  bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM,
                 &val, sizeof(val));

As with net.core.bypass_prot_mem, this is inherited to child sockets,
and BPF always takes precedence over sysctl at socket(2) and accept(2).

SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE
and not supported on other hooks for some reasons:

  1. UDP charges memory under sk->sk_receive_queue.lock instead
     of lock_sock()

  2. Modifying the flag after skb is charged to sk requires such
     adjustment during bpf_setsockopt() and complicates the logic
     unnecessarily

We can support other hooks later if a real use case justifies that.

Most changes are inline and hard to trace, but a microbenchmark on
__sk_mem_raise_allocated() during neper/tcp_stream showed that more
samples completed faster with sk->sk_bypass_prot_mem == 1.  This will
be more visible under tcp_mem pressure (but it's not a fair comparison).

  # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; }
    kretprobe:__sk_mem_raise_allocated /@start[tid]/
    { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }'
  # tcp_stream -6 -F 1000 -N -T 256

Without bpf prog:

  [128, 256)          3846 |                                                    |
  [256, 512)       1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@     |
  [1K, 2K)          198207 |@@@@@@                                              |
  [2K, 4K)           31199 |@                                                   |

With bpf prog in the next patch:
  (must be attached before tcp_stream)
  # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create
  # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test

  [128, 256)          6413 |                                                    |
  [256, 512)       1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                      |
  [1K, 2K)          117031 |@@@@                                                |
  [2K, 4K)           11773 |                                                    |

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com
2025-12-16 09:52:07 -08:00
Jianyun Gao
f561c42074 libbpf: Fix some incorrect @param descriptions in the comment of libbpf.h
Fix up some of missing or incorrect @param descriptions for libbpf public APIs
in libbpf.h.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251118033025.11804-1-jianyungao89@gmail.com
2025-12-16 09:52:07 -08:00
Paul Houssel
370271441c libbpf: Fix BTF dedup to support recursive typedef definitions
Handle recursive typedefs in BTF deduplication

Pahole fails to encode BTF for some Go projects (e.g. Kubernetes and
Podman) due to recursive type definitions that create reference loops
not representable in C. These recursive typedefs trigger a failure in
the BTF deduplication algorithm.

This patch extends btf_dedup_ref_type() to properly handle potential
recursion for BTF_KIND_TYPEDEF, similar to how recursion is already
handled for BTF_KIND_STRUCT. This allows pahole to successfully
generate BTF for Go binaries using recursive types without impacting
existing C-based workflows.

Suggested-by: Tristan d'Audibert <tristan.daudibert@gmail.com>
Co-developed-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Co-developed-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Co-developed-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Co-developed-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Co-developed-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Signed-off-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Signed-off-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Signed-off-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Signed-off-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Paul Houssel <paul.houssel@orange.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/bf00857b1e06f282aac12f6834de7396a7547ba6.1763037045.git.paul.houssel@orange.com
2025-12-16 09:52:07 -08:00
James Clark
8cc0f2c095 perf: Add perf_event_attr::config4
Arm FEAT_SPE_FDS adds the ability to filter on the data source of a
packet using another 64-bits of event filtering control. As the existing
perf_event_attr::configN fields are all used up for SPE PMU, an
additional field is needed. Add a new 'config4' field.

Reviewed-by: Leo Yan <leo.yan@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Will Deacon <will@kernel.org>
2025-12-16 09:52:07 -08:00
Heiko Carstens
9905b35d8a tools: Remove s390 compat support
Remove s390 compat support from everything within tools, since s390 compat
support will be removed from the kernel.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Weißschuh <linux@weissschuh.net> # tools/nolibc selftests/nolibc
Reviewed-by: Thomas Weißschuh <linux@weissschuh.net> # selftests/vDSO
Acked-by: Alexei Starovoitov <ast@kernel.org> # bpf bits
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2025-12-16 09:52:07 -08:00
Peter Zijlstra
8d178bd7b6 perf: Support deferred user unwind
Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
2025-12-16 09:52:07 -08:00
Jeff Layton
530f40421a vfs: add needed headers for new struct delegation definition
The definition of struct delegation uses stdint.h integer types. Add the
necessary headers to ensure that always works.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-12-16 09:52:07 -08:00
Jeff Layton
4f10610ae5 vfs: expose delegation support to userland
Now that support for recallable directory delegations is available,
expose this functionality to userland with new F_SETDELEG and F_GETDELEG
commands for fcntl().

Note that this also allows userland to request a FL_DELEG type lease on
files too. Userland applications that do will get signalled when there
are metadata changes in addition to just data changes (which is a
limitation of FL_LEASE leases).

These commands accept a new "struct delegation" argument that contains a
flags field for future expansion.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-12-16 09:52:07 -08:00
Andrii Nakryiko
d65dbb412d sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   de7342228b7343774d6a9981c2ddbfb5e201044b
Checkpoint bpf-next commit: f8c67d8550ee69ce684c7015b2c8c63cda24bbfb
Baseline bpf commit:        4d920ed684392ae064af62957d6f5a90312dfaf6
Checkpoint bpf commit:      e427054ae7bc8b1268cf1989381a43885795616f

Alan Maguire (1):
  libbpf: Fix parsing of multi-split BTF

Andrii Nakryiko (1):
  libbpf: Fix powerpc's stack register definition in bpf_tracing.h

Anton Protopopov (4):
  libbpf: fix formatting of bpf_object__append_subprog_code
  bpf, x86: add new map type: instructions array
  libbpf: Recognize insn_array map type
  libbpf: support llvm-generated indirect jumps

Donald Hunter (1):
  docs/bpf: Add missing BPF k/uprobe program types to docs

Jianyun Gao (4):
  libbpf: Optimize the redundant code in the
    bpf_object__init_user_btf_maps() function.
  libbpf: Fix the incorrect reference to the memlock_rlim variable in
    the comment.
  libbpf: Complete the missing @param and @return tags in btf.h
  libbpf: Update the comment to remove the reference to the deprecated
    interface bpf_program__load().

Mykyta Yatsenko (2):
  bpf: widen dynptr size/offset to 64 bit
  bpf: add _impl suffix for bpf_stream_vprintk() kfunc

Xu Kuohai (1):
  bpf: Add overwrite mode for BPF ring buffer

 docs/program_types.rst   |  18 +++
 include/uapi/linux/bpf.h |  33 ++++-
 src/bpf.c                |   2 +-
 src/bpf_helpers.h        |  28 ++--
 src/bpf_tracing.h        |   2 +-
 src/btf.c                |   4 +-
 src/btf.h                |   8 ++
 src/libbpf.c             | 296 +++++++++++++++++++++++++++++++++++----
 src/libbpf_internal.h    |   2 +
 src/libbpf_probes.c      |   4 +
 src/linker.c             |   3 +
 11 files changed, 353 insertions(+), 47 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-07 14:00:07 -08:00
Andrii Nakryiko
befbf010d7 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-07 14:00:07 -08:00
Mykyta Yatsenko
a00b10df8c bpf: add _impl suffix for bpf_stream_vprintk() kfunc
Rename bpf_stream_vprintk() to bpf_stream_vprintk_impl().

This makes bpf_stream_vprintk() follow the already established "_impl"
suffix-based naming convention for kfuncs with the bpf_prog_aux
argument provided by the verifier implicitly. This convention will be
taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to
preserve backwards compatibility to BPF programs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20251104-implv2-v3-2-4772b9ae0e06@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-11-07 14:00:07 -08:00
Anton Protopopov
24a89cb35d libbpf: support llvm-generated indirect jumps
For v4 instruction set LLVM is allowed to generate indirect jumps for
switch statements and for 'goto *rX' assembly. Every such a jump will
be accompanied by necessary metadata, e.g. (`llvm-objdump -Sr ...`):

       0:       r2 = 0x0 ll
                0000000000000030:  R_BPF_64_64  BPF.JT.0.0

Here BPF.JT.1.0 is a symbol residing in the .jumptables section:

    Symbol table:
       4: 0000000000000000   240 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0

The -bpf-min-jump-table-entries llvm option may be used to control the
minimal size of a switch which will be converted to an indirect jumps.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-11-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
349b78117b libbpf: Recognize insn_array map type
Teach libbpf about the existence of the new instruction array map.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
9d159773c5 bpf, x86: add new map type: instructions array
On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are
translated by the verifier into "xlated" BPF programs. During this
process the original instructions offsets might be adjusted and/or
individual instructions might be replaced by new sets of instructions,
or deleted.

Add a new BPF map type which is aimed to keep track of how, for a
given program, the original instructions were relocated during the
verification. Also, besides keeping track of the original -> xlated
mapping, make x86 JIT to build the xlated -> jitted mapping for every
instruction listed in an instruction array. This is required for every
future application of instruction arrays: static keys, indirect jumps
and indirect calls.

A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32
keys and value of size 8. The values have different semantics for
userspace and for BPF space. For userspace a value consists of two
u32 values – xlated and jitted offsets. For BPF side the value is
a real pointer to a jitted instruction.

On map creation/initialization, before loading the program, each
element of the map should be initialized to point to an instruction
offset within the program. Before the program load such maps should
be made frozen. After the program verification xlated and jitted
offsets can be read via the bpf(2) syscall.

If a tracked instruction is removed by the verifier, then the xlated
offset is set to (u32)-1 which is considered to be too big for a valid
BPF program offset.

One such a map can, obviously, be used to track one and only one BPF
program.  If the verification process was unsuccessful, then the same
map can be re-used to verify the program with a different log level.
However, if the program was loaded fine, then such a map, being
frozen in any case, can't be reused by other programs even after the
program release.

Example. Consider the following original and xlated programs:

    Original prog:                      Xlated prog:

     0:  r1 = 0x0                        0: r1 = 0
     1:  *(u32 *)(r10 - 0x4) = r1        1: *(u32 *)(r10 -4) = r1
     2:  r2 = r10                        2: r2 = r10
     3:  r2 += -0x4                      3: r2 += -4
     4:  r1 = 0x0 ll                     4: r1 = map[id:88]
     6:  call 0x1                        6: r1 += 272
                                         7: r0 = *(u32 *)(r2 +0)
                                         8: if r0 >= 0x1 goto pc+3
                                         9: r0 <<= 3
                                        10: r0 += r1
                                        11: goto pc+1
                                        12: r0 = 0
     7:  r6 = r0                        13: r6 = r0
     8:  if r6 == 0x0 goto +0x2         14: if r6 == 0x0 goto pc+4
     9:  call 0x76                      15: r0 = 0xffffffff8d2079c0
                                        17: r0 = *(u64 *)(r0 +0)
    10:  *(u64 *)(r6 + 0x0) = r0        18: *(u64 *)(r6 +0) = r0
    11:  r0 = 0x0                       19: r0 = 0x0
    12:  exit                           20: exit

An instruction array map, containing, e.g., instructions [0,4,7,12]
will be translated by the verifier to [0,4,13,20]. A map with
index 5 (the middle of 16-byte instruction) or indexes greater than 12
(outside the program boundaries) would be rejected.

The functionality provided by this patch will be extended in consequent
patches to implement BPF Static Keys, indirect jumps, and indirect calls.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Alan Maguire
0e14a12a1d libbpf: Fix parsing of multi-split BTF
When creating multi-split BTF we correctly set the start string offset
to be the size of the base string section plus the base BTF start
string offset; the latter is needed for multi-split BTF since the
offset is non-zero there.

Unfortunately the BTF parsing case needed that logic and it was
missed.

Fixes: 4e29128a9ace ("libbpf/btf: Fix string handling to support multi-split BTF")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251104203309.318429-2-alan.maguire@oracle.com
2025-11-07 14:00:07 -08:00
Donald Hunter
813fbe13ab docs/bpf: Add missing BPF k/uprobe program types to docs
Update the table of program types in the libbpf docs with the missing
k/uprobe multi and session program types.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251029180932.98038-1-donald.hunter@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
fd00fd999f libbpf: Update the comment to remove the reference to the deprecated interface bpf_program__load().
Commit be2f2d1680df ("libbpf: Deprecate bpf_program__load() API") marked
bpf_program__load() as deprecated starting with libbpf v0.6. And later
in commit 146bf811f5ac ("libbpf: remove most other deprecated high-level
APIs") actually removed the bpf_program__load() implementation and
related old high-level APIs.

This patch update the comment in bpf_program__set_attach_target() to
remove the reference to the deprecated interface bpf_program__load().

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103120727.145965-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
f4b32db745 libbpf: Complete the missing @param and @return tags in btf.h
Complete the missing @param and @return tags in the Doxygen comments of
the btf.h file.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103115836.144339-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Andrii Nakryiko
99bf90957a libbpf: Fix powerpc's stack register definition in bpf_tracing.h
retsnoop's build on powerpc (ppc64le) architecture ([0]) failed due to
wrong definition of PT_REGS_SP() macro. Looking at powerpc's
implementation of stack unwinding in perf_callchain_user_64() clearly
shows that stack pointer register is gpr[1].

Fix libbpf's definition of __PT_SP_REG for powerpc to fix all this.

  [0] https://kojipkgs.fedoraproject.org/work/tasks/1544/137921544/build.log

Fixes: 138d6153a139 ("samples/bpf: Enable powerpc support")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Naveen N Rao (AMD) <naveen@kernel.org>
Link: https://lore.kernel.org/r/20251020203643.989467-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Jianyun Gao
98b6e51fc6 libbpf: Fix the incorrect reference to the memlock_rlim variable in the comment.
The variable "memlock_rlim_max" referenced in the comment does not exist.
I think that the author probably meant the variable "memlock_rlim". So,
correct it.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251027032008.738944-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Jianyun Gao
bc3b400e06 libbpf: Optimize the redundant code in the bpf_object__init_user_btf_maps() function.
In the elf_sec_data() function, the input parameter 'scn' will be
evaluated. If it is NULL, then it will directly return NULL. Therefore,
the return value of the elf_sec_data() function already takes into
account the case where the input parameter scn is NULL. Therefore,
subsequently, the code only needs to check whether the return value of
the elf_sec_data() function is NULL.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20251024080802.642189-1-jianyungao89@gmail.com
2025-11-07 14:00:07 -08:00
Xu Kuohai
665ad8c7f7 bpf: Add overwrite mode for BPF ring buffer
When the BPF ring buffer is full, a new event cannot be recorded until one
or more old events are consumed to make enough space for it. In cases such
as fault diagnostics, where recent events are more useful than older ones,
this mechanism may lead to critical events being lost.

So add overwrite mode for BPF ring buffer to address it. In this mode, the
new event overwrites the oldest event when the buffer is full.

The basic idea is as follows:

1. producer_pos tracks the next position to record new event. When there
   is enough free space, producer_pos is simply advanced by producer to
   make space for the new event.

2. To avoid waiting for consumer when the buffer is full, a new variable,
   overwrite_pos, is introduced for producer. It points to the oldest event
   committed in the buffer. It is advanced by producer to discard one or more
   oldest events to make space for the new event when the buffer is full.

3. pending_pos tracks the oldest event to be committed. pending_pos is never
   passed by producer_pos, so multiple producers never write to the same
   position at the same time.

The following example diagrams show how it works in a 4096-byte ring buffer.

1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                                                                       |
   |                                                                       |
   |                                                                       |
   +-----------------------------------------------------------------------+
   ^
   |
   |
producer_pos = 0
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

2. Now reserve a 512-byte event A.

   There is enough free space, so A is allocated at offset 0. And producer_pos
   is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is
   set.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                                                              |
   |   A    |                                                              |
   | [BUSY] |                                                              |
   +-----------------------------------------------------------------------+
   ^        ^
   |        |
   |        |
   |    producer_pos = 512
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

3. Reserve event B, size 1024.

   B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced
   to the end of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                            |
   |   A    |        B        |                                            |
   | [BUSY] |      [BUSY]     |                                            |
   +-----------------------------------------------------------------------+
   ^                          ^
   |                          |
   |                          |
   |                   producer_pos = 1536
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

4. Reserve event C, size 2048.

   C is allocated at offset 1536, and producer_pos is advanced to 3584.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   | [BUSY] |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                                                              ^
   |                                                              |
   |                                                              |
   |                                                    producer_pos = 3584
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

5. Submit event A.

   The BUSY bit of A is cleared. B becomes the oldest event to be committed, so
   pending_pos is advanced to 512, the start of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^        ^                                                     ^
   |        |                                                     |
   |        |                                                     |
   |   pending_pos = 512                                  producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

6. Submit event B.

   The BUSY bit of B is cleared, and pending_pos is advanced to the start of C,
   which is now the oldest event to be committed.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |                 |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                          ^                                   ^
   |                          |                                   |
   |                          |                                   |
   |                     pending_pos = 1536               producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

7. Reserve event D, size 1536 (3 * 512).

   There are 2048 bytes not being written between producer_pos (currently 3584)
   and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced
   by 1536 (from 3584 to 5120).

   Since event D will overwrite all bytes of event A and the first 512 bytes of
   event B, overwrite_pos is advanced to the start of event C, the oldest event
   that is not overwritten.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |      [BUSY]     |        |               [BUSY]              | [BUSY] |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   pending_pos = 1536
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |
consumer_pos = 0

8. Reserve event E, size 1024.

   Although there are 512 bytes not being written between producer_pos and
   pending_pos, E cannot be reserved, as it would overwrite the first 512
   bytes of event C, which is still being written.

9. Submit event C and D.

   pending_pos is advanced to the end of D.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |                 |        |                                   |        |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |             pending_pos=5120
   |
consumer_pos = 0

The performance data for overwrite mode will be provided in a follow-up
patch that adds overwrite-mode benchmarks.

A sample of performance data for non-overwrite mode, collected on an x86_64
CPU and an arm64 CPU, before and after this patch, is shown below. As we can
see, no obvious performance regression occurs.

- x86_64 (AMD EPYC 9654)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s)

- arm64 (HiSilicon Kunpeng 920)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com
2025-11-07 14:00:07 -08:00
Mykyta Yatsenko
42995c95b9 bpf: widen dynptr size/offset to 64 bit
Dynptr currently caps size and offset at 24 bits, which isn’t sufficient
for file-backed use cases; even 32 bits can be limiting. Refactor dynptr
helpers/kfuncs to use 64-bit size and offset, ensuring consistency
across the APIs.

This change does not affect internals of xdp, skb or other dynptrs,
which continue to behave as before. Also it does not break binary
compatibility.

The widening enables large-file access support via dynptr, implemented
in the next patches.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Anton Protopopov
49c5e0eef4 libbpf: fix formatting of bpf_object__append_subprog_code
The commit 6c918709bd30 ("libbpf: Refactor bpf_object__reloc_code")
added the bpf_object__append_subprog_code() with incorrect indentations.
Use tabs instead. (This also makes a consequent commit better readable.)

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251019202145.3944697-14-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-07 14:00:07 -08:00
Alain Knaff
30599e72bf include: add __poll_t typedef in include/linux/types.h for Android/Termux
On Android/Termux, linux/types.h is included (indirectly) by sys/epoll.h which
depends on this definition to be present.

Signed-off-by: Alain Knaff <github@misc.lka.org.lu>
2025-11-07 12:35:07 -08:00
Andrii Nakryiko
3d451d916f ci: drop tmp.master testing of pahole
It hasn't been updated for a long time, seems abandoned.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
02b3ec9ffc ci: denylist verif_scale_pyperf600 as it now fails with newer Clang
We get "The sequence of 8193 jumps is too complex." with newer Clang.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
c7f77de09d ci: update clang to v21 for test workflow
Update Clang version used.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
2719a398b0 libbpf: fix Github's Makefile for libbpf_utils.c
Drop removed str_error.o from the list of object to build. Rename
libbpf_errno.o into libbpf_utils.o.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
7e9d669550 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   21aeabb68258ce17b91af113a768760b3a491d93
Checkpoint bpf-next commit: de7342228b7343774d6a9981c2ddbfb5e201044b
Baseline bpf commit:        27861fc720be2c39b861d8bdfb68287f54de6855
Checkpoint bpf commit:      4d920ed684392ae064af62957d6f5a90312dfaf6

Alasdair McWilliam (1):
  rtnetlink: add needed_{head,tail}room attributes

Andrii Nakryiko (5):
  libbpf: make libbpf_errno.c into more generic libbpf_utils.c
  libbpf: remove unused libbpf_strerror_r and STRERR_BUFSIZE
  libbpf: move libbpf_errstr() into libbpf_utils.c
  libbpf: move libbpf_sha256() implementation into libbpf_utils.c
  libbpf: remove linux/unaligned.h dependency for libbpf_sha256()

Christian Brauner (1):
  nsfs: support exhaustive file handles

D. Wythe (1):
  libbpf: Fix error when st-prefix_ops and ops from differ btf

Eric Biggers (2):
  libbpf: Replace AF_ALG with open coded SHA-256
  libbpf: Fix undefined behavior in {get,put}_unaligned_be32()

Hangbin Liu (1):
  bonding: add support for per-port LACP actor priority

Jakub Kicinski (1):
  uapi: wrap compiler_types.h in an ifdef instead of the implicit strip

Jiawei Zhao (2):
  libbpf: Fix USDT SIB argument handling causing unrecognized register
    error
  libbpf: Remove unused args in parse_usdt_note

KP Singh (7):
  bpf: Implement exclusive map creation
  libbpf: Implement SHA256 internal helper
  libbpf: Support exclusive map creation
  bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD
  bpf: Implement signature verification for BPF programs
  libbpf: Update light skeleton for signing
  libbpf: Embed and verify the metadata hash in the loader

Mykyta Yatsenko (1):
  bpf: bpf task work plumbing

Rong Tao (1):
  bpf: Finish constification of 1st parameter of bpf_d_path()

Tony Ambardar (1):
  libbpf: Fix missing #pragma in libbpf_utils.c

 include/uapi/linux/bpf.h     |  24 +++-
 include/uapi/linux/fcntl.h   |   1 +
 include/uapi/linux/if_link.h |   3 +
 include/uapi/linux/stddef.h  |   2 +
 src/bpf.c                    |   6 +-
 src/bpf.h                    |   5 +-
 src/bpf_gen_internal.h       |   2 +
 src/btf.c                    |   1 -
 src/btf_dump.c               |   1 -
 src/elf.c                    |   1 -
 src/features.c               |   1 -
 src/gen_loader.c             |  50 ++++++-
 src/libbpf.c                 | 108 ++++++++++++---
 src/libbpf.h                 |  25 +++-
 src/libbpf.map               |   3 +
 src/libbpf_errno.c           |  75 ----------
 src/libbpf_internal.h        |  19 +++
 src/libbpf_utils.c           | 256 +++++++++++++++++++++++++++++++++++
 src/linker.c                 |   1 -
 src/relo_core.c              |   1 -
 src/ringbuf.c                |   1 -
 src/skel_internal.h          |  76 ++++++++++-
 src/str_error.c              | 104 --------------
 src/str_error.h              |  19 ---
 src/usdt.bpf.h               |  44 +++++-
 src/usdt.c                   |  73 ++++++++--
 26 files changed, 650 insertions(+), 252 deletions(-)
 delete mode 100644 src/libbpf_errno.c
 create mode 100644 src/libbpf_utils.c
 delete mode 100644 src/str_error.c
 delete mode 100644 src/str_error.h

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
e4dc2acd35 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Eric Biggers
2b940bcde1 libbpf: Fix undefined behavior in {get,put}_unaligned_be32()
These violate aliasing rules and may be miscompiled unless
-fno-strict-aliasing is used.  Replace them with the standard memcpy()
solution.  Note that compilers know how to optimize this properly.

Fixes: 4a1c9e544b8d ("libbpf: remove linux/unaligned.h dependency for libbpf_sha256()")
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20251006012037.159295-1-ebiggers@kernel.org
2025-10-06 15:59:27 -07:00
Rong Tao
379ac32f2c bpf: Finish constification of 1st parameter of bpf_d_path()
The commit 1b8abbb12128 ("bpf...d_path(): constify path argument")
constified the first parameter of the bpf_d_path(), but failed to
update it in all places. Finish constification.

Otherwise the selftest fail to build:
.../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path'
  222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym;
      |            ^
.../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here
 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym;

Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument")
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Tony Ambardar
eca524d5a6 libbpf: Fix missing #pragma in libbpf_utils.c
The recent sha256 patch uses a GCC pragma to suppress compile errors for
a packed struct, but omits a needed pragma (see related link) and thus
still raises errors: (e.g. on GCC 12.3 armhf)

libbpf_utils.c:153:29: error: packed attribute causes inefficient alignment for ‘__val’ [-Werror=attributes]
  153 | struct __packed_u32 { __u32 __val; } __attribute__((packed));
      |                             ^~~~~

Resolve by adding the GCC diagnostic pragma to ignore "-Wattributes".

Link: https://lore.kernel.org/bpf/CAP-5=fXURWoZu2j6Y8xQy23i7=DfgThq3WC1RkGFBx-4moQKYQ@mail.gmail.com/

Fixes: 4a1c9e544b8d ("libbpf: remove linux/unaligned.h dependency for libbpf_sha256()")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
50d1b8e6b4 libbpf: remove linux/unaligned.h dependency for libbpf_sha256()
linux/unaligned.h include dependency is causing issues for libbpf's
Github mirror due to {get,put}_unaligned_be32() usage.

So get rid of it by implementing custom variants of those macros that
will work both in kernel and Github mirror repos.

Also switch round_up() to roundup(), as the former is not available in
Github mirror (and is just a subtly more specific variant of roundup()
anyways).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
84aad03545 libbpf: move libbpf_sha256() implementation into libbpf_utils.c
Move sha256 implementation out of already large and unwieldy libbpf.c
into libbpf_utils.c where we'll keep reusable helpers.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
6fcb2c1963 libbpf: move libbpf_errstr() into libbpf_utils.c
Get rid of str_err.{c,h} by moving implementation of libbpf_errstr()
into libbpf_utils.c and declarations into libbpf_internal.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
ce015f0184 libbpf: remove unused libbpf_strerror_r and STRERR_BUFSIZE
libbpf_strerror_r() is not exposed as public API and neither is it used
inside libbpf itself. Remove it altogether.

Same for STRERR_BUFSIZE, it's just an orphaned leftover constant which
we missed to clean up some time earlier.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
33021bb9dd libbpf: make libbpf_errno.c into more generic libbpf_utils.c
Libbpf is missing one convenient place to put common "utils"-like code
that is generic and usable from multiple places. Use libbpf_errno.c as
the base for more generic libbpf_utils.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251001171326.3883055-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
2025-10-06 15:59:27 -07:00
Alasdair McWilliam
97fbf1d106 rtnetlink: add needed_{head,tail}room attributes
Various network interface types make use of needed_{head,tail}room values
to efficiently reserve buffer space for additional encapsulation headers,
such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible
to query these values in a generic way.

Introduce ability to query the needed_{head,tail}room values of a network
device via rtnetlink, such that applications that may wish to use these
values can do so.

For example, Cilium agent iterates over present devices based on user config
(direct routing, vxlan, geneve, wireguard etc.) and in future will configure
netkit in order to expose the needed_{head,tail}room into K8s pods. See
b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room").

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alasdair McWilliam <alasdair@mcwilliam.dev>
Reviewed-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-06 15:59:27 -07:00
Hangbin Liu
017c96d6e1 bonding: add support for per-port LACP actor priority
Introduce a new netlink attribute 'actor_port_prio' to allow setting
the LACP actor port priority on a per-slave basis. This extends the
existing bonding infrastructure to support more granular control over
LACP negotiations.

The priority value is embedded in LACPDU packets and will be used by
subsequent patches to influence aggregator selection policies.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-06 15:59:27 -07:00
Jakub Kicinski
1f098bc568 uapi: wrap compiler_types.h in an ifdef instead of the implicit strip
The uAPI stddef header includes compiler_types.h, a kernel-only
header, to make sure that kernel definitions of annotations
like __counted_by() take precedence.

There is a hack in scripts/headers_install.sh which strips includes
of compiler.h and compiler_types.h when installing uAPI headers.
While explicit handling makes sense for compiler.h, which is included
all over the uAPI, compiler_types.h is only included by stddef.h
(within the uAPI, obviously it's included in kernel code a lot).

Remove the stripping from scripts/headers_install.sh and wrap
the include of compiler_types.h in #ifdef __KERNEL__ instead.
This should be equivalent functionally, but is easier to understand
to a casual reader of the code. It also makes it easier to work
with kernel headers directly from under tools/

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250825201828.2370083-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-06 15:59:27 -07:00
Eric Biggers
367798a9cf libbpf: Replace AF_ALG with open coded SHA-256
Reimplement libbpf_sha256() using some basic SHA-256 C code.  This
eliminates the newly-added dependency on AF_ALG, which is a problematic
UAPI that is not supported by all kernels.

Make libbpf_sha256() return void, since it can no longer fail.  This
simplifies some callers.  Also drop the unnecessary 'sha_out_sz'
parameter.  Finally, also fix the typo in "compute_sha_udpate_offsets".

Fixes: c297fe3e9f99 ("libbpf: Implement SHA256 internal helper")
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Link: https://lore.kernel.org/r/20250928003833.138407-1-ebiggers@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
D. Wythe
f9369ca839 libbpf: Fix error when st-prefix_ops and ops from differ btf
When a module registers a struct_ops, the struct_ops type and its
corresponding map_value type ("bpf_struct_ops_") may reside in different
btf objects, here are four possible case:

+--------+---------------+-------------+---------------------------------+
|        |bpf_struct_ops_| xxx_ops     |                                 |
+--------+---------------+-------------+---------------------------------+
| case 0 | btf_vmlinux   | btf_vmlinux | be used and reg only in vmlinux |
+--------+---------------+-------------+---------------------------------+
| case 1 | btf_vmlinux   | mod_btf     | INVALID                         |
+--------+---------------+-------------+---------------------------------+
| case 2 | mod_btf       | btf_vmlinux | reg in mod but be used both in  |
|        |               |             | vmlinux and mod.                |
+--------+---------------+-------------+---------------------------------+
| case 3 | mod_btf       | mod_btf     | be used and reg only in mod     |
+--------+---------------+-------------+---------------------------------+

Currently we figure out the mod_btf by searching with the struct_ops type,
which makes it impossible to figure out the mod_btf when the struct_ops
type is in btf_vmlinux while it's corresponding map_value type is in
mod_btf (case 2).

The fix is to use the corresponding map_value type ("bpf_struct_ops_")
as the lookup anchor instead of the struct_ops type to figure out the
`btf` and `mod_btf` via find_ksym_btf_id(), and then we can locate
the kern_type_id via btf__find_by_name_kind() with the `btf` we just
obtained from find_ksym_btf_id().

With this change the lookup obtains the correct btf and mod_btf for case 2,
preserves correct behavior for other valid cases, and still fails as
expected for the invalid scenario (case 1).

Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20250926071751.108293-1-alibuda@linux.alibaba.com
2025-10-06 15:59:27 -07:00
Mykyta Yatsenko
70619ad135 bpf: bpf task work plumbing
This patch adds necessary plumbing in verifier, syscall and maps to
support handling new kfunc bpf_task_work_schedule and kernel structure
bpf_task_work. The idea is similar to how we already handle bpf_wq and
bpf_timer.
verifier changes validate calls to bpf_task_work_schedule to make sure
it is safe and expected invariants hold.
btf part is required to detect bpf_task_work structure inside map value
and store its offset, which will be used in the next patch to calculate
key and value addresses.
arraymap and hashtab changes are needed to handle freeing of the
bpf_task_work: run code needed to deinitialize it, for example cancel
task_work callback if possible.
The use of bpf_task_work and proper implementation for kfuncs are
introduced in the next patch.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
56cc32b5e3 libbpf: Embed and verify the metadata hash in the loader
To fulfill the BPF signing contract, represented as Sig(I_loader ||
H_meta), the generated trusted loader program must verify the integrity
of the metadata. This signature cryptographically binds the loader's
instructions (I_loader) to a hash of the metadata (H_meta).

The verification process is embedded directly into the loader program.
Upon execution, the loader loads the runtime hash from struct bpf_map
i.e. BPF_PSEUDO_MAP_IDX and compares this runtime hash against an
expected hash value that has been hardcoded directly by
bpf_obj__gen_loader.

The load from bpf_map can be improved by calling
BPF_OBJ_GET_INFO_BY_FD from the kernel context after BPF_OBJ_GET_INFO_BY_FD
has been updated for being called from the kernel context.

The following instructions are generated:

    ld_imm64 r1, const_ptr_to_map // insn[0].src_reg == BPF_PSEUDO_MAP_IDX
    r2 = *(u64 *)(r1 + 0);
    ld_imm64 r3, sha256_of_map_part1 // constant precomputed by
bpftool (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 8);
    ld_imm64 r3, sha256_of_map_part2 // (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 16);
    ld_imm64 r3, sha256_of_map_part3 // (part of H_meta)
    if r2 != r3 goto out;

    r2 = *(u64 *)(r1 + 24);
    ld_imm64 r3, sha256_of_map_part4 // (part of H_meta)
    if r2 != r3 goto out;
    ...

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-4-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
986d033976 libbpf: Update light skeleton for signing
* The metadata map is created with as an exclusive map (with an
excl_prog_hash) This restricts map access exclusively to the signed
loader program, preventing tampering by other processes.

* The map is then frozen, making it read-only from userspace.

* BPF_OBJ_GET_INFO_BY_ID instructs the kernel to compute the hash of the
  metadata map (H') and store it in bpf_map->sha.

* The loader is then loaded with the signature which is then verified by
  the kernel.

loading signed programs prebuilt into the kernel are not currently
supported. These can supported by enabling BPF_OBJ_GET_INFO_BY_ID to be
called from the kernel.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-3-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
decfae3a5d bpf: Implement signature verification for BPF programs
This patch extends the BPF_PROG_LOAD command by adding three new fields
to `union bpf_attr` in the user-space API:

  - signature: A pointer to the signature blob.
  - signature_size: The size of the signature blob.
  - keyring_id: The serial number of a loaded kernel keyring (e.g.,
    the user or session keyring) containing the trusted public keys.

When a BPF program is loaded with a signature, the kernel:

1.  Retrieves the trusted keyring using the provided `keyring_id`.
2.  Verifies the supplied signature against the BPF program's
    instruction buffer.
3.  If the signature is valid and was generated by a key in the trusted
    keyring, the program load proceeds.
4.  If no signature is provided, the load proceeds as before, allowing
    for backward compatibility. LSMs can chose to restrict unsigned
    programs and implement a security policy.
5.  If signature verification fails for any reason,
    the program is not loaded.

Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
a6bb074359 bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD
Currently only array maps are supported, but the implementation can be
extended for other maps and objects. The hash is memoized only for
exclusive and frozen maps as their content is stable until the exclusive
program modifies the map.

This is required for BPF signing, enabling a trusted loader program to
verify a map's integrity. The loader retrieves
the map's runtime hash from the kernel and compares it against an
expected hash computed at build time.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
59ed98f687 libbpf: Support exclusive map creation
Implement setters and getters that allow map to be registered as
exclusive to the specified program. The registration should be done
before the exclusive program is loaded.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-5-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
1202ada5c1 libbpf: Implement SHA256 internal helper
Use AF_ALG sockets to not have libbpf depend on OpenSSL. The helper is
used for the loader generation code to embed the metadata hash in the
loader program and also by the bpf_map__make_exclusive API to calculate
the hash of the program the map is exclusive to.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-4-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
KP Singh
8347a49c62 bpf: Implement exclusive map creation
Exclusive maps allow maps to only be accessed by program with a
program with a matching hash which is specified in the excl_prog_hash
attr.

For the signing use-case, this allows the trusted loader program
to load the map and verify the integrity

Signed-off-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-06 15:59:27 -07:00
Jiawei Zhao
fcc06c3da4 libbpf: Remove unused args in parse_usdt_note
Remove unused 'elf' and 'path' parameters from parse_usdt_note function
signature. These parameters are not referenced within the function body
and only add unnecessary complexity.

The function only requires the note header, data buffer, offsets, and
output structure to perform USDT note parsing.

Update function declaration, definition, and the single call site in
collect_usdt_targets() to match the simplified signature.

This is a safe internal cleanup as parse_usdt_note is a static function.

Signed-off-by: Jiawei Zhao <phoenix500526@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250904030525.1932293-1-phoenix500526@163.com
2025-10-06 15:59:27 -07:00
Jiawei Zhao
9236e137e9 libbpf: Fix USDT SIB argument handling causing unrecognized register error
On x86-64, USDT arguments can be specified using Scale-Index-Base (SIB)
addressing, e.g. "1@-96(%rbp,%rax,8)". The current USDT implementation
in libbpf cannot parse this format, causing `bpf_program__attach_usdt()`
to fail with -ENOENT (unrecognized register).

This patch fixes this by implementing the necessary changes:
- add correct handling for SIB-addressed arguments in `bpf_usdt_arg`.
- add adaptive support to `__bpf_usdt_arg_type` and
  `__bpf_usdt_arg_spec` to represent SIB addressing parameters.

Signed-off-by: Jiawei Zhao <phoenix500526@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250827053128.1301287-2-phoenix500526@163.com
2025-10-06 15:59:27 -07:00
Christian Brauner
2d769c3bc5 nsfs: support exhaustive file handles
Pidfd file handles are exhaustive meaning they don't require a handle on
another pidfd to pass to open_by_handle_at() so it can derive the
filesystem to decode in. Instead it can be derived from the file
handle itself. The same is possible for namespace file handles.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
9705048c0e include: implement be{32,64}_to_cpu() and cpu_to_be{32,64}() macros
libbpf is now using above macros for libbpf_sha256() implementation,
make them available in Github repo.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
6920913226 include: add BPF_JMP_REG() macro implementation
libbpf's gen_loader is now using BPF_JMP_REG(), so add it to
include/linux/filter.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
519d65b564 sync: fix sync script
Add missing {} for find -exec invoation.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-10-06 15:59:27 -07:00
Andrii Nakryiko
3f077472ee sync: sync stddef.h UAPI header
It contains __struct_group() macro needed for pkt_cls.h UAPI header...

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 17:04:39 -07:00
Andrii Nakryiko
0c33cc07f1 sync: sync networking UAPI headers
Some of them were outdated, again due to originally using UAPI headers
from tools/ subdirectory in Linux repo.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
Andrii Nakryiko
289e4a2160 sync: add back fcnt.h and openat2.h UAPI headers
They were removed during one of the syncs because Linux repo's tools/
versions of UAPI headers were removed (as they were not needed for perf
anymore). This is no right for libbpf, so add them back. And moving
forward, we'll sync them from Linux repo original UAPIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
Andrii Nakryiko
67901a67cb sync: update sync-kernel.sh to fetch original UAPI headers
Instead of UAPI headers copies from tools/ subdir in kernel repo, fetch
all the original UAPI headers straight from include/uapi/linux location.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-26 15:43:18 -07:00
Andrii Nakryiko
5acba1722d sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   561c80369df0733ba0574882a1635287b20f9de2
Checkpoint bpf-next commit: 21aeabb68258ce17b91af113a768760b3a491d93
Baseline bpf commit:        561c80369df0733ba0574882a1635287b20f9de2
Checkpoint bpf commit:      27861fc720be2c39b861d8bdfb68287f54de6855

Cryolitia PukNgae (1):
  libbpf: Add documentation to version and error API functions

Mykyta Yatsenko (1):
  libbpf: Export bpf_object__prepare symbol

Yureka Lilian (1):
  libbpf: Fix reuse of DEVMAP

 src/libbpf.c | 10 ++++++++++
 src/libbpf.h | 27 ++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-21 13:39:29 -07:00
Andrii Nakryiko
72f3e4fd8e sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-08-21 13:39:29 -07:00
Cryolitia PukNgae
83fe1f4494 libbpf: Add documentation to version and error API functions
Add documentation for the following API functions:

- libbpf_major_version()
- libbpf_minor_version()
- libbpf_version_string()
- libbpf_strerror()

Signed-off-by: Cryolitia PukNgae <cryolitia@uniontech.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250820-libbpf-doc-1-v1-1-13841f25a134@uniontech.com
2025-08-21 13:39:29 -07:00
Mykyta Yatsenko
9f8984fba5 libbpf: Export bpf_object__prepare symbol
Add missing LIBBPF_API macro for bpf_object__prepare function to enable
its export. libbpf.map had bpf_object__prepare already listed.

Fixes: 1315c28ed809 ("libbpf: Split bpf object load into prepare/load")
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20250819215119.37795-1-mykyta.yatsenko5@gmail.com
2025-08-21 13:39:29 -07:00
Yureka Lilian
04a23358c7 libbpf: Fix reuse of DEVMAP
Previously, re-using pinned DEVMAP maps would always fail, because
get_map_info on a DEVMAP always returns flags with BPF_F_RDONLY_PROG set,
but BPF_F_RDONLY_PROG being set on a map during creation is invalid.

Thus, ignore the BPF_F_RDONLY_PROG flag in the flags returned from
get_map_info when checking for compatibility with an existing DEVMAP.

The same problem is handled in a third-party ebpf library:
- https://github.com/cilium/ebpf/issues/925
- https://github.com/cilium/ebpf/pull/930

Fixes: 0cdbb4b09a06 ("devmap: Allow map lookups from eBPF")
Signed-off-by: Yureka Lilian <yuka@yuka.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250814180113.1245565-3-yuka@yuka.dev
2025-08-21 13:39:29 -07:00
Ilya Leoshkevich
fc687b8ee9 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   42be23e8f2dcb100cb9944b2b54b6bf41aff943d
Checkpoint bpf-next commit: 561c80369df0733ba0574882a1635287b20f9de2
Baseline bpf commit:        0238c45fbbf8228f52aa4642f0cdc21c570d1dfe
Checkpoint bpf commit:      561c80369df0733ba0574882a1635287b20f9de2

Achill Gilgenast (1):
  libbpf: Avoid possible use of uninitialized mod_len

Ilya Leoshkevich (1):
  libbpf: Add the ability to suppress perf event enablement

Jason Xing (1):
  net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt

Samiullah Khawaja (2):
  Add support to set NAPI threaded for individual NAPI
  net: define an enum for the napi threaded state

 include/uapi/linux/if_xdp.h |  1 +
 include/uapi/linux/netdev.h |  6 ++++++
 src/libbpf.c                | 15 +++++++++------
 src/libbpf.h                |  4 +++-
 4 files changed, 19 insertions(+), 7 deletions(-)

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
2025-08-12 12:51:17 -07:00
Ilya Leoshkevich
a3e0234f49 sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
2025-08-12 12:51:17 -07:00
Ilya Leoshkevich
8ce18b6b73 libbpf: Add the ability to suppress perf event enablement
Automatically enabling a perf event after attaching a BPF prog to it is
not always desirable.

Add a new "dont_enable" field to struct bpf_perf_event_opts. While
introducing "enable" instead would be nicer in that it would avoid
a double negation in the implementation, it would make
DECLARE_LIBBPF_OPTS() less efficient.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Co-developed-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20250806162417.19666-2-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-08-12 12:51:17 -07:00
Achill Gilgenast
df60ff2a29 libbpf: Avoid possible use of uninitialized mod_len
Though mod_len is only read when mod_name != NULL and both are initialized
together, gcc15 produces a warning with -Werror=maybe-uninitialized:

libbpf.c: In function 'find_kernel_btf_id.constprop':
libbpf.c:10100:33: error: 'mod_len' may be used uninitialized [-Werror=maybe-uninitialized]
10100 |                 if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0)
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
libbpf.c:10070:21: note: 'mod_len' was declared here
10070 |         int ret, i, mod_len;
      |                     ^~~~~~~

Silence the false positive.

Signed-off-by: Achill Gilgenast <fossdd@pwned.life>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250729094611.2065713-1-fossdd@pwned.life
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-08-12 12:51:17 -07:00
Samiullah Khawaja
a547f98fbb net: define an enum for the napi threaded state
Instead of using '0' and '1' for napi threaded state use an enum with
'disabled' and 'enabled' states.

Tested:
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-08-12 12:51:17 -07:00
Samiullah Khawaja
4c853bf66f Add support to set NAPI threaded for individual NAPI
A net device has a threaded sysctl that can be used to enable threaded
NAPI polling on all of the NAPI contexts under that device. Allow
enabling threaded NAPI polling at individual NAPI level using netlink.

Extend the netlink operation `napi-set` and allow setting the threaded
attribute of a NAPI. This will enable the threaded polling on a NAPI
context.

Add a test in `nl_netdev.py` that verifies various cases of threaded
NAPI being set at NAPI and at device level.

Tested
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-08-12 12:51:17 -07:00
Jason Xing
77af22f93d net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt
This patch provides a setsockopt method to let applications leverage to
adjust how many descs to be handled at most in one send syscall. It
mitigates the situation where the default value (32) that is too small
leads to higher frequency of triggering send syscall.

Considering the prosperity/complexity the applications have, there is no
absolutely ideal suggestion fitting all cases. So keep 32 as its default
value like before.

The patch does the following things:
- Add XDP_MAX_TX_SKB_BUDGET socket option.
- Set max_tx_budget to 32 by default in the initialization phase as a
  per-socket granular control.
- Set the range of max_tx_budget as [32, xs->tx->nentries].

The idea behind this comes out of real workloads in production. We use a
user-level stack with xsk support to accelerate sending packets and
minimize triggering syscalls. When the packets are aggregated, it's not
hard to hit the upper bound (namely, 32). The moment user-space stack
fetches the -EAGAIN error number passed from sendto(), it will loop to try
again until all the expected descs from tx ring are sent out to the driver.
Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of
sendto() and higher throughput/PPS.

Here is what I did in production, along with some numbers as follows:
For one application I saw lately, I suggested using 128 as max_tx_budget
because I saw two limitations without changing any default configuration:
1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by
net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind
this was I counted how many descs are transmitted to the driver at one
time of sendto() based on [1] patch and then I calculated the
possibility of hitting the upper bound. Finally I chose 128 as a
suitable value because 1) it covers most of the cases, 2) a higher
number would not bring evident results. After twisting the parameters,
a stable improvement of around 4% for both PPS and throughput and less
resources consumption were found to be observed by strace -c -p xxx:
1) %time was decreased by 7.8%
2) error counter was decreased from 18367 to 572

[1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-08-12 12:51:17 -07:00
Eduard Zingerman
58dd1f58b5 docs: describe how to reproduce errors reported by oss-fuzz
Add a description for current oss-fuzz setup and write down the
commands needed to reproduce fuzzer reported errors:
- "Official way" in case exact oss-fuzz environment is necessary.
- "Simple way" for local tinkering.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
2025-07-18 17:23:16 -07:00
Eduard Zingerman
dac1ec64a3 scripts: allow skipping elfutils rebuild in build-fuzzers.sh
This simplifies local reproduction of fuzzer reported errors.
E.g. the following sequence of commands would execute much faster on a
second run:

  $ SKIP_LIBELF_REBUILD=1 scripts/build-fuzzers.sh
  $ out/bpf-object-fuzzer <path-to-test-case>

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
2025-07-18 17:23:08 -07:00
Andrii Nakryiko
9823ef295d libbpf: dump Makefile version to 1.7.0
With new development cycle comes updated Makefile.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-07-18 17:20:48 -07:00
Andrii Nakryiko
cb15da45c2 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   e860a98c8aebd8de82c0ee901acf5a759acd4570
Checkpoint bpf-next commit: 42be23e8f2dcb100cb9944b2b54b6bf41aff943d
Baseline bpf commit:        bf4807c89d8f92c47404b1e4eeeefb42259d1b50
Checkpoint bpf commit:      0238c45fbbf8228f52aa4642f0cdc21c570d1dfe

Andrii Nakryiko (2):
  libbpf: start v1.7 dev cycle
  libbpf: Fix handling of BPF arena relocations

Eduard Zingerman (1):
  libbpf: Verify that arena map exists when adding arena relocations

Matteo Croce (1):
  libbpf: Fix warning in calloc() usage

Tao Chen (1):
  bpf: Add struct bpf_token_info

 include/uapi/linux/bpf.h |  8 ++++++++
 src/libbpf.c             | 27 +++++++++++++++++++--------
 src/libbpf.map           |  3 +++
 src/libbpf_version.h     |  2 +-
 4 files changed, 31 insertions(+), 9 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-07-18 17:20:48 -07:00
Eduard Zingerman
8e59f80a93 libbpf: Verify that arena map exists when adding arena relocations
Fuzzer reported a memory access error in bpf_program__record_reloc()
that happens when:
- ".addr_space.1" section exists
- there is a relocation referencing this section
- there are no arena maps defined in BTF.

Sanity checks for maps existence are already present in
bpf_program__record_reloc(), hence this commit adds another one.

[1] https://github.com/libbpf/libbpf/actions/runs/16375110681/job/46272998064

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250718222059.281526-1-eddyz87@gmail.com
2025-07-18 17:20:48 -07:00
Andrii Nakryiko
fb66fb4948 libbpf: Fix handling of BPF arena relocations
Initial __arena global variable support implementation in libbpf
contains a bug: it remembers struct bpf_map pointer for arena, which is
used later on to process relocations. Recording this pointer is
problematic because map pointers are not stable during ELF relocation
collection phase, as an array of struct bpf_map's can be reallocated,
invalidating all the pointers. Libbpf is dealing with similar issues by
using a stable internal map index, though for BPF arena map specifically
this approach wasn't used due to an oversight.

The resulting behavior is non-deterministic issue which depends on exact
layout of ELF object file, number of actual maps, etc. We didn't hit
this until very recently, when this bug started triggering crash in BPF
CI when validating one of sched-ext BPF programs.

The fix is rather straightforward: we just follow an established pattern
of remembering map index (just like obj->kconfig_map_idx, for example)
instead of `struct bpf_map *`, and resolving index to a pointer at the
point where map information is necessary.

While at it also add debug-level message for arena-related relocation
resolution information, which we already have for all other kinds of
maps.

Fixes: 2e7ba4f8fd1f ("libbpf: Recognize __arena global variables.")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250718001009.610955-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-18 17:20:48 -07:00
Matteo Croce
a9dbcc32fd libbpf: Fix warning in calloc() usage
When compiling libbpf with some compilers, this warning is triggered:

libbpf.c: In function ‘bpf_object__gen_loader’:
libbpf.c:9209:28: error: ‘calloc’ sizes specified with ‘sizeof’ in the earlier argument and not in the later argument [-Werror=calloc-transposed-args]
 9209 |         gen = calloc(sizeof(*gen), 1);
      |                            ^
libbpf.c:9209:28: note: earlier argument should specify number of elements, later size of each element

Fix this by inverting the calloc() arguments.

Signed-off-by: Matteo Croce <teknoraver@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250717200337.49168-1-technoboy85@gmail.com
2025-07-18 17:20:48 -07:00
Tao Chen
a3dadc5a42 bpf: Add struct bpf_token_info
The 'commit 35f96de04127 ("bpf: Introduce BPF token object")' added
BPF token as a new kind of BPF kernel object. And BPF_OBJ_GET_INFO_BY_FD
already used to get BPF object info, so we can also get token info with
this cmd.
One usage scenario, when program runs failed with token, because of
the permission failure, we can report what BPF token is allowing with
this API for debugging.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Link: https://lore.kernel.org/r/20250716134654.1162635-1-chen.dylane@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-18 17:20:48 -07:00
Andrii Nakryiko
2b39ea081f libbpf: start v1.7 dev cycle
With libbpf 1.6.0 released, adjust libbpf.map and libbpf_version.h to
start v1.7 development cycles.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250716175936.2343013-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-18 17:20:48 -07:00
Andrii Nakryiko
da08818f4f sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   07ee18a0bc946b6b407942c88faed089e20f47d1
Checkpoint bpf-next commit: e860a98c8aebd8de82c0ee901acf5a759acd4570
Baseline bpf commit:        e34a79b96ab9d49ed8b605fee11099cf3efbb428
Checkpoint bpf commit:      bf4807c89d8f92c47404b1e4eeeefb42259d1b50

Eduard Zingerman (1):
  libbpf: __arg_untrusted in bpf_helpers.h

Kumar Kartikeya Dwivedi (3):
  bpf: Introduce BPF standard streams
  libbpf: Add bpf_stream_printk() macro
  libbpf: Introduce bpf_prog_stream_read() API

 include/uapi/linux/bpf.h | 24 ++++++++++++++++++++++++
 src/bpf.c                | 20 ++++++++++++++++++++
 src/bpf.h                | 21 +++++++++++++++++++++
 src/bpf_helpers.h        | 17 +++++++++++++++++
 src/libbpf.map           |  1 +
 5 files changed, 83 insertions(+)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-07-16 08:57:15 -07:00
Eduard Zingerman
5f4a34c606 libbpf: __arg_untrusted in bpf_helpers.h
Make btf_decl_tag("arg:untrusted") available for libbpf users via
macro. Makes the following usage possible:

  void foo(struct bar *p __arg_untrusted) { ... }
  void bar(struct foo *p __arg_trusted) {
    ...
    foo(p->buz->bar); // buz derefrence looses __trusted
    ...
  }

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250704230354.1323244-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-16 08:57:15 -07:00
Kumar Kartikeya Dwivedi
c2850a3840 libbpf: Introduce bpf_prog_stream_read() API
Introduce a libbpf API so that users can read data from a given BPF
stream for a BPF prog fd. For now, only the low-level syscall wrapper
is provided, we can add a bpf_program__* accessor as a follow up if
needed.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-11-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-16 08:57:15 -07:00
Kumar Kartikeya Dwivedi
9bb5c46da4 libbpf: Add bpf_stream_printk() macro
Add a convenience macro to print data to the BPF streams. BPF_STDOUT and
BPF_STDERR stream IDs in the vmlinux.h can be passed to the macro to
print to the respective streams.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-16 08:57:15 -07:00
Kumar Kartikeya Dwivedi
ae131a0b7c bpf: Introduce BPF standard streams
Add support for a stream API to the kernel and expose related kfuncs to
BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These
can be used for printing messages that can be consumed from user space,
thus it's similar in spirit to existing trace_pipe interface.

The kernel will use the BPF_STDERR stream to notify the program of any
errors encountered at runtime. BPF programs themselves may use both
streams for writing debug messages. BPF library-like code may use
BPF_STDERR to print warnings or errors on misuse at runtime.

The implementation of a stream is as follows. Everytime a message is
emitted from the kernel (directly, or through a BPF program), a record
is allocated by bump allocating from per-cpu region backed by a page
obtained using alloc_pages_nolock(). This ensures that we can allocate
memory from any context. The eventual plan is to discard this scheme in
favor of Alexei's kmalloc_nolock() [0].

This record is then locklessly inserted into a list (llist_add()) so
that the printing side doesn't require holding any locks, and works in
any context. Each stream has a maximum capacity of 4MB of text, and each
printed message is accounted against this limit.

Messages from a program are emitted using the bpf_stream_vprintk kfunc,
which takes a stream_id argument in addition to working otherwise
similar to bpf_trace_vprintk.

The bprintf buffer helpers are extracted out to be reused for printing
the string into them before copying it into the stream, so that we can
(with the defined max limit) format a string and know its true length
before performing allocations of the stream element.

For consuming elements from a stream, we expose a bpf(2) syscall command
named BPF_PROG_STREAM_READ_BY_FD, which allows reading data from the
stream of a given prog_fd into a user space buffer. The main logic is
implemented in bpf_stream_read(). The log messages are queued in
bpf_stream::log by the bpf_stream_vprintk kfunc, and then pulled and
ordered correctly in the stream backlog.

For this purpose, we hold a lock around bpf_stream_backlog_peek(), as
llist_del_first() (if we maintained a second lockless list for the
backlog) wouldn't be safe from multiple threads anyway. Then, if we
fail to find something in the backlog log, we splice out everything from
the lockless log, and place it in the backlog log, and then return the
head of the backlog. Once the full length of the element is consumed, we
will pop it and free it.

The lockless list bpf_stream::log is a LIFO stack. Elements obtained
using a llist_del_all() operation are in LIFO order, thus would break
the chronological ordering if printed directly. Hence, this batch of
messages is first reversed. Then, it is stashed into a separate list in
the stream, i.e. the backlog_log. The head of this list is the actual
message that should always be returned to the caller. All of this is
done in bpf_stream_backlog_fill().

From the kernel side, the writing into the stream will be a bit more
involved than the typical printk. First, the kernel typically may print
a collection of messages into the stream, and parallel writers into the
stream may suffer from interleaving of messages. To ensure each group of
messages is visible atomically, we can lift the advantage of using a
lockless list for pushing in messages.

To enable this, we add a bpf_stream_stage() macro, and require kernel
users to use bpf_stream_printk statements for the passed expression to
write into the stream. Underneath the macro, we have a message staging
API, where a bpf_stream_stage object on the stack accumulates the
messages being printed into a local llist_head, and then a commit
operation splices the whole batch into the stream's lockless log list.

This is especially pertinent for rqspinlock deadlock messages printed to
program streams. After this change, we see each deadlock invocation as a
non-interleaving contiguous message without any confusion on the
reader's part, improving their user experience in debugging the fault.

While programs cannot benefit from this staged stream writing API, they
could just as well hold an rqspinlock around their print statements to
serialize messages, hence this is kept kernel-internal for now.

Overall, this infrastructure provides NMI-safe any context printing of
messages to two dedicated streams.

Later patches will add support for printing splats in case of BPF arena
page faults, rqspinlock deadlocks, and cond_break timeouts, and
integration of this facility into bpftool for dumping messages to user
space.

  [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-07-16 08:57:15 -07:00
Andrii Nakryiko
7a6e6b484d sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   4a4b84ba9e453295c746d81cb245c0c5d80050f0
Checkpoint bpf-next commit: 07ee18a0bc946b6b407942c88faed089e20f47d1
Baseline bpf commit:        a766cfbbeb3a
Checkpoint bpf commit:      e34a79b96ab9d49ed8b605fee11099cf3efbb428

Adin Scannell (1):
  libbpf: Fix possible use-after-free for externs

Yuan Chen (1):
  libbpf: Fix null pointer dereference in btf_dump__free on allocation
    failure

 src/btf_dump.c |  3 +++
 src/libbpf.c   | 10 +++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-07-07 17:21:39 -07:00
Andrii Nakryiko
dc031df06a sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-07-01 15:38:22 -07:00
Adin Scannell
b58f5a3e77 libbpf: Fix possible use-after-free for externs
The `name` field in `obj->externs` points into the BTF data at initial
open time. However, some functions may invalidate this after opening and
before loading (e.g. `bpf_map__set_value_size`), which results in
pointers into freed memory and undefined behavior.

The simplest solution is to simply `strdup` these strings, similar to
the `essent_name`, and free them at the same time.

In order to test this path, the `global_map_resize` BPF selftest is
modified slightly to ensure the presence of an extern, which causes this
test to fail prior to the fix. Given there isn't an obvious API or error
to test against, I opted to add this to the existing test as an aspect
of the resizing feature rather than duplicate the test.

Fixes: 9d0a23313b1a ("libbpf: Add capability for resizing datasec maps")
Signed-off-by: Adin Scannell <amscanne@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250625050215.2777374-1-amscanne@meta.com
2025-07-01 15:38:22 -07:00
Yuan Chen
de1d0a25a8 libbpf: Fix null pointer dereference in btf_dump__free on allocation failure
When btf_dump__new() fails to allocate memory for the internal hashmap
(btf_dump->type_names), it returns an error code. However, the cleanup
function btf_dump__free() does not check if btf_dump->type_names is NULL
before attempting to free it. This leads to a null pointer dereference
when btf_dump__free() is called on a btf_dump object.

Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion")
Signed-off-by: Yuan Chen <chenyuan@kylinos.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250618011933.11423-1-chenyuan_fl@163.com
2025-07-01 15:38:22 -07:00
Andrii Nakryiko
95a9035e8b sync: adjust sync-kernel.sh script to handle UAPI header guards better
Adjust the sync script to handle UAPI header guards with singular and
double underscore between UAPI and LINUX. Kernel seems to have a mix of
both approaches.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-06-16 11:30:04 -07:00
Andrii Nakryiko
0e2ac81b00 sync: normalize more of Linux UAPI headers
Normalize UAPI headers that had single underscore between UAPI and LINUX.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-06-16 11:30:04 -07:00
Amery Hung
fdb04dd485 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   9325d53fe9adff354b6a93fda5f38c165947da0f
Checkpoint bpf-next commit: 4a4b84ba9e453295c746d81cb245c0c5d80050f0
Baseline bpf commit:        b4432656b36e5cc1d50a1f2dc15357543add530e
Checkpoint bpf commit:      d60d09eadb7cb17690c847f1623436cd4b58c19c

Alan Maguire (1):
  libbpf/btf: Fix string handling to support multi-split BTF

Amery Hung (1):
  libbpf: Support creating and destroying qdisc

Andrii Nakryiko (1):
  libbpf: Handle unsupported mmap-based /sys/kernel/btf/vmlinux
    correctly

Blake Jones (1):
  libbpf: Add support for printing BTF character arrays as strings

Ian Rogers (1):
  perf/uapi: Fix PERF_RECORD_SAMPLE comments in
    <uapi/linux/perf_event.h>

Ingo Molnar (1):
  perf/uapi: Clean up <uapi/linux/perf_event.h> a bit

Jiawei Zhao (1):
  libbpf: Correct some typos and syntax issues in usdt doc

Lorenz Bauer (1):
  libbpf: Use mmap to parse vmlinux BTF from sysfs

Paul Chaignon (2):
  bpf: Clarify handling of mark and tstamp by redirect_peer
  bpf: Fix L4 csum update on IPv6 in CHECKSUM_COMPLETE

Saket Kumar Bhaskar (1):
  selftests/bpf: Fix bpf selftest build warning

Stanislav Fomichev (1):
  net: devmem: TCP tx netlink api

Tao Chen (2):
  bpf: Add cookie to raw_tp bpf_link_info
  bpf: Add cookie to tracing bpf_link_info

Tobias Klauser (1):
  bpf: adjust path to trace_output sample eBPF program

Yonghong Song (2):
  bpf: Implement mprog API on top of existing cgroup progs
  libbpf: Support link-based cgroup attach with options

 include/uapi/linux/bpf.h        |  18 +-
 include/uapi/linux/if_xdp.h     |   6 +-
 include/uapi/linux/netdev.h     |   1 +
 include/uapi/linux/perf_event.h | 657 ++++++++++++++++----------------
 src/bpf.c                       |  44 +++
 src/bpf.h                       |   5 +
 src/btf.c                       |  91 ++++-
 src/btf.h                       |   3 +-
 src/btf_dump.c                  |  55 ++-
 src/libbpf.c                    |  28 ++
 src/libbpf.h                    |  20 +-
 src/libbpf.map                  |   1 +
 src/netlink.c                   |  20 +-
 src/usdt.c                      |  10 +-
 14 files changed, 602 insertions(+), 357 deletions(-)

Signed-off-by: Amery Hung <ameryhung@gmail.com>
2025-06-16 08:52:44 -07:00
Amery Hung
f6284bb875 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
2025-06-16 08:52:44 -07:00
Andrii Nakryiko
7356be641d libbpf: Handle unsupported mmap-based /sys/kernel/btf/vmlinux correctly
libbpf_err_ptr() helpers are meant to return NULL and set errno, if
there is an error. But btf_parse_raw_mmap() is meant to be used
internally and is expected to return ERR_PTR() values. Because of this
mismatch, when libbpf tries to mmap /sys/kernel/btf/vmlinux, we don't
detect the error correctly with IS_ERR() check, and never fallback to
old non-mmap-based way of loading vmlinux BTF.

Fix this by using proper ERR_PTR() returns internally.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Fixes: 3c0421c93ce4 ("libbpf: Use mmap to parse vmlinux BTF from sysfs")
Cc: Lorenz Bauer <lmb@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20250606202134.2738910-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-06-16 08:52:44 -07:00
Paul Chaignon
d3d933ac9b bpf: Fix L4 csum update on IPv6 in CHECKSUM_COMPLETE
In Cilium, we use bpf_csum_diff + bpf_l4_csum_replace to, among other
things, update the L4 checksum after reverse SNATing IPv6 packets. That
use case is however not currently supported and leads to invalid
skb->csum values in some cases. This patch adds support for IPv6 address
changes in bpf_l4_csum_update via a new flag.

When calling bpf_l4_csum_replace in Cilium, it ends up calling
inet_proto_csum_replace_by_diff:

    1:  void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
    2:                                       __wsum diff, bool pseudohdr)
    3:  {
    4:      if (skb->ip_summed != CHECKSUM_PARTIAL) {
    5:          csum_replace_by_diff(sum, diff);
    6:          if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
    7:              skb->csum = ~csum_sub(diff, skb->csum);
    8:      } else if (pseudohdr) {
    9:          *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
    10:     }
    11: }

The bug happens when we're in the CHECKSUM_COMPLETE state. We've just
updated one of the IPv6 addresses. The helper now updates the L4 header
checksum on line 5. Next, it updates skb->csum on line 7. It shouldn't.

For an IPv6 packet, the updates of the IPv6 address and of the L4
checksum will cancel each other. The checksums are set such that
computing a checksum over the packet including its checksum will result
in a sum of 0. So the same is true here when we update the L4 checksum
on line 5. We'll update it as to cancel the previous IPv6 address
update. Hence skb->csum should remain untouched in this case.

The same bug doesn't affect IPv4 packets because, in that case, three
fields are updated: the IPv4 address, the IP checksum, and the L4
checksum. The change to the IPv4 address and one of the checksums still
cancel each other in skb->csum, but we're left with one checksum update
and should therefore update skb->csum accordingly. That's exactly what
inet_proto_csum_replace_by_diff does.

This special case for IPv6 L4 checksums is also described atop
inet_proto_csum_replace16, the function we should be using in this case.

This patch introduces a new bpf_l4_csum_replace flag, BPF_F_IPV6,
to indicate that we're updating the L4 checksum of an IPv6 packet. When
the flag is set, inet_proto_csum_replace_by_diff will skip the
skb->csum update.

Fixes: 7d672345ed295 ("bpf: add generic bpf_csum_diff helper")
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/96a6bc3a443e6f0b21ff7b7834000e17fb549e05.1748509484.git.paul.chaignon@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-06-16 08:52:44 -07:00
Tobias Klauser
161752932b bpf: adjust path to trace_output sample eBPF program
The sample file was renamed from trace_output_kern.c to
trace_output.bpf.c in commit d4fffba4d04b ("samples/bpf: Change _kern
suffix to .bpf with syscall tracing program"). Adjust the path in the
documentation comment for bpf_perf_event_output.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Link: https://lore.kernel.org/r/20250610140756.16332-1-tklauser@distanz.ch
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-06-16 08:52:44 -07:00
Tao Chen
ae67e5966a bpf: Add cookie to tracing bpf_link_info
bpf_tramp_link includes cookie info, we can add it in bpf_link_info.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606165818.3394397-1-chen.dylane@linux.dev
2025-06-16 08:52:44 -07:00
Yonghong Song
553676e8f5 libbpf: Support link-based cgroup attach with options
Currently libbpf supports bpf_program__attach_cgroup() with signature:
  LIBBPF_API struct bpf_link *
  bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd);

To support mprog style attachment, additionsl fields like flags,
relative_{fd,id} and expected_revision are needed.

Add a new API:
  LIBBPF_API struct bpf_link *
  bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd,
                                  const struct bpf_cgroup_opts *opts);
where bpf_cgroup_opts contains all above needed fields.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606163146.2429212-1-yonghong.song@linux.dev
2025-06-16 08:52:44 -07:00
Yonghong Song
15dfc869f8 bpf: Implement mprog API on top of existing cgroup progs
Current cgroup prog ordering is appending at attachment time. This is not
ideal. In some cases, users want specific ordering at a particular cgroup
level. To address this, the existing mprog API seems an ideal solution with
supporting BPF_F_BEFORE and BPF_F_AFTER flags.

But there are a few obstacles to directly use kernel mprog interface.
Currently cgroup bpf progs already support prog attach/detach/replace
and link-based attach/detach/replace. For example, in struct
bpf_prog_array_item, the cgroup_storage field needs to be together
with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog
as the member, which makes it difficult to use kernel mprog interface.

In another case, the current cgroup prog detach tries to use the
same flag as in attach. This is different from mprog kernel interface
which uses flags passed from user space.

So to avoid modifying existing behavior, I made the following changes to
support mprog API for cgroup progs:
 - The support is for prog list at cgroup level. Cross-level prog list
   (a.k.a. effective prog list) is not supported.
 - Previously, BPF_F_PREORDER is supported only for prog attach, now
   BPF_F_PREORDER is also supported by link-based attach.
 - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported
   similar to kernel mprog but with different implementation.
 - For detach and replace, use the existing implementation.
 - For attach, detach and replace, the revision for a particular prog
   list, associated with a particular attach type, will be updated
   by increasing count by 1.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606163141.2428937-1-yonghong.song@linux.dev
2025-06-16 08:52:44 -07:00
Blake Jones
439433a909 libbpf: Add support for printing BTF character arrays as strings
The BTF dumper code currently displays arrays of characters as just that -
arrays, with each character formatted individually. Sometimes this is what
makes sense, but it's nice to be able to treat that array as a string.

This change adds a special case to the btf_dump functionality to allow
0-terminated arrays of single-byte integer values to be printed as
character strings. Characters for which isprint() returns false are
printed as hex-escaped values. This is enabled when the new ".emit_strings"
is set to 1 in the btf_dump_type_data_opts structure.

As an example, here's what it looks like to dump the string "hello" using
a few different field values for btf_dump_type_data_opts (.compact = 1):

- .emit_strings = 0, .skip_names = 0:  (char[6])['h','e','l','l','o',]
- .emit_strings = 0, .skip_names = 1:  ['h','e','l','l','o',]
- .emit_strings = 1, .skip_names = 0:  (char[6])"hello"
- .emit_strings = 1, .skip_names = 1:  "hello"

Here's the string "h\xff", dumped with .compact = 1 and .skip_names = 1:

- .emit_strings = 0:  ['h',-1,]
- .emit_strings = 1:  "h\xff"

Signed-off-by: Blake Jones <blakejones@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250603203701.520541-1-blakejones@google.com
2025-06-16 08:52:44 -07:00
Jiawei Zhao
224ea3ec50 libbpf: Correct some typos and syntax issues in usdt doc
Fix some incorrect words, such as "and" -> "an", "it's" -> "its".  Fix
some grammar issues, such as removing redundant "will", "would
complicated" -> "would complicate".

Signed-off-by: Jiawei Zhao <Phoenix500526@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250531095111.57824-1-Phoenix500526@163.com
2025-06-16 08:52:44 -07:00
Tao Chen
b73864fc10 bpf: Add cookie to raw_tp bpf_link_info
After commit 68ca5d4eebb8 ("bpf: support BPF cookie in raw tracepoint
(raw_tp, tp_btf) programs"), we can show the cookie in bpf_link_info
like kprobe etc.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250603154309.3063644-1-chen.dylane@linux.dev
2025-06-16 08:52:44 -07:00
Lorenz Bauer
01dd871a20 libbpf: Use mmap to parse vmlinux BTF from sysfs
Teach libbpf to use mmap when parsing vmlinux BTF from /sys. We don't
apply this to fall-back paths on the regular file system because there
is no way to ensure that modifications underlying the MAP_PRIVATE
mapping are not visible to the process.

Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250520-vmlinux-mmap-v5-3-e8c941acc414@isovalent.com
2025-06-16 08:52:44 -07:00
Alan Maguire
8df8d67f63 libbpf/btf: Fix string handling to support multi-split BTF
libbpf handling of split BTF has been written largely with the
assumption that multiple splits are possible, i.e. split BTF on top of
split BTF on top of base BTF.  One area where this does not quite work
is string handling in split BTF; the start string offset should be the
base BTF string section length + the base BTF string offset.  This
worked in the past because for a single split BTF with base the start
string offset was always 0.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250519165935.261614-2-alan.maguire@oracle.com
2025-06-16 08:52:44 -07:00
Stanislav Fomichev
31bb8f7936 net: devmem: TCP tx netlink api
Add bind-tx netlink call to attach dmabuf for TX; queue is not
required, only ifindex and dmabuf fd for attachment.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20250508004830.4100853-4-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-06-16 08:51:34 -07:00
Amery Hung
f580871b42 libbpf: Support creating and destroying qdisc
Extend struct bpf_tc_hook with handle, qdisc name and a new attach type,
BPF_TC_QDISC, to allow users to add or remove any qdisc specified in
addition to clsact.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250409214606.2000194-8-ameryhung@gmail.com
2025-06-16 08:51:34 -07:00
Ingo Molnar
52b9b38a22 perf/uapi: Clean up <uapi/linux/perf_event.h> a bit
When applying a recent commit to the <uapi/linux/perf_event.h>
header I noticed that we have accumulated quite a bit of
historic noise in this header, so do a bit of spring cleaning:

 - Define bitfields in a vertically aligned fashion, like
   perf_event_mmap_page::capabilities already does. This
   makes it easier to see the distribution and sizing of
   bits within a word, at a glance. The following is much
   more readable:

			__u64	cap_bit0		: 1,
				cap_bit0_is_deprecated	: 1,
				cap_user_rdpmc		: 1,
				cap_user_time		: 1,
				cap_user_time_zero	: 1,
				cap_user_time_short	: 1,
				cap_____res		: 58;

   Than:

			__u64	cap_bit0:1,
				cap_bit0_is_deprecated:1,
				cap_user_rdpmc:1,
				cap_user_time:1,
				cap_user_time_zero:1,
				cap_user_time_short:1,
				cap_____res:58;

   So convert all bitfield definitions from the latter style to the
   former style.

 - Fix typos and grammar

 - Fix capitalization

 - Remove whitespace noise

 - Harmonize the definitions of various generations and groups of
   PERF_MEM_ ABI values.

 - Vertically align all definitions and assignments to the same
   column (48), as the first definition (enum perf_type_id),
   throughout the entire header.

 - And in general make the code and comments to be more in sync
   with each other and to be more readable overall.

No change in functionality.

Copy the changes over to tools/include/uapi/linux/perf_event.h.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20250521221529.2547099-1-irogers@google.com
2025-06-16 08:51:34 -07:00
Ian Rogers
bae6a269ca perf/uapi: Fix PERF_RECORD_SAMPLE comments in <uapi/linux/perf_event.h>
AAUX data for PERF_SAMPLE_AUX appears last. PERF_SAMPLE_CGROUP is
missing from the comment.

This makes the <uapi/linux/perf_event.h> comment match that in the
perf_event_open man page.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-perf-users@vger.kernel.org
Link: https://lore.kernel.org/r/20250521221529.2547099-1-irogers@google.com
2025-06-16 08:51:34 -07:00
Paul Chaignon
9b06cd15e0 bpf: Clarify handling of mark and tstamp by redirect_peer
When switching network namespaces with the bpf_redirect_peer helper, the
skb->mark and skb->tstamp fields are not zeroed out like they can be on
a typical netns switch. This patch clarifies that in the helper
description.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/ccc86af26d43c5c0b776bcba2601b7479c0d46d0.1746460653.git.paul.chaignon@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-06-16 08:51:34 -07:00
Andrii Nakryiko
346532d711 sync: one-time strip out _UAPI prefix from UAPI header guards
Normalize already synced UAPI headers. For all subsequent syncs this
will be done automatically by the sync script.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-06-13 13:18:23 -07:00
Andrii Nakryiko
15c5317b6c sync: update sync script to strip _UAPI prefix in UAPI headers
It's expected that kernel UAPI headers have #ifndef guards starting with
__LINUX prefix, while in the kernel source code these guards are
actually starting with _UAPI__LINUX. The stripping of _UAPI prefix is
done (among other things) by kernel's scripts/headers_install.sh script.

Given libbpf vendors its own UAPI header under include/uapi subdir, and
those "internal" UAPI headers are sometimes used by libbpf users for
convenience, let's stick to the __LINUX prefix rule and do that during
the sync.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-06-13 13:18:23 -07:00
Mykyta Yatsenko
02bdeb7a2c sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   8e64c387c942229c551d0f23de4d9993d3a2acb6
Checkpoint bpf-next commit: 9325d53fe9adff354b6a93fda5f38c165947da0f
Baseline bpf commit:        b4432656b36e5cc1d50a1f2dc15357543add530e
Checkpoint bpf commit:      b4432656b36e5cc1d50a1f2dc15357543add530e

Andrii Nakryiko (1):
  libbpf: Improve BTF dedup handling of "identical" BTF types

Anton Protopopov (3):
  libbpf: Use proper errno value in linker
  bpf: Fix uninitialized values in BPF_{CORE,PROBE}_READ
  libbpf: Use proper errno value in nlattr

Jiri Olsa (1):
  bpf: Add support to retrieve ref_ctr_offset for uprobe perf link

Mykyta Yatsenko (1):
  libbpf: Check bpf_map_skeleton link for NULL

 include/uapi/linux/bpf.h |   1 +
 src/bpf_core_read.h      |   6 ++
 src/btf.c                | 137 +++++++++++++++++++++++++--------------
 src/libbpf.c             |   6 ++
 src/linker.c             |   4 +-
 src/nlattr.c             |  15 ++---
 6 files changed, 111 insertions(+), 58 deletions(-)

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
2025-05-19 10:07:42 -07:00
Mykyta Yatsenko
453601a65a libbpf: Check bpf_map_skeleton link for NULL
Avoid dereferencing bpf_map_skeleton's link field if it's NULL.
If BPF map skeleton is created with the size, that indicates containing
link field, but the field was not actually initialized with valid
bpf_link pointer, libbpf crashes. This may happen when using libbpf-rs
skeleton.
Skeleton loading may still progress, but user needs to attach struct_ops
map separately.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250514113220.219095-1-mykyta.yatsenko5@gmail.com
2025-05-19 10:07:42 -07:00
Anton Protopopov
8e34ca4e8f libbpf: Use proper errno value in nlattr
Return value of the validate_nla() function can be propagated all the
way up to users of libbpf API. In case of error this libbpf version
of validate_nla returns -1 which will be seen as -EPERM from user's
point of view. Instead, return a more reasonable -EINVAL.

Fixes: bbf48c18ee0c ("libbpf: add error reporting in XDP")
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250510182011.2246631-1-a.s.protopopov@gmail.com
2025-05-19 10:07:42 -07:00
Jiri Olsa
eda0e4ca46 bpf: Add support to retrieve ref_ctr_offset for uprobe perf link
Adding support to retrieve ref_ctr_offset for uprobe perf link,
which got somehow omitted from the initial uprobe link info changes.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/bpf/20250509153539.779599-2-jolsa@kernel.org
2025-05-19 10:07:42 -07:00
Andrii Nakryiko
5ee9fbf7d7 libbpf: Improve BTF dedup handling of "identical" BTF types
BTF dedup has a strong assumption that compiler with deduplicate identical
types within any given compilation unit (i.e., .c file). This property
is used when establishing equilvalence of two subgraphs of types.

Unfortunately, this property doesn't always holds in practice. We've
seen cases of having truly identical structs, unions, array definitions,
and, most recently, even pointers to the same type being duplicated
within CU.

Previously, we mitigated this on a case-by-case basis, adding a few
simple heuristics for validating that two BTF types (having two
different type IDs) are structurally the same. But this approach scales
poorly, and we can have more weird cases come up in the future.

So let's take a half-step back, and implement a bit more generic
structural equivalence check, recursively. We still limit it to
reasonable depth to avoid long reference loops. Depth-wise limiting of
potentially cyclical graph isn't great, but as I mentioned below doesn't
seem to be detrimental performance-wise. We can always improve this in
the future with per-type visited markers, if necessary.

Performance-wise this doesn't seem too affect vmlinux BTF dedup, which
makes sense because this logic kicks in not so frequently and only if we
already established a canonical candidate type match, but suddenly find
a different (but probably identical) type.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20250501235231.1339822-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-05-19 10:07:42 -07:00
Anton Protopopov
d88ca95133 bpf: Fix uninitialized values in BPF_{CORE,PROBE}_READ
With the latest LLVM bpf selftests build will fail with
the following error message:

    progs/profiler.inc.h:710:31: error: default initialization of an object of type 'typeof ((parent_task)->real_cred->uid.val)' (aka 'const unsigned int') leaves the object uninitialized and is incompatible with C++ [-Werror,-Wdefault-const-init-unsafe]
      710 |         proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val);
          |                                      ^
    tools/testing/selftests/bpf/tools/include/bpf/bpf_core_read.h:520:35: note: expanded from macro 'BPF_CORE_READ'
      520 |         ___type((src), a, ##__VA_ARGS__) __r;                               \
          |                                          ^

This happens because BPF_CORE_READ (and other macro) declare the
variable __r using the ___type macro which can inherit const modifier
from intermediate types.

Fix this by using __typeof_unqual__, when supported. (And when it
is not supported, the problem shouldn't appear, as older compilers
haven't complained.)

Fixes: 792001f4f7aa ("libbpf: Add user-space variants of BPF_CORE_READ() family of macros")
Fixes: a4b09a9ef945 ("libbpf: Add non-CO-RE variants of BPF_CORE_READ() macro family")
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250502193031.3522715-1-a.s.protopopov@gmail.com
2025-05-19 10:07:42 -07:00
Anton Protopopov
28deee2663 libbpf: Use proper errno value in linker
Return values of the linker_append_sec_data() and the
linker_append_elf_relos() functions are propagated all the
way up to users of libbpf API. In some error cases these
functions return -1 which will be seen as -EPERM from user's
point of view. Instead, return a more reasonable -EINVAL.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250430120820.2262053-1-a.s.protopopov@gmail.com
2025-05-19 10:07:42 -07:00
Andrii Nakryiko
374f7807e1 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   25601e85441dd91cf7973b002f27af4c5b8691ea
Checkpoint bpf-next commit: 8e64c387c942229c551d0f23de4d9993d3a2acb6
Baseline bpf commit:        3f8ad18f8184
Checkpoint bpf commit:      b4432656b36e5cc1d50a1f2dc15357543add530e

Alan Maguire (1):
  libbpf: Add identical pointer detection to btf_dedup_is_equiv()

Anton Protopopov (2):
  bpf: Fix a comment describing bpf_attr
  libbpf: Add likely/unlikely macros and use them in selftests

Carlos Llamas (1):
  libbpf: Fix implicit memfd_create() for bionic

Feng Yang (1):
  libbpf: Fix event name too long error

Ihor Solodrai (1):
  libbpf: Verify section type in btf_find_elf_sections

Jonathan Wiepert (1):
  Use thread-safe function pointer in libbpf_print

Mykyta Yatsenko (1):
  libbpf: Add getters for BTF.ext func and line info

Paul Chaignon (2):
  bpf: Clarify role of BPF_F_RECOMPUTE_CSUM
  bpf: Clarify the meaning of BPF_F_PSEUDO_HDR

Tao Chen (1):
  libbpf: Remove sample_period init in perf_buffer

Viktor Malik (1):
  libbpf: Fix buffer overflow in bpf_object__init_prog

 include/uapi/linux/bpf.h | 18 +++++----
 src/bpf_helpers.h        |  8 ++++
 src/btf.c                | 22 +++++++++++
 src/libbpf.c             | 81 +++++++++++++++++++++-------------------
 src/libbpf.h             |  6 +++
 src/libbpf.map           |  4 ++
 src/libbpf_internal.h    |  9 +++++
 src/linker.c             |  2 +-
 8 files changed, 103 insertions(+), 47 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-04-29 11:33:37 -07:00
Andrii Nakryiko
27dc274f68 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-04-29 11:33:37 -07:00
Alan Maguire
c8a28812fb libbpf: Add identical pointer detection to btf_dedup_is_equiv()
Recently as a side-effect of

commit ac053946f5c4 ("compiler.h: introduce TYPEOF_UNQUAL() macro")

issues were observed in deduplication between modules and kernel BTF
such that a large number of kernel types were not deduplicated so
were found in module BTF (task_struct, bpf_prog etc).  The root cause
appeared to be a failure to dedup struct types, specifically those
with members that were pointers with __percpu annotations.

The issue in dedup is at the point that we are deduplicating structures,
we have not yet deduplicated reference types like pointers.  If multiple
copies of a pointer point at the same (deduplicated) integer as in this
case, we do not see them as identical.  Special handling already exists
to deal with structures and arrays, so add pointer handling here too.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250429161042.2069678-1-alan.maguire@oracle.com
2025-04-29 11:33:37 -07:00
Jonathan Wiepert
88ae865423 Use thread-safe function pointer in libbpf_print
This patch fixes a thread safety bug where libbpf_print uses the
global variable storing the print function pointer rather than the local
variable that had the print function set via __atomic_load_n.

Fixes: f1cb927cdb62 ("libbpf: Ensure print callback usage is thread-safe")
Signed-off-by: Jonathan Wiepert <jonathan.wiepert@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
Link: https://lore.kernel.org/bpf/20250424221457.793068-1-jonathan.wiepert@gmail.com
2025-04-29 11:33:37 -07:00
Tao Chen
a2dc135196 libbpf: Remove sample_period init in perf_buffer
It seems that sample_period is not used in perf buffer. Actually, only
wakeup_events are meaningful to enable events aggregation for wakeup notification.
Remove sample_period setting code to avoid confusion.

Fixes: fb84b8224655 ("libbpf: add perf buffer API")
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/bpf/20250423163901.2983689-1-chen.dylane@linux.dev
2025-04-29 11:33:37 -07:00
Feng Yang
715808d3e2 libbpf: Fix event name too long error
When the binary path is excessively long, the generated probe_name in libbpf
exceeds the kernel's MAX_EVENT_NAME_LEN limit (64 bytes).
This causes legacy uprobe event attachment to fail with error code -22.

The fix reorders the fields to place the unique ID before the name.
This ensures that even if truncation occurs via snprintf, the unique ID
remains intact, preserving event name uniqueness. Additionally, explicit
checks with MAX_EVENT_NAME_LEN are added to enforce length constraints.

Before Fix:
	./test_progs -t attach_probe/kprobe-long_name
	......
	libbpf: failed to add legacy kprobe event for 'bpf_testmod_looooooooooooooooooooooooooooooong_name+0x0': -EINVAL
	libbpf: prog 'handle_kprobe': failed to create kprobe 'bpf_testmod_looooooooooooooooooooooooooooooong_name+0x0' perf event: -EINVAL
	test_attach_kprobe_long_event_name:FAIL:attach_kprobe_long_event_name unexpected error: -22
	test_attach_probe:PASS:uprobe_ref_ctr_cleanup 0 nsec
	#13/11   attach_probe/kprobe-long_name:FAIL
	#13      attach_probe:FAIL

	./test_progs -t attach_probe/uprobe-long_name
	......
	libbpf: failed to add legacy uprobe event for /root/linux-bpf/bpf-next/tools/testing/selftests/bpf/test_progs:0x13efd9: -EINVAL
	libbpf: prog 'handle_uprobe': failed to create uprobe '/root/linux-bpf/bpf-next/tools/testing/selftests/bpf/test_progs:0x13efd9' perf event: -EINVAL
	test_attach_uprobe_long_event_name:FAIL:attach_uprobe_long_event_name unexpected error: -22
	#13/10   attach_probe/uprobe-long_name:FAIL
	#13      attach_probe:FAIL
After Fix:
	./test_progs -t attach_probe/uprobe-long_name
	#13/10   attach_probe/uprobe-long_name:OK
	#13      attach_probe:OK
	Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

	./test_progs -t attach_probe/kprobe-long_name
	#13/11   attach_probe/kprobe-long_name:OK
	#13      attach_probe:OK
	Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

Fixes: 46ed5fc33db9 ("libbpf: Refactor and simplify legacy kprobe code")
Fixes: cc10623c6810 ("libbpf: Add legacy uprobe attaching support")
Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Feng Yang <yangfeng@kylinos.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250417014848.59321-2-yangfeng59949@163.com
2025-04-29 11:33:37 -07:00
Ihor Solodrai
02bc656f90 libbpf: Verify section type in btf_find_elf_sections
A valid ELF file may contain a SHT_NOBITS .BTF section. This case is
not handled correctly in btf_parse_elf, which leads to a segfault.

Before attempting to load BTF section data, check that the section
type is SHT_PROGBITS, which is the expected type for BTF data.  Fail
with an error if the type is different.

Bug report: https://github.com/libbpf/libbpf/issues/894
v1: https://lore.kernel.org/bpf/20250408184104.3962949-1-ihor.solodrai@linux.dev/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250410182823.1591681-1-ihor.solodrai@linux.dev
2025-04-29 11:33:37 -07:00
Viktor Malik
806b4e0a9f libbpf: Fix buffer overflow in bpf_object__init_prog
As shown in [1], it is possible to corrupt a BPF ELF file such that
arbitrary BPF instructions are loaded by libbpf. This can be done by
setting a symbol (BPF program) section offset to a large (unsigned)
number such that <section start + symbol offset> overflows and points
before the section data in the memory.

Consider the situation below where:
- prog_start = sec_start + symbol_offset    <-- size_t overflow here
- prog_end   = prog_start + prog_size

    prog_start        sec_start        prog_end        sec_end
        |                |                 |              |
        v                v                 v              v
    .....................|################################|............

The report in [1] also provides a corrupted BPF ELF which can be used as
a reproducer:

    $ readelf -S crash
    Section Headers:
      [Nr] Name              Type             Address           Offset
           Size              EntSize          Flags  Link  Info  Align
    ...
      [ 2] uretprobe.mu[...] PROGBITS         0000000000000000  00000040
           0000000000000068  0000000000000000  AX       0     0     8

    $ readelf -s crash
    Symbol table '.symtab' contains 8 entries:
       Num:    Value          Size Type    Bind   Vis      Ndx Name
    ...
         6: ffffffffffffffb8   104 FUNC    GLOBAL DEFAULT    2 handle_tp

Here, the handle_tp prog has section offset ffffffffffffffb8, i.e. will
point before the actual memory where section 2 is allocated.

This is also reported by AddressSanitizer:

    =================================================================
    ==1232==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x7c7302fe0000 at pc 0x7fc3046e4b77 bp 0x7ffe64677cd0 sp 0x7ffe64677490
    READ of size 104 at 0x7c7302fe0000 thread T0
        #0 0x7fc3046e4b76 in memcpy (/lib64/libasan.so.8+0xe4b76)
        #1 0x00000040df3e in bpf_object__init_prog /src/libbpf/src/libbpf.c:856
        #2 0x00000040df3e in bpf_object__add_programs /src/libbpf/src/libbpf.c:928
        #3 0x00000040df3e in bpf_object__elf_collect /src/libbpf/src/libbpf.c:3930
        #4 0x00000040df3e in bpf_object_open /src/libbpf/src/libbpf.c:8067
        #5 0x00000040f176 in bpf_object__open_file /src/libbpf/src/libbpf.c:8090
        #6 0x000000400c16 in main /poc/poc.c:8
        #7 0x7fc3043d25b4 in __libc_start_call_main (/lib64/libc.so.6+0x35b4)
        #8 0x7fc3043d2667 in __libc_start_main@@GLIBC_2.34 (/lib64/libc.so.6+0x3667)
        #9 0x000000400b34 in _start (/poc/poc+0x400b34)

    0x7c7302fe0000 is located 64 bytes before 104-byte region [0x7c7302fe0040,0x7c7302fe00a8)
    allocated by thread T0 here:
        #0 0x7fc3046e716b in malloc (/lib64/libasan.so.8+0xe716b)
        #1 0x7fc3045ee600 in __libelf_set_rawdata_wrlock (/lib64/libelf.so.1+0xb600)
        #2 0x7fc3045ef018 in __elf_getdata_rdlock (/lib64/libelf.so.1+0xc018)
        #3 0x00000040642f in elf_sec_data /src/libbpf/src/libbpf.c:3740

The problem here is that currently, libbpf only checks that the program
end is within the section bounds. There used to be a check
`while (sec_off < sec_sz)` in bpf_object__add_programs, however, it was
removed by commit 6245947c1b3c ("libbpf: Allow gaps in BPF program
sections to support overriden weak functions").

Add a check for detecting the overflow of `sec_off + prog_sz` to
bpf_object__init_prog to fix this issue.

[1] https://github.com/lmarch2/poc/blob/main/libbpf/libbpf.md

Fixes: 6245947c1b3c ("libbpf: Allow gaps in BPF program sections to support overriden weak functions")
Reported-by: lmarch2 <2524158037@qq.com>
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://github.com/lmarch2/poc/blob/main/libbpf/libbpf.md
Link: https://lore.kernel.org/bpf/20250415155014.397603-1-vmalik@redhat.com
2025-04-29 11:33:37 -07:00
Paul Chaignon
0c85f5a154 bpf: Clarify the meaning of BPF_F_PSEUDO_HDR
In the bpf_l4_csum_replace helper, the BPF_F_PSEUDO_HDR flag should only
be set if the modified header field is part of the pseudo-header.

If you modify for example the UDP ports and pass BPF_F_PSEUDO_HDR,
inet_proto_csum_replace4 will update skb->csum even though it shouldn't
(the port and the UDP checksum updates null each other).

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Link: https://lore.kernel.org/r/5126ef84ba75425b689482cbc98bffe75e5d8ab0.1744102490.git.paul.chaignon@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-04-29 11:33:37 -07:00
Paul Chaignon
7de6a44a0f bpf: Clarify role of BPF_F_RECOMPUTE_CSUM
BPF_F_RECOMPUTE_CSUM doesn't update the actual L3 and L4 checksums in
the packet, but simply updates skb->csum (according to skb->ip_summed).
This patch clarifies that to avoid confusions.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Link: https://lore.kernel.org/r/ff6895d42936f03dbb82334d8bcfd50e00c79086.1744102490.git.paul.chaignon@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-04-29 11:33:37 -07:00
Mykyta Yatsenko
abdb15bedd libbpf: Add getters for BTF.ext func and line info
Introducing new libbpf API getters for BTF.ext func and line info,
namely:
  bpf_program__func_info
  bpf_program__func_info_cnt
  bpf_program__line_info
  bpf_program__line_info_cnt

This change enables scenarios, when user needs to load bpf_program
directly using `bpf_prog_load`, instead of higher-level
`bpf_object__load`. Line and func info are required for checking BTF
info in verifier; verification may fail without these fields if, for
example, program calls `bpf_obj_new`.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250408234417.452565-2-mykyta.yatsenko5@gmail.com
2025-04-29 11:33:37 -07:00
Anton Protopopov
7a1388d55f libbpf: Add likely/unlikely macros and use them in selftests
A few selftests and, more importantly, consequent changes to the
bpf_helpers.h file, use likely/unlikely macros, so define them here
and remove duplicate definitions from existing selftests.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250331203618.1973691-3-a.s.protopopov@gmail.com
2025-04-29 11:33:37 -07:00
Anton Protopopov
4687560af9 bpf: Fix a comment describing bpf_attr
The map_fd field of the bpf_attr union is used in the BPF_MAP_FREEZE
syscall.  Explicitly mention this in the comments.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250331203618.1973691-2-a.s.protopopov@gmail.com
2025-04-29 11:33:37 -07:00
Carlos Llamas
732d6c011f libbpf: Fix implicit memfd_create() for bionic
Since memfd_create() is not consistently available across different
bionic libc implementations, using memfd_create() directly can break
some Android builds:

  tools/lib/bpf/linker.c:576:7: error: implicit declaration of function 'memfd_create' [-Werror,-Wimplicit-function-declaration]
    576 |         fd = memfd_create(filename, 0);
        |              ^

To fix this, relocate and inline the sys_memfd_create() helper so that
it can be used in "linker.c". Similar issues were previously fixed by
commit 9fa5e1a180aa ("libbpf: Call memfd_create() syscall directly").

Fixes: 6d5e5e5d7ce1 ("libbpf: Extend linker API to support in-memory ELF files")
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250330211325.530677-1-cmllamas@google.com
2025-04-29 11:33:37 -07:00
Mykyta Yatsenko
4659eaafa4 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   488a8544f839048064ea0647596af2aaca7ecc25
Checkpoint bpf-next commit: 25601e85441dd91cf7973b002f27af4c5b8691ea
Baseline bpf commit:        46e88299d19695c2b21e245c52a86ed26ed5cfee
Checkpoint bpf commit:      0c2623cef4f49e1ef6a908a389eea86130d11057

David Wei (1):
  netdev: add io_uring memory provider info

Ian Rogers (1):
  libbpf: Add namespace for errstr making it libbpf_errstr

Jason Xing (6):
  bpf: Add networking timestamping support to bpf_get/setsockopt()
  bpf: Add BPF_SOCK_OPS_TSTAMP_SCHED_CB callback
  bpf: Add BPF_SOCK_OPS_TSTAMP_SND_SW_CB callback
  bpf: Add BPF_SOCK_OPS_TSTAMP_SND_HW_CB callback
  bpf: Add BPF_SOCK_OPS_TSTAMP_ACK_CB callback
  bpf: Add BPF_SOCK_OPS_TSTAMP_SENDMSG_CB callback

Joe Damato (1):
  netdev-genl: Add an XSK attribute to queues

Kan Liang (1):
  perf: Extend per event callchain limit to branch stack

Mykyta Yatsenko (2):
  bpf: BPF token support for BPF_BTF_GET_FD_BY_ID
  libbpf: Pass BPF token from find_prog_btf_id to BPF_BTF_GET_FD_BY_ID

Song Yoong Siang (1):
  xsk: Add launch time hardware offload support to XDP Tx metadata

 include/uapi/linux/bpf.h        | 31 +++++++++++++++++++++++++++++++
 include/uapi/linux/if_xdp.h     | 10 ++++++++++
 include/uapi/linux/netdev.h     | 16 ++++++++++++++++
 include/uapi/linux/perf_event.h |  2 ++
 src/bpf.c                       |  3 ++-
 src/bpf.h                       |  3 ++-
 src/btf.c                       | 15 +++++++++++++--
 src/libbpf.c                    | 10 +++++-----
 src/libbpf_internal.h           |  1 +
 src/str_error.c                 |  2 +-
 src/str_error.h                 |  7 +++++--
 11 files changed, 88 insertions(+), 12 deletions(-)

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
2025-04-02 14:24:25 -07:00
Ian Rogers
2a228c7885 libbpf: Add namespace for errstr making it libbpf_errstr
When statically linking symbols can be replaced with those from other
statically linked libraries depending on the link order and the hoped
for "multiple definition" error may not appear. To avoid conflicts it
is good practice to namespace symbols, this change renames errstr to
libbpf_errstr. To avoid churn a #define is used to turn use of
errstr(err) to libbpf_errstr(err).

Fixes: 1633a83bf993 ("libbpf: Introduce errstr() for stringifying errno")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250320222439.1350187-1-irogers@google.com
2025-04-02 14:24:25 -07:00
Mykyta Yatsenko
90844e28dc libbpf: Pass BPF token from find_prog_btf_id to BPF_BTF_GET_FD_BY_ID
Pass BPF token from bpf_program__set_attach_target to
BPF_BTF_GET_FD_BY_ID bpf command.
When freplace program attaches to target program, it needs to look up
for BTF of the target, this may require BPF token, if, for example,
running from user namespace.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250317174039.161275-4-mykyta.yatsenko5@gmail.com
2025-04-02 14:24:25 -07:00
Mykyta Yatsenko
009a8cb452 bpf: BPF token support for BPF_BTF_GET_FD_BY_ID
Currently BPF_BTF_GET_FD_BY_ID requires CAP_SYS_ADMIN, which does not
allow running it from user namespace. This creates a problem when
freplace program running from user namespace needs to query target
program BTF.
This patch relaxes capable check from CAP_SYS_ADMIN to CAP_BPF and adds
support for BPF token that can be passed in attributes to syscall.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250317174039.161275-2-mykyta.yatsenko5@gmail.com
2025-04-02 14:24:25 -07:00
Song Yoong Siang
89cad6a160 xsk: Add launch time hardware offload support to XDP Tx metadata
Extend the XDP Tx metadata framework so that user can requests launch time
hardware offload, where the Ethernet device will schedule the packet for
transmission at a pre-determined time called launch time. The value of
launch time is communicated from user space to Ethernet driver via
launch_time field of struct xsk_tx_metadata.

Suggested-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20250216093430.957880-2-yoong.siang.song@intel.com
2025-04-02 14:24:25 -07:00
Jason Xing
5cbd13ee02 bpf: Add BPF_SOCK_OPS_TSTAMP_SENDMSG_CB callback
This patch introduces a new callback in tcp_tx_timestamp() to correlate
tcp_sendmsg timestamp with timestamps from other tx timestamping
callbacks (e.g., SND/SW/ACK).

Without this patch, BPF program wouldn't know which timestamps belong
to which flow because of no socket lock protection. This new callback
is inserted in tcp_tx_timestamp() to address this issue because
tcp_tx_timestamp() still owns the same socket lock with
tcp_sendmsg_locked() in the meanwhile tcp_tx_timestamp() initializes
the timestamping related fields for the skb, especially tskey. The
tskey is the bridge to do the correlation.

For TCP, BPF program hooks the beginning of tcp_sendmsg_locked() and
then stores the sendmsg timestamp at the bpf_sk_storage, correlating
this timestamp with its tskey that are later used in other sending
timestamping callbacks.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-11-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Jason Xing
79e19bb62b bpf: Add BPF_SOCK_OPS_TSTAMP_ACK_CB callback
Support the ACK case for bpf timestamping.

Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_ACK_CB. This
callback will occur at the same timestamping point as the user
space's SCM_TSTAMP_ACK. The BPF program can use it to get the
same SCM_TSTAMP_ACK timestamp without modifying the user-space
application.

This patch extends txstamp_ack to two bits: 1 stands for
SO_TIMESTAMPING mode, 2 bpf extension.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-10-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Jason Xing
253b5ce758 bpf: Add BPF_SOCK_OPS_TSTAMP_SND_HW_CB callback
Support hw SCM_TSTAMP_SND case for bpf timestamping.

Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SND_HW_CB. This
callback will occur at the same timestamping point as the user
space's hardware SCM_TSTAMP_SND. The BPF program can use it to
get the same SCM_TSTAMP_SND timestamp without modifying the
user-space application.

To avoid increasing the code complexity, replace SKBTX_HW_TSTAMP
with SKBTX_HW_TSTAMP_NOBPF instead of changing numerous callers
from driver side using SKBTX_HW_TSTAMP. The new definition of
SKBTX_HW_TSTAMP means the combination tests of socket timestamping
and bpf timestamping. After this patch, drivers can work under the
bpf timestamping.

Considering some drivers don't assign the skb with hardware
timestamp, this patch does the assignment and then BPF program
can acquire the hwstamp from skb directly.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-9-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Jason Xing
d855493df1 bpf: Add BPF_SOCK_OPS_TSTAMP_SND_SW_CB callback
Support sw SCM_TSTAMP_SND case for bpf timestamping.

Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SND_SW_CB. This
callback will occur at the same timestamping point as the user
space's software SCM_TSTAMP_SND. The BPF program can use it to
get the same SCM_TSTAMP_SND timestamp without modifying the
user-space application.

Based on this patch, BPF program will get the software
timestamp when the driver is ready to send the skb. In the
sebsequent patch, the hardware timestamp will be supported.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-8-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Jason Xing
7ea10cfba8 bpf: Add BPF_SOCK_OPS_TSTAMP_SCHED_CB callback
Support SCM_TSTAMP_SCHED case for bpf timestamping.

Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SCHED_CB. This
callback will occur at the same timestamping point as the user
space's SCM_TSTAMP_SCHED. The BPF program can use it to get the
same SCM_TSTAMP_SCHED timestamp without modifying the user-space
application.

A new SKBTX_BPF flag is added to mark skb_shinfo(skb)->tx_flags,
ensuring that the new BPF timestamping and the current user
space's SO_TIMESTAMPING do not interfere with each other.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-7-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Jason Xing
43b6c2cd70 bpf: Add networking timestamping support to bpf_get/setsockopt()
The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
added to bpf_get/setsockopt. The later patches will implement the
BPF networking timestamping. The BPF program will use
bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
enable the BPF networking timestamping on a socket.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250220072940.99994-2-kerneljasonxing@gmail.com
2025-04-02 14:24:25 -07:00
Joe Damato
b1cb441916 netdev-genl: Add an XSK attribute to queues
Expose a new per-queue nest attribute, xsk, which will be present for
queues that are being used for AF_XDP. If the queue is not being used for
AF_XDP, the nest will not be present.

In the future, this attribute can be extended to include more data about
XSK as it is needed.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20250214211255.14194-3-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-04-02 14:24:25 -07:00
David Wei
01500813ad netdev: add io_uring memory provider info
Add a nested attribute for io_uring memory provider info. For now it is
empty and its presence indicates that a particular page pool or queue
has an io_uring memory provider attached.

$ ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get
[{'id': 80,
  'ifindex': 2,
  'inflight': 64,
  'inflight-mem': 262144,
  'napi-id': 525},
 {'id': 79,
  'ifindex': 2,
  'inflight': 320,
  'inflight-mem': 1310720,
  'io_uring': {},
  'napi-id': 525},
...

$ ./cli.py --spec netlink/specs/netdev.yaml --dump queue-get
[{'id': 0, 'ifindex': 1, 'type': 'rx'},
 {'id': 0, 'ifindex': 1, 'type': 'tx'},
 {'id': 0, 'ifindex': 2, 'napi-id': 513, 'type': 'rx'},
 {'id': 1, 'ifindex': 2, 'napi-id': 514, 'type': 'rx'},
...
 {'id': 12, 'ifindex': 2, 'io_uring': {}, 'napi-id': 525, 'type': 'rx'},
...

Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20250204215622.695511-6-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-04-02 14:24:25 -07:00
Kan Liang
59171f49e9 perf: Extend per event callchain limit to branch stack
The commit 97c79a38cd45 ("perf core: Per event callchain limit")
introduced a per-event term to allow finer tuning of the depth of
callchains to save space.

It should be applied to the branch stack as well. For example, autoFDO
collections require maximum LBR entries. In the meantime, other
system-wide LBR users may only be interested in the latest a few number
of LBRs. A per-event LBR depth would save the perf output buffer.

The patch simply drops the uninterested branches, but HW still collects
the maximum branches. There may be a model-specific optimization that
can reduce the HW depth for some cases to reduce the overhead further.
But it isn't included in the patch set. Because it's not useful for all
cases. For example, ARCH LBR can utilize the PEBS and XSAVE to collect
LBRs. The depth should have less impact on the collecting overhead.
The model-specific optimization may be implemented later separately.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250310181536.3645382-1-kan.liang@linux.intel.com
2025-04-02 14:24:25 -07:00
Ihor Solodrai
1b8768339f ci: add temporary patches for selftests
* https://lore.kernel.org/all/20250327185528.1740787-1-song@kernel.org/
* https://lore.kernel.org/bpf/20250328193124.808784-1-song@kernel.org/
* https://lore.kernel.org/bpf/20250331033828.365077-1-yonghong.song@linux.dev/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-04-02 10:23:23 -07:00
Ihor Solodrai
374036c9f1 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   239860828f8660e2be487e2fbdae2640cce3fd67
Checkpoint bpf-next commit: 79d93c8ff35855d3283ee7d82dfe0c54f90b9986
Baseline bpf commit:        319fc77f8f45a1b3dba15b0cc1a869778fd222f7
Checkpoint bpf commit:      6ccf6adb05d0fe3dbb1a77ab90bf054da8a2198d

Ihor Solodrai (1):
  libbpf: Implement bpf_usdt_arg_size BPF function

Mykyta Yatsenko (3):
  libbpf: Use map_is_created helper in map setters
  libbpf: Introduce more granular state for bpf_object
  libbpf: Split bpf object load into prepare/load

Nandakumar Edamana (1):
  libbpf: Fix out-of-bound read

Peilin Ye (1):
  bpf: Introduce load-acquire and store-release instructions

Yonghong Song (1):
  bpf: Allow pre-ordering for bpf cgroup progs

 include/uapi/linux/bpf.h |   4 +
 src/libbpf.c             | 201 ++++++++++++++++++++++++++-------------
 src/libbpf.h             |  13 +++
 src/libbpf.map           |   1 +
 src/usdt.bpf.h           |  32 +++++++
 5 files changed, 183 insertions(+), 68 deletions(-)

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Peilin Ye
bf62e0dcfd bpf: Introduce load-acquire and store-release instructions
Introduce BPF instructions with load-acquire and store-release
semantics, as discussed in [1].  Define 2 new flags:

  #define BPF_LOAD_ACQ    0x100
  #define BPF_STORE_REL   0x110

A "load-acquire" is a BPF_STX | BPF_ATOMIC instruction with the 'imm'
field set to BPF_LOAD_ACQ (0x100).

Similarly, a "store-release" is a BPF_STX | BPF_ATOMIC instruction with
the 'imm' field set to BPF_STORE_REL (0x110).

Unlike existing atomic read-modify-write operations that only support
BPF_W (32-bit) and BPF_DW (64-bit) size modifiers, load-acquires and
store-releases also support BPF_B (8-bit) and BPF_H (16-bit).  As an
exception, however, 64-bit load-acquires/store-releases are not
supported on 32-bit architectures (to fix a build error reported by the
kernel test robot).

An 8- or 16-bit load-acquire zero-extends the value before writing it to
a 32-bit register, just like ARM64 instruction LDARH and friends.

Similar to existing atomic read-modify-write operations, misaligned
load-acquires/store-releases are not allowed (even if
BPF_F_ANY_ALIGNMENT is set).

As an example, consider the following 64-bit load-acquire BPF
instruction (assuming little-endian):

  db 10 00 00 00 01 00 00  r0 = load_acquire((u64 *)(r1 + 0x0))

  opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX
  imm (0x00000100): BPF_LOAD_ACQ

Similarly, a 16-bit BPF store-release:

  cb 21 00 00 10 01 00 00  store_release((u16 *)(r1 + 0x0), w2)

  opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX
  imm (0x00000110): BPF_STORE_REL

In arch/{arm64,s390,x86}/net/bpf_jit_comp.c, have
bpf_jit_supports_insn(..., /*in_arena=*/true) return false for the new
instructions, until the corresponding JIT compiler supports them in
arena.

[1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Peilin Ye <yepeilin@google.com>
Link: https://lore.kernel.org/r/a217f46f0e445fbd573a1a024be5c6bf1d5fe716.1741049567.git.yepeilin@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Mykyta Yatsenko
855a5d7904 libbpf: Split bpf object load into prepare/load
Introduce bpf_object__prepare API: additional intermediate preparation
step that performs ELF processing, relocations, prepares final state of
BPF program instructions (accessible with bpf_program__insns()), creates
and (potentially) pins maps, and stops short of loading BPF programs.

We anticipate few use cases for this API, such as:
* Use prepare to initialize bpf_token, without loading freplace
programs, unlocking possibility to lookup BTF of other programs.
* Execute prepare to obtain finalized BPF program instructions without
loading programs, enabling tools like veristat to process one program at
a time, without incurring cost of ELF parsing and processing.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250303135752.158343-4-mykyta.yatsenko5@gmail.com
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Mykyta Yatsenko
16c58c33c8 libbpf: Introduce more granular state for bpf_object
We are going to split bpf_object loading into 2 stages: preparation and
loading. This will increase flexibility when working with bpf_object
and unlock some optimizations and use cases.
This patch substitutes a boolean flag (loaded) by more finely-grained
state for bpf_object.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250303135752.158343-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Mykyta Yatsenko
e14bb3629f libbpf: Use map_is_created helper in map setters
Refactoring: use map_is_created helper in map setters that need to check
the state of the map. This helps to reduce the number of the places that
depend explicitly on the loaded flag, simplifying refactoring in the
next patch of this set.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250303135752.158343-2-mykyta.yatsenko5@gmail.com
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Yonghong Song
fbda5d7d2f bpf: Allow pre-ordering for bpf cgroup progs
Currently for bpf progs in a cgroup hierarchy, the effective prog array
is computed from bottom cgroup to upper cgroups (post-ordering). For
example, the following cgroup hierarchy
    root cgroup: p1, p2
        subcgroup: p3, p4
have BPF_F_ALLOW_MULTI for both cgroup levels.
The effective cgroup array ordering looks like
    p3 p4 p1 p2
and at run time, progs will execute based on that order.

But in some cases, it is desirable to have root prog executes earlier than
children progs (pre-ordering). For example,
  - prog p1 intends to collect original pkt dest addresses.
  - prog p3 will modify original pkt dest addresses to a proxy address for
    security reason.
The end result is that prog p1 gets proxy address which is not what it
wants. Putting p1 to every child cgroup is not desirable either as it
will duplicate itself in many child cgroups. And this is exactly a use case
we are encountering in Meta.

To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag
is specified at attachment time, the prog has higher priority and the
ordering with that flag will be from top to bottom (pre-ordering).
For example, in the above example,
    root cgroup: p1, p2
        subcgroup: p3, p4
Let us say p2 and p4 are marked with BPF_F_PREORDER. The final
effective array ordering will be
    p2 p4 p3 p1

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Ihor Solodrai
be18fdb16a libbpf: Implement bpf_usdt_arg_size BPF function
Information about USDT argument size is implicitly stored in
__bpf_usdt_arg_spec, but currently it's not accessbile to BPF programs
that use USDT.

Implement bpf_sdt_arg_size() that returns the size of an USDT argument
in bytes.

v1->v2:
  * do not add __bpf_usdt_arg_spec() helper

v1: https://lore.kernel.org/bpf/20250220215904.3362709-1-ihor.solodrai@linux.dev/

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250224235756.2612606-1-ihor.solodrai@linux.dev
2025-03-10 15:35:17 -07:00
Nandakumar Edamana
82f60c9b5e libbpf: Fix out-of-bound read
In `set_kcfg_value_str`, an untrusted string is accessed with the assumption
that it will be at least two characters long due to the presence of checks for
opening and closing quotes. But the check for the closing quote
(value[len - 1] != '"') misses the fact that it could be checking the opening
quote itself in case of an invalid input that consists of just the opening
quote.

This commit adds an explicit check to make sure the string is at least two
characters long.

Signed-off-by: Nandakumar Edamana <nandakumar@nandakumar.co.in>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250221210110.3182084-1-nandakumar@nandakumar.co.in
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-03-10 15:35:17 -07:00
Andrii Nakryiko
4c893341f5 Makefile: detect pkg-config availability
Detect whether build system has pkg-config tool, and if not, fallback to
manually specifying -lelf -lz as dependency.

Closes: https://github.com/libbpf/libbpf/issues/885
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-03-03 18:59:47 -08:00
Ihor Solodrai
42a6ef6316 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   01f3ce5328c405179b2c69ea047c423dad2bfa6d
Checkpoint bpf-next commit: 239860828f8660e2be487e2fbdae2640cce3fd67
Baseline bpf commit:        c45323b7560ec87c37c729b703c86ee65f136d75
Checkpoint bpf commit:      319fc77f8f45a1b3dba15b0cc1a869778fd222f7

Andrii Nakryiko (2):
  libbpf: fix LDX/STX/ST CO-RE relocation size adjustment logic
  libbpf: Fix hypothetical STT_SECTION extern NULL deref case

Daniel Borkmann (1):
  netkit: Allow for configuring needed_{head,tail}room

Ihor Solodrai (3):
  libbpf: Introduce kflag for type_tags and decl_tags in BTF
  docs/bpf: Document the semantics of BTF tags with kind_flag
  libbpf: Check the kflag of type tags in btf_dump

Tao Chen (1):
  libbpf: Wrap libbpf API direct err with libbpf_err

Tony Ambardar (1):
  libbpf: Fix accessing BTF.ext core_relo header

Yonghong Song (1):
  bpf: Sync uapi bpf.h header for the tooling infra

 include/uapi/linux/bpf.h     |  5 +-
 include/uapi/linux/btf.h     |  3 +-
 include/uapi/linux/if_link.h |  2 +
 src/btf.c                    | 90 ++++++++++++++++++++++++++----------
 src/btf.h                    |  3 ++
 src/btf_dump.c               |  5 +-
 src/libbpf.c                 | 26 +++++------
 src/libbpf.map               |  2 +
 src/linker.c                 |  2 +-
 src/relo_core.c              | 24 ++++++++--
 10 files changed, 116 insertions(+), 46 deletions(-)

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Andrii Nakryiko
041d5948f3 libbpf: Fix hypothetical STT_SECTION extern NULL deref case
Fix theoretical NULL dereference in linker when resolving *extern*
STT_SECTION symbol against not-yet-existing ELF section. Not sure if
it's possible in practice for valid ELF object files (this would require
embedded assembly manipulations, at which point BTF will be missing),
but fix the s/dst_sym/dst_sec/ typo guarding this condition anyways.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Fixes: a46349227cd8 ("libbpf: Add linker extern resolution support for functions and global variables")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20250220002821.834400-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Tao Chen
39a589c74e libbpf: Wrap libbpf API direct err with libbpf_err
Just wrap the direct err with libbpf_err, keep consistency
with other APIs.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20250219153711.29651-1-chen.dylane@linux.dev
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Andrii Nakryiko
d7a4ab1548 libbpf: fix LDX/STX/ST CO-RE relocation size adjustment logic
Libbpf has a somewhat obscure feature of automatically adjusting the
"size" of LDX/STX/ST instruction (memory store and load instructions),
based on originally recorded access size (u8, u16, u32, or u64) and the
actual size of the field on target kernel. This is meant to facilitate
using BPF CO-RE on 32-bit architectures (pointers are always 64-bit in
BPF, but host kernel's BTF will have it as 32-bit type), as well as
generally supporting safe type changes (unsigned integer type changes
can be transparently "relocated").

One issue that surfaced only now, 5 years after this logic was
implemented, is how this all works when dealing with fields that are
arrays. This isn't all that easy and straightforward to hit (see
selftests that reproduce this condition), but one of sched_ext BPF
programs did hit it with innocent looking loop.

Long story short, libbpf used to calculate entire array size, instead of
making sure to only calculate array's element size. But it's the element
that is loaded by LDX/STX/ST instructions (1, 2, 4, or 8 bytes), so
that's what libbpf should check. This patch adjusts the logic for
arrays and fixed the issue.

Reported-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250207014809.1573841-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Yonghong Song
4eed43c229 bpf: Sync uapi bpf.h header for the tooling infra
Commit 0abff462d802 ("bpf: Add comment about helper freeze") missed the
tooling header sync. Fix it.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250213050427.2788837-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Ihor Solodrai
cc278ff7c0 libbpf: Check the kflag of type tags in btf_dump
If the kflag is set for a BTF type tag, then the tag represents an
arbitrary __attribute__. Change btf_dump accordingly.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250130201239.1429648-4-ihor.solodrai@linux.dev
2025-02-24 15:10:59 -08:00
Ihor Solodrai
2b8b896bca docs/bpf: Document the semantics of BTF tags with kind_flag
Explain the meaning of kind_flag in BTF type_tags and decl_tags.
Update uapi btf.h kind_flag comment to reflect the changes.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250130201239.1429648-3-ihor.solodrai@linux.dev
2025-02-24 15:10:59 -08:00
Ihor Solodrai
32bda80136 libbpf: Introduce kflag for type_tags and decl_tags in BTF
Add the following functions to libbpf API:
  * btf__add_type_attr()
  * btf__add_decl_attr()

These functions allow to add to BTF the type tags and decl tags with
info->kflag set to 1. The kflag indicates that the tag directly
encodes an __attribute__ and not a normal tag.

See Documentation/bpf/btf.rst changes in the subsequent patch for
details on the semantics.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250130201239.1429648-2-ihor.solodrai@linux.dev
2025-02-24 15:10:59 -08:00
Tony Ambardar
71208c3362 libbpf: Fix accessing BTF.ext core_relo header
Update btf_ext_parse_info() to ensure the core_relo header is present
before reading its fields. This avoids a potential buffer read overflow
reported by the OSS Fuzz project.

Fixes: cf579164e9ea ("libbpf: Support BTF.ext loading and output in either endianness")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://issues.oss-fuzz.com/issues/388905046
Link: https://lore.kernel.org/bpf/20250125065236.2603346-1-itugrok@yahoo.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Daniel Borkmann
9544a909f1 netkit: Allow for configuring needed_{head,tail}room
Allow the user to configure needed_{head,tail}room for both netkit
devices. The idea is similar to 163e529200af ("veth: implement
ndo_set_rx_headroom") with the difference that the two parameters
can be specified upon device creation. By default the current behavior
stays as is which is needed_{head,tail}room is 0.

In case of Cilium, for example, the netkit devices are not enslaved
into a bridge or openvswitch device (rather, BPF-based redirection
is used out of tcx), and as such these parameters are not propagated
into the Pod's netns via peer device.

Given Cilium can run in vxlan/geneve tunneling mode (needed_headroom)
and/or be used in combination with WireGuard (needed_{head,tail}room),
allow the Cilium CNI plugin to specify these two upon netkit device
creation.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/bpf/20241220234658.490686-1-daniel@iogearbox.net
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-02-24 15:10:59 -08:00
Ihor Solodrai
d4a841a32b ci: remove dependency on run-on-arch-action
run-on-arch-action is simply a wrapper around docker. There is no
value in using it in libbpf, as it is not complicated to run
non-native arch docker images directly on github-hosted runners.

Docker relies on qemu-user-static installed on the system to emulate
different architectures.

Recently there were various reports about multi-arch docker builds
failing with seemingly random issues, and it appears to boil down to
qemu [1]. I stumbled on this problem while updating s390x runners [2]
for BPF CI, and setting up more recent version of qemu helped.

This change addresses recent build failures on s390x and ppc64le.

[1] https://github.com/docker/setup-qemu-action/issues/188
[2] https://github.com/kernel-patches/runner/pull/69
[3] https://docs.docker.com/build/buildkit/#getting-started

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
2025-01-31 22:30:52 -08:00
Ihor Solodrai
324f3c3846 ci: run coverty scan on push to master
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2025-01-17 13:53:19 -08:00
Ihor Solodrai
63528b7a4d ci: remove sourcing helpers.sh from coverity workflow
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2025-01-17 13:53:19 -08:00
Andrii Nakryiko
7abfe520df sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   f44275e7155dc310d36516fc25be503da099781c
Checkpoint bpf-next commit: 01f3ce5328c405179b2c69ea047c423dad2bfa6d
Baseline bpf commit:        9d89551994a430b50c4fffcb1e617a057fa76e20
Checkpoint bpf commit:      c45323b7560ec87c37c729b703c86ee65f136d75

Andrii Nakryiko (1):
  libbpf: Work around kernel inconsistently stripping '.llvm.' suffix

Pu Lehui (2):
  libbpf: Fix return zero when elf_begin failed
  libbpf: Fix incorrect traversal end type ID when marking
    BTF_IS_EMBEDDED

Vishal Chourasia (1):
  tools: Sync if_xdp.h uapi tooling header

Yonghong Song (1):
  libbpf: Add unique_match option for multi kprobe

 include/uapi/linux/if_xdp.h |  4 ++--
 src/btf.c                   |  1 +
 src/btf_relocate.c          |  2 +-
 src/libbpf.c                | 39 +++++++++++++++++++++++++++++++++++--
 src/libbpf.h                |  4 +++-
 5 files changed, 44 insertions(+), 6 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-01-17 12:31:44 -08:00
Vishal Chourasia
d76c770473 tools: Sync if_xdp.h uapi tooling header
Sync if_xdp.h uapi header to remove following warning:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h'
  differs from latest version at 'include/uapi/linux/if_xdp.h'

Fixes: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support")
Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20250115032248.125742-1-yoong.siang.song@intel.com
2025-01-17 12:31:44 -08:00
Andrii Nakryiko
444f3c0e7a libbpf: Work around kernel inconsistently stripping '.llvm.' suffix
Some versions of kernel were stripping out '.llvm.<hash>' suffix from
kerne symbols (produced by Clang LTO compilation) from function names
reported in available_filter_functions, while kallsyms reported full
original name. This confuses libbpf's multi-kprobe logic of finding all
matching kernel functions for specified user glob pattern by joining
available_filter_functions and kallsyms contents, because joining by
full symbol name won't work for symbols containing '.llvm.<hash>' suffix.

This was eventually fixed by [0] in the kernel, but we'd like to not
regress multi-kprobe experience and add a work around for this bug on
libbpf side, stripping kallsym's name if it matches user pattern and
contains '.llvm.' suffix.

  [0] fb6a421fb615 ("kallsyms: Match symbols exactly with CONFIG_LTO_CLANG")

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250117003957.179331-1-andrii@kernel.org
2025-01-17 12:31:44 -08:00
Pu Lehui
719aeb7a6e libbpf: Fix incorrect traversal end type ID when marking BTF_IS_EMBEDDED
When redirecting the split BTF to the vmlinux base BTF, we need to mark
the distilled base struct/union members of split BTF structs/unions in
id_map with BTF_IS_EMBEDDED. This indicates that these types must match
both name and size later. Therefore, we need to traverse the entire
split BTF, which involves traversing type IDs from nr_dist_base_types to
nr_types. However, the current implementation uses an incorrect
traversal end type ID, so let's correct it.

Fixes: 19e00c897d50 ("libbpf: Split BTF relocation")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-3-pulehui@huaweicloud.com
2025-01-17 12:31:44 -08:00
Pu Lehui
a7edf4aec8 libbpf: Fix return zero when elf_begin failed
The error number of elf_begin is omitted when encapsulating the
btf_find_elf_sections function.

Fixes: c86f180ffc99 ("libbpf: Make btf_parse_elf process .BTF.base transparently")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-2-pulehui@huaweicloud.com
2025-01-17 12:31:44 -08:00
Yonghong Song
32792ec66c libbpf: Add unique_match option for multi kprobe
Jordan reported an issue in Meta production environment where func
try_to_wake_up() is renamed to try_to_wake_up.llvm.<hash>() by clang
compiler at lto mode. The original 'kprobe/try_to_wake_up' does not
work any more since try_to_wake_up() does not match the actual func
name in /proc/kallsyms.

There are a couple of ways to resolve this issue. For example, in
attach_kprobe(), we could do lookup in /proc/kallsyms so try_to_wake_up()
can be replaced by try_to_wake_up.llvm.<hach>(). Or we can force users
to use bpf_program__attach_kprobe() where they need to lookup
/proc/kallsyms to find out try_to_wake_up.llvm.<hach>(). But these two
approaches requires extra work by either libbpf or user.

Luckily, suggested by Andrii, multi kprobe already supports wildcard ('*')
for symbol matching. In the above example, 'try_to_wake_up*' can match
to try_to_wake_up() or try_to_wake_up.llvm.<hash>() and this allows
bpf prog works for different kernels as some kernels may have
try_to_wake_up() and some others may have try_to_wake_up.llvm.<hash>().

The original intention is to kprobe try_to_wake_up() only, so an optional
field unique_match is added to struct bpf_kprobe_multi_opts. If the
field is set to true, the number of matched functions must be one.
Otherwise, the attachment will fail. In the above case, multi kprobe
with 'try_to_wake_up*' and unique_match preserves user functionality.

Reported-by: Jordan Rome <linux@jordanrome.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250109174023.3368432-1-yonghong.song@linux.dev
2025-01-17 12:31:44 -08:00
Ihor Solodrai
c924f8d3dd ci: sync with libbpf/ci@v3
* vmtest.yml
  * use v3 of libbpf/ci actions
  * remove unnecessary selftests preparation steps
* ci/vmtest
  * remove unnecessary scripts and configs
  * add libbpf-specific run-vmtest.env [1]

[1] https://github.com/libbpf/ci/pull/166

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2025-01-15 19:30:48 -08:00
Daniel Müller
0ff2f8e0ee sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   a1087da9d11e5bcacc706002bc0f84b790881f69
Checkpoint bpf-next commit: f44275e7155dc310d36516fc25be503da099781c
Baseline bpf commit:        fb86c42a2a5d44e849ddfbc98b8d2f4f40d36ee3
Checkpoint bpf commit:      9d89551994a430b50c4fffcb1e617a057fa76e20

Adrian Hunter (1):
  perf/core: Add aux_pause, aux_resume, aux_start_paused

Alastair Robertson (2):
  libbpf: Pull file-opening logic up to top-level functions
  libbpf: Extend linker API to support in-memory ELF files

Andrii Nakryiko (1):
  libbpf: don't adjust USDT semaphore address if .stapsdt.base addr is
    missing

Anton Protopopov (2):
  bpf: Add fd_array_cnt attribute for prog_load
  libbpf: prog load: Allow to use fd_array_cnt

Ben Olson (1):
  libbpf: Improve debug message when the base BTF cannot be found

Daniel Borkmann (1):
  tools: Sync if_link.h uapi tooling header

Daniel Xu (1):
  libbpf: Set MFD_NOEXEC_SEAL when creating memfd

Eric Dumazet (1):
  net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute

Jiri Olsa (1):
  libbpf: Fix memory leak in bpf_program__attach_uprobe_multi

Joe Damato (3):
  netdev-genl: Dump napi_defer_hard_irqs
  netdev-genl: Dump gro_flush_timeout
  netdev-genl: Support setting per-NAPI config values

Martin Karsten (1):
  net: Add napi_struct parameter irq_suspend_timeout

Quentin Monnet (1):
  libbpf: Fix segfault due to libelf functions not setting errno

Sidong Yang (1):
  libbpf: Change hash_combine parameters from long to unsigned long

 include/uapi/linux/bpf.h        |  10 +
 include/uapi/linux/if_link.h    | 554 +++++++++++++++++++++++++++++++-
 include/uapi/linux/netdev.h     |   4 +
 include/uapi/linux/perf_event.h |  11 +-
 src/bpf.c                       |   3 +-
 src/bpf.h                       |   5 +-
 src/btf.c                       |   4 +-
 src/libbpf.c                    |  25 +-
 src/libbpf.h                    |   5 +
 src/libbpf.map                  |   4 +
 src/linker.c                    | 248 ++++++++++----
 src/usdt.c                      |   2 +-
 12 files changed, 794 insertions(+), 81 deletions(-)

Signed-off-by: Daniel Müller <deso@posteo.net>
2025-01-08 14:58:04 -08:00
Daniel Xu
f468e83c85 libbpf: Set MFD_NOEXEC_SEAL when creating memfd
Starting from 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and
MFD_EXEC") and until 1717449b4417 ("memfd: drop warning for missing
exec-related flags"), the kernel would print a warning if neither
MFD_NOEXEC_SEAL nor MFD_EXEC is set in memfd_create().

If libbpf runs on on a kernel between these two commits (eg. on an
improperly backported system), it'll trigger this warning.

To avoid this warning (and also be more secure), explicitly set
MFD_NOEXEC_SEAL. But since libbpf can be run on potentially very old
kernels, leave a fallback for kernels without MFD_NOEXEC_SEAL support.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/6e62c2421ad7eb1da49cbf16da95aaaa7f94d394.1735594195.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-01-08 14:58:04 -08:00
Anton Protopopov
48c771c4ce libbpf: prog load: Allow to use fd_array_cnt
Add new fd_array_cnt field to bpf_prog_load_opts
and pass it in bpf_attr, if set.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-6-aspsk@isovalent.com
2025-01-08 14:58:04 -08:00
Anton Protopopov
266da73237 bpf: Add fd_array_cnt attribute for prog_load
The fd_array attribute of the BPF_PROG_LOAD syscall may contain a set
of file descriptors: maps or btfs. This field was introduced as a
sparse array. Introduce a new attribute, fd_array_cnt, which, if
present, indicates that the fd_array is a continuous array of the
corresponding length.

If fd_array_cnt is non-zero, then every map in the fd_array will be
bound to the program, as if it was used by the program. This
functionality is similar to the BPF_PROG_BIND_MAP syscall, but such
maps can be used by the verifier during the program load.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-5-aspsk@isovalent.com
2025-01-08 14:58:04 -08:00
Alastair Robertson
d2f1f4490b libbpf: Extend linker API to support in-memory ELF files
The new_fd and add_fd functions correspond to the original new and
add_file functions, but accept an FD instead of a file name. This
gives API consumers the option of using anonymous files/memfds to
avoid writing ELFs to disk.

This new API will be useful for performing linking as part of
bpftrace's JIT compilation.

The add_buf function is a convenience wrapper that does the work of
creating a memfd for the caller.

Signed-off-by: Alastair Robertson <ajor@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241211164030.573042-3-ajor@meta.com
2025-01-08 14:58:04 -08:00
Alastair Robertson
f00fad0951 libbpf: Pull file-opening logic up to top-level functions
Move the filename arguments and file-descriptor handling from
init_output_elf() and linker_load_obj_file() and instead handle them
at the top-level in bpf_linker__new() and bpf_linker__add_file().

This will allow the inner functions to be shared with a new,
non-filename-based, API in the next commit.

Signed-off-by: Alastair Robertson <ajor@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241211164030.573042-2-ajor@meta.com
2025-01-08 14:58:04 -08:00
Quentin Monnet
984dcc97ae libbpf: Fix segfault due to libelf functions not setting errno
Libelf functions do not set errno on failure. Instead, it relies on its
internal _elf_errno value, that can be retrieved via elf_errno (or the
corresponding message via elf_errmsg()). From "man libelf":

    If a libelf function encounters an error it will set an internal
    error code that can be retrieved with elf_errno. Each thread
    maintains its own separate error code. The meaning of each error
    code can be determined with elf_errmsg, which returns a string
    describing the error.

As a consequence, libbpf should not return -errno when a function from
libelf fails, because an empty value will not be interpreted as an error
and won't prevent the program to stop. This is visible in
bpf_linker__add_file(), for example, where we call a succession of
functions that rely on libelf:

    err = err ?: linker_load_obj_file(linker, filename, opts, &obj);
    err = err ?: linker_append_sec_data(linker, &obj);
    err = err ?: linker_append_elf_syms(linker, &obj);
    err = err ?: linker_append_elf_relos(linker, &obj);
    err = err ?: linker_append_btf(linker, &obj);
    err = err ?: linker_append_btf_ext(linker, &obj);

If the object file that we try to process is not, in fact, a correct
object file, linker_load_obj_file() may fail with errno not being set,
and return 0. In this case we attempt to run linker_append_elf_sysms()
and may segfault.

This can happen (and was discovered) with bpftool:

    $ bpftool gen object output.o sample_ret0.bpf.c
    libbpf: failed to get ELF header for sample_ret0.bpf.c: invalid `Elf' handle
    zsh: segmentation fault (core dumped)  bpftool gen object output.o sample_ret0.bpf.c

Fix the issue by returning a non-null error code (-EINVAL) when libelf
functions fail.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Signed-off-by: Quentin Monnet <qmo@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241205135942.65262-1-qmo@kernel.org
2025-01-08 14:58:04 -08:00
Ben Olson
3ed57f68e5 libbpf: Improve debug message when the base BTF cannot be found
When running `bpftool` on a kernel module installed in `/lib/modules...`,
this error is encountered if the user does not specify `--base-btf` to
point to a valid base BTF (e.g. usually in `/sys/kernel/btf/vmlinux`).
However, looking at the debug output to determine the cause of the error
simply says `Invalid BTF string section`, which does not point to the
actual source of the error. This just improves that debug message to tell
users what happened.

Signed-off-by: Ben Olson <matthew.olson@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/Z0YqzQ5lNz7obQG7@bolson-desk
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-01-08 14:58:04 -08:00
Andrii Nakryiko
69d85c5fb3 libbpf: don't adjust USDT semaphore address if .stapsdt.base addr is missing
USDT ELF note optionally can record an offset of .stapsdt.base, which is
used to make adjustments to USDT target attach address. Currently,
libbpf will do this address adjustment unconditionally if it finds
.stapsdt.base ELF section in target binary. But there is a corner case
where .stapsdt.base ELF section is present, but specific USDT note
doesn't reference it. In such case, libbpf will basically just add base
address and end up with absolutely incorrect USDT target address.

This adjustment has to be done only if both .stapsdt.sema section is
present and USDT note is recording a reference to it.

Fixes: 74cc6311cec9 ("libbpf: Add USDT notes parsing and resolution logic")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20241121224558.796110-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-01-08 14:58:04 -08:00
Martin Karsten
0d822312fa net: Add napi_struct parameter irq_suspend_timeout
Add a per-NAPI IRQ suspension parameter, which can be get/set with
netdev-genl.

This patch doesn't change any behavior but prepares the code for other
changes in the following commits which use irq_suspend_timeout as a
timeout for IRQ suspension.

Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Tested-by: Joe Damato <jdamato@fastly.com>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 14:58:04 -08:00
Daniel Borkmann
ba2ba19f6d tools: Sync if_link.h uapi tooling header
Sync if_link uapi header to the latest version as we need the refresher
in tooling for netkit device. Given it's been a while since the last sync
and the diff is fairly big, it has been done as its own commit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20241004101335.117711-4-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-01-08 14:58:04 -08:00
Joe Damato
c9a728c329 netdev-genl: Support setting per-NAPI config values
Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241011184527.16393-7-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 14:58:04 -08:00
Joe Damato
adf7973417 netdev-genl: Dump gro_flush_timeout
Support dumping gro_flush_timeout for a NAPI ID.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241011184527.16393-5-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 14:58:04 -08:00
Joe Damato
4f5f2597ce netdev-genl: Dump napi_defer_hard_irqs
Support dumping defer_hard_irqs for a NAPI ID.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20241011184527.16393-3-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 14:58:04 -08:00
Eric Dumazet
114881acba net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute
Some network devices have the ability to offload EDT (Earliest
Departure Time) which is the model used for TCP pacing and FQ
packet scheduler.

Some of them implement the timing wheel mechanism described in
https://saeed.github.io/files/carousel-sigcomm17.pdf
with an associated 'timing wheel horizon'.

This patch adds dev->max_pacing_offload_horizon expressing
this timing wheel horizon in nsec units.

This is a read-only attribute.

Unless a driver sets it, dev->max_pacing_offload_horizon
is zero.

v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 )
v3: added yaml doc (also per Jakub feedback)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 14:58:04 -08:00
Sidong Yang
b1f223b5b8 libbpf: Change hash_combine parameters from long to unsigned long
The hash_combine() could be trapped when compiled with sanitizer like "zig cc"
or clang with signed-integer-overflow option. This patch parameters and return
type to unsigned long to remove the potential overflow.

Signed-off-by: Sidong Yang <sidong.yang@furiosa.ai>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241116081054.65195-1-sidong.yang@furiosa.ai
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-01-08 14:58:04 -08:00
Jiri Olsa
05333afec1 libbpf: Fix memory leak in bpf_program__attach_uprobe_multi
Andrii reported memory leak detected by Coverity on error path
in bpf_program__attach_uprobe_multi. Fixing that by moving
the check earlier before the offsets allocations.

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241115115843.694337-1-jolsa@kernel.org
2025-01-08 14:58:04 -08:00
Adrian Hunter
d22e0d8721 perf/core: Add aux_pause, aux_resume, aux_start_paused
Hardware traces, such as instruction traces, can produce a vast amount of
trace data, so being able to reduce tracing to more specific circumstances
can be useful.

The ability to pause or resume tracing when another event happens, can do
that.

Add ability for an event to "pause" or "resume" AUX area tracing.

Add aux_pause bit to perf_event_attr to indicate that, if the event
happens, the associated AUX area tracing should be paused. Ditto
aux_resume. Do not allow aux_pause and aux_resume to be set together.

Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
event that it should start in a "paused" state.

Add aux_paused to struct hw_perf_event for AUX area events to keep track of
the "paused" state. aux_paused is initialized to aux_start_paused.

Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
callbacks. Call as needed, during __perf_event_output(). Add
aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
handler. Pause/resume in NMI context will miss out if it coincides with
another pause/resume.

To use aux_pause or aux_resume, an event must be in a group with the AUX
area event as the group leader.

Example (requires Intel PT and tools patches also):

 $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
 Linux
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.043 MB perf.data ]
 $ perf script --call-trace
 uname   30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
 uname   30805 [000] 24001.058784424:  psb offs: 0
 uname   30805 [000] 24001.058784424:  cbr: 39 freq: 3904 MHz (139%)
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        debug_smp_processor_id
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        __x64_sys_newuname
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            down_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                __cond_resched
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            up_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            _copy_to_user
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])        syscall_exit_to_user_mode
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            syscall_exit_work
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                perf_syscall_exit
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_trace_buf_alloc
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_get_recursion_context
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_tp_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_trace_buf_update
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            tracing_gen_ctx_irq_test
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            __perf_event_account_interrupt
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                __this_cpu_preempt_check
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            perf_event_output_forward
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                perf_event_aux_pause
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                    ring_buffer_get
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_lock
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_unlock
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                    pt_event_stop
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785463: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785639: 0x0

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20241022155920.17511-3-adrian.hunter@intel.com
2025-01-08 14:58:04 -08:00
Ihor Solodrai
c5f22aca0f ci: remove llvm-17 variant of the workflow
Also try prettifying the job names.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-20 10:23:35 -08:00
Ihor Solodrai
bfc9770b24 ci: switch to libbpf/ci actions @v2
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
cd73a17321 ci: configure CI test jobs
* Don't run pahole@tmp.master + llvm-17 combination.
* Use descriptive name of for vmtest jobs
* Don't run test_progs_cpuv4 when LLVM_VERSION < 18 (same as on BPF CI)
* Add some logging to prepare-selftests-run.sh

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
39e4e86263 ci: cleanup now unused local actions and workflows
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
c1f8925561 ci: bump llvm version to 18
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
c7bf7b8977 ci: update temporary kernel patches
Remove old patches applied to kernel source for CI. They haven't been
applied in a while.

Add a fix for token/obj_priv_implicit_token_envvar

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
e0687f9f54 ci: add vmtest as a reusable workflow
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
dcf6ad6c70 ci: switch to libbpf/ci/build-selftests action
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
a453ffb7ea ci: add a vmtest step for setting up selftests run
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Ihor Solodrai
779cb2b65b ci: use libbpf/ci/run-vmtest action to run selftests
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-11-18 22:00:09 -08:00
Andrii Nakryiko
244485ce72 ci: don't fail test kmodule builds on older kernels
We are now getting:

  WARNING: Module.symvers is missing.
           Modules may not have dependencies or modversions.
           You may get many unresolved symbol errors.
           You can set KBUILD_MODPOST_WARN=1 to turn errors into warning
           if you want to proceed at your own risk.

So let's set KBUILD_MODPOST_WARN=1.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-11-13 19:25:37 -08:00
Andrii Nakryiko
713f5b0779 libbpf: bump version to v1.6.0 for new dev cycle
We are now in v1.6.0 dev cycles, reflect that in Makefile.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-11-13 19:25:37 -08:00
Andrii Nakryiko
6c25f7dcb5 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   c6fb8030b4baa01c850f99fc6da051b1017edc46
Checkpoint bpf-next commit: a1087da9d11e5bcacc706002bc0f84b790881f69
Baseline bpf commit:        d5fb316e2af1d947f0f6c3666e373a54d9f27c6f
Checkpoint bpf commit:      fb86c42a2a5d44e849ddfbc98b8d2f4f40d36ee3

Andrii Nakryiko (1):
  libbpf: start v1.6 development cycle

Jiri Olsa (2):
  bpf: Add support for uprobe multi session attach
  libbpf: Add support for uprobe multi session attach

Mykyta Yatsenko (4):
  libbpf: Introduce errstr() for stringifying errno
  libbpf: Stringify errno in log messages in libbpf.c
  libbpf: Stringify errno in log messages in btf*.c
  libbpf: Stringify errno in log messages in the remaining code

 include/uapi/linux/bpf.h |   1 +
 src/bpf.c                |   1 +
 src/btf.c                |  26 +--
 src/btf_dump.c           |   3 +-
 src/elf.c                |   4 +-
 src/features.c           |  15 +-
 src/gen_loader.c         |   3 +-
 src/libbpf.c             | 375 ++++++++++++++++++---------------------
 src/libbpf.h             |   4 +-
 src/libbpf.map           |   3 +
 src/libbpf_version.h     |   2 +-
 src/linker.c             |  21 ++-
 src/ringbuf.c            |  34 ++--
 src/str_error.c          |  71 ++++++++
 src/str_error.h          |   7 +
 src/usdt.c               |  32 ++--
 16 files changed, 332 insertions(+), 270 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-11-13 19:25:37 -08:00
Andrii Nakryiko
8576598d64 sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-11-13 19:25:37 -08:00
Mykyta Yatsenko
db73e46709 libbpf: Stringify errno in log messages in the remaining code
Convert numeric error codes into the string representations in log
messages in the rest of libbpf source files.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241111212919.368971-5-mykyta.yatsenko5@gmail.com
2024-11-13 19:25:37 -08:00
Mykyta Yatsenko
da7d63a690 libbpf: Stringify errno in log messages in btf*.c
Convert numeric error codes into the string representations in log
messages in btf.c and btf_dump.c.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241111212919.368971-4-mykyta.yatsenko5@gmail.com
2024-11-13 19:25:37 -08:00
Mykyta Yatsenko
6d8af6175e libbpf: Stringify errno in log messages in libbpf.c
Convert numeric error codes into the string representations in log
messages in libbpf.c.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241111212919.368971-3-mykyta.yatsenko5@gmail.com
2024-11-13 19:25:37 -08:00
Mykyta Yatsenko
8232da46d6 libbpf: Introduce errstr() for stringifying errno
Add function errstr(int err) that allows converting numeric error codes
into string representations.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241111212919.368971-2-mykyta.yatsenko5@gmail.com
2024-11-13 19:25:37 -08:00
Jiri Olsa
c975e02612 libbpf: Add support for uprobe multi session attach
Adding support to attach program in uprobe session mode
with bpf_program__attach_uprobe_multi function.

Adding session bool to bpf_uprobe_multi_opts struct that allows
to load and attach the bpf program via uprobe session.
the attachment to create uprobe multi session.

Also adding new program loader section that allows:
  SEC("uprobe.session/bpf_fentry_test*")

and loads/attaches uprobe program as uprobe session.

Adding sleepable hook (uprobe.session.s) as well.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-6-jolsa@kernel.org
2024-11-13 19:25:37 -08:00
Jiri Olsa
183d84803a bpf: Add support for uprobe multi session attach
Adding support to attach BPF program for entry and return probe
of the same function. This is common use case which at the moment
requires to create two uprobe multi links.

Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs
kernel to attach single link program to both entry and exit probe.

It's possible to control execution of the BPF program on return
probe simply by returning zero or non zero from the entry BPF
program execution to execute or not the BPF program on return
probe respectively.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-4-jolsa@kernel.org
2024-11-13 19:25:37 -08:00
Andrii Nakryiko
6210515c78 libbpf: start v1.6 development cycle
With libbpf v1.5.0 release out, start v1.6 dev cycle.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241029184045.581537-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-11-13 19:25:37 -08:00
Ihor Solodrai
94610d4c27 ci: remove CI jobs for 4.9.0 and 5.5.0 kernels
Signed-off-by: Ihor Solodrai <isolodrai@meta.com>
2024-11-13 19:22:53 -08:00
Andrii Nakryiko
09b9e83102 ci: bump uraimo/run-on-arch-action version
Bump to latest uraimo/run-on-arch-action@v2.8.1 version, hoping that
fixes the CI.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-10-24 14:34:52 -07:00
Andrii Nakryiko
891438c086 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   989a29cfed9b5092c3e18be14e9032c51bb1c9f6
Checkpoint bpf-next commit: c6fb8030b4baa01c850f99fc6da051b1017edc46
Baseline bpf commit:        b836cbdf3b81a4a22b3452186efa2e5105a77e10
Checkpoint bpf commit:      d5fb316e2af1d947f0f6c3666e373a54d9f27c6f

Andrii Nakryiko (1):
  libbpf: move global data mmap()'ing into bpf_object__load()

Eder Zulian (1):
  libbpf: Prevent compiler warnings/errors

Hou Tao (1):
  bpf: Add the missing BPF_LINK_TYPE invocation for sockmap

Kui-Feng Lee (1):
  libbpf: define __uptr.

 include/uapi/linux/bpf.h |  3 ++
 src/bpf_helpers.h        |  1 +
 src/btf_dump.c           |  4 +-
 src/libbpf.c             | 83 +++++++++++++++++++---------------------
 4 files changed, 46 insertions(+), 45 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-10-24 14:34:52 -07:00
Hou Tao
2d7a79a984 bpf: Add the missing BPF_LINK_TYPE invocation for sockmap
There is an out-of-bounds read in bpf_link_show_fdinfo() for the sockmap
link fd. Fix it by adding the missing BPF_LINK_TYPE invocation for
sockmap link

Also add comments for bpf_link_type to prevent missing updates in the
future.

Fixes: 699c23f02c65 ("bpf: Add bpf_link support for sk_msg and sk_skb progs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241024013558.1135167-2-houtao@huaweicloud.com
2024-10-24 14:34:52 -07:00
Kui-Feng Lee
ee92f521ab libbpf: define __uptr.
Make __uptr available to BPF programs to enable them to define uptrs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-8-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-24 14:34:52 -07:00
Andrii Nakryiko
2dea4b86ee libbpf: move global data mmap()'ing into bpf_object__load()
Since BPF skeleton inception libbpf has been doing mmap()'ing of global
data ARRAY maps in bpf_object__load_skeleton() API, which is used by
code generated .skel.h files (i.e., by BPF skeletons only).

This is wrong because if BPF object is loaded through generic
bpf_object__load() API, global data maps won't be re-mmap()'ed after
load step, and memory pointers returned from bpf_map__initial_value()
would be wrong and won't reflect the actual memory shared between BPF
program and user space.

bpf_map__initial_value() return result is rarely used after load, so
this went unnoticed for a really long time, until bpftrace project
attempted to load BPF object through generic bpf_object__load() API and
then used BPF subskeleton instantiated from such bpf_object. It turned
out that .data/.rodata/.bss data updates through such subskeleton was
"blackholed", all because libbpf wouldn't re-mmap() those maps during
bpf_object__load() phase.

Long story short, this step should be done by libbpf regardless of BPF
skeleton usage, right after BPF map is created in the kernel. This patch
moves this functionality into bpf_object__populate_internal_map() to
achieve this. And bpf_object__load_skeleton() is now simple and almost
trivial, only propagating these mmap()'ed pointers into user-supplied
skeleton structs.

We also do trivial adjustments to error reporting inside
bpf_object__populate_internal_map() for consistency with the rest of
libbpf's map-handling code.

Reported-by: Alastair Robertson <ajor@meta.com>
Reported-by: Jonathan Wiepert <jwiepert@meta.com>
Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241023043908.3834423-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-24 14:34:52 -07:00
Eder Zulian
fdbdbb6b8a libbpf: Prevent compiler warnings/errors
Initialize 'new_off' and 'pad_bits' to 0 and 'pad_type' to  NULL in
btf_dump_emit_bit_padding to prevent compiler warnings/errors which are
observed when compiling with 'EXTRA_CFLAGS=-g -Og' options, but do not
happen when compiling with current default options.

For example, when compiling libbpf with

  $ make "EXTRA_CFLAGS=-g -Og" -C tools/lib/bpf/ clean all

Clang version 17.0.6 and GCC 13.3.1 fail to compile btf_dump.c due to
following errors:

  btf_dump.c: In function ‘btf_dump_emit_bit_padding’:
  btf_dump.c:903:42: error: ‘new_off’ may be used uninitialized [-Werror=maybe-uninitialized]
    903 |         if (new_off > cur_off && new_off <= next_off) {
        |                                  ~~~~~~~~^~~~~~~~~~~
  btf_dump.c:870:13: note: ‘new_off’ was declared here
    870 |         int new_off, pad_bits, bits, i;
        |             ^~~~~~~
  btf_dump.c:917:25: error: ‘pad_type’ may be used uninitialized [-Werror=maybe-uninitialized]
    917 |                         btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type,
        |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    918 |                                         in_bitfield ? new_off - cur_off : 0);
        |                                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  btf_dump.c:871:21: note: ‘pad_type’ was declared here
    871 |         const char *pad_type;
        |                     ^~~~~~~~
  btf_dump.c:930:20: error: ‘pad_bits’ may be used uninitialized [-Werror=maybe-uninitialized]
    930 |                 if (bits == pad_bits) {
        |                    ^
  btf_dump.c:870:22: note: ‘pad_bits’ was declared here
    870 |         int new_off, pad_bits, bits, i;
        |                      ^~~~~~~~
  cc1: all warnings being treated as errors

Signed-off-by: Eder Zulian <ezulian@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20241022172329.3871958-3-ezulian@redhat.com
2024-10-24 14:34:52 -07:00
Andrii Nakryiko
fc064eb41e sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   b24d7f0da6ef5a23456a301eaf51b170f961d4ae
Checkpoint bpf-next commit: 989a29cfed9b5092c3e18be14e9032c51bb1c9f6
Baseline bpf commit:        b24d7f0da6ef5a23456a301eaf51b170f961d4ae
Checkpoint bpf commit:      b836cbdf3b81a4a22b3452186efa2e5105a77e10

Andrii Nakryiko (2):
  libbpf: fix sym_is_subprog() logic for weak global subprogs
  libbpf: never interpret subprogs in .text as entry programs

Chen Ni (1):
  libbpf: Remove unneeded semicolon

Eduard Zingerman (1):
  bpf: __bpf_fastcall for bpf_get_smp_processor_id in uapi

Eric Long (1):
  libbpf: Do not resolve size on duplicate FUNCs

Ihor Solodrai (1):
  libbpf: Change log level of BTF loading error message

Martin Kelly (1):
  bpf: Update bpf_override_return() comment

Matteo Croce (1):
  bpf: fix argument type in bpf_loop documentation

Namhyung Kim (1):
  libbpf: Fix possible compiler warnings in hashmap

Tao Chen (1):
  libbpf: Fix expected_attach_type set handling in program load callback

Tony Ambardar (7):
  libbpf: Improve log message formatting
  libbpf: Fix header comment typos for BTF.ext
  libbpf: Fix output .symtab byte-order during linking
  libbpf: Support BTF.ext loading and output in either endianness
  libbpf: Support opening bpf objects of either endianness
  libbpf: Support linking bpf objects of either endianness
  libbpf: Support creating light skeleton of either endianness

 include/uapi/linux/bpf.h |   8 +-
 src/bpf_gen_internal.h   |   1 +
 src/btf.c                | 280 ++++++++++++++++++++++++++++++---------
 src/btf.h                |   3 +
 src/btf_dump.c           |   2 +-
 src/btf_relocate.c       |   2 +-
 src/gen_loader.c         | 187 ++++++++++++++++++--------
 src/hashmap.h            |  20 +--
 src/libbpf.c             |  81 ++++++++---
 src/libbpf.map           |   2 +
 src/libbpf_internal.h    |  43 +++++-
 src/linker.c             |  84 +++++++++---
 src/relo_core.c          |   2 +-
 src/skel_internal.h      |   3 +-
 src/zip.c                |   2 +-
 15 files changed, 550 insertions(+), 170 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-10-11 14:12:43 -07:00
Andrii Nakryiko
db8a210964 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-10-11 14:12:43 -07:00
Namhyung Kim
f69995d909 libbpf: Fix possible compiler warnings in hashmap
The hashmap__for_each_entry[_safe] is accessing 'map' as a pointer.
But it does without parentheses so passing a static hash map with an
ampersand (like '&slab_hash') will cause compiler warnings due
to unmatched types as '->' operator has a higher precedence.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241011170021.1490836-1-namhyung@kernel.org
2024-10-11 14:12:43 -07:00
Andrii Nakryiko
ac9ced9eb3 libbpf: never interpret subprogs in .text as entry programs
Libbpf pre-1.0 had a legacy logic of allowing singular non-annotated
(i.e., not having explicit SEC() annotation) function to be treated as
sole entry BPF program (unless there were other explicit entry
programs).

This behavior was dropped during libbpf 1.0 transition period (unless
LIBBPF_STRICT_SEC_NAME flag was unset in libbpf_mode). When 1.0 was
released and all the legacy behavior was removed, the bug slipped
through leaving this legacy behavior around.

Fix this for good, as it actually causes very confusing behavior if BPF
object file only has subprograms, but no entry programs.

Fixes: bd054102a8c7 ("libbpf: enforce strict libbpf 1.0 behaviors")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241010211731.4121837-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Martin Kelly
ba8bd24bbb bpf: Update bpf_override_return() comment
The documentation says CONFIG_FUNCTION_ERROR_INJECTION is supported only
on x86. This was presumably true at the time of writing, but it's now
supported on many other architectures too. Drop this statement, since
it's not correct anymore and it fits better in other documentation
anyway.

Signed-off-by: Martin Kelly <martin.kelly@crowdstrike.com>
Link: https://lore.kernel.org/r/20241010193301.995909-1-martin.kelly@crowdstrike.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Matteo Croce
8ea6e12372 bpf: fix argument type in bpf_loop documentation
The `index` argument to bpf_loop() is threaded as an u64.
This lead in a subtle verifier denial where clang cloned the argument
in another register[1].

[1] https://github.com/systemd/systemd/pull/34650#issuecomment-2401092895

Signed-off-by: Matteo Croce <teknoraver@meta.com>
Link: https://lore.kernel.org/r/20241010035652.17830-1-technoboy85@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Andrii Nakryiko
0e3971339f libbpf: fix sym_is_subprog() logic for weak global subprogs
sym_is_subprog() is incorrectly rejecting relocations against *weak*
global subprogs. Fix that by realizing that STB_WEAK is also a global
function.

While it seems like verifier doesn't support taking an address of
non-static subprog right now, it's still best to fix support for it on
libbpf side, otherwise users will get a very confusing error during BPF
skeleton generation or static linking due to misinterpreted relocation:

  libbpf: prog 'handle_tp': bad map relo against 'foo' in section '.text'
  Error: failed to open BPF object file: Relocation failed

It's clearly not a map relocation, but is treated and reported as such
without this fix.

Fixes: 53eddb5e04ac ("libbpf: Support subprog address relocation")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241009011554.880168-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Eric Long
ecf998ed8f libbpf: Do not resolve size on duplicate FUNCs
FUNCs do not have sizes, thus currently btf__resolve_size will fail
with -EINVAL. Add conditions so that we only update size when the BTF
object is not function or function prototype.

Signed-off-by: Eric Long <i@hack3r.moe>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241002-libbpf-dup-extern-funcs-v4-1-560eb460ff90@hack3r.moe
2024-10-11 14:12:43 -07:00
Eduard Zingerman
89df6536bf bpf: __bpf_fastcall for bpf_get_smp_processor_id in uapi
Since [1] kernel supports __bpf_fastcall attribute for helper function
bpf_get_smp_processor_id(). Update uapi definition for this helper in
order to have this attribute in the generated bpf_helper_defs.h

[1] commit 91b7fbf3936f ("bpf, x86, riscv, arm: no_caller_saved_registers for bpf_get_smp_processor_id()")

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240916091712.2929279-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
8244006267 libbpf: Support creating light skeleton of either endianness
Track target endianness in 'struct bpf_gen' and process in-memory data in
native byte-order, but on finalization convert the embedded loader BPF
insns to target endianness.

The light skeleton also includes a target-accessed data blob which is
heterogeneous and thus difficult to convert to target byte-order on
finalization. Add support functions to convert data to target endianness
as it is added to the blob.

Also add additional debug logging for data blob structure details and
skeleton loading.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/569562e1d5bf1cce80a1f1a3882461ee2da1ffd5.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
6ac8762ecd libbpf: Support linking bpf objects of either endianness
Allow static linking object files of either endianness, checking that input
files have consistent byte-order, and setting output endianness from input.

Linking requires in-memory processing of programs, relocations, sections,
etc. in native endianness, and output conversion to target byte-order. This
is enabled by built-in ELF translation and recent BTF/BTF.ext endianness
functions. Further add local functions for swapping byte-order of sections
containing BPF insns.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/b47ca686d02664843fc99b96262fe3259650bc43.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
628b21dbcd libbpf: Support opening bpf objects of either endianness
Allow bpf_object__open() to access files of either endianness, and convert
included BPF programs to native byte-order in-memory for introspection.
Loading BPF objects of non-native byte-order is still disallowed however.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/26353c1a1887a54400e1acd6c138fa90c99cdd40.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
5ae8432d15 libbpf: Support BTF.ext loading and output in either endianness
Support for handling BTF data of either endianness was added in [1], but
did not include BTF.ext data for lack of use cases. Later, support for
static linking [2] provided a use case, but this feature and later ones
were restricted to native-endian usage.

Add support for BTF.ext handling in either endianness. Convert BTF.ext data
to native endianness when read into memory for further processing, and
support raw data access that restores the original byte-order for output.
Add internal header functions for byte-swapping func, line, and core info
records.

Add new API functions btf_ext__endianness() and btf_ext__set_endianness()
for query and setting byte-order, as already exist for BTF data.

[1] 3289959b97ca ("libbpf: Support BTF loading and raw data output in both endianness")
[2] 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support")

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/133407ab20e0dd5c07cab2a6fa7879dee1ffa4bc.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
f2668a0a71 libbpf: Fix output .symtab byte-order during linking
Object linking output data uses the default ELF_T_BYTE type for '.symtab'
section data, which disables any libelf-based translation. Explicitly set
the ELF_T_SYM type for output to restore libelf's byte-order conversion,
noting that input '.symtab' data is already correctly translated.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/87868bfeccf3f51aec61260073f8778e9077050a.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
5060f172cc libbpf: Fix header comment typos for BTF.ext
Mention struct btf_ext_info_sec rather than non-existent btf_sec_func_info
in BTF.ext struct documentation.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/cde65e01a5f2945c578485fab265ef711e2daeb6.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tony Ambardar
ceeb7211c9 libbpf: Improve log message formatting
Fix missing newlines and extraneous terminal spaces in messages.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/086884b7cbf87e524d584f9bf87f7a580e378b2b.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Chen Ni
3fb92e63e0 libbpf: Remove unneeded semicolon
Remove unneeded semicolon in zip_archive_open().

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240926023823.3632993-1-nichen@iscas.ac.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Tao Chen
ad633fb142 libbpf: Fix expected_attach_type set handling in program load callback
Referenced commit broke the logic of resetting expected_attach_type to
zero for allowed program types if kernel doesn't yet support such field.
We do need to overwrite and preserve expected_attach_type for
multi-uprobe though, but that can be done explicitly in
libbpf_prepare_prog_load().

Fixes: 5902da6d8a52 ("libbpf: Add uprobe multi link support to bpf_program__attach_usdt")
Suggested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240925153012.212866-1-chen.dylane@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Ihor Solodrai
3ea36843b3 libbpf: Change log level of BTF loading error message
Reduce log level of BTF loading error to INFO if BTF is not required.

Andrii says:

  Nowadays the expectation is that the BPF program will have a valid
  .BTF section, so even though .BTF is "optional", I think it's fine
  to emit a warning for that case (any reasonably recent Clang will
  produce valid BTF).

  Ihor's patch is fixing the situation with an outdated host kernel
  that doesn't understand BTF. libbpf will try to "upload" the
  program's BTF, but if that fails and the BPF object doesn't use
  any features that require having BTF uploaded, then it's just an
  information message to the user, but otherwise can be ignored.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-11 14:12:43 -07:00
Jordan Rome
80b16457cb ci: add temporary patch for failing upstream BPF uprobe selftest
Signed-off-by: Jordan Rome <linux@jordanrome.com>
2024-10-09 14:13:35 -07:00
Jordan Rome
7827ca87d1 ci: regenerate vmlinux.h
Regenerate latest vmlinux.h for old kernel CI tests.

Signed-off-by: Jordan Rome <linux@jordanrome.com>
2024-10-09 14:13:35 -07:00
Jordan Rome
91ccd57ca9 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b
Checkpoint bpf-next commit: b24d7f0da6ef5a23456a301eaf51b170f961d4ae
Baseline bpf commit:        b408473ea01b2e499d23503e2bf898416da9d7ac
Checkpoint bpf commit:      b24d7f0da6ef5a23456a301eaf51b170f961d4ae

Alan Maguire (1):
  bpf/bpf_get,set_sockopt: add option to set TCP-BPF sock ops flags

Daniel Borkmann (1):
  bpf: Sync uapi bpf.h header to tools directory

Donald Hunter (1):
  docs/bpf: Add missing BPF program types to docs

Ihor Solodrai (1):
  libbpf: Add bpf_object__token_fd accessor

Jiri Olsa (1):
  libbpf: Fix uretprobe.multi.s programs auto attachment

Lin Yikai (1):
  libbpf: fix some typos in libbpf

Mina Almasry (2):
  net: netdev netlink api to bind dma-buf to a net device
  netdev: add dmabuf introspection

Pu Lehui (3):
  libbpf: Access first syscall argument with CO-RE direct read on s390
  libbpf: Access first syscall argument with CO-RE direct read on arm64
  libbpf: Fix accessing first syscall argument on RV64

Sam James (1):
  libbpf: Workaround (another) -Wmaybe-uninitialized false positive

Shuyi Cheng (1):
  libbpf: Fixed getting wrong return address on arm64 architecture

Yusheng Zheng (1):
  libbpf: Fix some typos in comments

 docs/program_types.rst      | 30 ++++++++++++++++++++++++++----
 include/uapi/linux/bpf.h    | 25 ++++++++++++-------------
 include/uapi/linux/netdev.h | 13 +++++++++++++
 src/bpf.h                   |  4 ++--
 src/bpf_helpers.h           |  2 +-
 src/bpf_tracing.h           | 25 ++++++++++++++++---------
 src/btf.c                   |  4 ++--
 src/btf.h                   |  2 +-
 src/btf_dump.c              |  2 +-
 src/libbpf.c                | 13 +++++++++----
 src/libbpf.h                | 18 +++++++++++++-----
 src/libbpf.map              |  1 +
 src/libbpf_legacy.h         |  4 ++--
 src/linker.c                |  4 ++--
 src/skel_internal.h         |  2 +-
 src/usdt.bpf.h              |  2 +-
 16 files changed, 103 insertions(+), 48 deletions(-)

Signed-off-by: Jordan Rome <linux@jordanrome.com>
2024-10-09 14:13:35 -07:00
Jordan Rome
f0a307f61c sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Jordan Rome <linux@jordanrome.com>
2024-10-09 14:13:35 -07:00
Daniel Borkmann
80b97bd0b8 bpf: Sync uapi bpf.h header to tools directory
There is a delta between kernel UAPI bpf.h and tools UAPI bpf.h,
thus sync them again.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2024-10-09 14:13:35 -07:00
Ihor Solodrai
7c2f492a88 libbpf: Add bpf_object__token_fd accessor
Add a LIBBPF_API function to retrieve the token_fd from a bpf_object.

Without this accessor, if user needs a token FD they have to get it
manually via bpf_token_create, even though a token might have been
already created by bpf_object__load.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240913001858.3345583-1-ihor.solodrai@pm.me
2024-10-09 14:13:35 -07:00
Donald Hunter
114f6ce2fd docs/bpf: Add missing BPF program types to docs
Update the table of program types in the libbpf documentation with the
recently added program types.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240912095944.6386-1-donald.hunter@gmail.com
2024-10-09 14:13:35 -07:00
Jiri Olsa
69671302df libbpf: Fix uretprobe.multi.s programs auto attachment
As reported by Andrii we don't currently recognize uretprobe.multi.s
programs as return probes due to using (wrong) strcmp function.

Using str_has_pfx() instead to match uretprobe.multi prefix.

Tests are passing, because the return program was executed
as entry program and all counts were incremented properly.

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240910125336.3056271-1-jolsa@kernel.org
2024-10-09 14:13:35 -07:00
Yusheng Zheng
e1833cff9c libbpf: Fix some typos in comments
Fix some spelling errors in the code comments of libbpf:

betwen -> between
paremeters -> parameters
knowning -> knowing
definiton -> definition
compatiblity -> compatibility
overriden -> overridden
occured -> occurred
proccess -> process
managment -> management
nessary -> necessary

Signed-off-by: Yusheng Zheng <yunwei356@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240909225952.30324-1-yunwei356@gmail.com
2024-10-09 14:13:35 -07:00
Shuyi Cheng
81ac790dc8 libbpf: Fixed getting wrong return address on arm64 architecture
ARM64 has a separate lr register to store the return address, so here
you only need to read the lr register to get the return address, no need
to dereference it again.

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1725787433-77262-1-git-send-email-chengshuyi@linux.alibaba.com
2024-10-09 14:13:35 -07:00
Sam James
3b301cf75d libbpf: Workaround (another) -Wmaybe-uninitialized false positive
We get this with GCC 15 -O3 (at least):
```
libbpf.c: In function ‘bpf_map__init_kern_struct_ops’:
libbpf.c:1109:18: error: ‘mod_btf’ may be used uninitialized [-Werror=maybe-uninitialized]
 1109 |         kern_btf = mod_btf ? mod_btf->btf : obj->btf_vmlinux;
      |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
libbpf.c:1094:28: note: ‘mod_btf’ was declared here
 1094 |         struct module_btf *mod_btf;
      |                            ^~~~~~~
In function ‘find_struct_ops_kern_types’,
    inlined from ‘bpf_map__init_kern_struct_ops’ at libbpf.c:1102:8:
libbpf.c:982:21: error: ‘btf’ may be used uninitialized [-Werror=maybe-uninitialized]
  982 |         kern_type = btf__type_by_id(btf, kern_type_id);
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
libbpf.c: In function ‘bpf_map__init_kern_struct_ops’:
libbpf.c:967:21: note: ‘btf’ was declared here
  967 |         struct btf *btf;
      |                     ^~~
```

This is similar to the other libbpf fix from a few weeks ago for
the same modelling-errno issue (fab45b962749184e1a1a57c7c583782b78fad539).

Signed-off-by: Sam James <sam@gentoo.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://bugs.gentoo.org/939106
Link: https://lore.kernel.org/bpf/f6962729197ae7cdf4f6d1512625bd92f2322d31.1725630494.git.sam@gentoo.org
2024-10-09 14:13:35 -07:00
Lin Yikai
6c8dde3554 libbpf: fix some typos in libbpf
Hi, fix some spelling errors in libbpf, the details are as follows:

-in the code comments:
	termintaing->terminating
	architecutre->architecture
	requring->requiring
	recored->recoded
	sanitise->sanities
	allowd->allowed
	abover->above
	see bpf_udst_arg()->see bpf_usdt_arg()

Signed-off-by: Lin Yikai <yikai.lin@vivo.com>
Link: https://lore.kernel.org/r/20240905110354.3274546-3-yikai.lin@vivo.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-10-09 14:13:35 -07:00
Pu Lehui
9045c3ab53 libbpf: Fix accessing first syscall argument on RV64
On RV64, as Ilya mentioned before [0], the first syscall parameter should be
accessed through orig_a0 (see arch/riscv64/include/asm/syscall.h),
otherwise it will cause selftests like bpf_syscall_macro, vmlinux,
test_lsm, etc. to fail on RV64. Let's fix it by using the struct pt_regs
style CO-RE direct access.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220209021745.2215452-1-iii@linux.ibm.com [0]
Link: https://lore.kernel.org/bpf/20240831041934.1629216-5-pulehui@huaweicloud.com
2024-10-09 14:13:35 -07:00
Pu Lehui
53a645402f libbpf: Access first syscall argument with CO-RE direct read on arm64
Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE
SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking
into account the read pt_regs comes directly from the context, let's use
CO-RE direct read to access the first system call argument.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/bpf/20240831041934.1629216-3-pulehui@huaweicloud.com
2024-10-09 14:13:35 -07:00
Pu Lehui
6d01681b02 libbpf: Access first syscall argument with CO-RE direct read on s390
Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE
SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking
into account the read pt_regs comes directly from the context, let's use
CO-RE direct read to access the first system call argument.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240831041934.1629216-2-pulehui@huaweicloud.com
2024-10-09 14:13:35 -07:00
Mina Almasry
9a37057800 netdev: add dmabuf introspection
Add dmabuf information to page_pool stats:

$ ./cli.py --spec ../netlink/specs/netdev.yaml --dump page-pool-get
...
 {'dmabuf': 10,
  'id': 456,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 455,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 454,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 453,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 452,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 451,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 450,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},
 {'dmabuf': 10,
  'id': 449,
  'ifindex': 3,
  'inflight': 1023,
  'inflight-mem': 4190208},

And queue stats:

$ ./cli.py --spec ../netlink/specs/netdev.yaml --dump queue-get
...
{'dmabuf': 10, 'id': 8, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 9, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 10, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 11, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 12, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 13, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 14, 'ifindex': 3, 'type': 'rx'},
{'dmabuf': 10, 'id': 15, 'ifindex': 3, 'type': 'rx'},

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-14-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-10-09 14:13:35 -07:00
Mina Almasry
3578ab89fb net: netdev netlink api to bind dma-buf to a net device
API takes the dma-buf fd as input, and binds it to the netdevice. The
user can specify the rx queues to bind the dma-buf to.

Suggested-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-3-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-10-09 14:13:35 -07:00
Alan Maguire
178df3d885 bpf/bpf_get,set_sockopt: add option to set TCP-BPF sock ops flags
Currently the only opportunity to set sock ops flags dictating
which callbacks fire for a socket is from within a TCP-BPF sockops
program.  This is problematic if the connection is already set up
as there is no further chance to specify callbacks for that socket.
Add TCP_BPF_SOCK_OPS_CB_FLAGS to bpf_setsockopt() and bpf_getsockopt()
to allow users to specify callbacks later, either via an iterator
over sockets or via a socket-specific program triggered by a
setsockopt() on the socket.

Previous discussion on this here [1].

[1] https://lore.kernel.org/bpf/f42f157b-6e52-dd4d-3d97-9b86c84c0b00@oracle.com/

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20240808150558.1035626-2-alan.maguire@oracle.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-10-09 14:13:35 -07:00
Ihor Solodrai
1f98105e54 ci: bump actions/upload-artifact to v4
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
2024-10-07 15:38:01 -07:00
Andrii Nakryiko
a4161e00f9 ci: get rid of s390x kernel tests
Kernel/libbpf code is very well tested on s390x in BPF CI, so get rid of
it here as it often is a source of trouble and noise, without really
benefiting us much.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-10-07 15:38:01 -07:00
Andrii Nakryiko
caa17bdcbf ci: regenerate vmlinux.h
Regenerated latest vmlinux.h for old kernels.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-08-30 16:29:01 -07:00
Andrii Nakryiko
76c9f50f3e sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   ec5b8c76ab1c6d163762d60cfbedcd27e7527144
Checkpoint bpf-next commit: 2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b
Baseline bpf commit:        e1533b6319ab9c3a97dad314dd88b3783bc41b69
Checkpoint bpf commit:      b408473ea01b2e499d23503e2bf898416da9d7ac

Alan Maguire (1):
  libbpf: Fix license for btf_relocate.c

Andrii Nakryiko (2):
  libbpf: Fix no-args func prototype BTF dumping syntax
  libbpf: Fix bpf_object__open_skeleton()'s mishandling of options

David Vernet (1):
  libbpf: Don't take direct pointers into BTF data from st_ops

Jordan Rome (1):
  bpf: Add bpf_copy_from_user_str kfunc

Kan Liang (1):
  perf/x86/intel: Support new data source for Lunar Lake

Sam James (1):
  libbpf: Workaround -Wmaybe-uninitialized false positive

Stanislav Fomichev (1):
  selftests/bpf: Add XDP_UMEM_TX_METADATA_LEN to XSK TX metadata test

Tony Ambardar (1):
  libbpf: Ensure new BTF objects inherit input endianness

 include/uapi/linux/bpf.h        |  9 ++++
 include/uapi/linux/if_xdp.h     |  4 ++
 include/uapi/linux/perf_event.h |  6 ++-
 src/btf.c                       |  4 ++
 src/btf_dump.c                  |  8 ++--
 src/btf_relocate.c              |  2 +-
 src/elf.c                       |  3 ++
 src/libbpf.c                    | 75 ++++++++++++++-------------------
 8 files changed, 62 insertions(+), 49 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-08-30 16:29:01 -07:00
Tony Ambardar
fe28fae57a libbpf: Ensure new BTF objects inherit input endianness
New split BTF needs to preserve base's endianness. Similarly, when
creating a distilled BTF, we need to preserve original endianness.

Fix by updating libbpf's btf__distill_base() and btf_new_empty() to retain
the byte order of any source BTF objects when creating new ones.

Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support")
Fixes: 58e185a0dc35 ("libbpf: Add btf__distill_base() creating split BTF with distilled base BTF")
Reported-by: Song Liu <song@kernel.org>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/6358db36c5f68b07873a0a5be2d062b1af5ea5f8.camel@gmail.com/
Link: https://lore.kernel.org/bpf/20240830095150.278881-1-tony.ambardar@gmail.com
2024-08-30 16:29:01 -07:00
Andrii Nakryiko
f6f24022d3 libbpf: Fix bpf_object__open_skeleton()'s mishandling of options
We do an ugly copying of options in bpf_object__open_skeleton() just to
be able to set object name from skeleton's recorded name (while still
allowing user to override it through opts->object_name).

This is not just ugly, but it also is broken due to memcpy() that
doesn't take into account potential skel_opts' and user-provided opts'
sizes differences due to backward and forward compatibility. This leads
to copying over extra bytes and then failing to validate options
properly. It could, technically, lead also to SIGSEGV, if we are unlucky.

So just get rid of that memory copy completely and instead pass
default object name into bpf_object_open() directly, simplifying all
this significantly. The rule now is that obj_name should be non-NULL for
bpf_object_open() when called with in-memory buffer, so validate that
explicitly as well.

We adopt bpf_object__open_mem() to this as well and generate default
name (based on buffer memory address and size) outside of bpf_object_open().

Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support")
Reported-by: Daniel Müller <deso@posteo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Daniel Müller <deso@posteo.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240827203721.1145494-1-andrii@kernel.org
2024-08-30 16:29:01 -07:00
Jordan Rome
4bd31a1044 bpf: Add bpf_copy_from_user_str kfunc
This adds a kfunc wrapper around strncpy_from_user,
which can be called from sleepable BPF programs.

This matches the non-sleepable 'bpf_probe_read_user_str'
helper except it includes an additional 'flags'
param, which allows consumers to clear the entire
destination buffer on success or failure.

Signed-off-by: Jordan Rome <linux@jordanrome.com>
Link: https://lore.kernel.org/r/20240823195101.3621028-1-linux@jordanrome.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-08-30 16:29:01 -07:00
Sam James
33b22671c2 libbpf: Workaround -Wmaybe-uninitialized false positive
In `elf_close`, we get this with GCC 15 -O3 (at least):
```
In function ‘elf_close’,
    inlined from ‘elf_close’ at elf.c:53:6,
    inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2:
elf.c:57:9: warning: ‘elf_fd.elf’ may be used uninitialized [-Wmaybe-uninitialized]
   57 |         elf_end(elf_fd->elf);
      |         ^~~~~~~~~~~~~~~~~~~~
elf.c: In function ‘elf_find_func_offset_from_file’:
elf.c:377:23: note: ‘elf_fd.elf’ was declared here
  377 |         struct elf_fd elf_fd;
      |                       ^~~~~~
In function ‘elf_close’,
    inlined from ‘elf_close’ at elf.c:53:6,
    inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2:
elf.c:58:9: warning: ‘elf_fd.fd’ may be used uninitialized [-Wmaybe-uninitialized]
   58 |         close(elf_fd->fd);
      |         ^~~~~~~~~~~~~~~~~
elf.c: In function ‘elf_find_func_offset_from_file’:
elf.c:377:23: note: ‘elf_fd.fd’ was declared here
  377 |         struct elf_fd elf_fd;
      |                       ^~~~~~
```

In reality, our use is fine, it's just that GCC doesn't model errno
here (see linked GCC bug). Suppress -Wmaybe-uninitialized accordingly
by initializing elf_fd.fd to -1 and elf_fd.elf to NULL.

I've done this in two other functions as well given it could easily
occur there too (same access/use pattern).

Signed-off-by: Sam James <sam@gentoo.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://gcc.gnu.org/PR114952
Link: https://lore.kernel.org/bpf/14ec488a1cac02794c2fa2b83ae0cef1bce2cb36.1723578546.git.sam@gentoo.org
2024-08-30 16:29:01 -07:00
Alan Maguire
8b29484790 libbpf: Fix license for btf_relocate.c
License should be

// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)

...as with other libbpf files.

Fixes: 19e00c897d50 ("libbpf: Split BTF relocation")
Reported-by: Neill Kapron <nkapron@google.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240810093504.2111134-1-alan.maguire@oracle.com
2024-08-30 16:29:01 -07:00
David Vernet
7b5237996a libbpf: Don't take direct pointers into BTF data from st_ops
In struct bpf_struct_ops, we have take a pointer to a BTF type name, and
a struct btf_type. This was presumably done for convenience, but can
actually result in subtle and confusing bugs given that BTF data can be
invalidated before a program is loaded. For example, in sched_ext, we
may sometimes resize a data section after a skeleton has been opened,
but before the struct_ops scheduler map has been loaded. This may cause
the BTF data to be realloc'd, which can then cause a UAF when loading
the program because the struct_ops map has pointers directly into the
BTF data.

We're already storing the BTF type_id in struct bpf_struct_ops. Because
type_id is stable, we can therefore just update the places where we were
looking at those pointers to instead do the lookups we need from the
type_id.

Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support")
Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240724171459.281234-1-void@manifault.com
2024-08-30 16:29:01 -07:00
Stanislav Fomichev
a89e519b40 selftests/bpf: Add XDP_UMEM_TX_METADATA_LEN to XSK TX metadata test
This flag is now required to use tx_metadata_len.

Fixes: 40808a237d9c ("selftests/bpf: Add TX side to xdp_metadata")
Reported-by: Julian Schindel <mail@arctic-alpaca.de>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20240713015253.121248-3-sdf@fomichev.me
2024-08-30 16:29:01 -07:00
Andrii Nakryiko
205e86de8b libbpf: Fix no-args func prototype BTF dumping syntax
For all these years libbpf's BTF dumper has been emitting not strictly
valid syntax for function prototypes that have no input arguments.

Instead of `int (*blah)()` we should emit `int (*blah)(void)`.

This is not normally a problem, but it manifests when we get kfuncs in
vmlinux.h that have no input arguments. Due to compiler internal
specifics, we get no BTF information for such kfuncs, if they are not
declared with proper `(void)`.

The fix is trivial. We also need to adjust a few ancient tests that
happily assumed `()` is correct.

Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion")
Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/bpf/20240712224442.282823-1-andrii@kernel.org
2024-08-30 16:29:01 -07:00
Kan Liang
86fc78bd2b perf/x86/intel: Support new data source for Lunar Lake
A new PEBS data source format is introduced for the p-core of Lunar
Lake. The data source field is extended to 8 bits with new encodings.

A new layout is introduced into the union intel_x86_pebs_dse.
Introduce the lnl_latency_data() to parse the new format.
Enlarge the pebs_data_source[] accordingly to include new encodings.

Only the mem load and the mem store events can generate the data source.
Introduce INTEL_HYBRID_LDLAT_CONSTRAINT and
INTEL_HYBRID_STLAT_CONSTRAINT to mark them.

Add two new bits for the new cache-related data src, L2_MHB and MSC.
The L2_MHB is short for L2 Miss Handling Buffer, which is similar to
LFB (Line Fill Buffer), but to track the L2 Cache misses.
The MSC stands for the memory-side cache.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Link: https://lkml.kernel.org/r/20240626143545.480761-6-kan.liang@linux.intel.com
2024-08-30 16:29:01 -07:00
Andrii Nakryiko
20ccbb303a ci: take into account common local DENYLIST/ALLOWLIST
Similar to naming convention in BPF selftests.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-08-30 09:14:00 -07:00
chantra
26443a6d43 ci: fix test job names
* use the architecture name in job name instead of `runs_on` labels

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-29 10:58:52 +01:00
Andrii Nakryiko
22ec3eb15d ci: deny verify_pkcs7_sig as it keeps failing
This has nothing to do with libbpf and is probably failing due to
environment setup.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-08-27 12:51:55 -07:00
Manu Bretelle
bc24cd126a ci: run test on Ubuntu 24.04
Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-22 12:59:18 -07:00
Manu Bretelle
92316f5072 ci: Pass llvm-version as an input and enforce passing it to build-selftests action
Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-21 16:04:36 -07:00
Manu Bretelle
a73c6f7f80 ci: Use llvm repositories matching the host we are running on
As this will change to a Ubuntu 24.04 runner, we want this to automatically detect
which ubuntu version it is running on.

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-21 16:04:36 -07:00
Manu Bretelle
8e47e755cd ci: bump default llvm version to 17
Ubuntu 24.04's minimum llvm version is 17. Bumping this now to limit changes later.

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-21 16:04:36 -07:00
Manu Bretelle
ec0d0fda8b ci: lock down s390x CI to Ubuntu 20.04 runners
I am working on upgrading to 24.04 runners. In order to make sure that current jobs are scheduled
on Ubuntu 20.04, we need to ask for runners with tag `docker-main`, which is currently
set by those old runners.
Later, we will be able to switch this tag to `docker-noble-main` which are Ubuntu 24.04 runners.

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
2024-08-21 16:04:36 -07:00
Ivan Shapovalov
b07dfe3b2a Makefile: ensure $(OBJDIR) is created before writing to it
Signed-off-by: Ivan Shapovalov <intelfx@intelfx.name>
2024-07-29 14:05:05 -07:00
Andrii Nakryiko
686f600bca sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   a12978712d9001b060bcc10eaae42ad5102abe2b
Checkpoint bpf-next commit: ec5b8c76ab1c6d163762d60cfbedcd27e7527144
Baseline bpf commit:        b1c4b4d45263241ec6c2405a8df8265d4b58e707
Checkpoint bpf commit:      e1533b6319ab9c3a97dad314dd88b3783bc41b69

Alan Maguire (1):
  libbpf: Fix error handling in btf__distill_base()

Andreas Ziegler (1):
  libbpf: Add NULL checks to bpf_object__{prev_map,next_map}

Andrii Nakryiko (2):
  libbpf: fix BPF skeleton forward/backward compat handling
  libbpf: improve old BPF skeleton handling for map auto-attach

 src/btf.c    |  2 +-
 src/libbpf.c | 75 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 43 insertions(+), 34 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-07-10 14:22:00 -07:00
Andrii Nakryiko
726d7f3722 sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-07-10 14:22:00 -07:00
Andrii Nakryiko
e6f1ae2557 libbpf: improve old BPF skeleton handling for map auto-attach
Improve how we handle old BPF skeletons when it comes to BPF map
auto-attachment. Emit one warn-level message per each struct_ops map
that could have been auto-attached, if user provided recent enough BPF
skeleton version. Don't spam log if there are no relevant struct_ops
maps, though.

This should help users realize that they probably need to regenerate BPF
skeleton header with more recent bpftool/libbpf-cargo (or whatever other
means of BPF skeleton generation).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240708204540.4188946-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-10 14:22:00 -07:00
Andrii Nakryiko
bf7ddbef99 libbpf: fix BPF skeleton forward/backward compat handling
BPF skeleton was designed from day one to be extensible. Generated BPF
skeleton code specifies actual sizes of map/prog/variable skeletons for
that reason and libbpf is supposed to work with newer/older versions
correctly.

Unfortunately, it was missed that we implicitly embed hard-coded most
up-to-date (according to libbpf's version of libbpf.h header used to
compile BPF skeleton header) sizes of those structs, which can differ
from the actual sizes at runtime when libbpf is used as a shared
library.

We have a few places were we just index array of maps/progs/vars, which
implicitly uses these potentially invalid sizes of structs.

This patch aims to fix this problem going forward. Once this lands,
we'll backport these changes in Github repo to create patched releases
for older libbpfs.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support")
Fixes: 430025e5dca5 ("libbpf: Add subskeleton scaffolding")
Fixes: 08ac454e258e ("libbpf: Auto-attach struct_ops BPF maps in BPF skeleton")
Co-developed-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240708204540.4188946-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-10 14:22:00 -07:00
Andreas Ziegler
1867490d8f libbpf: Add NULL checks to bpf_object__{prev_map,next_map}
In the current state, an erroneous call to
bpf_object__find_map_by_name(NULL, ...) leads to a segmentation
fault through the following call chain:

  bpf_object__find_map_by_name(obj = NULL, ...)
  -> bpf_object__for_each_map(pos, obj = NULL)
  -> bpf_object__next_map((obj = NULL), NULL)
  -> return (obj = NULL)->maps

While calling bpf_object__find_map_by_name with obj = NULL is
obviously incorrect, this should not lead to a segmentation
fault but rather be handled gracefully.

As __bpf_map__iter already handles this situation correctly, we
can delegate the check for the regular case there and only add
a check in case the prev or next parameter is NULL.

Signed-off-by: Andreas Ziegler <ziegler.andreas@siemens.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240703083436.505124-1-ziegler.andreas@siemens.com
2024-07-10 14:22:00 -07:00
Alan Maguire
24aca0740b libbpf: Fix error handling in btf__distill_base()
Coverity points out that after calling btf__new_empty_split() the wrong
value is checked for error.

Fixes: 58e185a0dc35 ("libbpf: Add btf__distill_base() creating split BTF with distilled base BTF")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240629100058.2866763-1-alan.maguire@oracle.com
2024-07-10 14:22:00 -07:00
Andrii Nakryiko
c1a6c770c4 libbpf: add btf_iter.o and btf_relocate.o to Makefile
Upstream libbpf got two new .c files, make sure they are built with
Github Makefile as well.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
223cd2273e sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   531876c80004ecff7bfdbd8ba6c6b48835ef5e22
Checkpoint bpf-next commit: a12978712d9001b060bcc10eaae42ad5102abe2b
Baseline bpf commit:        62da3acd28955e7299babebdfcb14243b789e773
Checkpoint bpf commit:      b1c4b4d45263241ec6c2405a8df8265d4b58e707

Alan Maguire (6):
  libbpf: Add btf__distill_base() creating split BTF with distilled base
    BTF
  libbpf: Split BTF relocation
  libbpf: BTF relocation followup fixing naming, loop logic
  libbpf: Split field iter code into its own file kernel
  libbpf,bpf: Share BTF relocate-related code with kernel
  libbpf: Fix clang compilation error in btf_relocate.c

Andrii Nakryiko (4):
  libbpf: Add BTF field iterator
  libbpf: Make use of BTF field iterator in BPF linker code
  libbpf: Make use of BTF field iterator in BTF handling code
  libbpf: Remove callback-based type/string BTF field visitor helpers

Antoine Tenart (1):
  libbpf: Skip base btf sanity checks

Donglin Peng (1):
  libbpf: Checking the btf_type kind when fixing variable offsets

Eduard Zingerman (1):
  libbpf: Make btf_parse_elf process .BTF.base transparently

Mykyta Yatsenko (1):
  libbpf: Auto-attach struct_ops BPF maps in BPF skeleton

Vadim Fedorenko (1):
  bpf: Add CHECKSUM_COMPLETE to bpf test progs

 include/uapi/linux/bpf.h |   2 +
 src/btf.c                | 696 +++++++++++++++++++++++++++------------
 src/btf.h                |  36 ++
 src/btf_iter.c           | 177 ++++++++++
 src/btf_relocate.c       | 519 +++++++++++++++++++++++++++++
 src/libbpf.c             |  64 +++-
 src/libbpf.h             |  18 +
 src/libbpf.map           |   4 +
 src/libbpf_internal.h    |  29 +-
 src/linker.c             |  69 ++--
 10 files changed, 1378 insertions(+), 236 deletions(-)
 create mode 100644 src/btf_iter.c
 create mode 100644 src/btf_relocate.c

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
dcd076347c sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-27 10:01:42 -07:00
Alan Maguire
e4982342e7 libbpf: Fix clang compilation error in btf_relocate.c
When building with clang for ARCH=i386, the following errors are
observed:

  CC      kernel/bpf/btf_relocate.o
./tools/lib/bpf/btf_relocate.c:206:23: error: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Werror,-Wsingle-bit-bitfield-constant-conversion]
  206 |                 info[id].needs_size = true;
      |                                     ^ ~
./tools/lib/bpf/btf_relocate.c:256:25: error: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Werror,-Wsingle-bit-bitfield-constant-conversion]
  256 |                         base_info.needs_size = true;
      |                                              ^ ~
2 errors generated.

The problem is we use 1-bit, 31-bit bitfields in a signed int.
Changing to

	bool needs_size: 1;
	unsigned int size:31;

...resolves the error and pahole reports that 4 bytes are used
for the underlying representation:

$ pahole btf_name_info tools/lib/bpf/btf_relocate.o
struct btf_name_info {
	const char  *              name;                 /*     0     8 */
	unsigned int               needs_size:1;         /*     8: 0  4 */
	unsigned int               size:31;              /*     8: 1  4 */
	__u32                      id;                   /*    12     4 */

	/* size: 16, cachelines: 1, members: 4 */
	/* last cacheline: 16 bytes */
};

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240624192903.854261-1-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Antoine Tenart
95c63a08f2 libbpf: Skip base btf sanity checks
When upgrading to libbpf 1.3 we noticed a big performance hit while
loading programs using CORE on non base-BTF symbols. This was tracked
down to the new BTF sanity check logic. The issue is the base BTF
definitions are checked first for the base BTF and then again for every
module BTF.

Loading 5 dummy programs (using libbpf-rs) that are using CORE on a
non-base BTF symbol on my system:
- Before this fix: 3s.
- With this fix: 0.1s.

Fix this by only checking the types starting at the BTF start id. This
should ensure the base BTF is still checked as expected but only once
(btf->start_id == 1 when creating the base BTF), and then only
additional types are checked for each module BTF.

Fixes: 3903802bb99a ("libbpf: Add basic BTF sanity validation")
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20240624090908.171231-1-atenart@kernel.org
2024-06-27 10:01:42 -07:00
Alan Maguire
27f0169332 libbpf,bpf: Share BTF relocate-related code with kernel
Share relocation implementation with the kernel.  As part of this,
we also need the type/string iteration functions so also share
btf_iter.c file. Relocation code in kernel and userspace is identical
save for the impementation of the reparenting of split BTF to the
relocated base BTF and retrieval of the BTF header from "struct btf";
these small functions need separate user-space and kernel implementations
for the separate "struct btf"s they operate upon.

One other wrinkle on the kernel side is we have to map .BTF.ids in
modules as they were generated with the type ids used at BTF encoding
time. btf_relocate() optionally returns an array mapping from old BTF
ids to relocated ids, so we use that to fix up these references where
needed for kfuncs.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240620091733.1967885-5-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Alan Maguire
4ffb92e204 libbpf: Split field iter code into its own file kernel
This will allow it to be shared with the kernel.  No functional change.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240620091733.1967885-4-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Alan Maguire
bc021a8b42 libbpf: BTF relocation followup fixing naming, loop logic
Use less verbose names in BTF relocation code and fix off-by-one error
and typo in btf_relocate.c.  Simplify loop over matching distilled
types, moving from assigning a _next value in loop body to moving
match check conditions into the guard.

Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240620091733.1967885-2-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Donglin Peng
88a0787335 libbpf: Checking the btf_type kind when fixing variable offsets
I encountered an issue when building the test_progs from the repository [1]:

  $ pwd
  /work/Qemu/x86_64/linux-6.10-rc2/tools/testing/selftests/bpf/

  $ make test_progs V=1
  [...]
  ./tools/sbin/bpftool gen object ./ip_check_defrag.bpf.linked2.o ./ip_check_defrag.bpf.linked1.o
  libbpf: failed to find symbol for variable 'bpf_dynptr_slice' in section '.ksyms'
  Error: failed to link './ip_check_defrag.bpf.linked1.o': No such file or directory (2)
  [...]

Upon investigation, I discovered that the btf_types referenced in the '.ksyms'
section had a kind of BTF_KIND_FUNC instead of BTF_KIND_VAR:

  $ bpftool btf dump file ./ip_check_defrag.bpf.linked1.o
  [...]
  [2] DATASEC '.ksyms' size=0 vlen=2
        type_id=16 offset=0 size=0 (FUNC 'bpf_dynptr_from_skb')
        type_id=17 offset=0 size=0 (FUNC 'bpf_dynptr_slice')
  [...]
  [16] FUNC 'bpf_dynptr_from_skb' type_id=82 linkage=extern
  [17] FUNC 'bpf_dynptr_slice' type_id=85 linkage=extern
  [...]

For a detailed analysis, please refer to [2]. We can add a kind checking to
fix the issue.

  [1] https://github.com/eddyz87/bpf/tree/binsort-btf-dedup
  [2] https://lore.kernel.org/all/0c0ef20c-c05e-4db9-bad7-2cbc0d6dfae7@oracle.com/

Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support")
Signed-off-by: Donglin Peng <dolinux.peng@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240619122355.426405-1-dolinux.peng@gmail.com
2024-06-27 10:01:42 -07:00
Eduard Zingerman
4bc5a64933 libbpf: Make btf_parse_elf process .BTF.base transparently
Update btf_parse_elf() to check if .BTF.base section is present.
The logic is as follows:

  if .BTF.base section exists:
     distilled_base := btf_new(.BTF.base)
  if distilled_base:
     btf := btf_new(.BTF, .base_btf=distilled_base)
     if base_btf:
        btf_relocate(btf, base_btf)
  else:
     btf := btf_new(.BTF)
  return btf

In other words:
- if .BTF.base section exists, load BTF from it and use it as a base
  for .BTF load;
- if base_btf is specified and .BTF.base section exist, relocate newly
  loaded .BTF against base_btf.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240613095014.357981-6-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Alan Maguire
2afe409348 libbpf: Split BTF relocation
Map distilled base BTF type ids referenced in split BTF and their
references to the base BTF passed in, and if the mapping succeeds,
reparent the split BTF to the base BTF.

Relocation is done by first verifying that distilled base BTF
only consists of named INT, FLOAT, ENUM, FWD, STRUCT and
UNION kinds; then we sort these to speed lookups.  Once sorted,
the base BTF is iterated, and for each relevant kind we check
for an equivalent in distilled base BTF.  When found, the
mapping from distilled -> base BTF id and string offset is recorded.
In establishing mappings, we need to ensure we check STRUCT/UNION
size when the STRUCT/UNION is embedded in a split BTF STRUCT/UNION,
and when duplicate names exist for the same STRUCT/UNION.  Otherwise
size is ignored in matching STRUCT/UNIONs.

Once all mappings are established, we can update type ids
and string offsets in split BTF and reparent it to the new base.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240613095014.357981-4-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Alan Maguire
36cb1ad3ae libbpf: Add btf__distill_base() creating split BTF with distilled base BTF
To support more robust split BTF, adding supplemental context for the
base BTF type ids that split BTF refers to is required.  Without such
references, a simple shuffling of base BTF type ids (without any other
significant change) invalidates the split BTF.  Here the attempt is made
to store additional context to make split BTF more robust.

This context comes in the form of distilled base BTF providing minimal
information (name and - in some cases - size) for base INTs, FLOATs,
STRUCTs, UNIONs, ENUMs and ENUM64s along with modified split BTF that
points at that base and contains any additional types needed (such as
TYPEDEF, PTR and anonymous STRUCT/UNION declarations).  This
information constitutes the minimal BTF representation needed to
disambiguate or remove split BTF references to base BTF.  The rules
are as follows:

- INT, FLOAT, FWD are recorded in full.
- if a named base BTF STRUCT or UNION is referred to from split BTF, it
  will be encoded as a zero-member sized STRUCT/UNION (preserving
  size for later relocation checks).  Only base BTF STRUCT/UNIONs
  that are either embedded in split BTF STRUCT/UNIONs or that have
  multiple STRUCT/UNION instances of the same name will _need_ size
  checks at relocation time, but as it is possible a different set of
  types will be duplicates in the later to-be-resolved base BTF,
  we preserve size information for all named STRUCT/UNIONs.
- if an ENUM[64] is named, a ENUM forward representation (an ENUM
  with no values) of the same size is used.
- in all other cases, the type is added to the new split BTF.

Avoiding struct/union/enum/enum64 expansion is important to keep the
distilled base BTF representation to a minimum size.

When successful, new representations of the distilled base BTF and new
split BTF that refers to it are returned.  Both need to be freed by the
caller.

So to take a simple example, with split BTF with a type referring
to "struct sk_buff", we will generate distilled base BTF with a
0-member STRUCT sk_buff of the appropriate size, and the split BTF
will refer to it instead.

Tools like pahole can utilize such split BTF to populate the .BTF
section (split BTF) and an additional .BTF.base section.  Then
when the split BTF is loaded, the distilled base BTF can be used
to relocate split BTF to reference the current (and possibly changed)
base BTF.

So for example if "struct sk_buff" was id 502 when the split BTF was
originally generated,  we can use the distilled base BTF to see that
id 502 refers to a "struct sk_buff" and replace instances of id 502
with the current (relocated) base BTF sk_buff type id.

Distilled base BTF is small; when building a kernel with all modules
using distilled base BTF as a test, overall module size grew by only
5.3Mb total across ~2700 modules.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240613095014.357981-2-alan.maguire@oracle.com
2024-06-27 10:01:42 -07:00
Vadim Fedorenko
0a66859bf1 bpf: Add CHECKSUM_COMPLETE to bpf test progs
Add special flag to validate that TC BPF program properly updates
checksum information in skb.

Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240606145851.229116-1-vadfed@meta.com
2024-06-27 10:01:42 -07:00
Mykyta Yatsenko
be998aa3d4 libbpf: Auto-attach struct_ops BPF maps in BPF skeleton
Similarly to `bpf_program`, support `bpf_map` automatic attachment in
`bpf_object__attach_skeleton`. Currently only struct_ops maps could be
attached.

On bpftool side, code-generate links in skeleton struct for struct_ops maps.
Similarly to `bpf_program_skeleton`, set links in `bpf_map_skeleton`.

On libbpf side, extend `bpf_map` with new `autoattach` field to support
enabling or disabling autoattach functionality, introducing
getter/setter for this field.

`bpf_object__(attach|detach)_skeleton` is extended with
attaching/detaching struct_ops maps logic.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240605175135.117127-1-yatsenko@meta.com
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
78c78e90cd libbpf: Remove callback-based type/string BTF field visitor helpers
Now that all libbpf/bpftool code switched to btf_field_iter, remove
btf_type_visit_type_ids() and btf_type_visit_str_offs() callback-based
helpers as not needed anymore.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240605001629.4061937-6-andrii@kernel.org
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
dd19c7ef77 libbpf: Make use of BTF field iterator in BTF handling code
Use new BTF field iterator logic to replace all the callback-based
visitor calls. There is still a .BTF.ext callback-based visitor APIs
that should be converted, which will happens as a follow up.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240605001629.4061937-4-andrii@kernel.org
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
13182b94f3 libbpf: Make use of BTF field iterator in BPF linker code
Switch all BPF linker code dealing with iterating BTF type ID and string
offset fields to new btf_field_iter facilities.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240605001629.4061937-3-andrii@kernel.org
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
cece3242fb libbpf: Add BTF field iterator
Implement iterator-based type ID and string offset BTF field iterator.
This is used extensively in BTF-handling code and BPF linker code for
various sanity checks, rewriting IDs/offsets, etc. Currently this is
implemented as visitor pattern calling custom callbacks, which makes the
logic (especially in simple cases) unnecessarily obscure and harder to
follow.

Having equivalent functionality using iterator pattern makes for simpler
to understand and maintain code. As we add more code for BTF processing
logic in libbpf, it's best to switch to iterator pattern before adding
more callback-based code.

The idea for iterator-based implementation is to record offsets of
necessary fields within fixed btf_type parts (which should be iterated
just once), and, for kinds that have multiple members (based on vlen
field), record where in each member necessary fields are located.

Generic iteration code then just keeps track of last offset that was
returned and handles N members correctly. Return type is just u32
pointer, where NULL is returned when all relevant fields were already
iterated.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240605001629.4061937-2-andrii@kernel.org
2024-06-27 10:01:42 -07:00
Andrii Nakryiko
42065ea662 ci: make pahole-staging workflow manually triggerable
Allow to manually trigger pahole-staging workflow.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-06 14:39:09 -07:00
Andrii Nakryiko
764d19da07 ci: revert switching to ubuntu-latest for pahole-staging workflow
pahole staging workflow is using the same old VM image as BPF selftests
stages. It doesn't have recent enough glibc, so we can't yet switch to
newer Ubuntu, unfortunately.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-06 14:32:23 -07:00
Andrii Nakryiko
fbcb2871fe ci: regenerate vmlinux.h
Regenerated latest vmlinux.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-03 13:41:26 -07:00
Andrii Nakryiko
61a6e8edd7 github: remove PR template
No one is looking at it anyways. It just gets in the way.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-03 13:41:26 -07:00
Andrii Nakryiko
4ab7361e64 libbpf: don't close(-1) in multi-uprobe feature detector
Guard close(link_fd) with extra link_fd >= 0 check to prevent close(-1).

Detected by Coverity static analysis.

Fixes: 04d939a2ab22 ("libbpf: detect broken PID filtering logic for multi-uprobe")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240529231212.768828-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-03 13:41:26 -07:00
Andrii Nakryiko
ff856238e2 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   eb4e7726279a344c82e3c23be396bcfd0a4d5669
Checkpoint bpf-next commit: 531876c80004ecff7bfdbd8ba6c6b48835ef5e22
Baseline bpf commit:        9dfdb706e164ae869b1d97f83ebf8523b2809714
Checkpoint bpf commit:      62da3acd28955e7299babebdfcb14243b789e773

Andrii Nakryiko (1):
  libbpf: keep FD_CLOEXEC flag when dup()'ing FD

Jakub Kicinski (1):
  netdev: add qstat for csum complete

 include/uapi/linux/netdev.h |  1 +
 src/libbpf_internal.h       | 10 +++-------
 2 files changed, 4 insertions(+), 7 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-06-03 13:41:26 -07:00
Jakub Kicinski
c085e9c364 netdev: add qstat for csum complete
Recent commit 0cfe71f45f42 ("netdev: add queue stats") added
a lot of useful stats, but only those immediately needed by virtio.
Presumably virtio does not support CHECKSUM_COMPLETE,
so statistic for that form of checksumming wasn't included.
Other drivers will definitely need it, in fact we expect it
to be needed in net-next soon (mlx5). So let's add the definition
of the counter for CHECKSUM_COMPLETE to uAPI in net already,
so that the counters are in a more natural order (all subsequent
counters have not been present in any released kernel, yet).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Fixes: 0cfe71f45f42 ("netdev: add queue stats")
Link: https://lore.kernel.org/r/20240529163547.3693194-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-06-03 13:41:26 -07:00
Andrii Nakryiko
805b689cd2 libbpf: keep FD_CLOEXEC flag when dup()'ing FD
Make sure to preserve and/or enforce FD_CLOEXEC flag on duped FDs.
Use dup3() with O_CLOEXEC flag for that.

Without this fix libbpf effectively clears FD_CLOEXEC flag on each of BPF
map/prog FD, which is definitely not the right or expected behavior.

Reported-by: Lennart Poettering <lennart@poettering.net>
Fixes: bc308d011ab8 ("libbpf: call dup2() syscall directly")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240529223239.504241-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-03 13:41:26 -07:00
Andrii Nakryiko
9b789075a9 ci: switch to ubuntu-latest where possible
Track ubuntu-latest where relevant and possible.
We can't update to ubuntu-latest when building and running BPF
selftests, though, because our QEMU image has too old of an GLIBC.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-28 22:37:25 -07:00
Andrii Nakryiko
c22d662a95 ci: update vmlinux.h to latest version
Re-generate vmlinux.h to add latest kernel types necessary for BPF
selftests.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-28 21:15:00 -07:00
Andrii Nakryiko
074445067f ci: add temporary patch for failing upstream BPF selftest
Add fix that landed in bpf tree to fix sk_storage_tracing selftest.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-28 20:39:55 -07:00
Andrii Nakryiko
9a1f1f28c6 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   009367099eb61a4fc2af44d4eb06b6b4de7de6db
Checkpoint bpf-next commit: eb4e7726279a344c82e3c23be396bcfd0a4d5669
Baseline bpf commit:        3e9bc0472b910d4115e16e9c2d684c7757cb6c60
Checkpoint bpf commit:      9dfdb706e164ae869b1d97f83ebf8523b2809714

Abhishek Chauhan (1):
  net: Add additional bit to support clockid_t timestamp type

Andrii Nakryiko (2):
  libbpf: fix feature detectors when using token_fd
  libbpf: detect broken PID filtering logic for multi-uprobe

Arnaldo Carvalho de Melo (1):
  tools headers: Remove now unused copies of uapi/{fcntl,openat2}.h and
    asm/fcntl.h

Daniel Jurgens (1):
  netdev: Add queue stats for TX stop and wake

Mykyta Yatsenko (1):
  libbpf: Configure log verbosity with env variable

Xuan Zhuo (1):
  netdev: add queue stats

 docs/libbpf_overview.rst     |   8 +++
 include/uapi/linux/bpf.h     |  15 +++--
 include/uapi/linux/fcntl.h   | 123 -----------------------------------
 include/uapi/linux/netdev.h  |  21 ++++++
 include/uapi/linux/openat2.h |  43 ------------
 src/bpf.c                    |   2 +-
 src/features.c               |  33 +++++++++-
 src/libbpf.c                 |  25 ++++++-
 src/libbpf.h                 |   5 +-
 9 files changed, 99 insertions(+), 176 deletions(-)
 delete mode 100644 include/uapi/linux/fcntl.h
 delete mode 100644 include/uapi/linux/openat2.h

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-28 20:39:55 -07:00
Andrii Nakryiko
0a519f87ee sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-28 20:39:55 -07:00
Andrii Nakryiko
d9f9fd5b22 libbpf: detect broken PID filtering logic for multi-uprobe
Libbpf is automatically (and transparently to user) detecting
multi-uprobe support in the kernel, and, if supported, uses
multi-uprobes to improve USDT attachment speed.

USDTs can be attached system-wide or for the specific process by PID. In
the latter case, we rely on correct kernel logic of not triggering USDT
for unrelated processes.

As such, on older kernels that do support multi-uprobes, but still have
broken PID filtering logic, we need to fall back to singular uprobes.

Unfortunately, whether user is using PID filtering or not is known at
the attachment time, which happens after relevant BPF programs were
loaded into the kernel. Also unfortunately, we need to make a call
whether to use multi-uprobes or singular uprobe for SEC("usdt") programs
during BPF object load time, at which point we have no information about
possible PID filtering.

The distinction between single and multi-uprobes is small, but important
for the kernel. Multi-uprobes get BPF_TRACE_UPROBE_MULTI attach type,
and kernel internally substitiute different implementation of some of
BPF helpers (e.g., bpf_get_attach_cookie()) depending on whether uprobe
is multi or singular. So, multi-uprobes and singular uprobes cannot be
intermixed.

All the above implies that we have to make an early and conservative
call about the use of multi-uprobes. And so this patch modifies libbpf's
existing feature detector for multi-uprobe support to also check correct
PID filtering. If PID filtering is not yet fixed, we fall back to
singular uprobes for USDTs.

This extension to feature detection is simple thanks to kernel's -EINVAL
addition for pid < 0.

Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240521163401.3005045-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-28 20:39:55 -07:00
Mykyta Yatsenko
d4d3e68e8d libbpf: Configure log verbosity with env variable
Configure logging verbosity by setting LIBBPF_LOG_LEVEL environment
variable, which is applied only to default logger. Once user set their
custom logging callback, it is up to them to handle filtering.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240524131840.114289-1-yatsenko@meta.com
2024-05-28 20:39:55 -07:00
Abhishek Chauhan
0babfb126a net: Add additional bit to support clockid_t timestamp type
tstamp_type is now set based on actual clockid_t compressed
into 2 bits.

To make the design scalable for future needs this commit bring in
the change to extend the tstamp_type:1 to tstamp_type:2 to support
other clockid_t timestamp.

We now support CLOCK_TAI as part of tstamp_type as part of this
commit with existing support CLOCK_MONOTONIC and CLOCK_REALTIME.

Signed-off-by: Abhishek Chauhan <quic_abchauha@quicinc.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240509211834.3235191-3-quic_abchauha@quicinc.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-28 20:39:55 -07:00
Arnaldo Carvalho de Melo
89ed67d7ab tools headers: Remove now unused copies of uapi/{fcntl,openat2}.h and asm/fcntl.h
These were used to build perf to provide defines not available in older
distros, but this was back in 2017, nowadays all the distros that are
supported and I have build containers for work using just the system
headers, so ditch them.

Some of these older distros may not have things that are used in 'perf
trace', but then they also don't have libtraceevent packages, so don't
build 'perf trace'.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/20240315204835.748716-5-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-05-28 20:39:55 -07:00
Andrii Nakryiko
8dfa981c53 libbpf: fix feature detectors when using token_fd
Adjust `union bpf_attr` size passed to kernel in two feature-detecting
functions to take into account prog_token_fd field.

Libbpf is avoiding memset()'ing entire `union bpf_attr` by only using
minimal set of bpf_attr's fields. Two places have been missed when
wiring BPF token support in libbpf's feature detection logic.

Fix them trivially.

Fixes: f3dcee938f48 ("libbpf: Wire up token_fd into feature probing logic")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240513180804.403775-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-28 20:39:55 -07:00
Daniel Jurgens
15b461a608 netdev: Add queue stats for TX stop and wake
TX queue stop and wake are counted by some drivers.
Support reporting these via netdev-genl queue stats.

Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20240510201927.1821109-2-danielj@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-28 20:39:55 -07:00
Xuan Zhuo
ec3c369941 netdev: add queue stats
These stats are commonly. Support reporting those via netdev-genl queue
stats.

name: rx-hw-drops
name: rx-hw-drop-overruns
name: rx-csum-unnecessary
name: rx-csum-none
name: rx-csum-bad
name: rx-hw-gro-packets
name: rx-hw-gro-bytes
name: rx-hw-gro-wire-packets
name: rx-hw-gro-wire-bytes
name: rx-hw-drop-ratelimits
name: tx-hw-drops
name: tx-hw-drop-errors
name: tx-csum-none
name: tx-needs-csum
name: tx-hw-gso-packets
name: tx-hw-gso-bytes
name: tx-hw-gso-wire-packets
name: tx-hw-gso-wire-bytes
name: tx-hw-drop-ratelimits

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-05-28 20:39:55 -07:00
Andrii Nakryiko
02724cfd07 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   0737df6de94661ae55fd3343ce9abec32c687e62
Checkpoint bpf-next commit: 009367099eb61a4fc2af44d4eb06b6b4de7de6db
Baseline bpf commit:        3e9bc0472b910d4115e16e9c2d684c7757cb6c60
Checkpoint bpf commit:      3e9bc0472b910d4115e16e9c2d684c7757cb6c60

Andrii Nakryiko (6):
  libbpf: fix potential overflow in ring__consume_n()
  libbpf: fix ring_buffer__consume_n() return result logic
  libbpf: remove unnecessary struct_ops prog validity check
  libbpf: handle yet another corner case of nulling out struct_ops
    program
  libbpf: fix libbpf_strerror_r() handling unknown errors
  libbpf: improve early detection of doomed-to-fail BPF program loading

Jiri Olsa (2):
  libbpf: Fix error message in attach_kprobe_session
  libbpf: Fix error message in attach_kprobe_multi

Jose E. Marchesi (3):
  libbpf: Fix bpf_ksym_exists() in GCC
  libbpf: Avoid casts from pointers to enums in bpf_tracing.h
  bpf: Avoid uninitialized value in BPF_CORE_READ_BITFIELD

 src/bpf_core_read.h |  1 +
 src/bpf_helpers.h   | 17 +++++++++--
 src/bpf_tracing.h   | 70 ++++++++++++++++++++++-----------------------
 src/libbpf.c        | 42 ++++++++++++++++++---------
 src/ringbuf.c       |  4 +--
 src/str_error.c     | 16 +++++++++--
 src/usdt.bpf.h      | 24 ++++++++--------
 7 files changed, 106 insertions(+), 68 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-08 16:04:40 -07:00
Jose E. Marchesi
3827aa514c bpf: Avoid uninitialized value in BPF_CORE_READ_BITFIELD
[Changes from V1:
 - Use a default branch in the switch statement to initialize `val'.]

GCC warns that `val' may be used uninitialized in the
BPF_CRE_READ_BITFIELD macro, defined in bpf_core_read.h as:

	[...]
	unsigned long long val;						      \
	[...]								      \
	switch (__CORE_RELO(s, field, BYTE_SIZE)) {			      \
	case 1: val = *(const unsigned char *)p; break;			      \
	case 2: val = *(const unsigned short *)p; break;		      \
	case 4: val = *(const unsigned int *)p; break;			      \
	case 8: val = *(const unsigned long long *)p; break;		      \
        }       							      \
	[...]
	val;								      \
	}								      \

This patch adds a default entry in the switch statement that sets
`val' to zero in order to avoid the warning, and random values to be
used in case __builtin_preserve_field_info returns unexpected values
for BPF_FIELD_BYTE_SIZE.

Tested in bpf-next master.
No regressions.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240508101313.16662-1-jose.marchesi@oracle.com
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
e5146eff75 libbpf: improve early detection of doomed-to-fail BPF program loading
Extend libbpf's pre-load checks for BPF programs, detecting more typical
conditions that are destinated to cause BPF program failure. This is an
opportunity to provide more helpful and actionable error message to
users, instead of potentially very confusing BPF verifier log and/or
error.

In this case, we detect struct_ops BPF program that was not referenced
anywhere, but still attempted to be loaded (according to libbpf logic).
Suggest that the program might need to be used in some struct_ops
variable. User will get a message of the following kind:

  libbpf: prog 'test_1_forgotten': SEC("struct_ops") program isn't referenced anywhere, did you forget to use it?

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240507001335.1445325-6-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
ed54f30307 libbpf: fix libbpf_strerror_r() handling unknown errors
strerror_r(), used from libbpf-specific libbpf_strerror_r() wrapper is
documented to return error in two different ways, depending on glibc
version. Take that into account when handling strerror_r()'s own errors,
which happens when we pass some non-standard (internal) kernel error to
it. Before this patch we'd have "ERROR: strerror_r(524)=22", which is
quite confusing. Now for the same situation we'll see a bit less
visually scary "unknown error (-524)".

At least we won't confuse user with irrelevant EINVAL (22).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240507001335.1445325-5-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
fe5fe762b9 libbpf: handle yet another corner case of nulling out struct_ops program
There is yet another corner case where user can set STRUCT_OPS program
reference in STRUCT_OPS map to NULL, but libbpf will fail to disable
autoload for such BPF program. This time it's the case of "new" kernel
which has type information about callback field, but user explicitly
nulled-out program reference from user-space after opening BPF object.

Fix, hopefully, the last remaining unhandled case.

Fixes: 0737df6de946 ("libbpf: better fix for handling nulled-out struct_ops program")
Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240507001335.1445325-3-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
504369cba4 libbpf: remove unnecessary struct_ops prog validity check
libbpf ensures that BPF program references set in map->st_ops->progs[i]
during open phase are always valid STRUCT_OPS programs. This is done in
bpf_object__collect_st_ops_relos(). So there is no need to double-check
that in bpf_map__init_kern_struct_ops().

Simplify the code by removing unnecessary check. Also, we avoid using
local prog variable to keep code similar to the upcoming fix, which adds
similar logic in another part of bpf_map__init_kern_struct_ops().

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240507001335.1445325-2-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Jose E. Marchesi
ea02e10fc4 libbpf: Avoid casts from pointers to enums in bpf_tracing.h
[Differences from V1:
  - Do not introduce a global typedef, as this is a public header.
  - Keep the void* casts in BPF_KPROBE_READ_RET_IP and
    BPF_KRETPROBE_READ_RET_IP, as these are necessary
    for converting to a const void* argument of
    bpf_probe_read_kernel.]

The BPF_PROG, BPF_KPROBE and BPF_KSYSCALL macros defined in
tools/lib/bpf/bpf_tracing.h use a clever hack in order to provide a
convenient way to define entry points for BPF programs as if they were
normal C functions that get typed actual arguments, instead of as
elements in a single "context" array argument.

For example, PPF_PROGS allows writing:

  SEC("struct_ops/cwnd_event")
  void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event)
  {
        bbr_cwnd_event(sk, event);
        dctcp_cwnd_event(sk, event);
        cubictcp_cwnd_event(sk, event);
  }

That expands into a pair of functions:

  void ____cwnd_event (unsigned long long *ctx, struct sock *sk, enum tcp_ca_event event)
  {
        bbr_cwnd_event(sk, event);
        dctcp_cwnd_event(sk, event);
        cubictcp_cwnd_event(sk, event);
  }

  void cwnd_event (unsigned long long *ctx)
  {
        _Pragma("GCC diagnostic push")
        _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")
        return ____cwnd_event(ctx, (void*)ctx[0], (void*)ctx[1]);
        _Pragma("GCC diagnostic pop")
  }

Note how the 64-bit unsigned integers in the incoming CTX get casted
to a void pointer, and then implicitly converted to whatever type of
the actual argument in the wrapped function.  In this case:

  Arg1: unsigned long long -> void * -> struct sock *
  Arg2: unsigned long long -> void * -> enum tcp_ca_event

The behavior of GCC and clang when facing such conversions differ:

  pointer -> pointer

    Allowed by the C standard.
    GCC: no warning nor error.
    clang: no warning nor error.

  pointer -> integer type

    [C standard says the result of this conversion is implementation
     defined, and it may lead to unaligned pointer etc.]

    GCC: error: integer from pointer without a cast [-Wint-conversion]
    clang: error: incompatible pointer to integer conversion [-Wint-conversion]

  pointer -> enumerated type

    GCC: error: incompatible types in assigment (*)
    clang: error: incompatible pointer to integer conversion [-Wint-conversion]

These macros work because converting pointers to pointers is allowed,
and converting pointers to integers also works provided a suitable
integer type even if it is implementation defined, much like casting a
pointer to uintptr_t is guaranteed to work by the C standard.  The
conversion errors emitted by both compilers by default are silenced by
the pragmas.

However, the GCC error marked with (*) above when assigning a pointer
to an enumerated value is not associated with the -Wint-conversion
warning, and it is not possible to turn it off.

This is preventing building the BPF kernel selftests with GCC.

This patch fixes this by avoiding intermediate casts to void*,
replaced with casts to `unsigned long long', which is an integer type
capable of safely store a BPF pointer, much like the standard
uintptr_t.

Testing performed in bpf-next master:
  - vmtest.sh -- ./test_verifier
  - vmtest.sh -- ./test_progs
  - make M=samples/bpf
No regressions.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240502170925.3194-1-jose.marchesi@oracle.com
2024-05-08 16:04:40 -07:00
Jose E. Marchesi
4ec5e360ae libbpf: Fix bpf_ksym_exists() in GCC
The macro bpf_ksym_exists is defined in bpf_helpers.h as:

  #define bpf_ksym_exists(sym) ({								\
  	_Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak");	\
  	!!sym;											\
  })

The purpose of the macro is to determine whether a given symbol has
been defined, given the address of the object associated with the
symbol.  It also has a compile-time check to make sure the object
whose address is passed to the macro has been declared as weak, which
makes the check on `sym' meaningful.

As it happens, the check for weak doesn't work in GCC in all cases,
because __builtin_constant_p not always folds at parse time when
optimizing.  This is because optimizations that happen later in the
compilation process, like inlining, may make a previously non-constant
expression a constant.  This results in errors like the following when
building the selftests with GCC:

  bpf_helpers.h:190:24: error: expression in static assertion is not constant
  190 |         _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak");       \
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fortunately recent versions of GCC support a __builtin_has_attribute
that can be used to directly check for the __weak__ attribute.  This
patch changes bpf_helpers.h to use that builtin when building with a
recent enough GCC, and to omit the check if GCC is too old to support
the builtin.

The macro used for GCC becomes:

  #define bpf_ksym_exists(sym) ({									\
	_Static_assert(__builtin_has_attribute (*sym, __weak__), #sym " should be marked as __weak");	\
	!!sym;												\
  })

Note that since bpf_ksym_exists is designed to get the address of the
object associated with symbol SYM, we pass *sym to
__builtin_has_attribute instead of sym.  When an expression is passed
to __builtin_has_attribute then it is the type of the passed
expression that is checked for the specified attribute.  The
expression itself is not evaluated.  This accommodates well with the
existing usages of the macro:

- For function objects:

  struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym __weak;
  [...]
  bpf_ksym_exists(bpf_task_acquire)

- For variable objects:

  extern const struct rq runqueues __ksym __weak; /* typed */
  [...]
  bpf_ksym_exists(&runqueues)

Note also that BPF support was added in GCC 10 and support for
__builtin_has_attribute in GCC 9.

Locally tested in bpf-next master branch.
No regressions.

Signed-of-by: Jose E. Marchesi <jose.marchesi@oracle.com>

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240428112559.10518-1-jose.marchesi@oracle.com
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
cb7bfc5e51 libbpf: fix ring_buffer__consume_n() return result logic
Add INT_MAX check to ring_buffer__consume_n(). We do the similar check
to handle int return result of all these ring buffer APIs in other APIs
and ring_buffer__consume_n() is missing one. This patch fixes this
omission.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20240430201952.888293-2-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Andrii Nakryiko
e3e84bd7d0 libbpf: fix potential overflow in ring__consume_n()
ringbuf_process_ring() return int64_t, while ring__consume_n() assigns
it to int. It's highly unlikely, but possible for ringbuf_process_ring()
to return value larger than INT_MAX, so use int64_t. ring__consume_n()
does check INT_MAX before returning int result to the user.

Fixes: 4d22ea94ea33 ("libbpf: Add ring__consume_n / ring_buffer__consume_n")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20240430201952.888293-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-08 16:04:40 -07:00
Jiri Olsa
f3c4172c61 libbpf: Fix error message in attach_kprobe_multi
We just failed to retrieve pattern, so we need to print spec instead.

Fixes: ddc6b04989eb ("libbpf: Add bpf_program__attach_kprobe_multi_opts function")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240502075541.1425761-2-jolsa@kernel.org
2024-05-08 16:04:40 -07:00
Jiri Olsa
d045f7682b libbpf: Fix error message in attach_kprobe_session
We just failed to retrieve pattern, so we need to print spec instead.

Fixes: 2ca178f02b2f ("libbpf: Add support for kprobe session attach")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240502075541.1425761-1-jolsa@kernel.org
2024-05-08 16:04:40 -07:00
Quentin Monnet
e055420033 sync: Commit .mailmap changes from script when sync-ing repo
In commit 4794f18bf4 ("sync: Sync .mailmap entries"), we updated the
sync-up script to automatically update libbpf's .mailmap; however, the
script would not take care of committing the changes. Let's address
this.

The code is copied and adapted from the part where we commit changes to
src/bpf_helper_defs.h.

Signed-off-by: Quentin Monnet <qmo@kernel.org>
2024-05-01 17:38:26 -07:00
Andrii Nakryiko
255b705a16 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   1bba3b3d373dbafae891e7cb06b8c82c8d62aba1
Checkpoint bpf-next commit: 0737df6de94661ae55fd3343ce9abec32c687e62
Baseline bpf commit:        b867247555c4181bf84eb10b72b176862c29112d
Checkpoint bpf commit:      3e9bc0472b910d4115e16e9c2d684c7757cb6c60

Andrii Nakryiko (1):
  libbpf: better fix for handling nulled-out struct_ops program

Jiri Olsa (3):
  bpf: Add support for kprobe session attach
  libbpf: Add support for kprobe session attach
  libbpf: Add kprobe session attach type name to attach_type_name

Viktor Malik (1):
  libbpf: support "module: Function" syntax for tracing programs

 include/uapi/linux/bpf.h |   1 +
 src/bpf.c                |   1 +
 src/libbpf.c             | 112 +++++++++++++++++++++++++++++++--------
 src/libbpf.h             |   4 +-
 4 files changed, 95 insertions(+), 23 deletions(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-05-01 15:20:15 -07:00
Andrii Nakryiko
6a41f02ad4 libbpf: better fix for handling nulled-out struct_ops program
Previous attempt to fix the handling of nulled-out (from skeleton)
struct_ops program is working well only if struct_ops program is defined
as non-autoloaded by default (i.e., has SEC("?struct_ops") annotation,
with question mark).

Unfortunately, that fix is incomplete due to how
bpf_object_adjust_struct_ops_autoload() is marking referenced or
non-referenced struct_ops program as autoloaded (or not). Because
bpf_object_adjust_struct_ops_autoload() is run after
bpf_map__init_kern_struct_ops() step, which sets program slot to NULL,
such programs won't be considered "referenced", and so its autoload
property won't be changed.

This all sounds convoluted and it is, but the desire is to have as
natural behavior (as far as struct_ops usage is concerned) as possible.

This fix is redoing the original fix but makes it work for
autoloaded-by-default struct_ops programs as well. We achieve this by
forcing prog->autoload to false if prog was declaratively set for some
struct_ops map, but then nulled-out from skeleton (programmatically).
This achieves desired effect of not autoloading it. If such program is
still referenced somewhere else (different struct_ops map or different
callback field), it will get its autoload property adjusted by
bpf_object_adjust_struct_ops_autoload() later.

We also fix selftest, which accidentally used SEC("?struct_ops")
annotation. It was meant to use autoload-by-default program from the
very beginning.

Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly")
Cc: Kui-Feng Lee <thinker.li@gmail.com>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240501041706.3712608-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-05-01 15:20:15 -07:00
Viktor Malik
dd589c3b31 libbpf: support "module: Function" syntax for tracing programs
In some situations, it is useful to explicitly specify a kernel module
to search for a tracing program target (e.g. when a function of the same
name exists in multiple modules or in vmlinux).

This patch enables that by allowing the "module:function" syntax for the
find_kernel_btf_id function. Thanks to this, the syntax can be used both
from a SEC macro (i.e. `SEC(fentry/module:function)`) and via the
bpf_program__set_attach_target API call.

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/9085a8cb9a552de98e554deb22ff7e977d025440.1714469650.git.vmalik@redhat.com
2024-05-01 15:20:15 -07:00
Jiri Olsa
045a0372ef libbpf: Add kprobe session attach type name to attach_type_name
Adding kprobe session attach type name to attach_type_name,
so libbpf_bpf_attach_type_str returns proper string name for
BPF_TRACE_KPROBE_SESSION attach type.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240430112830.1184228-6-jolsa@kernel.org
2024-05-01 15:20:15 -07:00
Jiri Olsa
6c3cf5108e libbpf: Add support for kprobe session attach
Adding support to attach program in kprobe session mode
with bpf_program__attach_kprobe_multi_opts function.

Adding session bool to bpf_kprobe_multi_opts struct that allows
to load and attach the bpf program via kprobe session.
the attachment to create kprobe multi session.

Also adding new program loader section that allows:
 SEC("kprobe.session/bpf_fentry_test*")

and loads/attaches kprobe program as kprobe session.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240430112830.1184228-5-jolsa@kernel.org
2024-05-01 15:20:15 -07:00
Jiri Olsa
b63d2945ff bpf: Add support for kprobe session attach
Adding support to attach bpf program for entry and return probe
of the same function. This is common use case which at the moment
requires to create two kprobe multi links.

Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs
kernel to attach single link program to both entry and exit probe.

It's possible to control execution of the bpf program on return
probe simply by returning zero or non zero from the entry bpf
program execution to execute or not the bpf program on return
probe respectively.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org
2024-05-01 15:20:15 -07:00
Andrii Nakryiko
d3e18fceec ci: remove tcp_rtt test from 5.5 ALLOWLIST
It's been updated to expecte the very latest kernel, can't succeed on
5.5 anymore.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-04-30 09:09:32 -07:00
Andrii Nakryiko
22bd976613 ci: update vmlinux.h
Regenerate vmlinux.h to get all the latest types.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-04-30 09:09:32 -07:00
Andrii Nakryiko
f9f3fbf72d sync: update .mailmap
Update .mailmap generated during sync.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-04-30 09:09:32 -07:00
Andrii Nakryiko
37b8e0eb2d sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   82e38a505c9868e784ec31e743fd8a9fa5ca1084
Checkpoint bpf-next commit: 1bba3b3d373dbafae891e7cb06b8c82c8d62aba1
Baseline bpf commit:        5bcf0dcbf9066348058b88a510c57f70f384c92c
Checkpoint bpf commit:      b867247555c4181bf84eb10b72b176862c29112d

Andrii Nakryiko (1):
  libbpf: handle nulled-out program in struct_ops correctly

Jose E. Marchesi (1):
  bpf_helpers.h: Define bpf_tail_call_static when building with GCC

Philo Lu (1):
  bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args

 include/uapi/linux/bpf.h | 2 ++
 src/bpf_helpers.h        | 4 +++-
 src/libbpf.c             | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2024-04-30 09:09:32 -07:00
Andrii Nakryiko
f28271ab72 libbpf: handle nulled-out program in struct_ops correctly
If struct_ops has one of program callbacks set declaratively and host
kernel is old and doesn't support this callback, libbpf will allow to
load such struct_ops as long as that callback was explicitly nulled-out
(presumably through skeleton). This is all working correctly, except we
won't reset corresponding program slot to NULL before bailing out, which
will lead to libbpf not detecting that BPF program has to be not
auto-loaded. Fix this by unconditionally resetting corresponding program
slot to NULL.

Fixes: c911fc61a7ce ("libbpf: Skip zeroed or null fields if not found in the kernel type.")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240428030954.3918764-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-04-30 09:09:32 -07:00
Jose E. Marchesi
b1051d9361 bpf_helpers.h: Define bpf_tail_call_static when building with GCC
The definition of bpf_tail_call_static in tools/lib/bpf/bpf_helpers.h
is guarded by a preprocessor check to assure that clang is recent
enough to support it.  This patch updates the guard so the function is
compiled when using GCC 13 or later as well.

Tested in bpf-next master. No regressions.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240426145158.14409-1-jose.marchesi@oracle.com
2024-04-30 09:09:32 -07:00
Philo Lu
43df08cd17 bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args
Two important arguments in RTT estimation, mrtt and srtt, are passed to
tcp_bpf_rtt(), so that bpf programs get more information about RTT
computation in BPF_SOCK_OPS_RTT_CB.

The difference between bpf_sock_ops->srtt_us and the srtt here is: the
former is an old rtt before update, while srtt passed by tcp_bpf_rtt()
is that after update.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240425161724.73707-2-lulie@linux.alibaba.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2024-04-30 09:09:32 -07:00
Quentin Monnet
4794f18bf4 sync: Sync .mailmap entries
The kernel repository has a .mailmap file to remap author names and
email addresses to their desired format in Git logs (for details, see
gitmailmap documentation [0]). Alas, this is only visible for author
information when looking at the logs locally, as GitHub does not support
mailmaps at the moment [1].

This commit adds a .mailmap file for libbpf, automatically generated
from the kernel's version. The script to generate the .mailmap is added,
too: it works by grepping email addresses from authors in the
repository, and collecting all lines ending with this address in the
kernel's .mailmap - in other words, all lines where this address is used
as a pattern for a remapping.

To keep the .mailmap up-to-date, add a call to the script to
sync-kernel.sh.

[0] https://git-scm.com/docs/gitmailmap
[1] https://github.com/orgs/community/discussions/22518

Signed-off-by: Quentin Monnet <qmo@kernel.org>
2024-04-25 22:42:05 -07:00
Yonghong Song
2fdcc365a0 ci: regenerate latest vmlinux.h
Update vmlinux.h to make BPF selftests compile.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
2024-04-24 15:16:35 -07:00
Yonghong Song
52c37177cc Makefile: Ensure github libbpf version the same as the kernel one
The kernel libbpf version is 1.5 now. So change github libbpf
version to be 1.5 as well.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
2024-04-24 15:16:35 -07:00
Yonghong Song
7cbfddfdf2 sync: latest libbpf changes from kernel
Syncing latest libbpf commits from kernel repository.
Baseline bpf-next commit:   14bb1e8c8d4ad5d9d2febb7d19c70a3cf536e1e5
Checkpoint bpf-next commit: 82e38a505c9868e784ec31e743fd8a9fa5ca1084
Baseline bpf commit:        443574b033876c85a35de4c65c14f7fe092222b2
Checkpoint bpf commit:      5bcf0dcbf9066348058b88a510c57f70f384c92c

Andrea Righi (3):
  libbpf: Start v1.5 development cycle
  libbpf: ringbuf: Allow to consume up to a certain amount of items
  libbpf: Add ring__consume_n / ring_buffer__consume_n

Anton Protopopov (2):
  bpf: Add support for passing mark with bpf_fib_lookup
  bpf: Pack struct bpf_fib_lookup

Benjamin Tissoires (1):
  tools: sync include/uapi/linux/bpf.h

David Lechner (1):
  bpf: Fix typo in uapi doc comments

Mykyta Yatsenko (1):
  bpf: improve error message for unsupported helper

Quentin Deslandes (2):
  libbpf: Fix misaligned array closing bracket
  libbpf: Fix dump of subsequent char arrays

Tobias Böhm (1):
  libbpf: Use local bpf_helpers.h include

Yonghong Song (4):
  libbpf: Mark libbpf_kallsyms_parse static function
  libbpf: Handle <orig_name>.llvm.<hash> symbol properly
  bpf: Add bpf_link support for sk_msg and sk_skb progs
  libbpf: Add bpf_link support for BPF_PROG_TYPE_SOCKMAP

 include/uapi/linux/bpf.h | 35 +++++++++++++++++++++----
 src/bpf_core_read.h      |  2 +-
 src/btf_dump.c           |  5 ++++
 src/libbpf.c             | 33 ++++++++++++++++++++++--
 src/libbpf.h             | 14 ++++++++++
 src/libbpf.map           |  7 +++++
 src/libbpf_internal.h    |  5 ----
 src/libbpf_probes.c      |  6 +++--
 src/libbpf_version.h     |  2 +-
 src/ringbuf.c            | 55 +++++++++++++++++++++++++++++++++-------
 10 files changed, 139 insertions(+), 25 deletions(-)

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
2024-04-24 15:16:35 -07:00
Yonghong Song
f2fe16ec95 sync: auto-generate latest BPF helpers
Latest changes to BPF helper definitions.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
2024-04-24 15:16:35 -07:00
Benjamin Tissoires
a911ca1e3e tools: sync include/uapi/linux/bpf.h
cp include/uapi/linux/bpf.h tools/include/uapi/linux/bpf.h

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-6-6c986a5a741f@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Quentin Deslandes
24924003c6 libbpf: Fix dump of subsequent char arrays
When dumping a character array, libbpf will watch for a '\0' and set
is_array_terminated=true if found. This prevents libbpf from printing
the remaining characters of the array, treating it as a nul-terminated
string.

However, once this flag is set, it's never reset, leading to subsequent
characters array not being printed properly:

.str_multi = (__u8[2][16])[
    [
        'H',
        'e',
        'l',
    ],
],

This patch saves the is_array_terminated flag and restores its
default (false) value before looping over the elements of an array,
then restores it afterward. This way, libbpf's behavior is unchanged
when dumping the characters of an array, but subsequent arrays are
printed properly:

.str_multi = (__u8[2][16])[
    [
        'H',
        'e',
        'l',
    ],
    [
        'l',
        'o',
    ],
],

Signed-off-by: Quentin Deslandes <qde@naccy.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240413211258.134421-3-qde@naccy.de
2024-04-24 15:16:35 -07:00
Quentin Deslandes
2c6f445a8e libbpf: Fix misaligned array closing bracket
In btf_dump_array_data(), libbpf will call btf_dump_dump_type_data() for
each element. For an array of characters, each element will be
processed the following way:

- btf_dump_dump_type_data() is called to print the character
- btf_dump_data_pfx() prefixes the current line with the proper number
  of indentations
- btf_dump_int_data() is called to print the character
- After the last character is printed, btf_dump_dump_type_data() calls
  btf_dump_data_pfx() before writing the closing bracket

However, for an array containing characters, btf_dump_int_data() won't
print any '\0' and subsequent characters. This leads to situations where
the line prefix is written, no character is added, then the prefix is
written again before adding the closing bracket:

(struct sk_metadata){
    .str_array = (__u8[14])[
        'H',
        'e',
        'l',
        'l',
        'o',
                ],

This change solves this issue by printing the '\0' character, which
has two benefits:

- The bracket closing the array is properly aligned
- It's clear from a user point of view that libbpf uses '\0' as a
  terminator for arrays of characters.

Signed-off-by: Quentin Deslandes <qde@naccy.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240413211258.134421-2-qde@naccy.de
2024-04-24 15:16:35 -07:00
Yonghong Song
09397e309a libbpf: Add bpf_link support for BPF_PROG_TYPE_SOCKMAP
Introduce a libbpf API function bpf_program__attach_sockmap()
which allow user to get a bpf_link for their corresponding programs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240410043532.3737722-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Yonghong Song
62217fb32a bpf: Add bpf_link support for sk_msg and sk_skb progs
Add bpf_link support for sk_msg and sk_skb programs. We have an
internal request to support bpf_link for sk_msg programs so user
space can have a uniform handling with bpf_link based libbpf
APIs. Using bpf_link based libbpf API also has a benefit which
makes system robust by decoupling prog life cycle and
attachment life cycle.

Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Andrea Righi
b521a722b9 libbpf: Add ring__consume_n / ring_buffer__consume_n
Introduce a new API to consume items from a ring buffer, limited to a
specified amount, and return to the caller the actual number of items
consumed.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/lkml/20240310154726.734289-1-andrea.righi@canonical.com/T
Link: https://lore.kernel.org/bpf/20240406092005.92399-4-andrea.righi@canonical.com
2024-04-24 15:16:35 -07:00
Andrea Righi
98de9ace4d libbpf: ringbuf: Allow to consume up to a certain amount of items
In some cases, instead of always consuming all items from ring buffers
in a greedy way, we may want to consume up to a certain amount of items,
for example when we need to copy items from the BPF ring buffer to a
limited user buffer.

This change allows to set an upper limit to the amount of items consumed
from one or more ring buffers.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240406092005.92399-3-andrea.righi@canonical.com
2024-04-24 15:16:35 -07:00
Andrea Righi
26d9ab5f78 libbpf: Start v1.5 development cycle
Bump libbpf.map to v1.5.0 to start a new libbpf version cycle.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240406092005.92399-2-andrea.righi@canonical.com
2024-04-24 15:16:35 -07:00
Anton Protopopov
c5219d1b3d bpf: Pack struct bpf_fib_lookup
The struct bpf_fib_lookup is supposed to be of size 64. A recent commit
59b418c7063d ("bpf: Add a check for struct bpf_fib_lookup size") added
a static assertion to check this property so that future changes to the
structure will not accidentally break this assumption.

As it immediately turned out, on some 32-bit arm systems, when AEABI=n,
the total size of the structure was equal to 68, see [1]. This happened
because the bpf_fib_lookup structure contains a union of two 16-bit
fields:

    union {
            __u16 tot_len;
            __u16 mtu_result;
    };

which was supposed to compile to a 16-bit-aligned 16-bit field. On the
aforementioned setups it was instead both aligned and padded to 32-bits.

Declare this inner union as __attribute__((packed, aligned(2))) such
that it always is of size 2 and is aligned to 16 bits.

  [1] https://lore.kernel.org/all/CA+G9fYtsoP51f-oP_Sp5MOq-Ffv8La2RztNpwvE6+R1VtFiLrw@mail.gmail.com/#t

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Fixes: e1850ea9bd9e ("bpf: bpf_fib_lookup return MTU value as output when looked up")
Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240403123303.1452184-1-aspsk@isovalent.com
2024-04-24 15:16:35 -07:00
Tobias Böhm
8d3a3e138b libbpf: Use local bpf_helpers.h include
Commit 20d59ee55172fdf6 ("libbpf: add bpf_core_cast() macro") added a
bpf_helpers include in bpf_core_read.h as a system include. Usually, the
includes are local, though, like in bpf_tracing.h. This commit adjusts
the include to be local as well.

Signed-off-by: Tobias Böhm <tobias@aibor.de>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/q5d5bgc6vty2fmaazd5e73efd6f5bhiru2le6fxn43vkw45bls@fhlw2s5ootdb
2024-04-24 15:16:35 -07:00
David Lechner
9f2853a352 bpf: Fix typo in uapi doc comments
In a few places in the bpf uapi headers, EOPNOTSUPP is missing a "P" in
the doc comments. This adds the missing "P".

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240329152900.398260-2-dlechner@baylibre.com
2024-04-24 15:16:35 -07:00
Yonghong Song
d2f83fb976 libbpf: Handle <orig_name>.llvm.<hash> symbol properly
With CONFIG_LTO_CLANG_THIN enabled, with some of previous
version of kernel code base ([1]), I hit the following
error:
   test_ksyms:PASS:kallsyms_fopen 0 nsec
   test_ksyms:FAIL:ksym_find symbol 'bpf_link_fops' not found
   #118     ksyms:FAIL

The reason is that 'bpf_link_fops' is renamed to
   bpf_link_fops.llvm.8325593422554671469
Due to cross-file inlining, the static variable 'bpf_link_fops'
in syscall.c is used by a function in another file. To avoid
potential duplicated names, the llvm added suffix
'.llvm.<hash>' ([2]) to 'bpf_link_fops' variable.
Such renaming caused a problem in libbpf if 'bpf_link_fops'
is used in bpf prog as a ksym but 'bpf_link_fops' does not
match any symbol in /proc/kallsyms.

To fix this issue, libbpf needs to understand that suffix '.llvm.<hash>'
is caused by clang lto kernel and to process such symbols properly.

With latest bpf-next code base built with CONFIG_LTO_CLANG_THIN,
I cannot reproduce the above failure any more. But such an issue
could happen with other symbols or in the future for bpf_link_fops symbol.

For example, with my current kernel, I got the following from
/proc/kallsyms:
  ffffffff84782154 d __func__.net_ratelimit.llvm.6135436931166841955
  ffffffff85f0a500 d tk_core.llvm.726630847145216431
  ffffffff85fdb960 d __fs_reclaim_map.llvm.10487989720912350772
  ffffffff864c7300 d fake_dst_ops.llvm.54750082607048300

I could not easily create a selftest to test newly-added
libbpf functionality with a static C test since I do not know
which symbol is cross-file inlined. But based on my particular kernel,
the following test change can run successfully.

>  diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c
>  index 6a86d1f07800..904a103f7b1d 100644
>  --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c
>  +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c
>  @@ -42,6 +42,7 @@ void test_ksyms(void)
>          ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops");
>          ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1");
>          ASSERT_EQ(data->out__btf_size, btf_size, "btf_size");
>  +       ASSERT_NEQ(data->out__fake_dst_ops, 0, "fake_dst_ops");
>          ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start");
>
>   cleanup:
>  diff --git a/tools/testing/selftests/bpf/progs/test_ksyms.c b/tools/testing/selftests/bpf/progs/test_ksyms.c
>  index 6c9cbb5a3bdf..fe91eef54b66 100644
>  --- a/tools/testing/selftests/bpf/progs/test_ksyms.c
>  +++ b/tools/testing/selftests/bpf/progs/test_ksyms.c
>  @@ -9,11 +9,13 @@ __u64 out__bpf_link_fops = -1;
>   __u64 out__bpf_link_fops1 = -1;
>   __u64 out__btf_size = -1;
>   __u64 out__per_cpu_start = -1;
>  +__u64 out__fake_dst_ops = -1;
>
>   extern const void bpf_link_fops __ksym;
>   extern const void __start_BTF __ksym;
>   extern const void __stop_BTF __ksym;
>   extern const void __per_cpu_start __ksym;
>  +extern const void fake_dst_ops __ksym;
>   /* non-existing symbol, weak, default to zero */
>   extern const void bpf_link_fops1 __ksym __weak;
>
>  @@ -23,6 +25,7 @@ int handler(const void *ctx)
>          out__bpf_link_fops = (__u64)&bpf_link_fops;
>          out__btf_size = (__u64)(&__stop_BTF - &__start_BTF);
>          out__per_cpu_start = (__u64)&__per_cpu_start;
>  +       out__fake_dst_ops = (__u64)&fake_dst_ops;
>
>          out__bpf_link_fops1 = (__u64)&bpf_link_fops1;

This patch fixed the issue in libbpf such that
the suffix '.llvm.<hash>' will be ignored during comparison of
bpf prog ksym vs. symbols in /proc/kallsyms, this resolved the issue.
Currently, only static variables in /proc/kallsyms are checked
with '.llvm.<hash>' suffix since in bpf programs function ksyms
with '.llvm.<hash>' suffix are most likely kfunc's and unlikely
to be cross-file inlined.

Note that currently kernel does not support gcc build with lto.

  [1] https://lore.kernel.org/bpf/20240302165017.1627295-1-yonghong.song@linux.dev/
  [2] https://github.com/llvm/llvm-project/blob/release/18.x/llvm/include/llvm/IR/ModuleSummaryIndex.h#L1714-L1719

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240326041458.1198161-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Yonghong Song
8b9cb7d479 libbpf: Mark libbpf_kallsyms_parse static function
Currently libbpf_kallsyms_parse() function is declared as a global
function but actually it is not a API and there is no external
users in bpftool/bpf-selftests. So let us mark the function as
static.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240326041453.1197949-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Mykyta Yatsenko
b062410166 bpf: improve error message for unsupported helper
BPF verifier emits "unknown func" message when given BPF program type
does not support BPF helper. This message may be confusing for users, as
important context that helper is unknown only to current program type is
not provided.

This patch changes message to "program of this type cannot use helper "
and aligns dependent code in libbpf and tests. Any suggestions on
improving/changing this message are welcome.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/r/20240325152210.377548-1-yatsenko@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Anton Protopopov
89d8cdf741 bpf: Add support for passing mark with bpf_fib_lookup
Extend the bpf_fib_lookup() helper by making it to utilize mark if
the BPF_FIB_LOOKUP_MARK flag is set. In order to pass the mark the
four bytes of struct bpf_fib_lookup are used, shared with the
output-only smac/dmac fields.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: David Ahern <dsahern@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240326101742.17421-2-aspsk@isovalent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-04-24 15:16:35 -07:00
Song Liu
8a2054f417 ci/diffs: Add temporary fix for mitigation config
Upstream is discussing the exact config to ship. In the meanwhile, which
would unblock CI.

More discussions here:

https://lore.kernel.org/lkml/20240423045548.1324969-1-song@kernel.org/T/#u

Signed-off-by: Song Liu <song@kernel.org>
2024-04-24 10:42:01 -07:00
91 changed files with 109510 additions and 95855 deletions

View File

@@ -1,3 +0,0 @@
Thank you for considering a contribution!
Please note that the `libbpf` authoritative source code is developed as part of bpf-next Linux source tree under tools/lib/bpf subdirectory and is periodically synced to Github. As such, all the libbpf changes should be sent to BPF mailing list, please don't open PRs here unless you are changing Github-specific parts of libbpf (e.g., Github-specific Makefile).

View File

@@ -1,31 +0,0 @@
name: 'build-selftests'
description: 'Build BPF selftests'
inputs:
repo-path:
description: 'where is the source code'
required: true
kernel:
description: 'kernel version or LATEST'
required: true
default: 'LATEST'
vmlinux:
description: 'where is vmlinux file'
required: true
default: '${{ github.workspace }}/vmlinux'
runs:
using: "composite"
steps:
- shell: bash
run: |
source $GITHUB_ACTION_PATH/../../../ci/vmtest/helpers.sh
foldable start "Setup Env"
sudo apt-get install -y qemu-kvm zstd binutils-dev elfutils libcap-dev libelf-dev libdw-dev python3-docutils
foldable end
- shell: bash
run: |
export KERNEL=${{ inputs.kernel }}
export REPO_ROOT="${{ github.workspace }}"
export REPO_PATH="${{ inputs.repo-path }}"
export VMLINUX_BTF="${{ inputs.vmlinux }}"
${{ github.action_path }}/build_selftests.sh

View File

@@ -1,60 +0,0 @@
#!/bin/bash
set -euo pipefail
THISDIR="$(cd $(dirname $0) && pwd)"
source ${THISDIR}/helpers.sh
foldable start prepare_selftests "Building selftests"
LIBBPF_PATH="${REPO_ROOT}"
llvm_default_version() {
echo "16"
}
llvm_latest_version() {
echo "17"
}
LLVM_VERSION=$(llvm_default_version)
if [[ "${LLVM_VERSION}" == $(llvm_latest_version) ]]; then
REPO_DISTRO_SUFFIX=""
else
REPO_DISTRO_SUFFIX="-${LLVM_VERSION}"
fi
echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal${REPO_DISTRO_SUFFIX} main" \
| sudo tee /etc/apt/sources.list.d/llvm.list
PREPARE_SELFTESTS_SCRIPT=${THISDIR}/prepare_selftests-${KERNEL}.sh
if [ -f "${PREPARE_SELFTESTS_SCRIPT}" ]; then
(cd "${REPO_ROOT}/${REPO_PATH}/tools/testing/selftests/bpf" && ${PREPARE_SELFTESTS_SCRIPT})
fi
if [[ "${KERNEL}" = 'LATEST' ]]; then
VMLINUX_H=
else
VMLINUX_H=${THISDIR}/vmlinux.h
fi
cd ${REPO_ROOT}/${REPO_PATH}
make headers
make \
CLANG=clang-${LLVM_VERSION} \
LLC=llc-${LLVM_VERSION} \
LLVM_STRIP=llvm-strip-${LLVM_VERSION} \
VMLINUX_BTF="${VMLINUX_BTF}" \
VMLINUX_H=${VMLINUX_H} \
-C "${REPO_ROOT}/${REPO_PATH}/tools/testing/selftests/bpf" \
-j $((4*$(nproc))) > /dev/null
cd -
mkdir ${LIBBPF_PATH}/selftests
cp -R "${REPO_ROOT}/${REPO_PATH}/tools/testing/selftests/bpf" \
${LIBBPF_PATH}/selftests
cd ${LIBBPF_PATH}
rm selftests/bpf/.gitignore
git add selftests
foldable end prepare_selftests

View File

@@ -1,38 +0,0 @@
# shellcheck shell=bash
# $1 - start or end
# $2 - fold identifier, no spaces
# $3 - fold section description
foldable() {
local YELLOW='\033[1;33m'
local NOCOLOR='\033[0m'
if [ $1 = "start" ]; then
line="::group::$2"
if [ ! -z "${3:-}" ]; then
line="$line - ${YELLOW}$3${NOCOLOR}"
fi
else
line="::endgroup::"
fi
echo -e "$line"
}
__print() {
local TITLE=""
if [[ -n $2 ]]; then
TITLE=" title=$2"
fi
echo "::$1${TITLE}::$3"
}
# $1 - title
# $2 - message
print_error() {
__print error $1 $2
}
# $1 - title
# $2 - message
print_notice() {
__print notice $1 $2
}

View File

@@ -1,5 +0,0 @@
#!/bin/bash
printf "all:\n\ttouch bpf_testmod.ko\n\nclean:\n" > bpf_testmod/Makefile
printf "all:\n\ttouch bpf_test_no_cfi.ko\n\nclean:\n" > bpf_test_no_cfi/Makefile

View File

@@ -1,5 +0,0 @@
#!/bin/bash
printf "all:\n\ttouch bpf_testmod.ko\n\nclean:\n" > bpf_testmod/Makefile
printf "all:\n\ttouch bpf_test_no_cfi.ko\n\nclean:\n" > bpf_test_no_cfi/Makefile

File diff suppressed because it is too large Load Diff

View File

@@ -1,118 +0,0 @@
name: 'vmtest'
description: 'Build + run vmtest'
inputs:
kernel:
description: 'kernel version or LATEST'
required: true
default: 'LATEST'
arch:
description: 'what arch to test'
required: true
default: 'x86_64'
pahole:
description: 'pahole rev or master'
required: true
default: 'master'
runs:
using: "composite"
steps:
# Allow CI user to access /dev/kvm (via qemu) w/o group change/relogin
# by changing permissions set by udev.
- name: Set /dev/kvm permissions
shell: bash
run: |
if [ -e /dev/kvm ]; then
echo "/dev/kvm exists"
if [ $(id -u) != 0 ]; then
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
| sudo tee /etc/udev/rules.d/99-kvm4all.rules > /dev/null
sudo udevadm control --reload-rules
sudo udevadm trigger --name-match=kvm
fi
else
echo "/dev/kvm does not exist"
fi
# setup environment
- name: Setup environment
uses: libbpf/ci/setup-build-env@main
with:
pahole: ${{ inputs.pahole }}
arch: ${{ inputs.arch }}
# 1. download CHECKPOINT kernel source
- name: Get checkpoint commit
shell: bash
run: |
cat CHECKPOINT-COMMIT
echo "CHECKPOINT=$(cat CHECKPOINT-COMMIT)" >> $GITHUB_ENV
- name: Get kernel source at checkpoint
uses: libbpf/ci/get-linux-source@main
with:
repo: 'https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git'
rev: ${{ env.CHECKPOINT }}
dest: '${{ github.workspace }}/.kernel'
- name: Patch kernel source
uses: libbpf/ci/patch-kernel@main
with:
patches-root: '${{ github.workspace }}/ci/diffs'
repo-root: '.kernel'
- name: Prepare to build BPF selftests
shell: bash
run: |
source $GITHUB_ACTION_PATH/../../../ci/vmtest/helpers.sh
foldable start "Prepare building selftest"
cd .kernel
cat tools/testing/selftests/bpf/config \
tools/testing/selftests/bpf/config.${{ inputs.arch }} > .config
# this file might or mihgt not exist depending on kernel version
cat tools/testing/selftests/bpf/config.vm >> .config || :
make olddefconfig && make prepare
cd -
foldable end
# 2. if kernel == LATEST, build kernel image from tree
- name: Build kernel image
if: ${{ inputs.kernel == 'LATEST' }}
shell: bash
run: |
source $GITHUB_ACTION_PATH/../../../ci/vmtest/helpers.sh
foldable start "Build Kernel Image"
cd .kernel
make -j $((4*$(nproc))) all > /dev/null
cp vmlinux ${{ github.workspace }}
cd -
foldable end
# else, just download prebuilt kernel image
- name: Download prebuilt kernel
if: ${{ inputs.kernel != 'LATEST' }}
uses: libbpf/ci/download-vmlinux@main
with:
kernel: ${{ inputs.kernel }}
arch: ${{ inputs.arch }}
# 3. build selftests
- name: Build BPF selftests
uses: ./.github/actions/build-selftests
with:
repo-path: '.kernel'
kernel: ${{ inputs.kernel }}
# 4. prepare rootfs
- name: prepare rootfs
uses: libbpf/ci/prepare-rootfs@main
env:
KBUILD_OUTPUT: '.kernel'
with:
project-name: 'libbpf'
arch: ${{ inputs.arch }}
kernel: ${{ inputs.kernel }}
kernel-root: '.kernel'
kbuild-output: ${{ env.KBUILD_OUTPUT }}
image-output: '/tmp/root.img'
# 5. run selftest in QEMU
- name: Run selftests
env:
KERNEL: ${{ inputs.kernel }}
REPO_ROOT: ${{ github.workspace }}
uses: libbpf/ci/run-qemu@main
with:
arch: ${{ inputs.arch }}
img: '/tmp/root.img'
vmlinuz: 'vmlinuz'
kernel-root: '.kernel'

View File

@@ -53,7 +53,7 @@ jobs:
ubuntu:
runs-on: ubuntu-latest
name: Ubuntu Focal Build (${{ matrix.arch }})
name: Ubuntu Build (${{ matrix.arch }})
strategy:
fail-fast: false
matrix:
@@ -61,31 +61,32 @@ jobs:
- arch: aarch64
- arch: ppc64le
- arch: s390x
- arch: x86
- arch: amd64
steps:
- uses: actions/checkout@v4
name: Checkout
- name: Setup QEMU
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v8.1.5
- uses: ./.github/actions/setup
name: Pre-Setup
- run: source /tmp/ci_setup && sudo -E $CI_ROOT/managers/ubuntu.sh
if: matrix.arch == 'x86'
if: matrix.arch == 'amd64'
name: Setup
- uses: uraimo/run-on-arch-action@v2.7.1
name: Build in docker
if: matrix.arch != 'x86'
with:
distro:
ubuntu20.04
arch:
${{ matrix.arch }}
setup:
cp /tmp/ci_setup $GITHUB_WORKSPACE
dockerRunArgs: |
--volume "${GITHUB_WORKSPACE}:${GITHUB_WORKSPACE}"
shell: /bin/bash
install: |
export DEBIAN_FRONTEND=noninteractive
export TZ="America/Los_Angeles"
apt-get update -y
apt-get install -y tzdata build-essential sudo
run: source ${GITHUB_WORKSPACE}/ci_setup && $CI_ROOT/managers/ubuntu.sh
- name: Build in docker
if: matrix.arch != 'amd64'
run: |
cp /tmp/ci_setup ${GITHUB_WORKSPACE}
docker run --rm \
--platform linux/${{ matrix.arch }} \
-v ${GITHUB_WORKSPACE}:${GITHUB_WORKSPACE} \
-e GITHUB_WORKSPACE=${GITHUB_WORKSPACE} \
-w /ci/workspace \
ubuntu:noble \
${GITHUB_WORKSPACE}/ci/build-in-docker.sh

View File

@@ -33,7 +33,7 @@ jobs:
dry-run: false
sanitizer: ${{ matrix.sanitizer }}
- name: Upload Crash
uses: actions/upload-artifact@v1
uses: actions/upload-artifact@v4
if: failure() && steps.build.outcome == 'success'
with:
name: ${{ matrix.sanitizer }}-artifacts

View File

@@ -17,7 +17,7 @@ permissions:
jobs:
analyze:
name: Analyze
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ matrix.language }}-${{ github.ref }}
cancel-in-progress: true

View File

@@ -1,30 +1,30 @@
name: libbpf-ci-coverity
on:
push:
branches:
- master
schedule:
- cron: '0 18 * * *'
jobs:
coverity:
runs-on: ubuntu-latest
if: github.repository == 'libbpf/libbpf'
name: Coverity
env:
COVERITY_SCAN_TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup
- name: Run coverity
if: ${{ env.COVERITY_SCAN_TOKEN }}
run: |
source "${GITHUB_WORKSPACE}"/ci/vmtest/helpers.sh
foldable start "Setup CI env"
source /tmp/ci_setup
export COVERITY_SCAN_NOTIFICATION_EMAIL="${AUTHOR_EMAIL}"
export COVERITY_SCAN_BRANCH_PATTERN=${GITHUB_REF##refs/*/}
export TRAVIS_BRANCH=${COVERITY_SCAN_BRANCH_PATTERN}
foldable end
scripts/coverity.sh
env:
COVERITY_SCAN_TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }}
COVERITY_SCAN_PROJECT_NAME: libbpf
COVERITY_SCAN_BUILD_COMMAND_PREPEND: 'cd src/'
COVERITY_SCAN_BUILD_COMMAND: 'make'

View File

@@ -3,34 +3,29 @@ name: ondemand
on:
workflow_dispatch:
inputs:
kernel-origin:
description: 'git repo for linux kernel'
default: 'https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git'
arch:
default: 'x86_64'
required: true
kernel-rev:
description: 'rev/tag/branch for linux kernel'
llvm-version:
default: '18'
required: true
kernel:
default: 'LATEST'
required: true
pahole:
default: "master"
required: true
pahole-origin:
description: 'git repo for pahole'
default: 'https://git.kernel.org/pub/scm/devel/pahole/pahole.git'
required: true
pahole-rev:
description: 'ref/tag/branch for pahole'
default: "master"
runs-on:
default: 'ubuntu-24.04'
required: true
jobs:
vmtest:
runs-on: ubuntu-latest
name: vmtest with customized pahole/Kernel
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup
- uses: ./.github/actions/vmtest
with:
kernel: 'LATEST'
kernel-rev: ${{ github.event.inputs.kernel-rev }}
kernel-origin: ${{ github.event.inputs.kernel-origin }}
pahole: ${{ github.event.inputs.pahole-rev }}
pahole-origin: ${{ github.event.inputs.pahole-origin }}
name: ${{ inputs.kernel }} kernel llvm-${{ inputs.llvm-version }} pahole@${{ inputs.pahole }}
uses: ./.github/workflows/vmtest.yml
with:
runs_on: ${{ inputs.runs-on }}
kernel: ${{ inputs.kernel }}
arch: ${{ inputs.arch }}
llvm-version: ${{ inputs.llvm-version }}
pahole: ${{ inputs.pahole }}

View File

@@ -1,20 +0,0 @@
name: pahole-staging
on:
schedule:
- cron: '0 18 * * *'
jobs:
vmtest:
runs-on: ubuntu-20.04
name: Kernel LATEST + staging pahole
env:
STAGING: tmp.master
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup
- uses: ./.github/actions/vmtest
with:
kernel: LATEST
pahole: $STAGING

View File

@@ -1,42 +1,31 @@
name: libbpf-ci
on:
on:
pull_request:
push:
schedule:
- cron: '0 18 * * *'
concurrency:
concurrency:
group: ci-test-${{ github.head_ref }}
cancel-in-progress: true
jobs:
vmtest:
runs-on: ${{ matrix.runs_on }}
name: Kernel ${{ matrix.kernel }} on ${{ matrix.runs_on }} + selftests
strategy:
fail-fast: false
matrix:
include:
- kernel: 'LATEST'
runs_on: ubuntu-20.04
runs_on: 'ubuntu-24.04'
arch: 'x86_64'
- kernel: '5.5.0'
runs_on: ubuntu-20.04
arch: 'x86_64'
- kernel: '4.9.0'
runs_on: ubuntu-20.04
arch: 'x86_64'
- kernel: 'LATEST'
runs_on: s390x
arch: 's390x'
steps:
- uses: actions/checkout@v4
name: Checkout
- uses: ./.github/actions/setup
name: Setup
- uses: ./.github/actions/vmtest
name: vmtest
with:
kernel: ${{ matrix.kernel }}
arch: ${{ matrix.arch }}
llvm-version: '21'
pahole: 'master'
name: Linux ${{ matrix.kernel }} llvm-${{ matrix.llvm-version }}
uses: ./.github/workflows/vmtest.yml
with:
runs_on: ${{ matrix.runs_on }}
kernel: ${{ matrix.kernel }}
arch: ${{ matrix.arch }}
llvm-version: ${{ matrix.llvm-version }}
pahole: ${{ matrix.pahole }}

117
.github/workflows/vmtest.yml vendored Normal file
View File

@@ -0,0 +1,117 @@
name: 'Build kernel and selftests/bpf, run selftests via vmtest'
on:
workflow_call:
inputs:
runs_on:
required: true
default: 'ubuntu-24.04'
type: string
arch:
description: 'what arch to test'
required: true
default: 'x86_64'
type: string
kernel:
description: 'kernel version or LATEST'
required: true
default: 'LATEST'
type: string
pahole:
description: 'pahole rev or branch'
required: false
default: 'master'
type: string
llvm-version:
description: 'llvm version'
required: false
default: '18'
type: string
jobs:
vmtest:
name: pahole@${{ inputs.pahole }}
runs-on: ${{ inputs.runs_on }}
steps:
- uses: actions/checkout@v4
- name: Setup environment
uses: libbpf/ci/setup-build-env@v3
with:
pahole: ${{ inputs.pahole }}
arch: ${{ inputs.arch }}
llvm-version: ${{ inputs.llvm-version }}
- name: Get checkpoint commit
shell: bash
run: |
cat CHECKPOINT-COMMIT
echo "CHECKPOINT=$(cat CHECKPOINT-COMMIT)" >> $GITHUB_ENV
- name: Get kernel source at checkpoint
uses: libbpf/ci/get-linux-source@v3
with:
repo: 'https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git'
rev: ${{ env.CHECKPOINT }}
dest: '${{ github.workspace }}/.kernel'
- name: Patch kernel source
uses: libbpf/ci/patch-kernel@v3
with:
patches-root: '${{ github.workspace }}/ci/diffs'
repo-root: '.kernel'
- name: Configure kernel build
shell: bash
run: |
cd .kernel
cat tools/testing/selftests/bpf/config \
tools/testing/selftests/bpf/config.${{ inputs.arch }} > .config
# this file might or might not exist depending on kernel version
cat tools/testing/selftests/bpf/config.vm >> .config || :
make olddefconfig && make prepare
cd -
- name: Build kernel image
if: ${{ inputs.kernel == 'LATEST' }}
shell: bash
run: |
cd .kernel
make -j $((4*$(nproc))) all
cp vmlinux ${{ github.workspace }}
cd -
- name: Download prebuilt kernel
if: ${{ inputs.kernel != 'LATEST' }}
uses: libbpf/ci/download-vmlinux@v3
with:
kernel: ${{ inputs.kernel }}
arch: ${{ inputs.arch }}
- name: Build selftests/bpf
uses: libbpf/ci/build-selftests@v3
env:
MAX_MAKE_JOBS: 32
VMLINUX_BTF: ${{ github.workspace }}/vmlinux
VMLINUX_H: ${{ inputs.kernel != 'LATEST' && format('{0}/.github/actions/build-selftests/vmlinux.h', github.workspace) || '' }}
with:
arch: ${{ inputs.arch }}
kernel-root: ${{ github.workspace }}/.kernel
llvm-version: ${{ inputs.llvm-version }}
- name: Run selftests
env:
ALLOWLIST_FILE: /tmp/allowlist
DENYLIST_FILE: /tmp/denylist
KERNEL: ${{ inputs.kernel }}
VMLINUX: ${{ github.workspace }}/vmlinux
LLVM_VERSION: ${{ inputs.llvm-version }}
SELFTESTS_BPF: ${{ github.workspace }}/.kernel/tools/testing/selftests/bpf
VMTEST_CONFIGS: ${{ github.workspace }}/ci/vmtest/configs
uses: libbpf/ci/run-vmtest@v3
with:
arch: ${{ inputs.arch }}
kbuild-output: ${{ github.workspace }}/.kernel
kernel-root: ${{ github.workspace }}/.kernel
vmlinuz: ${{ inputs.arch }}/vmlinuz-${{ inputs.kernel }}

24
.mailmap Normal file
View File

@@ -0,0 +1,24 @@
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
Antoine Tenart <atenart@kernel.org> <antoine.tenart@bootlin.com>
Benjamin Tissoires <bentiss@kernel.org> <benjamin.tissoires@redhat.com>
Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>
Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
Dan Carpenter <error27@gmail.com> <dan.carpenter@oracle.com>
Geliang Tang <geliang@kernel.org> <geliang.tang@suse.com>
Herbert Xu <herbert@gondor.apana.org.au>
Jakub Kicinski <kuba@kernel.org> <jakub.kicinski@netronome.com>
Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe@linaro.org>
Jesper Dangaard Brouer <hawk@kernel.org> <brouer@redhat.com>
Kees Cook <kees@kernel.org> <keescook@chromium.org>
Kuniyuki Iwashima <kuniyu@google.com> <kuniyu@amazon.co.jp>
Leo Yan <leo.yan@linux.dev> <leo.yan@linaro.org>
Mark Starovoytov <mstarovo@pm.me> <mstarovoitov@marvell.com>
Maxim Mikityanskiy <maxtram95@gmail.com> <maximmi@mellanox.com>
Maxim Mikityanskiy <maxtram95@gmail.com> <maximmi@nvidia.com>
Puranjay Mohan <puranjay@kernel.org> <puranjay12@gmail.com>
Quentin Monnet <qmo@kernel.org> <quentin@isovalent.com>
Quentin Monnet <qmo@kernel.org> <quentin.monnet@netronome.com>
Stanislav Fomichev <sdf@fomichev.me> <sdf@google.com>
Vadim Fedorenko <vadim.fedorenko@linux.dev> <vadfed@meta.com>
Vadim Fedorenko <vadim.fedorenko@linux.dev> <vfedorenko@novek.ru>

View File

@@ -1 +1 @@
443574b033876c85a35de4c65c14f7fe092222b2
22cc16c04b7893d8fc22810599f49a305d600b9e

View File

@@ -1 +1 @@
14bb1e8c8d4ad5d9d2febb7d19c70a3cf536e1e5
08a7491843224f8b96518fbe70d9e48163046054

14
ci/build-in-docker.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
export TZ="America/Los_Angeles"
apt-get update -y
apt-get install -y tzdata build-essential sudo
source ${GITHUB_WORKSPACE}/ci_setup
$CI_ROOT/managers/ubuntu.sh
exit 0

View File

@@ -0,0 +1,85 @@
From e3a4f5092e847ec00e2b66c060f2cef52b8d0177 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Thu, 14 Nov 2024 12:49:34 -0800
Subject: [PATCH bpf-next] selftests/bpf: set test path for
token/obj_priv_implicit_token_envvar
token/obj_priv_implicit_token_envvar test may fail in an environment
where the process executing tests can not write to the root path.
Example:
https://github.com/libbpf/libbpf/actions/runs/11844507007/job/33007897936
Change default path used by the test to /tmp/bpf-token-fs, and make it
runtime configurable via an environment variable.
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
---
tools/testing/selftests/bpf/prog_tests/token.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c
index fe86e4fdb89c..39f5414b674b 100644
--- a/tools/testing/selftests/bpf/prog_tests/token.c
+++ b/tools/testing/selftests/bpf/prog_tests/token.c
@@ -828,8 +828,11 @@ static int userns_obj_priv_btf_success(int mnt_fd, struct token_lsm *lsm_skel)
return validate_struct_ops_load(mnt_fd, true /* should succeed */);
}
+static const char* token_bpffs_custom_dir() {
+ return getenv("BPF_SELFTESTS_BPF_TOKEN_DIR") ? : "/tmp/bpf-token-fs";
+}
+
#define TOKEN_ENVVAR "LIBBPF_BPF_TOKEN_PATH"
-#define TOKEN_BPFFS_CUSTOM "/bpf-token-fs"
static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel)
{
@@ -892,6 +895,7 @@ static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel
static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *lsm_skel)
{
+ const char *custom_dir = token_bpffs_custom_dir();
LIBBPF_OPTS(bpf_object_open_opts, opts);
struct dummy_st_ops_success *skel;
int err;
@@ -909,10 +913,10 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
* BPF token implicitly, unless pointed to it through
* LIBBPF_BPF_TOKEN_PATH envvar
*/
- rmdir(TOKEN_BPFFS_CUSTOM);
- if (!ASSERT_OK(mkdir(TOKEN_BPFFS_CUSTOM, 0777), "mkdir_bpffs_custom"))
+ rmdir(custom_dir);
+ if (!ASSERT_OK(mkdir(custom_dir, 0777), "mkdir_bpffs_custom"))
goto err_out;
- err = sys_move_mount(mnt_fd, "", AT_FDCWD, TOKEN_BPFFS_CUSTOM, MOVE_MOUNT_F_EMPTY_PATH);
+ err = sys_move_mount(mnt_fd, "", AT_FDCWD, custom_dir, MOVE_MOUNT_F_EMPTY_PATH);
if (!ASSERT_OK(err, "move_mount_bpffs"))
goto err_out;
@@ -925,7 +929,7 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
goto err_out;
}
- err = setenv(TOKEN_ENVVAR, TOKEN_BPFFS_CUSTOM, 1 /*overwrite*/);
+ err = setenv(TOKEN_ENVVAR, custom_dir, 1 /*overwrite*/);
if (!ASSERT_OK(err, "setenv_token_path"))
goto err_out;
@@ -951,11 +955,11 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
goto err_out;
- rmdir(TOKEN_BPFFS_CUSTOM);
+ rmdir(custom_dir);
unsetenv(TOKEN_ENVVAR);
return 0;
err_out:
- rmdir(TOKEN_BPFFS_CUSTOM);
+ rmdir(custom_dir);
unsetenv(TOKEN_ENVVAR);
return -EINVAL;
}
--
2.47.0

View File

@@ -1,56 +0,0 @@
From f267f262815033452195f46c43b572159262f533 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 5 Mar 2024 10:08:28 +0100
Subject: [PATCH 2/2] xdp, bonding: Fix feature flags when there are no slave
devs anymore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit 9b0ed890ac2a ("bonding: do not report NETDEV_XDP_ACT_XSK_ZEROCOPY")
changed the driver from reporting everything as supported before a device
was bonded into having the driver report that no XDP feature is supported
until a real device is bonded as it seems to be more truthful given
eventually real underlying devices decide what XDP features are supported.
The change however did not take into account when all slave devices get
removed from the bond device. In this case after 9b0ed890ac2a, the driver
keeps reporting a feature mask of 0x77, that is, NETDEV_XDP_ACT_MASK &
~NETDEV_XDP_ACT_XSK_ZEROCOPY whereas it should have reported a feature
mask of 0.
Fix it by resetting XDP feature flags in the same way as if no XDP program
is attached to the bond device. This was uncovered by the XDP bond selftest
which let BPF CI fail. After adjusting the starting masks on the latter
to 0 instead of NETDEV_XDP_ACT_MASK the test passes again together with
this fix.
Fixes: 9b0ed890ac2a ("bonding: do not report NETDEV_XDP_ACT_XSK_ZEROCOPY")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Magnus Karlsson <magnus.karlsson@intel.com>
Cc: Prashant Batra <prbatra.mail@gmail.com>
Cc: Toke Høiland-Jørgensen <toke@redhat.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Message-ID: <20240305090829.17131-1-daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
drivers/net/bonding/bond_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a11748b8d69b..cd0683bcca03 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1811,7 +1811,7 @@ void bond_xdp_set_features(struct net_device *bond_dev)
ASSERT_RTNL();
- if (!bond_xdp_check(bond)) {
+ if (!bond_xdp_check(bond) || !bond_has_slaves(bond)) {
xdp_clear_features_flag(bond_dev);
return;
}
--
2.43.0

View File

@@ -0,0 +1,69 @@
From bd06a13f44e15e2e83561ea165061c445a15bd9e Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 27 Mar 2025 11:55:28 -0700
Subject: [PATCH 4000/4002] selftests/bpf: Fix tests after fields reorder in
struct file
The change in struct file [1] moved f_ref to the 3rd cache line.
It made *(u64 *)file dereference invalid from the verifier point of view,
because btf_struct_walk() walks into f_lock field, which is 4-byte long.
Fix the selftests to deference the file pointer as a 4-byte access.
[1] commit e249056c91a2 ("fs: place f_ref to 3rd cache line in struct file to resolve false sharing")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250327185528.1740787-1-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/progs/test_module_attach.c | 2 +-
tools/testing/selftests/bpf/progs/test_subprogs_extable.c | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c
index fb07f5773888..7f3c233943b3 100644
--- a/tools/testing/selftests/bpf/progs/test_module_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_module_attach.c
@@ -117,7 +117,7 @@ int BPF_PROG(handle_fexit_ret, int arg, struct file *ret)
bpf_probe_read_kernel(&buf, 8, ret);
bpf_probe_read_kernel(&buf, 8, (char *)ret + 256);
- *(volatile long long *)ret;
+ *(volatile int *)ret;
*(volatile int *)&ret->f_mode;
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c
index e2a21fbd4e44..dcac69f5928a 100644
--- a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c
+++ b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c
@@ -21,7 +21,7 @@ static __u64 test_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data)
SEC("fexit/bpf_testmod_return_ptr")
int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret)
{
- *(volatile long *)ret;
+ *(volatile int *)ret;
*(volatile int *)&ret->f_mode;
bpf_for_each_map_elem(&test_array, test_cb, NULL, 0);
triggered++;
@@ -31,7 +31,7 @@ int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret)
SEC("fexit/bpf_testmod_return_ptr")
int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret)
{
- *(volatile long *)ret;
+ *(volatile int *)ret;
*(volatile int *)&ret->f_mode;
bpf_for_each_map_elem(&test_array, test_cb, NULL, 0);
triggered++;
@@ -41,7 +41,7 @@ int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret)
SEC("fexit/bpf_testmod_return_ptr")
int BPF_PROG(handle_fexit_ret_subprogs3, int arg, struct file *ret)
{
- *(volatile long *)ret;
+ *(volatile int *)ret;
*(volatile int *)&ret->f_mode;
bpf_for_each_map_elem(&test_array, test_cb, NULL, 0);
triggered++;
--
2.49.0

View File

@@ -0,0 +1,71 @@
From 8be3a12f9f266aaf3f06f0cfe0e90cfe4d956f3d Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 28 Mar 2025 12:31:24 -0700
Subject: [PATCH 4001/4002] selftests/bpf: Fix verifier_bpf_fastcall test
Commit [1] moves percpu data on x86 from address 0x000... to address
0xfff...
Before [1]:
159020: 0000000000030700 0 OBJECT GLOBAL DEFAULT 23 pcpu_hot
After [1]:
152602: ffffffff83a3e034 4 OBJECT GLOBAL DEFAULT 35 pcpu_hot
As a result, verifier_bpf_fastcall tests should now expect a negative
value for pcpu_hot, IOW, the disassemble should show "r=" instead of
"w=".
Fix this in the test.
Note that, a later change created a new variable "cpu_number" for
bpf_get_smp_processor_id() [2]. The inlining logic is updated properly
as part of this change, so there is no need to fix anything on the
kernel side.
[1] commit 9d7de2aa8b41 ("x86/percpu/64: Use relative percpu offsets")
[2] commit 01c7bc5198e9 ("x86/smp: Move cpu number to percpu hot section")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250328193124.808784-1-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
index a9be6ae49454..c258b0722e04 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
@@ -12,7 +12,7 @@ SEC("raw_tp")
__arch_x86_64
__log_level(4) __msg("stack depth 8")
__xlated("4: r5 = 5")
-__xlated("5: w0 = ")
+__xlated("5: r0 = ")
__xlated("6: r0 = &(void __percpu *)(r0)")
__xlated("7: r0 = *(u32 *)(r0 +0)")
__xlated("8: exit")
@@ -704,7 +704,7 @@ SEC("raw_tp")
__arch_x86_64
__log_level(4) __msg("stack depth 32+0")
__xlated("2: r1 = 1")
-__xlated("3: w0 =")
+__xlated("3: r0 =")
__xlated("4: r0 = &(void __percpu *)(r0)")
__xlated("5: r0 = *(u32 *)(r0 +0)")
/* bpf_loop params setup */
@@ -753,7 +753,7 @@ __arch_x86_64
__log_level(4) __msg("stack depth 40+0")
/* call bpf_get_smp_processor_id */
__xlated("2: r1 = 42")
-__xlated("3: w0 =")
+__xlated("3: r0 =")
__xlated("4: r0 = &(void __percpu *)(r0)")
__xlated("5: r0 = *(u32 *)(r0 +0)")
/* call bpf_get_prandom_u32 */
--
2.49.0

View File

@@ -0,0 +1,71 @@
From 07be1f644ff9eeb842fd0490ddd824df0828cb0e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 30 Mar 2025 20:38:28 -0700
Subject: [PATCH 4002/4002] selftests/bpf: Fix verifier_private_stack test
failure
Several verifier_private_stack tests failed with latest bpf-next.
For example, for 'Private stack, single prog' subtest, the
jitted code:
func #0:
0: f3 0f 1e fa endbr64
4: 0f 1f 44 00 00 nopl (%rax,%rax)
9: 0f 1f 00 nopl (%rax)
c: 55 pushq %rbp
d: 48 89 e5 movq %rsp, %rbp
10: f3 0f 1e fa endbr64
14: 49 b9 58 74 8a 8f 7d 60 00 00 movabsq $0x607d8f8a7458, %r9
1e: 65 4c 03 0c 25 28 c0 48 87 addq %gs:-0x78b73fd8, %r9
27: bf 2a 00 00 00 movl $0x2a, %edi
2c: 49 89 b9 00 ff ff ff movq %rdi, -0x100(%r9)
33: 31 c0 xorl %eax, %eax
35: c9 leave
36: e9 20 5d 0f e1 jmp 0xffffffffe10f5d5b
The insn 'addq %gs:-0x78b73fd8, %r9' does not match the expected
regex 'addq %gs:0x{{.*}}, %r9' and this caused test failure.
Fix it by changing '%gs:0x{{.*}}' to '%gs:{{.*}}' to accommodate the
possible negative offset. A few other subtests are fixed in a similar way.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250331033828.365077-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/progs/verifier_private_stack.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
index b1fbdf119553..fc91b414364e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -27,7 +27,7 @@ __description("Private stack, single prog")
__success
__arch_x86_64
__jited(" movabsq $0x{{.*}}, %r9")
-__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" addq %gs:{{.*}}, %r9")
__jited(" movl $0x2a, %edi")
__jited(" movq %rdi, -0x100(%r9)")
__naked void private_stack_single_prog(void)
@@ -74,7 +74,7 @@ __success
__arch_x86_64
/* private stack fp for the main prog */
__jited(" movabsq $0x{{.*}}, %r9")
-__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" addq %gs:{{.*}}, %r9")
__jited(" movl $0x2a, %edi")
__jited(" movq %rdi, -0x200(%r9)")
__jited(" pushq %r9")
@@ -122,7 +122,7 @@ __jited(" pushq %rbp")
__jited(" movq %rsp, %rbp")
__jited(" endbr64")
__jited(" movabsq $0x{{.*}}, %r9")
-__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" addq %gs:{{.*}}, %r9")
__jited(" pushq %r9")
__jited(" callq")
__jited(" popq %r9")
--
2.49.0

View File

@@ -1,8 +0,0 @@
# btf_dump -- need to disable data dump sub-tests
core_retro
cpu_mask
hashmap
legacy_printk
perf_buffer
section_names

View File

@@ -1,50 +0,0 @@
# attach_probe
autoload
bpf_verif_scale
cgroup_attach_autodetach
cgroup_attach_override
core_autosize
core_extern
core_read_macros
core_reloc
core_retro
cpu_mask
endian
get_branch_snapshot
get_stackid_cannot_attach
global_data
global_data_init
global_func_args
hashmap
legacy_printk
linked_funcs
linked_maps
map_lock
obj_name
perf_buffer
perf_event_stackmap
pinning
pkt_md_access
probe_user
queue_stack_map
raw_tp_writable_reject_nbd_invalid
raw_tp_writable_test_run
rdonly_maps
section_names
signal_pending
sockmap_ktls
spinlock
stacktrace_map
stacktrace_map_raw_tp
static_linked
task_fd_query_rawtp
task_fd_query_tp
tc_bpf
tcp_estats
tcp_rtt
test_global_funcs/arg_tag_ctx*
tp_attach_query
usdt/urand_pid_attach
xdp
xdp_noinline
xdp_perf

View File

@@ -1,5 +1,6 @@
# TEMPORARY
btf_dump/btf_dump: syntax
bpf_cookie/perf_event
kprobe_multi_bench_attach
core_reloc/enum64val
core_reloc/size___diff_sz
@@ -12,3 +13,5 @@ xdp_bonding/xdp_bonding_features # started failing after net merge from 359e
tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets")
migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation
migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation
verify_pkcs7_sig # keeps failing
verif_scale_pyperf600 # fails on newer Clangs

View File

@@ -1,5 +0,0 @@
# This complements ALLOWLIST-5.5.0 but excludes subtest that can't work on 5.5
btf # "size check test", "func (Non zero vlen)"
tailcalls # tailcall_bpf2bpf_1, tailcall_bpf2bpf_2, tailcall_bpf2bpf_3
tc_bpf/tc_bpf_non_root

View File

@@ -0,0 +1,37 @@
#!/bin/bash
# This file is sourced by libbpf/ci/run-vmtest Github Action scripts.
# $SELFTESTS_BPF and $VMTEST_CONFIGS are set in the workflow, before
# libbpf/ci/run-vmtest action is called
# See .github/workflows/kernel-test.yml
ALLOWLIST_FILES=(
"${SELFTESTS_BPF}/ALLOWLIST"
"${SELFTESTS_BPF}/ALLOWLIST.${ARCH}"
"${VMTEST_CONFIGS}/ALLOWLIST"
"${VMTEST_CONFIGS}/ALLOWLIST-${KERNEL}"
"${VMTEST_CONFIGS}/ALLOWLIST-${KERNEL}.${ARCH}"
)
DENYLIST_FILES=(
"${SELFTESTS_BPF}/DENYLIST"
"${SELFTESTS_BPF}/DENYLIST.${ARCH}"
"${VMTEST_CONFIGS}/DENYLIST"
"${VMTEST_CONFIGS}/DENYLIST-${KERNEL}"
"${VMTEST_CONFIGS}/DENYLIST-${KERNEL}.${ARCH}"
)
# Export pipe-separated strings, because bash doesn't support array export
export SELFTESTS_BPF_ALLOWLIST_FILES=$(IFS="|"; echo "${ALLOWLIST_FILES[*]}")
export SELFTESTS_BPF_DENYLIST_FILES=$(IFS="|"; echo "${DENYLIST_FILES[*]}")
if [[ "${LLVM_VERSION}" -lt 18 ]]; then
echo "KERNEL_TEST=test_progs test_progs_no_alu32 test_maps test_verifier" >> $GITHUB_ENV
else # all
echo "KERNEL_TEST=test_progs test_progs_cpuv4 test_progs_no_alu32 test_maps test_verifier" >> $GITHUB_ENV
fi
echo "cp -R ${SELFTESTS_BPF} ${GITHUB_WORKSPACE}/selftests"
mkdir -p "${GITHUB_WORKSPACE}/selftests"
cp -R "${SELFTESTS_BPF}" "${GITHUB_WORKSPACE}/selftests"

View File

@@ -1,38 +0,0 @@
# shellcheck shell=bash
# $1 - start or end
# $2 - fold identifier, no spaces
# $3 - fold section description
foldable() {
local YELLOW='\033[1;33m'
local NOCOLOR='\033[0m'
if [ $1 = "start" ]; then
line="::group::$2"
if [ ! -z "${3:-}" ]; then
line="$line - ${YELLOW}$3${NOCOLOR}"
fi
else
line="::endgroup::"
fi
echo -e "$line"
}
__print() {
local TITLE=""
if [[ -n $2 ]]; then
TITLE=" title=$2"
fi
echo "::$1${TITLE}::$3"
}
# $1 - title
# $2 - message
print_error() {
__print error $1 $2
}
# $1 - title
# $2 - message
print_notice() {
__print notice $1 $2
}

View File

@@ -1,94 +0,0 @@
#!/bin/bash
set -euo pipefail
source $(cd $(dirname $0) && pwd)/helpers.sh
ARCH=$(uname -m)
STATUS_FILE=/exitstatus
read_lists() {
(for path in "$@"; do
if [[ -s "$path" ]]; then
cat "$path"
fi;
done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ','
}
test_progs() {
if [[ "${KERNEL}" != '4.9.0' ]]; then
foldable start test_progs "Testing test_progs"
# "&& true" does not change the return code (it is not executed
# if the Python script fails), but it prevents exiting on a
# failure due to the "set -e".
./test_progs ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true
echo "test_progs:$?" >> "${STATUS_FILE}"
foldable end test_progs
fi
}
test_progs_no_alu32() {
foldable start test_progs-no_alu32 "Testing test_progs-no_alu32"
./test_progs-no_alu32 ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true
echo "test_progs-no_alu32:$?" >> "${STATUS_FILE}"
foldable end test_progs-no_alu32
}
test_maps() {
if [[ "${KERNEL}" == 'latest' ]]; then
foldable start test_maps "Testing test_maps"
./test_maps && true
echo "test_maps:$?" >> "${STATUS_FILE}"
foldable end test_maps
fi
}
test_verifier() {
if [[ "${KERNEL}" == 'latest' ]]; then
foldable start test_verifier "Testing test_verifier"
./test_verifier && true
echo "test_verifier:$?" >> "${STATUS_FILE}"
foldable end test_verifier
fi
}
foldable end vm_init
foldable start kernel_config "Kconfig"
zcat /proc/config.gz
foldable end kernel_config
configs_path=/${PROJECT_NAME}/selftests/bpf
local_configs_path=${PROJECT_NAME}/vmtest/configs
DENYLIST=$(read_lists \
"$configs_path/DENYLIST" \
"$configs_path/DENYLIST.${ARCH}" \
"$local_configs_path/DENYLIST-${KERNEL}" \
"$local_configs_path/DENYLIST-${KERNEL}.${ARCH}" \
)
ALLOWLIST=$(read_lists \
"$configs_path/ALLOWLIST" \
"$configs_path/ALLOWLIST.${ARCH}" \
"$local_configs_path/ALLOWLIST-${KERNEL}" \
"$local_configs_path/ALLOWLIST-${KERNEL}.${ARCH}" \
)
echo "DENYLIST: ${DENYLIST}"
echo "ALLOWLIST: ${ALLOWLIST}"
cd ${PROJECT_NAME}/selftests/bpf
if [ $# -eq 0 ]; then
test_progs
test_progs_no_alu32
# test_maps
test_verifier
else
for test_name in "$@"; do
"${test_name}"
done
fi

View File

@@ -219,6 +219,14 @@ compilation and skeleton generation. Using Libbpf-rs will make building user
space part of the BPF application easier. Note that the BPF program themselves
must still be written in plain C.
libbpf logging
==============
By default, libbpf logs informational and warning messages to stderr. The
verbosity of these messages can be controlled by setting the environment
variable LIBBPF_LOG_LEVEL to either warn, info, or debug. A custom log
callback can be set using ``libbpf_set_print()``.
Additional Documentation
========================

View File

@@ -100,10 +100,26 @@ described in more detail in the footnotes.
| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``usdt+`` [#usdt]_ | |
+ + +----------------------------------+-----------+
| | | ``usdt.s+`` [#usdt]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
+ + +----------------------------------+-----------+
| | | ``kretprobe.multi+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_SESSION`` | ``kprobe.session+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_MULTI`` | ``uprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.multi.s+`` [#upmul]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi.s+`` [#upmul]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_SESSION`` | ``uprobe.session+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.session.s+`` [#upmul]_ | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -121,6 +137,8 @@ described in more detail in the footnotes.
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_LWT_XMIT`` | | ``lwt_xmit`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_NETFILTER`` | | ``netfilter`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_PERF_EVENT`` | | ``perf_event`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` | | ``raw_tp.w+`` [#rawtp]_ | |
@@ -131,11 +149,23 @@ described in more detail in the footnotes.
+ + +----------------------------------+-----------+
| | | ``raw_tracepoint+`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` | |
| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` [#tc_legacy]_ | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` | |
| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` [#tc_legacy]_ | |
+ + +----------------------------------+-----------+
| | | ``tc`` | |
| | | ``tc`` [#tc_legacy]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_NETKIT_PRIMARY`` | ``netkit/primary`` | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_NETKIT_PEER`` | ``netkit/peer`` | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TCX_INGRESS`` | ``tc/ingress`` | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TCX_EGRESS`` | ``tc/egress`` | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TCX_INGRESS`` | ``tcx/ingress`` | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TCX_EGRESS`` | ``tcx/egress`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_SK_LOOKUP`` | ``BPF_SK_LOOKUP`` | ``sk_lookup`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -155,7 +185,9 @@ described in more detail in the footnotes.
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_SOCK_OPS`` | ``BPF_CGROUP_SOCK_OPS`` | ``sockops`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` | |
| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` [#struct_ops]_ | |
+ + +----------------------------------+-----------+
| | | ``struct_ops.s+`` [#struct_ops]_ | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_SYSCALL`` | | ``syscall`` | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -203,11 +235,19 @@ described in more detail in the footnotes.
non-negative integer.
.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
.. [#upmul] The ``uprobe.multi`` attach format is ``uprobe.multi[.s]/<path>:<function-pattern>``
where ``function-pattern`` supports ``*`` and ``?`` wildcards.
.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
supports ``*`` and ``?`` wildcards. Valid characters for pattern are
``a-zA-Z0-9_.*?``.
.. [#lsm] The ``lsm`` attachment format is ``lsm[.s]/<hook>``.
.. [#rawtp] The ``raw_tp`` attach format is ``raw_tracepoint[.w]/<tracepoint>``.
.. [#tc_legacy] The ``tc``, ``classifier`` and ``action`` attach types are deprecated, use
``tcx/*`` instead.
.. [#struct_ops] The ``struct_ops`` attach format supports ``struct_ops[.s]/<name>`` convention,
but ``name`` is ignored and it is recommended to just use plain
``SEC("struct_ops[.s]")``. The attachments are defined in a struct initializer
that is tagged with ``SEC(".struct_ops[.link]")``.
.. [#tp] The ``tracepoint`` attach format is ``tracepoint/<category>/<name>``.
.. [#iter] The ``iter`` attach format is ``iter[.s]/<struct-name>``.

58
fuzz/readme.md Normal file
View File

@@ -0,0 +1,58 @@
# About fuzzing
Fuzzing is done by [OSS-Fuzz](https://google.github.io/oss-fuzz/).
It works by creating a project-specific binary that combines fuzzer
itself and a project provided entry point named
`LLVMFuzzerTestOneInput()`. When invoked, this executable either
searches for new test cases or runs an existing test case.
File `fuzz/bpf-object-fuzzer.c` defines an entry point for the robot:
- robot supplies bytes supposed to be an ELF file;
- wrapper invokes `bpf_object__open_mem()` to process these bytes.
File `scripts/build-fuzzers.sh` provides a recipe for fuzzer
infrastructure on how to build the executable described above (see
[here](https://github.com/google/oss-fuzz/tree/master/projects/libbpf)).
# Reproducing fuzzing errors
## Official way
OSS-Fuzz project describes error reproduction steps in the official
[documentation](https://google.github.io/oss-fuzz/advanced-topics/reproducing/).
Suppose you received an email linking to the fuzzer generated test
case, or got one as an artifact of the `CIFuzz` job (e.g. like
[here](https://github.com/libbpf/libbpf/actions/runs/16375110681)).
Actions to reproduce the error locally are:
```sh
git clone --depth=1 https://github.com/google/oss-fuzz.git
cd oss-fuzz
python infra/helper.py pull_images
python infra/helper.py build_image libbpf
python infra/helper.py build_fuzzers --sanitizer address libbpf <path-to-libbpf-checkout>
python infra/helper.py reproduce libbpf bpf-object-fuzzer <path-to-test-case>
```
`<path-to-test-case>` is usually a `crash-<many-hex-digits>` file w/o
extension, CI job wraps it into zip archive and attaches as an artifact.
To recompile after some fixes, repeat the `build_fuzzers` and
`reproduce` steps after modifying source code in
`<path-to-libbpf-checkout>`.
Note: the `build_fuzzers` step creates a binary
`build/out/libbpf/bpf-object-fuzzer`, it can be executed directly if
your environment is compatible.
## Simple way
From the project root:
```sh
SKIP_LIBELF_REBUILD=1 scripts/build-fuzzers.sh
out/bpf-object-fuzzer <path-to-test-case>
```
`out/bpf-object-fuzzer` is the fuzzer executable described earlier,
can be run with gdb etc.

View File

@@ -123,6 +123,14 @@
BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \
MAP_FD, VALUE_OFF)
#define BPF_JMP_REG(OP, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_K, \

View File

@@ -43,4 +43,18 @@
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define be32_to_cpu(x) __builtin_bswap32(x)
#define cpu_to_be32(x) __builtin_bswap32(x)
#define be64_to_cpu(x) __builtin_bswap64(x)
#define cpu_to_be64(x) __builtin_bswap64(x)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define be32_to_cpu(x) (x)
#define cpu_to_be32(x) (x)
#define be64_to_cpu(x) (x)
#define cpu_to_be64(x) (x)
#else
# error "__BYTE_ORDER__ undefined or invalid"
#endif
#endif

View File

@@ -30,4 +30,7 @@ struct list_head {
struct list_head *next, *prev;
};
/* needed on Android/Termux, where this linux/types.h is included by other include files */
typedef unsigned __bitwise __poll_t;
#endif

View File

@@ -5,8 +5,8 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef _UAPI__LINUX_BPF_H__
#define _UAPI__LINUX_BPF_H__
#ifndef __LINUX_BPF_H__
#define __LINUX_BPF_H__
#include <linux/types.h>
#include <linux/bpf_common.h>
@@ -51,6 +51,9 @@
#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */
#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */
#define BPF_LOAD_ACQ 0x100 /* load-acquire */
#define BPF_STORE_REL 0x110 /* store-release */
enum bpf_cond_pseudo_jmp {
BPF_MAY_GOTO = 0,
};
@@ -116,6 +119,14 @@ enum bpf_cgroup_iter_order {
BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
/*
* Walks the immediate children of the specified parent
* cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
* BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
* the iterator does not include the specified parent as one of the
* returned iterator elements.
*/
BPF_CGROUP_ITER_CHILDREN,
};
union bpf_iter_link_info {
@@ -447,6 +458,7 @@ union bpf_iter_link_info {
* * **struct bpf_map_info**
* * **struct bpf_btf_info**
* * **struct bpf_link_info**
* * **struct bpf_token_info**
*
* Return
* Returns zero on success. On error, -1 is returned and *errno*
@@ -903,6 +915,27 @@ union bpf_iter_link_info {
* A new file descriptor (a nonnegative integer), or -1 if an
* error occurred (in which case, *errno* is set appropriately).
*
* BPF_PROG_STREAM_READ_BY_FD
* Description
* Read data of a program's BPF stream. The program is identified
* by *prog_fd*, and the stream is identified by the *stream_id*.
* The data is copied to a buffer pointed to by *stream_buf*, and
* filled less than or equal to *stream_buf_len* bytes.
*
* Return
* Number of bytes read from the stream on success, or -1 if an
* error occurred (in which case, *errno* is set appropriately).
*
* BPF_PROG_ASSOC_STRUCT_OPS
* Description
* Associate a BPF program with a struct_ops map. The struct_ops
* map is identified by *map_fd* and the BPF program is
* identified by *prog_fd*.
*
* Return
* 0 on success or -1 if an error occurred (in which case,
* *errno* is set appropriately).
*
* NOTES
* eBPF objects (maps and programs) can be shared between processes.
*
@@ -958,6 +991,8 @@ enum bpf_cmd {
BPF_LINK_DETACH,
BPF_PROG_BIND_MAP,
BPF_TOKEN_CREATE,
BPF_PROG_STREAM_READ_BY_FD,
BPF_PROG_ASSOC_STRUCT_OPS,
__MAX_BPF_CMD,
};
@@ -1010,6 +1045,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
BPF_MAP_TYPE_INSN_ARRAY,
__MAX_BPF_MAP_TYPE
};
@@ -1115,11 +1151,17 @@ enum bpf_attach_type {
BPF_CGROUP_UNIX_GETSOCKNAME,
BPF_NETKIT_PRIMARY,
BPF_NETKIT_PEER,
BPF_TRACE_KPROBE_SESSION,
BPF_TRACE_UPROBE_SESSION,
BPF_TRACE_FSESSION,
__MAX_BPF_ATTACH_TYPE
};
#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
/* Add BPF_LINK_TYPE(type, name) in bpf_types.h to keep bpf_link_type_strs[]
* in sync with the definitions below.
*/
enum bpf_link_type {
BPF_LINK_TYPE_UNSPEC = 0,
BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
@@ -1135,6 +1177,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_TCX = 11,
BPF_LINK_TYPE_UPROBE_MULTI = 12,
BPF_LINK_TYPE_NETKIT = 13,
BPF_LINK_TYPE_SOCKMAP = 14,
__MAX_BPF_LINK_TYPE,
};
@@ -1201,6 +1244,7 @@ enum bpf_perf_event_type {
#define BPF_F_BEFORE (1U << 3)
#define BPF_F_AFTER (1U << 4)
#define BPF_F_ID (1U << 5)
#define BPF_F_PREORDER (1U << 6)
#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
@@ -1349,6 +1393,8 @@ enum {
BPF_NOEXIST = 1, /* create new element if it didn't exist */
BPF_EXIST = 2, /* update existing element */
BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */
};
/* flags for BPF_MAP_CREATE command */
@@ -1407,6 +1453,9 @@ enum {
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
/* Enable BPF ringbuf overwrite mode */
BPF_F_RB_OVERWRITE = (1U << 19),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1423,6 +1472,8 @@ enum {
#define BPF_F_TEST_RUN_ON_CPU (1U << 0)
/* If set, XDP frames will be transmitted after processing */
#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1)
/* If set, apply CHECKSUM_COMPLETE to skb and validate the checksum */
#define BPF_F_TEST_SKB_CHECKSUM_COMPLETE (1U << 2)
/* type for BPF_ENABLE_STATS */
enum bpf_stats_type {
@@ -1451,6 +1502,11 @@ struct bpf_stack_build_id {
#define BPF_OBJ_NAME_LEN 16U
enum {
BPF_STREAM_STDOUT = 1,
BPF_STREAM_STDERR = 2,
};
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -1492,9 +1548,15 @@ union bpf_attr {
* If provided, map_flags should have BPF_F_TOKEN_FD flag set.
*/
__s32 map_token_fd;
/* Hash of the program that has exclusive access to the map.
*/
__aligned_u64 excl_prog_hash;
/* Size of the passed excl_prog_hash. */
__u32 excl_prog_hash_size;
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */
__u32 map_fd;
__aligned_u64 key;
union {
@@ -1565,6 +1627,26 @@ union bpf_attr {
* If provided, prog_flags should have BPF_F_TOKEN_FD flag set.
*/
__s32 prog_token_fd;
/* The fd_array_cnt can be used to pass the length of the
* fd_array array. In this case all the [map] file descriptors
* passed in this array will be bound to the program, even if
* the maps are not referenced directly. The functionality is
* similar to the BPF_PROG_BIND_MAP syscall, but maps can be
* used by the verifier during the program load. If provided,
* then the fd_array[0,...,fd_array_cnt-1] is expected to be
* continuous.
*/
__u32 fd_array_cnt;
/* Pointer to a buffer containing the signature of the BPF
* program.
*/
__aligned_u64 signature;
/* Size of the signature buffer in bytes. */
__u32 signature_size;
/* ID of the kernel keyring to be used for signature
* verification.
*/
__s32 keyring_id;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1630,6 +1712,7 @@ union bpf_attr {
};
__u32 next_id;
__u32 open_flags;
__s32 fd_by_id_token_fd;
};
struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
@@ -1771,6 +1854,13 @@ union bpf_attr {
};
__u64 expected_revision;
} netkit;
struct {
union {
__u32 relative_fd;
__u32 relative_id;
};
__u64 expected_revision;
} cgroup;
};
} link_create;
@@ -1819,6 +1909,19 @@ union bpf_attr {
__u32 bpffs_fd;
} token_create;
struct {
__aligned_u64 stream_buf;
__u32 stream_buf_len;
__u32 stream_id;
__u32 prog_fd;
} prog_stream_read;
struct {
__u32 map_fd;
__u32 prog_fd;
__u32 flags;
} prog_assoc_struct_ops;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -1966,15 +2069,21 @@ union bpf_attr {
* program.
* Return
* The SMP id of the processor running the program.
* Attributes
* __bpf_fastcall
*
* long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
* Description
* Store *len* bytes from address *from* into the packet
* associated to *skb*, at *offset*. *flags* are a combination of
* **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
* checksum for the packet after storing the bytes) and
* **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
* **->swhash** and *skb*\ **->l4hash** to 0).
* associated to *skb*, at *offset*. The *flags* are a combination
* of the following values:
*
* **BPF_F_RECOMPUTE_CSUM**
* Automatically update *skb*\ **->csum** after storing the
* bytes.
* **BPF_F_INVALIDATE_HASH**
* Set *skb*\ **->hash**, *skb*\ **->swhash** and *skb*\
* **->l4hash** to 0.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
@@ -2026,7 +2135,8 @@ union bpf_attr {
* untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
* for updates resulting in a null checksum the value is set to
* **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
* the checksum is to be computed against a pseudo-header.
* that the modified header field is part of the pseudo-header.
* Flag **BPF_F_IPV6** should be set for IPv6 packets.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
@@ -2373,7 +2483,7 @@ union bpf_attr {
* into it. An example is available in file
* *samples/bpf/trace_output_user.c* in the Linux kernel source
* tree (the eBPF program counterpart is in
* *samples/bpf/trace_output_kern.c*).
* *samples/bpf/trace_output.bpf.c*).
*
* **bpf_perf_event_output**\ () achieves better performance
* than **bpf_trace_printk**\ () for sharing data with user
@@ -2847,7 +2957,7 @@ union bpf_attr {
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
* **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
* **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
* **TCP_BPF_RTO_MIN**.
* **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports the following *optname*\ s:
* **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
@@ -3097,10 +3207,6 @@ union bpf_attr {
* with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
* option, and in this case it only works on functions tagged with
* **ALLOW_ERROR_INJECTION** in the kernel code.
*
* Also, the helper is only available for the architectures having
* the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
* x86 architecture is the only one to support this feature.
* Return
* 0
*
@@ -3394,6 +3500,10 @@ union bpf_attr {
* for the nexthop. If the src addr cannot be derived,
* **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
* case, *params*->dmac and *params*->smac are not set either.
* **BPF_FIB_LOOKUP_MARK**
* Use the mark present in *params*->mark for the fib lookup.
* This option should not be used with BPF_FIB_LOOKUP_DIRECT,
* as it only has meaning for full lookups.
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -4813,7 +4923,7 @@ union bpf_attr {
*
* **-ENOENT** if the bpf_local_storage cannot be found.
*
* long bpf_d_path(struct path *path, char *buf, u32 sz)
* long bpf_d_path(const struct path *path, char *buf, u32 sz)
* Description
* Return full path for given **struct path** object, which
* needs to be the kernel BTF *path* object. The path is
@@ -4943,6 +5053,9 @@ union bpf_attr {
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* *skb*\ **->mark** and *skb*\ **->tstamp** are not cleared during
* the netns switch.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the
* ingress hook and for veth and netkit target device types. The
@@ -5022,7 +5135,7 @@ union bpf_attr {
* bytes will be copied to *dst*
* Return
* The **hash_algo** is returned on success,
* **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
* **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if
* invalid arguments are passed.
*
* struct socket *bpf_sock_from_file(struct file *file)
@@ -5361,7 +5474,7 @@ union bpf_attr {
* Currently, the **flags** must be 0. Currently, nr_loops is
* limited to 1 << 23 (~8 million) loops.
*
* long (\*callback_fn)(u32 index, void \*ctx);
* long (\*callback_fn)(u64 index, void \*ctx);
*
* where **index** is the current index in the loop. The index
* is zero-indexed.
@@ -5508,14 +5621,15 @@ union bpf_attr {
* bytes will be copied to *dst*
* Return
* The **hash_algo** is returned on success,
* **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
* **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
* invalid arguments are passed.
*
* void *bpf_kptr_xchg(void *map_value, void *ptr)
* void *bpf_kptr_xchg(void *dst, void *ptr)
* Description
* Exchange kptr at pointer *map_value* with *ptr*, and return the
* old value. *ptr* can be NULL, otherwise it must be a referenced
* pointer which will be released when this helper is called.
* Exchange kptr at pointer *dst* with *ptr*, and return the old value.
* *dst* can be map value or local kptr. *ptr* can be NULL, otherwise
* it must be a referenced pointer which will be released when this helper
* is called.
* Return
* The old value of kptr (which can be NULL). The returned pointer
* if not NULL, is a reference which must be released using its
@@ -5536,7 +5650,7 @@ union bpf_attr {
* Return
* *sk* if casting is valid, or **NULL** otherwise.
*
* long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
* long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
* Description
* Get a dynptr to local memory *data*.
*
@@ -5579,7 +5693,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
* long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5589,7 +5703,7 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
@@ -5610,7 +5724,7 @@ union bpf_attr {
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
* Description
* Get a pointer to the underlying dynptr data.
*
@@ -5998,7 +6112,10 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
/* */
/* This helper list is effectively frozen. If you are trying to \
* add a new helper, you should add a kfunc instead which has \
* less stability guarantees. See Documentation/bpf/kfuncs.rst \
*/
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
* know or care about integer value that is now passed as second argument
@@ -6036,11 +6153,7 @@ enum {
BPF_F_PSEUDO_HDR = (1ULL << 4),
BPF_F_MARK_MANGLED_0 = (1ULL << 5),
BPF_F_MARK_ENFORCE = (1ULL << 6),
};
/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
enum {
BPF_F_INGRESS = (1ULL << 0),
BPF_F_IPV6 = (1ULL << 7),
};
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
@@ -6150,6 +6263,7 @@ enum {
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
BPF_RB_OVERWRITE_POS = 4,
};
/* BPF ring buffer constants */
@@ -6189,10 +6303,12 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
/* Flags for bpf_redirect_map helper */
/* Flags for bpf_redirect and bpf_redirect_map helpers */
enum {
BPF_F_BROADCAST = (1ULL << 3),
BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
BPF_F_INGRESS = (1ULL << 0), /* used for skb path */
BPF_F_BROADCAST = (1ULL << 3), /* used for XDP path */
BPF_F_EXCLUDE_INGRESS = (1ULL << 4), /* used for XDP path */
#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
};
#define __bpf_md_ptr(type, name) \
@@ -6201,12 +6317,17 @@ union { \
__u64 :64; \
} __attribute__((aligned(8)))
/* The enum used in skb->tstamp_type. It specifies the clock type
* of the time stored in the skb->tstamp.
*/
enum {
BPF_SKB_TSTAMP_UNSPEC,
BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */
/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
* the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
* and try to deduce it by ingress, egress or skb->sk->sk_clockid.
BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */
BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */
BPF_SKB_CLOCK_REALTIME = 0,
BPF_SKB_CLOCK_MONOTONIC = 1,
BPF_SKB_CLOCK_TAI = 2,
/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
* the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
*/
};
@@ -6594,6 +6715,8 @@ struct bpf_map_info {
__u32 btf_value_type_id;
__u32 btf_vmlinux_id;
__u64 map_extra;
__aligned_u64 hash;
__u32 hash_size;
} __attribute__((aligned(8)));
struct bpf_btf_info {
@@ -6613,11 +6736,15 @@ struct bpf_link_info {
struct {
__aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
__u32 tp_name_len; /* in/out: tp_name buffer len */
__u32 :32;
__u64 cookie;
} raw_tracepoint;
struct {
__u32 attach_type;
__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
__u32 target_btf_id; /* BTF type id inside the object */
__u32 :32;
__u64 cookie;
} tracing;
struct {
__u64 cgroup_id;
@@ -6689,6 +6816,7 @@ struct bpf_link_info {
__u32 name_len;
__u32 offset; /* offset from file_name */
__u64 cookie;
__u64 ref_ctr_offset;
} uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */
struct {
__aligned_u64 func_name; /* in/out */
@@ -6720,9 +6848,20 @@ struct bpf_link_info {
__u32 ifindex;
__u32 attach_type;
} netkit;
struct {
__u32 map_id;
__u32 attach_type;
} sockmap;
};
} __attribute__((aligned(8)));
struct bpf_token_info {
__u64 allowed_cmds;
__u64 allowed_maps;
__u64 allowed_progs;
__u64 allowed_attachs;
} __attribute__((aligned(8)));
/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
* by user and intended to be used by socket (e.g. to bind to, depends on
* attach type).
@@ -6886,6 +7025,12 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
enum {
SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
SK_BPF_CB_TX_TIMESTAMPING
};
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -6938,6 +7083,8 @@ enum {
* socket transition to LISTEN state.
*/
BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
* Arg1: measured RTT input (mrtt)
* Arg2: updated srtt
*/
BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option.
* It will be called to handle
@@ -6996,6 +7143,29 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing
* through dev layer when
* SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send
* to the nic when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when
* SK_BPF_CB_TX_TIMESTAMPING feature
* is on.
*/
BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the
* same sendmsg call are acked
* when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall
* is triggered. It's used to correlate
* sendmsg timestamp with corresponding
* tskey.
*/
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7061,6 +7231,10 @@ enum {
TCP_BPF_SYN = 1005, /* Copy the TCP header */
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */
};
enum {
@@ -7120,6 +7294,7 @@ enum {
BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
BPF_FIB_LOOKUP_TBID = (1U << 3),
BPF_FIB_LOOKUP_SRC = (1U << 4),
BPF_FIB_LOOKUP_MARK = (1U << 5),
};
enum {
@@ -7152,7 +7327,7 @@ struct bpf_fib_lookup {
/* output: MTU value */
__u16 mtu_result;
};
} __attribute__((packed, aligned(2)));
/* input: L3 device index for lookup
* output: device index from FIB lookup
*/
@@ -7197,8 +7372,19 @@ struct bpf_fib_lookup {
__u32 tbid;
};
__u8 smac[6]; /* ETH_ALEN */
__u8 dmac[6]; /* ETH_ALEN */
union {
/* input */
struct {
__u32 mark; /* policy routing */
/* 2 4-byte holes for input */
};
/* output: source and dest mac */
struct {
__u8 smac[6]; /* ETH_ALEN */
__u8 dmac[6]; /* ETH_ALEN */
};
};
};
struct bpf_redir_neigh {
@@ -7285,6 +7471,14 @@ struct bpf_timer {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_task_work {
__u64 __opaque;
} __attribute__((aligned(8)));
struct bpf_wq {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_dynptr {
__u64 __opaque[2];
} __attribute__((aligned(8)));
@@ -7477,4 +7671,33 @@ struct bpf_iter_num {
__u64 __opaque[1];
} __attribute__((aligned(8)));
#endif /* _UAPI__LINUX_BPF_H__ */
/*
* Flags to control BPF kfunc behaviour.
* - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective
* helper documentation for details.)
*/
enum bpf_kfunc_flags {
BPF_F_PAD_ZEROS = (1ULL << 0),
};
/*
* Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
*
* Before the map is used the orig_off field should point to an
* instruction inside the program being loaded. The other fields
* must be set to 0.
*
* After the program is loaded, the xlated_off will be adjusted
* by the verifier to point to the index of the original instruction
* in the xlated program. If the instruction is deleted, it will
* be set to (u32)-1. The jitted_off will be set to the corresponding
* offset in the jitted image of the program.
*/
struct bpf_insn_array_value {
__u32 orig_off;
__u32 xlated_off;
__u32 jitted_off;
__u32 :32;
};
#endif /* __LINUX_BPF_H__ */

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI__LINUX_BPF_COMMON_H__
#define _UAPI__LINUX_BPF_COMMON_H__
#ifndef __LINUX_BPF_COMMON_H__
#define __LINUX_BPF_COMMON_H__
/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
@@ -54,4 +54,4 @@
#define BPF_MAXINSNS 4096
#endif
#endif /* _UAPI__LINUX_BPF_COMMON_H__ */
#endif /* __LINUX_BPF_COMMON_H__ */

View File

@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* Copyright (c) 2018 Facebook */
#ifndef _UAPI__LINUX_BTF_H__
#define _UAPI__LINUX_BTF_H__
#ifndef __LINUX_BTF_H__
#define __LINUX_BTF_H__
#include <linux/types.h>
@@ -36,7 +36,8 @@ struct btf_type {
* bits 24-28: kind (e.g. int, ptr, array...etc)
* bits 29-30: unused
* bit 31: kind_flag, currently used by
* struct, union, enum, fwd and enum64
* struct, union, enum, fwd, enum64,
* decl_tag and type_tag
*/
__u32 info;
/* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64.
@@ -197,4 +198,4 @@ struct btf_enum64 {
__u32 val_hi32;
};
#endif /* _UAPI__LINUX_BTF_H__ */
#endif /* __LINUX_BTF_H__ */

View File

@@ -1,13 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_FCNTL_H
#define _UAPI_LINUX_FCNTL_H
#ifndef _LINUX_FCNTL_H
#define _LINUX_FCNTL_H
#include <asm/fcntl.h>
#include <linux/openat2.h>
#include <linux/types.h>
#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
/*
* Request nofications on a directory.
* See below for events that may be notified.
*/
#define F_NOTIFY (F_LINUX_SPECIFIC_BASE + 2)
#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
/* Was the file just created? */
#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4)
/*
* Cancel a blocking posix lock; internal use only until we expose an
* asynchronous lock api to userspace:
@@ -17,12 +29,6 @@
/* Create a file descriptor with FD_CLOEXEC set. */
#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6)
/*
* Request nofications on a directory.
* See below for events that may be notified.
*/
#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2)
/*
* Set and get of pipe page size array
*/
@@ -74,6 +80,17 @@
*/
#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
/* Set/Get delegations */
#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15)
#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16)
/* Argument structure for F_GETDELEG and F_SETDELEG */
struct delegation {
__u32 d_flags; /* Must be 0 */
__u16 d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */
__u16 __pad; /* Must be 0 */
};
/*
* Types of directory notifications that may be requested.
*/
@@ -85,39 +102,92 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
/* Reserved kernel ranges [-100], [-10000, -40000]. */
#define AT_FDCWD -100 /* Special value for dirfd used to
indicate openat should use the
current working directory. */
/*
* The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is
* meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to
* unlinkat. The two functions do completely different things and therefore,
* the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to
* faccessat would be undefined behavior and thus treating it equivalent to
* AT_EACCESS is valid undefined behavior.
* The concept of process and threads in userland and the kernel is a confusing
* one - within the kernel every thread is a 'task' with its own individual PID,
* however from userland's point of view threads are grouped by a single PID,
* which is that of the 'thread group leader', typically the first thread
* spawned.
*
* To cut the Gideon knot, for internal kernel usage, we refer to
* PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
* perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
* group leader...
*/
#define AT_FDCWD -100 /* Special value used to indicate
openat should use the current
working directory. */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */
#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
/* Generic flags for the *at(2) family of syscalls. */
/* Reserved for per-syscall flags 0xff. */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic
links. */
/* Reserved for per-syscall flags 0x200 */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount
traversal. */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative
pathname to operate on dirfd
directly. */
/*
* These flags are currently statx(2)-specific, but they could be made generic
* in the future and so they should not be used for other per-syscall flags.
*/
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
/*
* Per-syscall flags for the *at(2) family of syscalls.
*
* These are flags that are so syscall-specific that a user passing these flags
* to the wrong syscall is so "clearly wrong" that we can safely call such
* usage "undefined behaviour".
*
* For example, the constants AT_REMOVEDIR and AT_EACCESS have the same value.
* AT_EACCESS is meaningful only to faccessat, while AT_REMOVEDIR is meaningful
* only to unlinkat. The two functions do completely different things and
* therefore, the flags can be allowed to overlap. For example, passing
* AT_REMOVEDIR to faccessat would be undefined behavior and thus treating it
* equivalent to AT_EACCESS is valid undefined behavior.
*
* Note for implementers: When picking a new per-syscall AT_* flag, try to
* reuse already existing flags first. This leaves us with as many unused bits
* as possible, so we can use them for generic bits in the future if necessary.
*/
/* Flags for renameat2(2) (must match legacy RENAME_* flags). */
#define AT_RENAME_NOREPLACE 0x0001
#define AT_RENAME_EXCHANGE 0x0002
#define AT_RENAME_WHITEOUT 0x0004
/* Flag for faccessat(2). */
#define AT_EACCESS 0x200 /* Test access permitted for
effective IDs, not real IDs. */
/* Flag for unlinkat(2). */
#define AT_REMOVEDIR 0x200 /* Remove directory instead of
unlinking file. */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
/* Flags for name_to_handle_at(2). */
#define AT_HANDLE_FID 0x200 /* File handle is needed to compare
object identity and may not be
usable with open_by_handle_at(2). */
#define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */
#define AT_HANDLE_CONNECTABLE 0x002 /* Request a connectable file handle */
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
/* Flags for execveat2(2). */
#define AT_EXECVE_CHECK 0x10000 /* Only perform a check if execution
would be allowed. */
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */
#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
compare object identity and may not
be usable to open_by_handle_at(2) */
#if defined(__KERNEL__)
#define AT_GETATTR_NOSEC 0x80000000
#endif
#endif /* _UAPI_LINUX_FCNTL_H */
#endif /* _LINUX_FCNTL_H */

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_IF_LINK_H
#define _UAPI_LINUX_IF_LINK_H
#ifndef _LINUX_IF_LINK_H
#define _LINUX_IF_LINK_H
#include <linux/types.h>
#include <linux/netlink.h>
@@ -377,6 +377,10 @@ enum {
IFLA_GSO_IPV4_MAX_SIZE,
IFLA_GRO_IPV4_MAX_SIZE,
IFLA_DPLL_PIN,
IFLA_MAX_PACING_OFFLOAD_HORIZON,
IFLA_NETNS_IMMUTABLE,
IFLA_HEADROOM,
IFLA_TAILROOM,
__IFLA_MAX
};
@@ -461,6 +465,286 @@ enum in6_addr_gen_mode {
/* Bridge section */
/**
* DOC: Bridge enum definition
*
* Please *note* that the timer values in the following section are expected
* in clock_t format, which is seconds multiplied by USER_HZ (generally
* defined as 100).
*
* @IFLA_BR_FORWARD_DELAY
* The bridge forwarding delay is the time spent in LISTENING state
* (before moving to LEARNING) and in LEARNING state (before moving
* to FORWARDING). Only relevant if STP is enabled.
*
* The valid values are between (2 * USER_HZ) and (30 * USER_HZ).
* The default value is (15 * USER_HZ).
*
* @IFLA_BR_HELLO_TIME
* The time between hello packets sent by the bridge, when it is a root
* bridge or a designated bridge. Only relevant if STP is enabled.
*
* The valid values are between (1 * USER_HZ) and (10 * USER_HZ).
* The default value is (2 * USER_HZ).
*
* @IFLA_BR_MAX_AGE
* The hello packet timeout is the time until another bridge in the
* spanning tree is assumed to be dead, after reception of its last hello
* message. Only relevant if STP is enabled.
*
* The valid values are between (6 * USER_HZ) and (40 * USER_HZ).
* The default value is (20 * USER_HZ).
*
* @IFLA_BR_AGEING_TIME
* Configure the bridge's FDB entries aging time. It is the time a MAC
* address will be kept in the FDB after a packet has been received from
* that address. After this time has passed, entries are cleaned up.
* Allow values outside the 802.1 standard specification for special cases:
*
* * 0 - entry never ages (all permanent)
* * 1 - entry disappears (no persistence)
*
* The default value is (300 * USER_HZ).
*
* @IFLA_BR_STP_STATE
* Turn spanning tree protocol on (*IFLA_BR_STP_STATE* > 0) or off
* (*IFLA_BR_STP_STATE* == 0) for this bridge.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_PRIORITY
* Set this bridge's spanning tree priority, used during STP root bridge
* election.
*
* The valid values are between 0 and 65535.
*
* @IFLA_BR_VLAN_FILTERING
* Turn VLAN filtering on (*IFLA_BR_VLAN_FILTERING* > 0) or off
* (*IFLA_BR_VLAN_FILTERING* == 0). When disabled, the bridge will not
* consider the VLAN tag when handling packets.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_VLAN_PROTOCOL
* Set the protocol used for VLAN filtering.
*
* The valid values are 0x8100(802.1Q) or 0x88A8(802.1AD). The default value
* is 0x8100(802.1Q).
*
* @IFLA_BR_GROUP_FWD_MASK
* The group forwarding mask. This is the bitmask that is applied to
* decide whether to forward incoming frames destined to link-local
* addresses (of the form 01:80:C2:00:00:0X).
*
* The default value is 0, which means the bridge does not forward any
* link-local frames coming on this port.
*
* @IFLA_BR_ROOT_ID
* The bridge root id, read only.
*
* @IFLA_BR_BRIDGE_ID
* The bridge id, read only.
*
* @IFLA_BR_ROOT_PORT
* The bridge root port, read only.
*
* @IFLA_BR_ROOT_PATH_COST
* The bridge root path cost, read only.
*
* @IFLA_BR_TOPOLOGY_CHANGE
* The bridge topology change, read only.
*
* @IFLA_BR_TOPOLOGY_CHANGE_DETECTED
* The bridge topology change detected, read only.
*
* @IFLA_BR_HELLO_TIMER
* The bridge hello timer, read only.
*
* @IFLA_BR_TCN_TIMER
* The bridge tcn timer, read only.
*
* @IFLA_BR_TOPOLOGY_CHANGE_TIMER
* The bridge topology change timer, read only.
*
* @IFLA_BR_GC_TIMER
* The bridge gc timer, read only.
*
* @IFLA_BR_GROUP_ADDR
* Set the MAC address of the multicast group this bridge uses for STP.
* The address must be a link-local address in standard Ethernet MAC address
* format. It is an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f].
*
* The default value is 0.
*
* @IFLA_BR_FDB_FLUSH
* Flush bridge's fdb dynamic entries.
*
* @IFLA_BR_MCAST_ROUTER
* Set bridge's multicast router if IGMP snooping is enabled.
* The valid values are:
*
* * 0 - disabled.
* * 1 - automatic (queried).
* * 2 - permanently enabled.
*
* The default value is 1.
*
* @IFLA_BR_MCAST_SNOOPING
* Turn multicast snooping on (*IFLA_BR_MCAST_SNOOPING* > 0) or off
* (*IFLA_BR_MCAST_SNOOPING* == 0).
*
* The default value is 1.
*
* @IFLA_BR_MCAST_QUERY_USE_IFADDR
* If enabled use the bridge's own IP address as source address for IGMP
* queries (*IFLA_BR_MCAST_QUERY_USE_IFADDR* > 0) or the default of 0.0.0.0
* (*IFLA_BR_MCAST_QUERY_USE_IFADDR* == 0).
*
* The default value is 0 (disabled).
*
* @IFLA_BR_MCAST_QUERIER
* Enable (*IFLA_BR_MULTICAST_QUERIER* > 0) or disable
* (*IFLA_BR_MULTICAST_QUERIER* == 0) IGMP querier, ie sending of multicast
* queries by the bridge.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_MCAST_HASH_ELASTICITY
* Set multicast database hash elasticity, It is the maximum chain length in
* the multicast hash table. This attribute is *deprecated* and the value
* is always 16.
*
* @IFLA_BR_MCAST_HASH_MAX
* Set maximum size of the multicast hash table
*
* The default value is 4096, the value must be a power of 2.
*
* @IFLA_BR_MCAST_LAST_MEMBER_CNT
* The Last Member Query Count is the number of Group-Specific Queries
* sent before the router assumes there are no local members. The Last
* Member Query Count is also the number of Group-and-Source-Specific
* Queries sent before the router assumes there are no listeners for a
* particular source.
*
* The default value is 2.
*
* @IFLA_BR_MCAST_STARTUP_QUERY_CNT
* The Startup Query Count is the number of Queries sent out on startup,
* separated by the Startup Query Interval.
*
* The default value is 2.
*
* @IFLA_BR_MCAST_LAST_MEMBER_INTVL
* The Last Member Query Interval is the Max Response Time inserted into
* Group-Specific Queries sent in response to Leave Group messages, and
* is also the amount of time between Group-Specific Query messages.
*
* The default value is (1 * USER_HZ).
*
* @IFLA_BR_MCAST_MEMBERSHIP_INTVL
* The interval after which the bridge will leave a group, if no membership
* reports for this group are received.
*
* The default value is (260 * USER_HZ).
*
* @IFLA_BR_MCAST_QUERIER_INTVL
* The interval between queries sent by other routers. if no queries are
* seen after this delay has passed, the bridge will start to send its own
* queries (as if *IFLA_BR_MCAST_QUERIER_INTVL* was enabled).
*
* The default value is (255 * USER_HZ).
*
* @IFLA_BR_MCAST_QUERY_INTVL
* The Query Interval is the interval between General Queries sent by
* the Querier.
*
* The default value is (125 * USER_HZ). The minimum value is (1 * USER_HZ).
*
* @IFLA_BR_MCAST_QUERY_RESPONSE_INTVL
* The Max Response Time used to calculate the Max Resp Code inserted
* into the periodic General Queries.
*
* The default value is (10 * USER_HZ).
*
* @IFLA_BR_MCAST_STARTUP_QUERY_INTVL
* The interval between queries in the startup phase.
*
* The default value is (125 * USER_HZ) / 4. The minimum value is (1 * USER_HZ).
*
* @IFLA_BR_NF_CALL_IPTABLES
* Enable (*NF_CALL_IPTABLES* > 0) or disable (*NF_CALL_IPTABLES* == 0)
* iptables hooks on the bridge.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_NF_CALL_IP6TABLES
* Enable (*NF_CALL_IP6TABLES* > 0) or disable (*NF_CALL_IP6TABLES* == 0)
* ip6tables hooks on the bridge.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_NF_CALL_ARPTABLES
* Enable (*NF_CALL_ARPTABLES* > 0) or disable (*NF_CALL_ARPTABLES* == 0)
* arptables hooks on the bridge.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_VLAN_DEFAULT_PVID
* VLAN ID applied to untagged and priority-tagged incoming packets.
*
* The default value is 1. Setting to the special value 0 makes all ports of
* this bridge not have a PVID by default, which means that they will
* not accept VLAN-untagged traffic.
*
* @IFLA_BR_PAD
* Bridge attribute padding type for netlink message.
*
* @IFLA_BR_VLAN_STATS_ENABLED
* Enable (*IFLA_BR_VLAN_STATS_ENABLED* == 1) or disable
* (*IFLA_BR_VLAN_STATS_ENABLED* == 0) per-VLAN stats accounting.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_MCAST_STATS_ENABLED
* Enable (*IFLA_BR_MCAST_STATS_ENABLED* > 0) or disable
* (*IFLA_BR_MCAST_STATS_ENABLED* == 0) multicast (IGMP/MLD) stats
* accounting.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_MCAST_IGMP_VERSION
* Set the IGMP version.
*
* The valid values are 2 and 3. The default value is 2.
*
* @IFLA_BR_MCAST_MLD_VERSION
* Set the MLD version.
*
* The valid values are 1 and 2. The default value is 1.
*
* @IFLA_BR_VLAN_STATS_PER_PORT
* Enable (*IFLA_BR_VLAN_STATS_PER_PORT* == 1) or disable
* (*IFLA_BR_VLAN_STATS_PER_PORT* == 0) per-VLAN per-port stats accounting.
* Can be changed only when there are no port VLANs configured.
*
* The default value is 0 (disabled).
*
* @IFLA_BR_MULTI_BOOLOPT
* The multi_boolopt is used to control new boolean options to avoid adding
* new netlink attributes. You can look at ``enum br_boolopt_id`` for those
* options.
*
* @IFLA_BR_MCAST_QUERIER_STATE
* Bridge mcast querier states, read only.
*
* @IFLA_BR_FDB_N_LEARNED
* The number of dynamically learned FDB entries for the current bridge,
* read only.
*
* @IFLA_BR_FDB_MAX_LEARNED
* Set the number of max dynamically learned FDB entries for the current
* bridge.
*/
enum {
IFLA_BR_UNSPEC,
IFLA_BR_FORWARD_DELAY,
@@ -510,6 +794,8 @@ enum {
IFLA_BR_VLAN_STATS_PER_PORT,
IFLA_BR_MULTI_BOOLOPT,
IFLA_BR_MCAST_QUERIER_STATE,
IFLA_BR_FDB_N_LEARNED,
IFLA_BR_FDB_MAX_LEARNED,
__IFLA_BR_MAX,
};
@@ -520,11 +806,252 @@ struct ifla_bridge_id {
__u8 addr[6]; /* ETH_ALEN */
};
/**
* DOC: Bridge mode enum definition
*
* @BRIDGE_MODE_HAIRPIN
* Controls whether traffic may be sent back out of the port on which it
* was received. This option is also called reflective relay mode, and is
* used to support basic VEPA (Virtual Ethernet Port Aggregator)
* capabilities. By default, this flag is turned off and the bridge will
* not forward traffic back out of the receiving port.
*/
enum {
BRIDGE_MODE_UNSPEC,
BRIDGE_MODE_HAIRPIN,
};
/**
* DOC: Bridge port enum definition
*
* @IFLA_BRPORT_STATE
* The operation state of the port. Here are the valid values.
*
* * 0 - port is in STP *DISABLED* state. Make this port completely
* inactive for STP. This is also called BPDU filter and could be used
* to disable STP on an untrusted port, like a leaf virtual device.
* The traffic forwarding is also stopped on this port.
* * 1 - port is in STP *LISTENING* state. Only valid if STP is enabled
* on the bridge. In this state the port listens for STP BPDUs and
* drops all other traffic frames.
* * 2 - port is in STP *LEARNING* state. Only valid if STP is enabled on
* the bridge. In this state the port will accept traffic only for the
* purpose of updating MAC address tables.
* * 3 - port is in STP *FORWARDING* state. Port is fully active.
* * 4 - port is in STP *BLOCKING* state. Only valid if STP is enabled on
* the bridge. This state is used during the STP election process.
* In this state, port will only process STP BPDUs.
*
* @IFLA_BRPORT_PRIORITY
* The STP port priority. The valid values are between 0 and 255.
*
* @IFLA_BRPORT_COST
* The STP path cost of the port. The valid values are between 1 and 65535.
*
* @IFLA_BRPORT_MODE
* Set the bridge port mode. See *BRIDGE_MODE_HAIRPIN* for more details.
*
* @IFLA_BRPORT_GUARD
* Controls whether STP BPDUs will be processed by the bridge port. By
* default, the flag is turned off to allow BPDU processing. Turning this
* flag on will disable the bridge port if a STP BPDU packet is received.
*
* If the bridge has Spanning Tree enabled, hostile devices on the network
* may send BPDU on a port and cause network failure. Setting *guard on*
* will detect and stop this by disabling the port. The port will be
* restarted if the link is brought down, or removed and reattached.
*
* @IFLA_BRPORT_PROTECT
* Controls whether a given port is allowed to become a root port or not.
* Only used when STP is enabled on the bridge. By default the flag is off.
*
* This feature is also called root port guard. If BPDU is received from a
* leaf (edge) port, it should not be elected as root port. This could
* be used if using STP on a bridge and the downstream bridges are not fully
* trusted; this prevents a hostile guest from rerouting traffic.
*
* @IFLA_BRPORT_FAST_LEAVE
* This flag allows the bridge to immediately stop multicast traffic
* forwarding on a port that receives an IGMP Leave message. It is only used
* when IGMP snooping is enabled on the bridge. By default the flag is off.
*
* @IFLA_BRPORT_LEARNING
* Controls whether a given port will learn *source* MAC addresses from
* received traffic or not. Also controls whether dynamic FDB entries
* (which can also be added by software) will be refreshed by incoming
* traffic. By default this flag is on.
*
* @IFLA_BRPORT_UNICAST_FLOOD
* Controls whether unicast traffic for which there is no FDB entry will
* be flooded towards this port. By default this flag is on.
*
* @IFLA_BRPORT_PROXYARP
* Enable proxy ARP on this port.
*
* @IFLA_BRPORT_LEARNING_SYNC
* Controls whether a given port will sync MAC addresses learned on device
* port to bridge FDB.
*
* @IFLA_BRPORT_PROXYARP_WIFI
* Enable proxy ARP on this port which meets extended requirements by
* IEEE 802.11 and Hotspot 2.0 specifications.
*
* @IFLA_BRPORT_ROOT_ID
*
* @IFLA_BRPORT_BRIDGE_ID
*
* @IFLA_BRPORT_DESIGNATED_PORT
*
* @IFLA_BRPORT_DESIGNATED_COST
*
* @IFLA_BRPORT_ID
*
* @IFLA_BRPORT_NO
*
* @IFLA_BRPORT_TOPOLOGY_CHANGE_ACK
*
* @IFLA_BRPORT_CONFIG_PENDING
*
* @IFLA_BRPORT_MESSAGE_AGE_TIMER
*
* @IFLA_BRPORT_FORWARD_DELAY_TIMER
*
* @IFLA_BRPORT_HOLD_TIMER
*
* @IFLA_BRPORT_FLUSH
* Flush bridge ports' fdb dynamic entries.
*
* @IFLA_BRPORT_MULTICAST_ROUTER
* Configure the port's multicast router presence. A port with
* a multicast router will receive all multicast traffic.
* The valid values are:
*
* * 0 disable multicast routers on this port
* * 1 let the system detect the presence of routers (default)
* * 2 permanently enable multicast traffic forwarding on this port
* * 3 enable multicast routers temporarily on this port, not depending
* on incoming queries.
*
* @IFLA_BRPORT_PAD
*
* @IFLA_BRPORT_MCAST_FLOOD
* Controls whether a given port will flood multicast traffic for which
* there is no MDB entry. By default this flag is on.
*
* @IFLA_BRPORT_MCAST_TO_UCAST
* Controls whether a given port will replicate packets using unicast
* instead of multicast. By default this flag is off.
*
* This is done by copying the packet per host and changing the multicast
* destination MAC to a unicast one accordingly.
*
* *mcast_to_unicast* works on top of the multicast snooping feature of the
* bridge. Which means unicast copies are only delivered to hosts which
* are interested in unicast and signaled this via IGMP/MLD reports previously.
*
* This feature is intended for interface types which have a more reliable
* and/or efficient way to deliver unicast packets than broadcast ones
* (e.g. WiFi).
*
* However, it should only be enabled on interfaces where no IGMPv2/MLDv1
* report suppression takes place. IGMP/MLD report suppression issue is
* usually overcome by the network daemon (supplicant) enabling AP isolation
* and by that separating all STAs.
*
* Delivery of STA-to-STA IP multicast is made possible again by enabling
* and utilizing the bridge hairpin mode, which considers the incoming port
* as a potential outgoing port, too (see *BRIDGE_MODE_HAIRPIN* option).
* Hairpin mode is performed after multicast snooping, therefore leading
* to only deliver reports to STAs running a multicast router.
*
* @IFLA_BRPORT_VLAN_TUNNEL
* Controls whether vlan to tunnel mapping is enabled on the port.
* By default this flag is off.
*
* @IFLA_BRPORT_BCAST_FLOOD
* Controls flooding of broadcast traffic on the given port. By default
* this flag is on.
*
* @IFLA_BRPORT_GROUP_FWD_MASK
* Set the group forward mask. This is a bitmask that is applied to
* decide whether to forward incoming frames destined to link-local
* addresses. The addresses of the form are 01:80:C2:00:00:0X (defaults
* to 0, which means the bridge does not forward any link-local frames
* coming on this port).
*
* @IFLA_BRPORT_NEIGH_SUPPRESS
* Controls whether neighbor discovery (arp and nd) proxy and suppression
* is enabled on the port. By default this flag is off.
*
* @IFLA_BRPORT_ISOLATED
* Controls whether a given port will be isolated, which means it will be
* able to communicate with non-isolated ports only. By default this
* flag is off.
*
* @IFLA_BRPORT_BACKUP_PORT
* Set a backup port. If the port loses carrier all traffic will be
* redirected to the configured backup port. Set the value to 0 to disable
* it.
*
* @IFLA_BRPORT_MRP_RING_OPEN
*
* @IFLA_BRPORT_MRP_IN_OPEN
*
* @IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT
* The number of per-port EHT hosts limit. The default value is 512.
* Setting to 0 is not allowed.
*
* @IFLA_BRPORT_MCAST_EHT_HOSTS_CNT
* The current number of tracked hosts, read only.
*
* @IFLA_BRPORT_LOCKED
* Controls whether a port will be locked, meaning that hosts behind the
* port will not be able to communicate through the port unless an FDB
* entry with the unit's MAC address is in the FDB. The common use case is
* that hosts are allowed access through authentication with the IEEE 802.1X
* protocol or based on whitelists. By default this flag is off.
*
* Please note that secure 802.1X deployments should always use the
* *BR_BOOLOPT_NO_LL_LEARN* flag, to not permit the bridge to populate its
* FDB based on link-local (EAPOL) traffic received on the port.
*
* @IFLA_BRPORT_MAB
* Controls whether a port will use MAC Authentication Bypass (MAB), a
* technique through which select MAC addresses may be allowed on a locked
* port, without using 802.1X authentication. Packets with an unknown source
* MAC address generates a "locked" FDB entry on the incoming bridge port.
* The common use case is for user space to react to these bridge FDB
* notifications and optionally replace the locked FDB entry with a normal
* one, allowing traffic to pass for whitelisted MAC addresses.
*
* Setting this flag also requires *IFLA_BRPORT_LOCKED* and
* *IFLA_BRPORT_LEARNING*. *IFLA_BRPORT_LOCKED* ensures that unauthorized
* data packets are dropped, and *IFLA_BRPORT_LEARNING* allows the dynamic
* FDB entries installed by user space (as replacements for the locked FDB
* entries) to be refreshed and/or aged out.
*
* @IFLA_BRPORT_MCAST_N_GROUPS
*
* @IFLA_BRPORT_MCAST_MAX_GROUPS
* Sets the maximum number of MDB entries that can be registered for a
* given port. Attempts to register more MDB entries at the port than this
* limit allows will be rejected, whether they are done through netlink
* (e.g. the bridge tool), or IGMP or MLD membership reports. Setting a
* limit of 0 disables the limit. The default value is 0.
*
* @IFLA_BRPORT_NEIGH_VLAN_SUPPRESS
* Controls whether neighbor discovery (arp and nd) proxy and suppression is
* enabled for a given port. By default this flag is off.
*
* Note that this option only takes effect when *IFLA_BRPORT_NEIGH_SUPPRESS*
* is enabled for a given port.
*
* @IFLA_BRPORT_BACKUP_NHID
* The FDB nexthop object ID to attach to packets being redirected to a
* backup port that has VLAN tunnel mapping enabled (via the
* *IFLA_BRPORT_VLAN_TUNNEL* option). Setting a value of 0 (default) has
* the effect of not attaching any ID.
*/
enum {
IFLA_BRPORT_UNSPEC,
IFLA_BRPORT_STATE, /* Spanning tree state */
@@ -769,6 +1296,19 @@ enum netkit_mode {
NETKIT_L3,
};
/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
* the BPF program if attached. This also means the latter can
* consume the two fields if they were populated earlier.
*
* NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before
* invoking the attached BPF program when the peer device resides
* in a different network namespace. This is the default behavior.
*/
enum netkit_scrub {
NETKIT_SCRUB_NONE,
NETKIT_SCRUB_DEFAULT,
};
enum {
IFLA_NETKIT_UNSPEC,
IFLA_NETKIT_PEER_INFO,
@@ -776,6 +1316,10 @@ enum {
IFLA_NETKIT_POLICY,
IFLA_NETKIT_PEER_POLICY,
IFLA_NETKIT_MODE,
IFLA_NETKIT_SCRUB,
IFLA_NETKIT_PEER_SCRUB,
IFLA_NETKIT_HEADROOM,
IFLA_NETKIT_TAILROOM,
__IFLA_NETKIT_MAX,
};
#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
@@ -854,6 +1398,9 @@ enum {
IFLA_VXLAN_DF,
IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
IFLA_VXLAN_LOCALBYPASS,
IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
IFLA_VXLAN_RESERVED_BITS,
IFLA_VXLAN_MC_ROUTE,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
@@ -871,6 +1418,13 @@ enum ifla_vxlan_df {
VXLAN_DF_MAX = __VXLAN_DF_END - 1,
};
enum ifla_vxlan_label_policy {
VXLAN_LABEL_FIXED = 0,
VXLAN_LABEL_INHERIT = 1,
__VXLAN_LABEL_END,
VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1,
};
/* GENEVE section */
enum {
IFLA_GENEVE_UNSPEC,
@@ -888,6 +1442,7 @@ enum {
IFLA_GENEVE_TTL_INHERIT,
IFLA_GENEVE_DF,
IFLA_GENEVE_INNER_PROTO_INHERIT,
IFLA_GENEVE_PORT_RANGE,
__IFLA_GENEVE_MAX
};
#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
@@ -900,6 +1455,11 @@ enum ifla_geneve_df {
GENEVE_DF_MAX = __GENEVE_DF_END - 1,
};
struct ifla_geneve_port_range {
__be16 low;
__be16 high;
};
/* Bareudp section */
enum {
IFLA_BAREUDP_UNSPEC,
@@ -935,6 +1495,8 @@ enum {
IFLA_GTP_ROLE,
IFLA_GTP_CREATE_SOCKETS,
IFLA_GTP_RESTART_COUNT,
IFLA_GTP_LOCAL,
IFLA_GTP_LOCAL6,
__IFLA_GTP_MAX,
};
#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
@@ -975,6 +1537,7 @@ enum {
IFLA_BOND_MISSED_MAX,
IFLA_BOND_NS_IP6_TARGET,
IFLA_BOND_COUPLED_CONTROL,
IFLA_BOND_BROADCAST_NEIGH,
__IFLA_BOND_MAX,
};
@@ -1003,6 +1566,7 @@ enum {
IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
IFLA_BOND_SLAVE_PRIO,
IFLA_BOND_SLAVE_ACTOR_PORT_PRIO,
__IFLA_BOND_SLAVE_MAX,
};
@@ -1240,6 +1804,7 @@ enum {
IFLA_HSR_PROTOCOL, /* Indicate different protocol than
* HSR. For example PRP.
*/
IFLA_HSR_INTERLINK, /* HSR interlink network device */
__IFLA_HSR_MAX,
};
@@ -1408,6 +1973,7 @@ struct ifla_rmnet_flags {
enum {
IFLA_MCTP_UNSPEC,
IFLA_MCTP_NET,
IFLA_MCTP_PHYS_BINDING,
__IFLA_MCTP_MAX,
};
@@ -1417,10 +1983,27 @@ enum {
enum {
IFLA_DSA_UNSPEC,
IFLA_DSA_MASTER,
IFLA_DSA_CONDUIT,
/* Deprecated, use IFLA_DSA_CONDUIT instead */
IFLA_DSA_MASTER = IFLA_DSA_CONDUIT,
__IFLA_DSA_MAX,
};
#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1)
#endif /* _UAPI_LINUX_IF_LINK_H */
/* OVPN section */
enum ovpn_mode {
OVPN_MODE_P2P,
OVPN_MODE_MP,
};
enum {
IFLA_OVPN_UNSPEC,
IFLA_OVPN_MODE,
__IFLA_OVPN_MAX,
};
#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1)
#endif /* _LINUX_IF_LINK_H */

View File

@@ -41,6 +41,10 @@
*/
#define XDP_UMEM_TX_SW_CSUM (1 << 1)
/* Request to reserve tx_metadata_len bytes of per-chunk metadata.
*/
#define XDP_UMEM_TX_METADATA_LEN (1 << 2)
struct sockaddr_xdp {
__u16 sxdp_family;
__u16 sxdp_flags;
@@ -75,6 +79,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
#define XDP_MAX_TX_SKB_BUDGET 9
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
@@ -113,16 +118,22 @@ struct xdp_options {
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
/* Request transmit timestamp. Upon completion, put it into tx_timestamp
* field of union xsk_tx_metadata.
* field of struct xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
/* Request transmit checksum offload. Checksum start position and offset
* are communicated via csum_start and csum_offset fields of union
* are communicated via csum_start and csum_offset fields of struct
* xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
/* Request launch time hardware offload. The device will schedule the packet for
* transmission at a pre-determined time called launch time. The value of
* launch time is communicated via launch_time field of struct xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2)
/* AF_XDP offloads request. 'request' union member is consumed by the driver
* when the packet is being transmitted. 'completion' union member is
* filled by the driver when the transmit completion arrives.
@@ -138,6 +149,10 @@ struct xsk_tx_metadata {
__u16 csum_start;
/* Offset from csum_start where checksum should be stored. */
__u16 csum_offset;
/* XDP_TXMD_FLAGS_LAUNCH_TIME */
/* Launch time in nanosecond against the PTP HW Clock */
__u64 launch_time;
} request;
struct {

View File

@@ -2,9 +2,10 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/netdev.yaml */
/* YNL-GEN uapi header */
/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#ifndef _UAPI_LINUX_NETDEV_H
#define _UAPI_LINUX_NETDEV_H
#ifndef _LINUX_NETDEV_H
#define _LINUX_NETDEV_H
#define NETDEV_FAMILY_NAME "netdev"
#define NETDEV_FAMILY_VERSION 1
@@ -59,10 +60,13 @@ enum netdev_xdp_rx_metadata {
* by the driver.
* @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
* driver.
* @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported
* by the driver.
*/
enum netdev_xsk_flags {
NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4,
};
enum netdev_queue_type {
@@ -74,6 +78,12 @@ enum netdev_qstats_scope {
NETDEV_QSTATS_SCOPE_QUEUE = 1,
};
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
@@ -86,6 +96,11 @@ enum {
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
};
enum {
__NETDEV_A_IO_URING_PROVIDER_INFO_MAX,
NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1)
};
enum {
NETDEV_A_PAGE_POOL_ID = 1,
NETDEV_A_PAGE_POOL_IFINDEX,
@@ -93,6 +108,8 @@ enum {
NETDEV_A_PAGE_POOL_INFLIGHT,
NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
NETDEV_A_PAGE_POOL_DETACH_TIME,
NETDEV_A_PAGE_POOL_DMABUF,
NETDEV_A_PAGE_POOL_IO_URING,
__NETDEV_A_PAGE_POOL_MAX,
NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -121,16 +138,28 @@ enum {
NETDEV_A_NAPI_ID,
NETDEV_A_NAPI_IRQ,
NETDEV_A_NAPI_PID,
NETDEV_A_NAPI_DEFER_HARD_IRQS,
NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
NETDEV_A_NAPI_THREADED,
__NETDEV_A_NAPI_MAX,
NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
};
enum {
__NETDEV_A_XSK_INFO_MAX,
NETDEV_A_XSK_INFO_MAX = (__NETDEV_A_XSK_INFO_MAX - 1)
};
enum {
NETDEV_A_QUEUE_ID = 1,
NETDEV_A_QUEUE_IFINDEX,
NETDEV_A_QUEUE_TYPE,
NETDEV_A_QUEUE_NAPI_ID,
NETDEV_A_QUEUE_DMABUF,
NETDEV_A_QUEUE_IO_URING,
NETDEV_A_QUEUE_XSK,
__NETDEV_A_QUEUE_MAX,
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -146,11 +175,43 @@ enum {
NETDEV_A_QSTATS_TX_PACKETS,
NETDEV_A_QSTATS_TX_BYTES,
NETDEV_A_QSTATS_RX_ALLOC_FAIL,
NETDEV_A_QSTATS_RX_HW_DROPS,
NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS,
NETDEV_A_QSTATS_RX_CSUM_COMPLETE,
NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY,
NETDEV_A_QSTATS_RX_CSUM_NONE,
NETDEV_A_QSTATS_RX_CSUM_BAD,
NETDEV_A_QSTATS_RX_HW_GRO_PACKETS,
NETDEV_A_QSTATS_RX_HW_GRO_BYTES,
NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS,
NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES,
NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS,
NETDEV_A_QSTATS_TX_HW_DROPS,
NETDEV_A_QSTATS_TX_HW_DROP_ERRORS,
NETDEV_A_QSTATS_TX_CSUM_NONE,
NETDEV_A_QSTATS_TX_NEEDS_CSUM,
NETDEV_A_QSTATS_TX_HW_GSO_PACKETS,
NETDEV_A_QSTATS_TX_HW_GSO_BYTES,
NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
NETDEV_A_QSTATS_TX_STOP,
NETDEV_A_QSTATS_TX_WAKE,
__NETDEV_A_QSTATS_MAX,
NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
};
enum {
NETDEV_A_DMABUF_IFINDEX = 1,
NETDEV_A_DMABUF_QUEUES,
NETDEV_A_DMABUF_FD,
NETDEV_A_DMABUF_ID,
__NETDEV_A_DMABUF_MAX,
NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
};
enum {
NETDEV_CMD_DEV_GET = 1,
NETDEV_CMD_DEV_ADD_NTF,
@@ -164,6 +225,9 @@ enum {
NETDEV_CMD_QUEUE_GET,
NETDEV_CMD_NAPI_GET,
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
NETDEV_CMD_BIND_TX,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
@@ -172,4 +236,4 @@ enum {
#define NETDEV_MCGRP_MGMT "mgmt"
#define NETDEV_MCGRP_PAGE_POOL "page-pool"
#endif /* _UAPI_LINUX_NETDEV_H */
#endif /* _LINUX_NETDEV_H */

View File

@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI__LINUX_NETLINK_H
#define _UAPI__LINUX_NETLINK_H
#ifndef __LINUX_NETLINK_H
#define __LINUX_NETLINK_H
#include <linux/kernel.h>
#include <linux/const.h>
#include <linux/socket.h> /* for __kernel_sa_family_t */
#include <linux/types.h>
@@ -20,7 +20,7 @@
#define NETLINK_CONNECTOR 11
#define NETLINK_NETFILTER 12 /* netfilter subsystem */
#define NETLINK_IP6_FW 13
#define NETLINK_DNRTMSG 14 /* DECnet routing messages */
#define NETLINK_DNRTMSG 14 /* DECnet routing messages (obsolete) */
#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */
#define NETLINK_GENERIC 16
/* leave room for NETLINK_DM (DM Events) */
@@ -41,12 +41,20 @@ struct sockaddr_nl {
__u32 nl_groups; /* multicast groups mask */
};
/**
* struct nlmsghdr - fixed format metadata header of Netlink messages
* @nlmsg_len: Length of message including header
* @nlmsg_type: Message content type
* @nlmsg_flags: Additional flags
* @nlmsg_seq: Sequence number
* @nlmsg_pid: Sending process port ID
*/
struct nlmsghdr {
__u32 nlmsg_len; /* Length of message including header */
__u16 nlmsg_type; /* Message content */
__u16 nlmsg_flags; /* Additional flags */
__u32 nlmsg_seq; /* Sequence number */
__u32 nlmsg_pid; /* Sending process port ID */
__u32 nlmsg_len;
__u16 nlmsg_type;
__u16 nlmsg_flags;
__u32 nlmsg_seq;
__u32 nlmsg_pid;
};
/* Flags values */
@@ -54,7 +62,7 @@ struct nlmsghdr {
#define NLM_F_REQUEST 0x01 /* It is request message. */
#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */
#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */
#define NLM_F_ECHO 0x08 /* Echo this request */
#define NLM_F_ECHO 0x08 /* Receive resulting notifications */
#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
@@ -72,6 +80,7 @@ struct nlmsghdr {
/* Modifiers to DELETE request */
#define NLM_F_NONREC 0x100 /* Do not delete recursively */
#define NLM_F_BULK 0x200 /* Delete multiple objects */
/* Flags for ACK message */
#define NLM_F_CAPPED 0x100 /* request was capped */
@@ -91,9 +100,10 @@ struct nlmsghdr {
#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN)
#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len))
#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0)))
#define NLMSG_DATA(nlh) ((void *)(((char *)nlh) + NLMSG_HDRLEN))
#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
(struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len)))
(struct nlmsghdr *)(((char *)(nlh)) + \
NLMSG_ALIGN((nlh)->nlmsg_len)))
#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len <= (len))
@@ -129,6 +139,11 @@ struct nlmsgerr {
* @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to
* be used - in the success case - to identify a created
* object or operation or similar (binary)
* @NLMSGERR_ATTR_POLICY: policy for a rejected attribute
* @NLMSGERR_ATTR_MISS_TYPE: type of a missing required attribute,
* %NLMSGERR_ATTR_MISS_NEST will not be present if the attribute was
* missing at the message level
* @NLMSGERR_ATTR_MISS_NEST: offset of the nest where attribute was missing
* @__NLMSGERR_ATTR_MAX: number of attributes
* @NLMSGERR_ATTR_MAX: highest attribute number
*/
@@ -137,6 +152,9 @@ enum nlmsgerr_attrs {
NLMSGERR_ATTR_MSG,
NLMSGERR_ATTR_OFFS,
NLMSGERR_ATTR_COOKIE,
NLMSGERR_ATTR_POLICY,
NLMSGERR_ATTR_MISS_TYPE,
NLMSGERR_ATTR_MISS_NEST,
__NLMSGERR_ATTR_MAX,
NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
@@ -249,4 +267,117 @@ struct nla_bitfield32 {
__u32 selector;
};
#endif /* _UAPI__LINUX_NETLINK_H */
/*
* policy descriptions - it's specific to each family how this is used
* Normally, it should be retrieved via a dump inside another attribute
* specifying where it applies.
*/
/**
* enum netlink_attribute_type - type of an attribute
* @NL_ATTR_TYPE_INVALID: unused
* @NL_ATTR_TYPE_FLAG: flag attribute (present/not present)
* @NL_ATTR_TYPE_U8: 8-bit unsigned attribute
* @NL_ATTR_TYPE_U16: 16-bit unsigned attribute
* @NL_ATTR_TYPE_U32: 32-bit unsigned attribute
* @NL_ATTR_TYPE_U64: 64-bit unsigned attribute
* @NL_ATTR_TYPE_S8: 8-bit signed attribute
* @NL_ATTR_TYPE_S16: 16-bit signed attribute
* @NL_ATTR_TYPE_S32: 32-bit signed attribute
* @NL_ATTR_TYPE_S64: 64-bit signed attribute
* @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified
* @NL_ATTR_TYPE_STRING: string, min/max length may be specified
* @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string,
* min/max length may be specified
* @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute
* consists of sub-attributes. The nested policy and maxtype
* inside may be specified.
* @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this
* attribute contains sub-attributes whose type is irrelevant
* (just used to separate the array entries) and each such array
* entry has attributes again, the policy for those inner ones
* and the corresponding maxtype may be specified.
* @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute
* @NL_ATTR_TYPE_SINT: 32-bit or 64-bit signed attribute, aligned to 4B
* @NL_ATTR_TYPE_UINT: 32-bit or 64-bit unsigned attribute, aligned to 4B
*/
enum netlink_attribute_type {
NL_ATTR_TYPE_INVALID,
NL_ATTR_TYPE_FLAG,
NL_ATTR_TYPE_U8,
NL_ATTR_TYPE_U16,
NL_ATTR_TYPE_U32,
NL_ATTR_TYPE_U64,
NL_ATTR_TYPE_S8,
NL_ATTR_TYPE_S16,
NL_ATTR_TYPE_S32,
NL_ATTR_TYPE_S64,
NL_ATTR_TYPE_BINARY,
NL_ATTR_TYPE_STRING,
NL_ATTR_TYPE_NUL_STRING,
NL_ATTR_TYPE_NESTED,
NL_ATTR_TYPE_NESTED_ARRAY,
NL_ATTR_TYPE_BITFIELD32,
NL_ATTR_TYPE_SINT,
NL_ATTR_TYPE_UINT,
};
/**
* enum netlink_policy_type_attr - policy type attributes
* @NL_POLICY_TYPE_ATTR_UNSPEC: unused
* @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute,
* &enum netlink_attribute_type (U32)
* @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed
* integers (S64)
* @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed
* integers (S64)
* @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned
* integers (U64)
* @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned
* integers (U64)
* @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary
* attributes, no minimum if not given (U32)
* @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary
* attributes, no maximum if not given (U32)
* @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and
* nested array types (U32)
* @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy
* attribute for nested and nested array types, this can
* in theory be < the size of the policy pointed to by
* the index, if limited inside the nesting (U32)
* @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the
* bitfield32 type (U32)
* @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64)
* @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment
*
* @__NL_POLICY_TYPE_ATTR_MAX: number of attributes
* @NL_POLICY_TYPE_ATTR_MAX: highest attribute number
*/
enum netlink_policy_type_attr {
NL_POLICY_TYPE_ATTR_UNSPEC,
NL_POLICY_TYPE_ATTR_TYPE,
NL_POLICY_TYPE_ATTR_MIN_VALUE_S,
NL_POLICY_TYPE_ATTR_MAX_VALUE_S,
NL_POLICY_TYPE_ATTR_MIN_VALUE_U,
NL_POLICY_TYPE_ATTR_MAX_VALUE_U,
NL_POLICY_TYPE_ATTR_MIN_LENGTH,
NL_POLICY_TYPE_ATTR_MAX_LENGTH,
NL_POLICY_TYPE_ATTR_POLICY_IDX,
NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE,
NL_POLICY_TYPE_ATTR_BITFIELD32_MASK,
NL_POLICY_TYPE_ATTR_PAD,
NL_POLICY_TYPE_ATTR_MASK,
/* keep last */
__NL_POLICY_TYPE_ATTR_MAX,
NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1
};
#endif /* __LINUX_NETLINK_H */

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_OPENAT2_H
#define _UAPI_LINUX_OPENAT2_H
#ifndef _LINUX_OPENAT2_H
#define _LINUX_OPENAT2_H
#include <linux/types.h>
@@ -40,4 +40,4 @@ struct open_how {
return -EAGAIN if that's not
possible. */
#endif /* _UAPI_LINUX_OPENAT2_H */
#endif /* _LINUX_OPENAT2_H */

File diff suppressed because it is too large Load Diff

View File

@@ -16,9 +16,40 @@ enum {
TCA_ACT_STATS,
TCA_ACT_PAD,
TCA_ACT_COOKIE,
TCA_ACT_FLAGS,
TCA_ACT_HW_STATS,
TCA_ACT_USED_HW_STATS,
TCA_ACT_IN_HW_COUNT,
__TCA_ACT_MAX
};
/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */
#define TCA_ACT_FLAGS_NO_PERCPU_STATS (1 << 0) /* Don't use percpu allocator for
* actions stats.
*/
#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */
#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */
/* tca HW stats type
* When user does not pass the attribute, he does not care.
* It is the same as if he would pass the attribute with
* all supported bits set.
* In case no bits are set, user is not interested in getting any HW statistics.
*/
#define TCA_ACT_HW_STATS_IMMEDIATE (1 << 0) /* Means that in dump, user
* gets the current HW stats
* state from the device
* queried at the dump time.
*/
#define TCA_ACT_HW_STATS_DELAYED (1 << 1) /* Means that in dump, user gets
* HW stats that might be out of date
* for some time, maybe couple of
* seconds. This is the case when
* driver polls stats updates
* periodically or when it gets async
* stats update from the device.
*/
#define TCA_ACT_MAX __TCA_ACT_MAX
#define TCA_OLD_COMPAT (TCA_ACT_MAX+1)
#define TCA_ACT_MAX_PRIO 32
@@ -63,12 +94,53 @@ enum {
#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN
/* These macros are put here for binary compatibility with userspace apps that
* make use of them. For kernel code and new userspace apps, use the TCA_ID_*
* versions.
*/
#define TCA_ACT_GACT 5
#define TCA_ACT_IPT 6 /* obsoleted, can be reused */
#define TCA_ACT_PEDIT 7
#define TCA_ACT_MIRRED 8
#define TCA_ACT_NAT 9
#define TCA_ACT_XT 10
#define TCA_ACT_SKBEDIT 11
#define TCA_ACT_VLAN 12
#define TCA_ACT_BPF 13
#define TCA_ACT_CONNMARK 14
#define TCA_ACT_SKBMOD 15
#define TCA_ACT_CSUM 16
#define TCA_ACT_TUNNEL_KEY 17
#define TCA_ACT_SIMP 22
#define TCA_ACT_IFE 25
#define TCA_ACT_SAMPLE 26
/* Action type identifiers*/
enum {
TCA_ID_UNSPEC=0,
TCA_ID_POLICE=1,
enum tca_id {
TCA_ID_UNSPEC = 0,
TCA_ID_POLICE = 1,
TCA_ID_GACT = TCA_ACT_GACT,
TCA_ID_IPT = TCA_ACT_IPT, /* Obsoleted, can be reused */
TCA_ID_PEDIT = TCA_ACT_PEDIT,
TCA_ID_MIRRED = TCA_ACT_MIRRED,
TCA_ID_NAT = TCA_ACT_NAT,
TCA_ID_XT = TCA_ACT_XT,
TCA_ID_SKBEDIT = TCA_ACT_SKBEDIT,
TCA_ID_VLAN = TCA_ACT_VLAN,
TCA_ID_BPF = TCA_ACT_BPF,
TCA_ID_CONNMARK = TCA_ACT_CONNMARK,
TCA_ID_SKBMOD = TCA_ACT_SKBMOD,
TCA_ID_CSUM = TCA_ACT_CSUM,
TCA_ID_TUNNEL_KEY = TCA_ACT_TUNNEL_KEY,
TCA_ID_SIMP = TCA_ACT_SIMP,
TCA_ID_IFE = TCA_ACT_IFE,
TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
TCA_ID_CTINFO,
TCA_ID_MPLS,
TCA_ID_CT,
TCA_ID_GATE,
/* other actions go here */
__TCA_ID_MAX=255
__TCA_ID_MAX = 255
};
#define TCA_ID_MAX __TCA_ID_MAX
@@ -120,6 +192,10 @@ enum {
TCA_POLICE_RESULT,
TCA_POLICE_TM,
TCA_POLICE_PAD,
TCA_POLICE_RATE64,
TCA_POLICE_PEAKRATE64,
TCA_POLICE_PKTRATE64,
TCA_POLICE_PKTBURST64,
__TCA_POLICE_MAX
#define TCA_POLICE_RESULT TCA_POLICE_RESULT
};
@@ -170,16 +246,19 @@ struct tc_u32_key {
};
struct tc_u32_sel {
unsigned char flags;
unsigned char offshift;
unsigned char nkeys;
/* New members MUST be added within the __struct_group() macro below. */
__struct_group(tc_u32_sel_hdr, hdr, /* no attrs */,
unsigned char flags;
unsigned char offshift;
unsigned char nkeys;
__be16 offmask;
__u16 off;
short offoff;
__be16 offmask;
__u16 off;
short offoff;
short hoff;
__be32 hmask;
short hoff;
__be32 hmask;
);
struct tc_u32_key keys[];
};
@@ -286,12 +365,19 @@ enum {
/* Basic filter */
struct tc_basic_pcnt {
__u64 rcnt;
__u64 rhit;
};
enum {
TCA_BASIC_UNSPEC,
TCA_BASIC_CLASSID,
TCA_BASIC_EMATCHES,
TCA_BASIC_ACT,
TCA_BASIC_POLICE,
TCA_BASIC_PCNT,
TCA_BASIC_PAD,
__TCA_BASIC_MAX
};
@@ -438,17 +524,79 @@ enum {
TCA_FLOWER_IN_HW_COUNT,
TCA_FLOWER_KEY_PORT_SRC_MIN, /* be16 */
TCA_FLOWER_KEY_PORT_SRC_MAX, /* be16 */
TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */
TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */
TCA_FLOWER_KEY_CT_STATE, /* u16 */
TCA_FLOWER_KEY_CT_STATE_MASK, /* u16 */
TCA_FLOWER_KEY_CT_ZONE, /* u16 */
TCA_FLOWER_KEY_CT_ZONE_MASK, /* u16 */
TCA_FLOWER_KEY_CT_MARK, /* u32 */
TCA_FLOWER_KEY_CT_MARK_MASK, /* u32 */
TCA_FLOWER_KEY_CT_LABELS, /* u128 */
TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */
TCA_FLOWER_KEY_MPLS_OPTS,
TCA_FLOWER_KEY_HASH, /* u32 */
TCA_FLOWER_KEY_HASH_MASK, /* u32 */
TCA_FLOWER_KEY_NUM_OF_VLANS, /* u8 */
TCA_FLOWER_KEY_PPPOE_SID, /* be16 */
TCA_FLOWER_KEY_PPP_PROTO, /* be16 */
TCA_FLOWER_KEY_L2TPV3_SID, /* be32 */
TCA_FLOWER_L2_MISS, /* u8 */
TCA_FLOWER_KEY_CFM, /* nested */
TCA_FLOWER_KEY_SPI, /* be32 */
TCA_FLOWER_KEY_SPI_MASK, /* be32 */
TCA_FLOWER_KEY_ENC_FLAGS, /* be32 */
TCA_FLOWER_KEY_ENC_FLAGS_MASK, /* be32 */
__TCA_FLOWER_MAX,
};
#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
enum {
TCA_FLOWER_KEY_CT_FLAGS_NEW = 1 << 0, /* Beginning of a new connection. */
TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */
TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */
TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */
TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */
__TCA_FLOWER_KEY_CT_FLAGS_MAX,
};
enum {
TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_GENEVE_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_VXLAN_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_ERSPAN_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_GTP, /* Nested
* TCA_FLOWER_KEY_ENC_OPT_GTP_
* attributes
*/
TCA_FLOWER_KEY_ENC_OPTS_PFCP, /* Nested
* TCA_FLOWER_KEY_ENC_IPT_PFCP
* attributes
*/
__TCA_FLOWER_KEY_ENC_OPTS_MAX,
};
@@ -467,17 +615,105 @@ enum {
(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
enum {
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */
__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */
__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_GTP_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_GTP_QFI, /* u8 */
__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_GTP_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX - 1)
enum {
TCA_FLOWER_KEY_ENC_OPT_PFCP_UNSPEC,
TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, /* u8 */
TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID, /* be64 */
__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX,
};
#define TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX \
(__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX - 1)
enum {
TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC,
TCA_FLOWER_KEY_MPLS_OPTS_LSE,
__TCA_FLOWER_KEY_MPLS_OPTS_MAX,
};
#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1)
enum {
TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC,
TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX,
};
#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \
(__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1)
enum {
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM = (1 << 2),
TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT = (1 << 3),
TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM = (1 << 4),
TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT = (1 << 5),
__TCA_FLOWER_KEY_FLAGS_MAX,
};
#define TCA_FLOWER_KEY_FLAGS_MAX (__TCA_FLOWER_KEY_FLAGS_MAX - 1)
enum {
TCA_FLOWER_KEY_CFM_OPT_UNSPEC,
TCA_FLOWER_KEY_CFM_MD_LEVEL,
TCA_FLOWER_KEY_CFM_OPCODE,
__TCA_FLOWER_KEY_CFM_OPT_MAX,
};
#define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
#define TCA_FLOWER_KEY_CFM_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
#define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */
/* Match-all classifier */
struct tc_matchall_pcnt {
__u64 rhit;
};
enum {
TCA_MATCHALL_UNSPEC,
TCA_MATCHALL_CLASSID,
TCA_MATCHALL_ACT,
TCA_MATCHALL_FLAGS,
TCA_MATCHALL_PCNT,
TCA_MATCHALL_PAD,
__TCA_MATCHALL_MAX,
};

View File

@@ -2,6 +2,7 @@
#ifndef __LINUX_PKT_SCHED_H
#define __LINUX_PKT_SCHED_H
#include <linux/const.h>
#include <linux/types.h>
/* Logical priority bands not depending on specific packet scheduler.
@@ -255,6 +256,9 @@ enum {
TCA_RED_PARMS,
TCA_RED_STAB,
TCA_RED_MAX_P,
TCA_RED_FLAGS, /* bitfield32 */
TCA_RED_EARLY_DROP_BLOCK, /* u32 */
TCA_RED_MARK_BLOCK, /* u32 */
__TCA_RED_MAX,
};
@@ -267,12 +271,28 @@ struct tc_red_qopt {
unsigned char Wlog; /* log(W) */
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
unsigned char Scell_log; /* cell size for idle damping */
/* This field can be used for flags that a RED-like qdisc has
* historically supported. E.g. when configuring RED, it can be used for
* ECN, HARDDROP and ADAPTATIVE. For SFQ it can be used for ECN,
* HARDDROP. Etc. Because this field has not been validated, and is
* copied back on dump, any bits besides those to which a given qdisc
* has assigned a historical meaning need to be considered for free use
* by userspace tools.
*
* Any further flags need to be passed differently, e.g. through an
* attribute (such as TCA_RED_FLAGS above). Such attribute should allow
* passing both recent and historic flags in one value.
*/
unsigned char flags;
#define TC_RED_ECN 1
#define TC_RED_HARDDROP 2
#define TC_RED_ADAPTATIVE 4
#define TC_RED_NODROP 8
};
#define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE)
struct tc_red_xstats {
__u32 early; /* Early drops */
__u32 pdrop; /* Drops due to queue limits */
@@ -474,6 +494,7 @@ enum {
TCA_NETEM_JITTER64,
TCA_NETEM_SLOT,
TCA_NETEM_SLOT_DIST,
TCA_NETEM_PRNG_SEED,
__TCA_NETEM_MAX,
};
@@ -590,6 +611,11 @@ enum {
#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
enum {
TC_FP_EXPRESS = 1,
TC_FP_PREEMPTIBLE = 2,
};
struct tc_mqprio_qopt {
__u8 num_tc;
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -603,12 +629,23 @@ struct tc_mqprio_qopt {
#define TC_MQPRIO_F_MIN_RATE 0x4
#define TC_MQPRIO_F_MAX_RATE 0x8
enum {
TCA_MQPRIO_TC_ENTRY_UNSPEC,
TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */
TCA_MQPRIO_TC_ENTRY_FP, /* u32 */
/* add new constants above here */
__TCA_MQPRIO_TC_ENTRY_CNT,
TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
};
enum {
TCA_MQPRIO_UNSPEC,
TCA_MQPRIO_MODE,
TCA_MQPRIO_SHAPER,
TCA_MQPRIO_MIN_RATE64,
TCA_MQPRIO_MAX_RATE64,
TCA_MQPRIO_TC_ENTRY,
__TCA_MQPRIO_MAX,
};
@@ -698,6 +735,8 @@ struct tc_codel_xstats {
/* FQ_CODEL */
#define FQ_CODEL_QUANTUM_MAX (1 << 20)
enum {
TCA_FQ_CODEL_UNSPEC,
TCA_FQ_CODEL_TARGET,
@@ -709,6 +748,8 @@ enum {
TCA_FQ_CODEL_CE_THRESHOLD,
TCA_FQ_CODEL_DROP_BATCH_SIZE,
TCA_FQ_CODEL_MEMORY_LIMIT,
TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR,
TCA_FQ_CODEL_CE_THRESHOLD_MASK,
__TCA_FQ_CODEL_MAX
};
@@ -785,15 +826,30 @@ enum {
TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */
TCA_FQ_TIMER_SLACK, /* timer slack */
TCA_FQ_HORIZON, /* time horizon in us */
TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */
TCA_FQ_PRIOMAP, /* prio2band */
TCA_FQ_WEIGHTS, /* Weights for each band */
TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */
__TCA_FQ_MAX
};
#define TCA_FQ_MAX (__TCA_FQ_MAX - 1)
#define FQ_BANDS 3
#define FQ_MIN_WEIGHT 16384
struct tc_fq_qd_stats {
__u64 gc_flows;
__u64 highprio_packets;
__u64 tcp_retrans;
__u64 highprio_packets; /* obsolete */
__u64 tcp_retrans; /* obsolete */
__u64 throttled;
__u64 flows_plimit;
__u64 pkts_too_long;
@@ -804,6 +860,12 @@ struct tc_fq_qd_stats {
__u32 throttled_flows;
__u32 unthrottle_latency_ns;
__u64 ce_mark; /* packets above ce_threshold */
__u64 horizon_drops;
__u64 horizon_caps;
__u64 fastpath_packets;
__u64 band_drops[FQ_BANDS];
__u32 band_pkt_count[FQ_BANDS];
__u32 pad;
};
/* Heavy-Hitter Filter */
@@ -841,19 +903,56 @@ enum {
TCA_PIE_BETA,
TCA_PIE_ECN,
TCA_PIE_BYTEMODE,
TCA_PIE_DQ_RATE_ESTIMATOR,
__TCA_PIE_MAX
};
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
struct tc_pie_xstats {
__u32 prob; /* current probability */
__u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to pie_action */
__u32 overlimit; /* dropped due to lack of space in queue */
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
__u64 prob; /* current probability */
__u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in
* bits/pie_time
*/
__u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to pie_action */
__u32 overlimit; /* dropped due to lack of space
* in queue
*/
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
};
/* FQ PIE */
enum {
TCA_FQ_PIE_UNSPEC,
TCA_FQ_PIE_LIMIT,
TCA_FQ_PIE_FLOWS,
TCA_FQ_PIE_TARGET,
TCA_FQ_PIE_TUPDATE,
TCA_FQ_PIE_ALPHA,
TCA_FQ_PIE_BETA,
TCA_FQ_PIE_QUANTUM,
TCA_FQ_PIE_MEMORY_LIMIT,
TCA_FQ_PIE_ECN_PROB,
TCA_FQ_PIE_ECN,
TCA_FQ_PIE_BYTEMODE,
TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
__TCA_FQ_PIE_MAX
};
#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1)
struct tc_fq_pie_xstats {
__u32 packets_in; /* total number of packets enqueued */
__u32 dropped; /* packets dropped due to fq_pie_action */
__u32 overlimit; /* dropped due to lack of space in queue */
__u32 overmemory; /* dropped due to lack of memory in queue */
__u32 ecn_mark; /* packets marked with ecn */
__u32 new_flow_count; /* count of new flows created by packets */
__u32 new_flows_len; /* count of flows in new list */
__u32 old_flows_len; /* count of flows in old list */
__u32 memory_usage; /* total memory across all queues */
};
/* CBS */
@@ -880,8 +979,9 @@ struct tc_etf_qopt {
__s32 delta;
__s32 clockid;
__u32 flags;
#define TC_ETF_DEADLINE_MODE_ON BIT(0)
#define TC_ETF_OFFLOAD_ON BIT(1)
#define TC_ETF_DEADLINE_MODE_ON _BITUL(0)
#define TC_ETF_OFFLOAD_ON _BITUL(1)
#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2)
};
enum {
@@ -913,6 +1013,7 @@ enum {
TCA_CAKE_INGRESS,
TCA_CAKE_ACK_FILTER,
TCA_CAKE_SPLIT_GSO,
TCA_CAKE_FWMARK,
__TCA_CAKE_MAX
};
#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1)
@@ -1039,6 +1140,40 @@ enum {
#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
/* The format for the admin sched (dump only):
* [TCA_TAPRIO_SCHED_ADMIN_SCHED]
* [TCA_TAPRIO_ATTR_SCHED_BASE_TIME]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES]
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
*/
#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST _BITUL(0)
#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD _BITUL(1)
enum {
TCA_TAPRIO_TC_ENTRY_UNSPEC,
TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */
TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */
TCA_TAPRIO_TC_ENTRY_FP, /* u32 */
/* add new constants above here */
__TCA_TAPRIO_TC_ENTRY_CNT,
TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
};
enum {
TCA_TAPRIO_OFFLOAD_STATS_PAD = 1, /* u64 */
TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS, /* u64 */
TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS, /* u64 */
/* add new constants above here */
__TCA_TAPRIO_OFFLOAD_STATS_CNT,
TCA_TAPRIO_OFFLOAD_STATS_MAX = (__TCA_TAPRIO_OFFLOAD_STATS_CNT - 1)
};
enum {
TCA_TAPRIO_ATTR_UNSPEC,
TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1047,9 +1182,101 @@ enum {
TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
TCA_TAPRIO_PAD,
TCA_TAPRIO_ATTR_PAD = TCA_TAPRIO_PAD,
TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
TCA_TAPRIO_ATTR_FLAGS, /* u32 */
TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
__TCA_TAPRIO_ATTR_MAX,
};
#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
/* ETS */
#define TCQ_ETS_MAX_BANDS 16
enum {
TCA_ETS_UNSPEC,
TCA_ETS_NBANDS, /* u8 */
TCA_ETS_NSTRICT, /* u8 */
TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */
TCA_ETS_QUANTA_BAND, /* u32 */
TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */
TCA_ETS_PRIOMAP_BAND, /* u8 */
__TCA_ETS_MAX,
};
#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
/* DUALPI2 */
enum tc_dualpi2_drop_overload {
TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0,
TC_DUALPI2_DROP_OVERLOAD_DROP = 1,
__TCA_DUALPI2_DROP_OVERLOAD_MAX,
};
#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1)
enum tc_dualpi2_drop_early {
TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0,
TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1,
__TCA_DUALPI2_DROP_EARLY_MAX,
};
#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1)
enum tc_dualpi2_ecn_mask {
TC_DUALPI2_ECN_MASK_L4S_ECT = 1,
TC_DUALPI2_ECN_MASK_CLA_ECT = 2,
TC_DUALPI2_ECN_MASK_ANY_ECT = 3,
__TCA_DUALPI2_ECN_MASK_MAX,
};
#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1)
enum tc_dualpi2_split_gso {
TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0,
TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1,
__TCA_DUALPI2_SPLIT_GSO_MAX,
};
#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1)
enum {
TCA_DUALPI2_UNSPEC,
TCA_DUALPI2_LIMIT, /* Packets */
TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */
TCA_DUALPI2_TARGET, /* us */
TCA_DUALPI2_TUPDATE, /* us */
TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */
TCA_DUALPI2_BETA, /* Hz scaled up by 256 */
TCA_DUALPI2_STEP_THRESH_PKTS, /* Step threshold in packets */
TCA_DUALPI2_STEP_THRESH_US, /* Step threshold in microseconds */
TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */
TCA_DUALPI2_COUPLING, /* Coupling factor between queues */
TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */
TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */
TCA_DUALPI2_C_PROTECTION, /* Percentage */
TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */
TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */
TCA_DUALPI2_PAD,
__TCA_DUALPI2_MAX
};
#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1)
struct tc_dualpi2_xstats {
__u32 prob; /* current probability */
__u32 delay_c; /* current delay in C queue */
__u32 delay_l; /* current delay in L queue */
__u32 packets_in_c; /* number of packets enqueued in C queue */
__u32 packets_in_l; /* number of packets enqueued in L queue */
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
__u32 step_marks; /* ECN marks due to the step AQM */
__s32 credit; /* current c_protection credit */
__u32 memory_used; /* Memory used by both queues */
__u32 max_memory_used; /* Maximum used memory */
__u32 memory_limit; /* Memory limit of both queues */
};
#endif

View File

@@ -0,0 +1,81 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _LINUX_STDDEF_H
#define _LINUX_STDDEF_H
#ifdef __KERNEL__
#endif
#ifndef __always_inline
#define __always_inline inline
#endif
/* Not all C++ standards support type declarations inside an anonymous union */
#ifndef __cplusplus
#define __struct_group_tag(TAG) TAG
#else
#define __struct_group_tag(TAG)
#endif
/**
* __struct_group() - Create a mirrored named and anonyomous struct
*
* @TAG: The tag name for the named sub-struct (usually empty)
* @NAME: The identifier name of the mirrored sub-struct
* @ATTRS: Any struct attributes (usually empty)
* @MEMBERS: The member declarations for the mirrored structs
*
* Used to create an anonymous union of two structs with identical layout
* and size: one anonymous and one named. The former's members can be used
* normally without sub-struct naming, and the latter can be used to
* reason about the start, end, and size of the group of struct members.
* The named struct can also be explicitly tagged for layer reuse (C only),
* as well as both having struct attributes appended.
*/
#define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \
union { \
struct { MEMBERS } ATTRS; \
struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \
} ATTRS
#ifdef __cplusplus
/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */
#define __DECLARE_FLEX_ARRAY(T, member) \
T member[0]
#else
/**
* __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
*
* @TYPE: The type of each flexible array element
* @NAME: The name of the flexible array member
*
* In order to have a flexible array member in a union or alone in a
* struct, it needs to be wrapped in an anonymous struct with at least 1
* named member, but that member can be empty.
*/
#define __DECLARE_FLEX_ARRAY(TYPE, NAME) \
struct { \
struct { } __empty_ ## NAME; \
TYPE NAME[]; \
}
#endif
#ifndef __counted_by
#define __counted_by(m)
#endif
#ifndef __counted_by_le
#define __counted_by_le(m)
#endif
#ifndef __counted_by_be
#define __counted_by_be(m)
#endif
#ifdef __KERNEL__
#define __kernel_nonstring __nonstring
#else
#define __kernel_nonstring
#endif
#endif /* _LINUX_STDDEF_H */

View File

@@ -35,44 +35,48 @@ if [[ "$SANITIZER" == undefined ]]; then
CXXFLAGS+=" $UBSAN_FLAGS"
fi
export SKIP_LIBELF_REBUILD=${SKIP_LIBELF_REBUILD:=''}
# Ideally libbelf should be built using release tarballs available
# at https://sourceware.org/elfutils/ftp/. Unfortunately sometimes they
# fail to compile (for example, elfutils-0.185 fails to compile with LDFLAGS enabled
# due to https://bugs.gentoo.org/794601) so let's just point the script to
# commits referring to versions of libelf that actually can be built
rm -rf elfutils
git clone https://sourceware.org/git/elfutils.git
(
cd elfutils
git checkout 67a187d4c1790058fc7fd218317851cb68bb087c
git log --oneline -1
if [[ ! -e elfutils || "$SKIP_LIBELF_REBUILD" == "" ]]; then
rm -rf elfutils
git clone https://sourceware.org/git/elfutils.git
(
cd elfutils
git checkout 67a187d4c1790058fc7fd218317851cb68bb087c
git log --oneline -1
# ASan isn't compatible with -Wl,--no-undefined: https://github.com/google/sanitizers/issues/380
sed -i 's/^\(NO_UNDEFINED=\).*/\1/' configure.ac
# ASan isn't compatible with -Wl,--no-undefined: https://github.com/google/sanitizers/issues/380
sed -i 's/^\(NO_UNDEFINED=\).*/\1/' configure.ac
# ASan isn't compatible with -Wl,-z,defs either:
# https://clang.llvm.org/docs/AddressSanitizer.html#usage
sed -i 's/^\(ZDEFS_LDFLAGS=\).*/\1/' configure.ac
# ASan isn't compatible with -Wl,-z,defs either:
# https://clang.llvm.org/docs/AddressSanitizer.html#usage
sed -i 's/^\(ZDEFS_LDFLAGS=\).*/\1/' configure.ac
if [[ "$SANITIZER" == undefined ]]; then
# That's basicaly what --enable-sanitize-undefined does to turn off unaligned access
# elfutils heavily relies on on i386/x86_64 but without changing compiler flags along the way
sed -i 's/\(check_undefined_val\)=[0-9]/\1=1/' configure.ac
if [[ "$SANITIZER" == undefined ]]; then
# That's basicaly what --enable-sanitize-undefined does to turn off unaligned access
# elfutils heavily relies on on i386/x86_64 but without changing compiler flags along the way
sed -i 's/\(check_undefined_val\)=[0-9]/\1=1/' configure.ac
fi
autoreconf -i -f
if ! ./configure --enable-maintainer-mode --disable-debuginfod --disable-libdebuginfod \
--disable-demangler --without-bzlib --without-lzma --without-zstd \
CC="$CC" CFLAGS="-Wno-error $CFLAGS" CXX="$CXX" CXXFLAGS="-Wno-error $CXXFLAGS" LDFLAGS="$CFLAGS"; then
cat config.log
exit 1
fi
make -C config -j$(nproc) V=1
make -C lib -j$(nproc) V=1
make -C libelf -j$(nproc) V=1
)
fi
autoreconf -i -f
if ! ./configure --enable-maintainer-mode --disable-debuginfod --disable-libdebuginfod \
--disable-demangler --without-bzlib --without-lzma --without-zstd \
CC="$CC" CFLAGS="-Wno-error $CFLAGS" CXX="$CXX" CXXFLAGS="-Wno-error $CXXFLAGS" LDFLAGS="$CFLAGS"; then
cat config.log
exit 1
fi
make -C config -j$(nproc) V=1
make -C lib -j$(nproc) V=1
make -C libelf -j$(nproc) V=1
)
make -C src BUILD_STATIC_ONLY=y V=1 clean
make -C src -j$(nproc) CFLAGS="-I$(pwd)/elfutils/libelf $CFLAGS" BUILD_STATIC_ONLY=y V=1

37
scripts/mailmap-update.sh Executable file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -eu
usage () {
echo "USAGE: ./mailmap-update.sh <libbpf-repo> <linux-repo>"
exit 1
}
LIBBPF_REPO="${1-""}"
LINUX_REPO="${2-""}"
if [ -z "${LIBBPF_REPO}" ] || [ -z "${LINUX_REPO}" ]; then
echo "Error: libbpf or linux repos are not specified"
usage
fi
LIBBPF_MAILMAP="${LIBBPF_REPO}/.mailmap"
LINUX_MAILMAP="${LINUX_REPO}/.mailmap"
tmpfile="$(mktemp)"
cleanup() {
rm -f "${tmpfile}"
}
trap cleanup EXIT
grep_lines() {
local pattern="$1"
local file="$2"
grep "${pattern}" "${file}" || true
}
while read -r email; do
grep_lines "${email}$" "${LINUX_MAILMAP}" >> "${tmpfile}"
done < <(git log --format='<%ae>' | sort -u)
sort -u "${tmpfile}" > "${LIBBPF_MAILMAP}"

View File

@@ -39,18 +39,19 @@ trap "cd ${WORKDIR}; exit" INT TERM EXIT
declare -A PATH_MAP
PATH_MAP=( \
[tools/lib/bpf]=src \
[tools/include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \
[tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \
[tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \
[tools/include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \
[tools/include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \
[tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \
[tools/include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \
[tools/include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \
[tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \
[tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \
[tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \
[include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \
[include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \
[include/uapi/linux/btf.h]=include/uapi/linux/btf.h \
[include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \
[include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \
[include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \
[include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \
[include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \
[include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \
[include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \
[include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \
[include/uapi/linux/perf_event.h]=include/uapi/linux/perf_event.h \
[include/uapi/linux/stddef.h]=include/uapi/linux/stddef.h \
[Documentation/bpf/libbpf]=docs \
)
@@ -63,6 +64,7 @@ LIBBPF_TREE_FILTER="mkdir -p __libbpf/include/uapi/linux __libbpf/include/tools
for p in "${!PATH_MAP[@]}"; do
LIBBPF_TREE_FILTER+="git mv -kf ${p} __libbpf/${PATH_MAP[${p}]} && "$'\\\n'
done
LIBBPF_TREE_FILTER+="find __libbpf/include/uapi/linux -type f -exec sed -i -e 's/_UAPI\(__\?LINUX\)/\1/g' -e 's@^#include <linux/compiler_types.h>@@' {} + && "$'\\\n'
LIBBPF_TREE_FILTER+="git rm --ignore-unmatch -f __libbpf/src/{Makefile,Build,test_libbpf.c,.gitignore} >/dev/null"
cd_to()
@@ -295,6 +297,22 @@ Latest changes to BPF helper definitions.
" -- src/bpf_helper_defs.h
fi
echo "Regenerating .mailmap..."
cd_to "${LINUX_REPO}"
git checkout "${TIP_SYM_REF}"
cd_to "${LIBBPF_REPO}"
"${LIBBPF_REPO}"/scripts/mailmap-update.sh "${LIBBPF_REPO}" "${LINUX_REPO}"
# if anything changed, commit it
mailmap_changes=$(git status --porcelain .mailmap | wc -l)
if ((${mailmap_changes} == 1)); then
git add .mailmap
git commit -s -m "sync: update .mailmap
Update .mailmap based on libbpf's list of contributors and on the latest
.mailmap version in the upstream repository.
" -- .mailmap
fi
# Use generated cover-letter as a template for "sync commit" with
# baseline and checkpoint commits from kernel repo (and leave summary
# from cover letter intact, of course)
@@ -331,7 +349,7 @@ diff -u ${TMP_DIR}/linux-view.ls ${TMP_DIR}/github-view.ls
echo "Comparing file contents..."
CONSISTENT=1
for F in $(cat ${TMP_DIR}/linux-view.ls); do
if ! diff -u "${LINUX_ABS_DIR}/${F}" "${GITHUB_ABS_DIR}/${F}"; then
if ! diff -u <(sed 's/_UAPI\(__\?LINUX\)/\1/' "${LINUX_ABS_DIR}/${F}") "${GITHUB_ABS_DIR}/${F}"; then
echo "${LINUX_ABS_DIR}/${F} and ${GITHUB_ABS_DIR}/${F} are different!"
CONSISTENT=0
fi

View File

@@ -9,7 +9,7 @@ else
endif
LIBBPF_MAJOR_VERSION := 1
LIBBPF_MINOR_VERSION := 4
LIBBPF_MINOR_VERSION := 7
LIBBPF_PATCH_VERSION := 0
LIBBPF_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).$(LIBBPF_PATCH_VERSION)
LIBBPF_MAJMIN_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).0
@@ -26,6 +26,7 @@ endef
$(call allow-override,CC,$(CROSS_COMPILE)cc)
$(call allow-override,LD,$(CROSS_COMPILE)ld)
PKG_CONFIG ?= pkg-config
TOPDIR = ..
@@ -41,10 +42,12 @@ ALL_CFLAGS += $(CFLAGS) \
$(EXTRA_CFLAGS)
ALL_LDFLAGS += $(LDFLAGS) $(EXTRA_LDFLAGS)
ifeq ($(shell command -v $(PKG_CONFIG) 2> /dev/null),)
NO_PKG_CONFIG := 1
endif
ifdef NO_PKG_CONFIG
ALL_LDFLAGS += -lelf -lz
else
PKG_CONFIG ?= pkg-config
ALL_CFLAGS += $(shell $(PKG_CONFIG) --cflags libelf zlib)
ALL_LDFLAGS += $(shell $(PKG_CONFIG) --libs libelf zlib)
endif
@@ -52,10 +55,10 @@ endif
OBJDIR ?= .
SHARED_OBJDIR := $(OBJDIR)/sharedobjs
STATIC_OBJDIR := $(OBJDIR)/staticobjs
OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \
nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \
btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \
relo_core.o usdt.o zip.o elf.o features.o
OBJS := bpf.o btf.o libbpf.o libbpf_utils.o netlink.o \
nlattr.o libbpf_probes.o bpf_prog_linfo.o \
btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \
relo_core.o usdt.o zip.o elf.o features.o btf_iter.o btf_relocate.o
SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS))
STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS))
@@ -119,13 +122,13 @@ $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION): $(SHARED_OBJS)
-Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
$^ $(ALL_LDFLAGS) -o $@
$(OBJDIR)/libbpf.pc: force
$(OBJDIR)/libbpf.pc: force | $(OBJDIR)
$(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \
-e "s|@LIBDIR@|$(LIBDIR_PC)|" \
-e "s|@VERSION@|$(LIBBPF_VERSION)|" \
< libbpf.pc.template > $@
$(STATIC_OBJDIR) $(SHARED_OBJDIR):
$(OBJDIR) $(STATIC_OBJDIR) $(SHARED_OBJDIR):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $@

100
src/bpf.c
View File

@@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts)
*/
int probe_memcg_account(int token_fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd);
const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
struct bpf_insn insns[] = {
BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns),
BPF_EXIT_INSN(),
@@ -154,7 +154,7 @@ int bump_rlimit_memlock(void)
memlock_bumped = true;
/* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */
/* zero memlock_rlim disables auto-bumping RLIMIT_MEMLOCK */
if (memlock_rlim == 0)
return 0;
@@ -172,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type,
__u32 max_entries,
const struct bpf_map_create_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd);
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
union bpf_attr attr;
int fd;
@@ -203,6 +203,8 @@ int bpf_map_create(enum bpf_map_type map_type,
attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);
attr.map_token_fd = OPTS_GET(opts, token_fd, 0);
attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
return libbpf_err_errno(fd);
@@ -238,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, size_t insn_cnt,
struct bpf_prog_load_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
const size_t attr_sz = offsetofend(union bpf_attr, keyring_id);
void *finfo = NULL, *linfo = NULL;
const char *func_info, *line_info;
__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
@@ -311,6 +313,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0);
attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL));
attr.fd_array_cnt = OPTS_GET(opts, fd_array_cnt, 0);
if (log_level) {
attr.log_buf = ptr_to_u64(log_buf);
@@ -766,6 +769,7 @@ int bpf_link_create(int prog_fd, int target_fd,
return libbpf_err(-EINVAL);
break;
case BPF_TRACE_KPROBE_MULTI:
case BPF_TRACE_KPROBE_SESSION:
attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0);
attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0);
attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0));
@@ -775,6 +779,7 @@ int bpf_link_create(int prog_fd, int target_fd,
return libbpf_err(-EINVAL);
break;
case BPF_TRACE_UPROBE_MULTI:
case BPF_TRACE_UPROBE_SESSION:
attr.link_create.uprobe_multi.flags = OPTS_GET(opts, uprobe_multi.flags, 0);
attr.link_create.uprobe_multi.cnt = OPTS_GET(opts, uprobe_multi.cnt, 0);
attr.link_create.uprobe_multi.path = ptr_to_u64(OPTS_GET(opts, uprobe_multi.path, 0));
@@ -789,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd,
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
case BPF_TRACE_FSESSION:
case BPF_LSM_MAC:
attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0);
if (!OPTS_ZEROED(opts, tracing))
@@ -834,6 +840,50 @@ int bpf_link_create(int prog_fd, int target_fd,
if (!OPTS_ZEROED(opts, netkit))
return libbpf_err(-EINVAL);
break;
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
case BPF_LSM_CGROUP:
relative_fd = OPTS_GET(opts, cgroup.relative_fd, 0);
relative_id = OPTS_GET(opts, cgroup.relative_id, 0);
if (relative_fd && relative_id)
return libbpf_err(-EINVAL);
if (relative_id) {
attr.link_create.cgroup.relative_id = relative_id;
attr.link_create.flags |= BPF_F_ID;
} else {
attr.link_create.cgroup.relative_fd = relative_fd;
}
attr.link_create.cgroup.expected_revision =
OPTS_GET(opts, cgroup.expected_revision, 0);
if (!OPTS_ZEROED(opts, cgroup))
return libbpf_err(-EINVAL);
break;
default:
if (!OPTS_ZEROED(opts, flags))
return libbpf_err(-EINVAL);
@@ -1094,7 +1144,7 @@ int bpf_map_get_fd_by_id(__u32 id)
int bpf_btf_get_fd_by_id_opts(__u32 id,
const struct bpf_get_fd_by_id_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, open_flags);
const size_t attr_sz = offsetofend(union bpf_attr, fd_by_id_token_fd);
union bpf_attr attr;
int fd;
@@ -1104,6 +1154,7 @@ int bpf_btf_get_fd_by_id_opts(__u32 id,
memset(&attr, 0, attr_sz);
attr.btf_id = id;
attr.open_flags = OPTS_GET(opts, open_flags, 0);
attr.fd_by_id_token_fd = OPTS_GET(opts, token_fd, 0);
fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz);
return libbpf_err_errno(fd);
@@ -1327,3 +1378,42 @@ int bpf_token_create(int bpffs_fd, struct bpf_token_create_opts *opts)
fd = sys_bpf_fd(BPF_TOKEN_CREATE, &attr, attr_sz);
return libbpf_err_errno(fd);
}
int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
struct bpf_prog_stream_read_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, prog_stream_read);
union bpf_attr attr;
int err;
if (!OPTS_VALID(opts, bpf_prog_stream_read_opts))
return libbpf_err(-EINVAL);
memset(&attr, 0, attr_sz);
attr.prog_stream_read.stream_buf = ptr_to_u64(buf);
attr.prog_stream_read.stream_buf_len = buf_len;
attr.prog_stream_read.stream_id = stream_id;
attr.prog_stream_read.prog_fd = prog_fd;
err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz);
return libbpf_err_errno(err);
}
int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
struct bpf_prog_assoc_struct_ops_opts *opts)
{
const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops);
union bpf_attr attr;
int err;
if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts))
return libbpf_err(-EINVAL);
memset(&attr, 0, attr_sz);
attr.prog_assoc_struct_ops.map_fd = map_fd;
attr.prog_assoc_struct_ops.prog_fd = prog_fd;
attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0);
err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz);
return libbpf_err_errno(err);
}

View File

@@ -54,9 +54,12 @@ struct bpf_map_create_opts {
__s32 value_type_btf_obj_fd;
__u32 token_fd;
const void *excl_prog_hash;
__u32 excl_prog_hash_size;
size_t :0;
};
#define bpf_map_create_opts__last_field token_fd
#define bpf_map_create_opts__last_field excl_prog_hash_size
LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
const char *map_name,
@@ -100,16 +103,19 @@ struct bpf_prog_load_opts {
__u32 log_level;
__u32 log_size;
char *log_buf;
/* output: actual total log contents size (including termintaing zero).
/* output: actual total log contents size (including terminating zero).
* It could be both larger than original log_size (if log was
* truncated), or smaller (if log buffer wasn't filled completely).
* If kernel doesn't support this feature, log_size is left unchanged.
*/
__u32 log_true_size;
__u32 token_fd;
/* if set, provides the length of fd_array */
__u32 fd_array_cnt;
size_t :0;
};
#define bpf_prog_load_opts__last_field token_fd
#define bpf_prog_load_opts__last_field fd_array_cnt
LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type,
const char *prog_name, const char *license,
@@ -129,7 +135,7 @@ struct bpf_btf_load_opts {
char *log_buf;
__u32 log_level;
__u32 log_size;
/* output: actual total log contents size (including termintaing zero).
/* output: actual total log contents size (including terminating zero).
* It could be both larger than original log_size (if log was
* truncated), or smaller (if log buffer wasn't filled completely).
* If kernel doesn't support this feature, log_size is left unchanged.
@@ -283,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch,
* Update spin_lock-ed map elements. This must be
* specified if the map value contains a spinlock.
*
* **BPF_F_CPU**
* As for percpu maps, update value on the specified CPU. And the cpu
* info is embedded into the high 32 bits of **opts->elem_flags**.
*
* **BPF_F_ALL_CPUS**
* As for percpu maps, update value across all CPUs. This flag cannot
* be used with BPF_F_CPU at the same time.
*
* @param fd BPF map file descriptor
* @param keys pointer to an array of *count* keys
* @param values pointer to an array of *count* values
@@ -435,6 +449,11 @@ struct bpf_link_create_opts {
__u32 relative_id;
__u64 expected_revision;
} netkit;
struct {
__u32 relative_fd;
__u32 relative_id;
__u64 expected_revision;
} cgroup;
};
size_t :0;
};
@@ -484,9 +503,10 @@ LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id);
struct bpf_get_fd_by_id_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
__u32 open_flags; /* permissions requested for the operation on fd */
__u32 token_fd;
size_t :0;
};
#define bpf_get_fd_by_id_opts__last_field open_flags
#define bpf_get_fd_by_id_opts__last_field token_fd
LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_prog_get_fd_by_id_opts(__u32 id,
@@ -700,6 +720,48 @@ struct bpf_token_create_opts {
LIBBPF_API int bpf_token_create(int bpffs_fd,
struct bpf_token_create_opts *opts);
struct bpf_prog_stream_read_opts {
size_t sz;
size_t :0;
};
#define bpf_prog_stream_read_opts__last_field sz
/**
* @brief **bpf_prog_stream_read** reads data from the BPF stream of a given BPF
* program.
*
* @param prog_fd FD for the BPF program whose BPF stream is to be read.
* @param stream_id ID of the BPF stream to be read.
* @param buf Buffer to read data into from the BPF stream.
* @param buf_len Maximum number of bytes to read from the BPF stream.
* @param opts optional options, can be NULL
*
* @return The number of bytes read, on success; negative error code, otherwise
* (errno is also set to the error code)
*/
LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
struct bpf_prog_stream_read_opts *opts);
struct bpf_prog_assoc_struct_ops_opts {
size_t sz;
__u32 flags;
size_t :0;
};
#define bpf_prog_assoc_struct_ops_opts__last_field flags
/**
* @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a
* struct_ops map.
*
* @param prog_fd FD for the BPF program
* @param map_fd FD for the struct_ops map to be associated with the BPF program
* @param opts optional options, can be NULL
*
* @return 0 on success; negative error code, otherwise (errno is also set to
* the error code)
*/
LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
struct bpf_prog_assoc_struct_ops_opts *opts);
#ifdef __cplusplus
} /* extern "C" */
#endif

View File

@@ -2,7 +2,7 @@
#ifndef __BPF_CORE_READ_H__
#define __BPF_CORE_READ_H__
#include <bpf/bpf_helpers.h>
#include "bpf_helpers.h"
/*
* enum bpf_field_info_kind is passed as a second argument into
@@ -104,6 +104,7 @@ enum bpf_enum_value_kind {
case 2: val = *(const unsigned short *)p; break; \
case 4: val = *(const unsigned int *)p; break; \
case 8: val = *(const unsigned long long *)p; break; \
default: val = 0; break; \
} \
val <<= __CORE_RELO(s, field, LSHIFT_U64); \
if (__CORE_RELO(s, field, SIGNED)) \
@@ -387,7 +388,13 @@ extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak;
#define ___arrow10(a, b, c, d, e, f, g, h, i, j) a->b->c->d->e->f->g->h->i->j
#define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__)
#if defined(__clang__) && (__clang_major__ >= 19)
#define ___type(...) __typeof_unqual__(___arrow(__VA_ARGS__))
#elif defined(__GNUC__) && (__GNUC__ >= 14)
#define ___type(...) __typeof_unqual__(___arrow(__VA_ARGS__))
#else
#define ___type(...) typeof(___arrow(__VA_ARGS__))
#endif
#define ___read(read_fn, dst, src_type, src, accessor) \
read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor)

View File

@@ -4,6 +4,7 @@
#define __BPF_GEN_INTERNAL_H
#include "bpf.h"
#include "libbpf_internal.h"
struct ksym_relo_desc {
const char *name;
@@ -34,6 +35,7 @@ struct bpf_gen {
void *data_cur;
void *insn_start;
void *insn_cur;
bool swapped_endian;
ssize_t cleanup_label;
__u32 nr_progs;
__u32 nr_maps;
@@ -49,6 +51,7 @@ struct bpf_gen {
__u32 nr_ksyms;
int fd_array;
int nr_fd_array;
int hash_insn_offset[SHA256_DWORD_SIZE];
};
void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps);

View File

@@ -44,6 +44,14 @@ struct bpf_dynptr;
struct iphdr;
struct ipv6hdr;
#ifndef __bpf_fastcall
#if __has_attribute(bpf_fastcall)
#define __bpf_fastcall __attribute__((bpf_fastcall))
#else
#define __bpf_fastcall
#endif
#endif
/*
* bpf_map_lookup_elem
*
@@ -203,17 +211,21 @@ static __u32 (* const bpf_get_prandom_u32)(void) = (void *) 7;
* Returns
* The SMP id of the processor running the program.
*/
static __u32 (* const bpf_get_smp_processor_id)(void) = (void *) 8;
static __bpf_fastcall __u32 (* const bpf_get_smp_processor_id)(void) = (void *) 8;
/*
* bpf_skb_store_bytes
*
* Store *len* bytes from address *from* into the packet
* associated to *skb*, at *offset*. *flags* are a combination of
* **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
* checksum for the packet after storing the bytes) and
* **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
* **->swhash** and *skb*\ **->l4hash** to 0).
* associated to *skb*, at *offset*. The *flags* are a combination
* of the following values:
*
* **BPF_F_RECOMPUTE_CSUM**
* Automatically update *skb*\ **->csum** after storing the
* bytes.
* **BPF_F_INVALIDATE_HASH**
* Set *skb*\ **->hash**, *skb*\ **->swhash** and *skb*\
* **->l4hash** to 0.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
@@ -273,7 +285,8 @@ static long (* const bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, _
* untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
* for updates resulting in a null checksum the value is set to
* **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
* the checksum is to be computed against a pseudo-header.
* that the modified header field is part of the pseudo-header.
* Flag **BPF_F_IPV6** should be set for IPv6 packets.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
@@ -676,7 +689,7 @@ static __u32 (* const bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24;
* into it. An example is available in file
* *samples/bpf/trace_output_user.c* in the Linux kernel source
* tree (the eBPF program counterpart is in
* *samples/bpf/trace_output_kern.c*).
* *samples/bpf/trace_output.bpf.c*).
*
* **bpf_perf_event_output**\ () achieves better performance
* than **bpf_trace_printk**\ () for sharing data with user
@@ -1224,7 +1237,7 @@ static long (* const bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *)
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
* **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
* **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
* **TCP_BPF_RTO_MIN**.
* **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports the following *optname*\ s:
* **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
@@ -1511,10 +1524,6 @@ static long (* const bpf_getsockopt)(void *bpf_socket, int level, int optname, v
* option, and in this case it only works on functions tagged with
* **ALLOW_ERROR_INJECTION** in the kernel code.
*
* Also, the helper is only available for the architectures having
* the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
* x86 architecture is the only one to support this feature.
*
* Returns
* 0
*/
@@ -1851,6 +1860,10 @@ static long (* const bpf_skb_load_bytes_relative)(const void *skb, __u32 offset,
* for the nexthop. If the src addr cannot be derived,
* **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
* case, *params*->dmac and *params*->smac are not set either.
* **BPF_FIB_LOOKUP_MARK**
* Use the mark present in *params*->mark for the fib lookup.
* This option should not be used with BPF_FIB_LOOKUP_DIRECT,
* as it only has meaning for full lookups.
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -3545,7 +3558,7 @@ static int (* const bpf_inode_storage_delete)(void *map, void *inode) = (void *)
* including the trailing NUL character. On error, a negative
* value.
*/
static long (* const bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147;
static long (* const bpf_d_path)(const struct path *path, char *buf, __u32 sz) = (void *) 147;
/*
* bpf_copy_from_user
@@ -3694,6 +3707,9 @@ static void *(* const bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154;
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* *skb*\ **->mark** and *skb*\ **->tstamp** are not cleared during
* the netns switch.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the
* ingress hook and for veth and netkit target device types. The
@@ -3798,7 +3814,7 @@ static __u64 (* const bpf_ktime_get_coarse_ns)(void) = (void *) 160;
*
* Returns
* The **hash_algo** is returned on success,
* **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
* **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if
* invalid arguments are passed.
*/
static long (* const bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161;
@@ -4216,7 +4232,7 @@ static long (* const bpf_find_vma)(struct task_struct *task, __u64 addr, void *c
* Currently, the **flags** must be 0. Currently, nr_loops is
* limited to 1 << 23 (~8 million) loops.
*
* long (\*callback_fn)(u32 index, void \*ctx);
* long (\*callback_fn)(u64 index, void \*ctx);
*
* where **index** is the current index in the loop. The index
* is zero-indexed.
@@ -4412,7 +4428,7 @@ static long (* const bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __
*
* Returns
* The **hash_algo** is returned on success,
* **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
* **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
* invalid arguments are passed.
*/
static long (* const bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193;
@@ -4420,9 +4436,10 @@ static long (* const bpf_ima_file_hash)(struct file *file, void *dst, __u32 size
/*
* bpf_kptr_xchg
*
* Exchange kptr at pointer *map_value* with *ptr*, and return the
* old value. *ptr* can be NULL, otherwise it must be a referenced
* pointer which will be released when this helper is called.
* Exchange kptr at pointer *dst* with *ptr*, and return the old value.
* *dst* can be map value or local kptr. *ptr* can be NULL, otherwise
* it must be a referenced pointer which will be released when this helper
* is called.
*
* Returns
* The old value of kptr (which can be NULL). The returned pointer
@@ -4430,7 +4447,7 @@ static long (* const bpf_ima_file_hash)(struct file *file, void *dst, __u32 size
* corresponding release function, or moved into a BPF map before
* program exit.
*/
static void *(* const bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194;
static void *(* const bpf_kptr_xchg)(void *dst, void *ptr) = (void *) 194;
/*
* bpf_map_lookup_percpu_elem
@@ -4467,7 +4484,7 @@ static struct mptcp_sock *(* const bpf_skc_to_mptcp_sock)(void *sk) = (void *) 1
* 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
* -EINVAL if flags is not 0.
*/
static long (* const bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197;
static long (* const bpf_dynptr_from_mem)(void *data, __u64 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197;
/*
* bpf_ringbuf_reserve_dynptr
@@ -4525,7 +4542,7 @@ static void (* const bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 f
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*/
static long (* const bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201;
static long (* const bpf_dynptr_read)(void *dst, __u64 len, const struct bpf_dynptr *src, __u64 offset, __u64 flags) = (void *) 201;
/*
* bpf_dynptr_write
@@ -4550,7 +4567,7 @@ static long (* const bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dyn
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*/
static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202;
static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u64 offset, void *src, __u64 len, __u64 flags) = (void *) 202;
/*
* bpf_dynptr_data
@@ -4568,7 +4585,7 @@ static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offse
* read-only, if the dynptr is invalid, or if the offset and length
* is out of bounds.
*/
static void *(* const bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203;
static void *(* const bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u64 offset, __u64 len) = (void *) 203;
/*
* bpf_tcp_raw_gen_syncookie_ipv4

View File

@@ -15,6 +15,14 @@
#define __array(name, val) typeof(val) *name[]
#define __ulong(name, val) enum { ___bpf_concat(__unique_value, __COUNTER__) = val } name
#ifndef likely
#define likely(x) (__builtin_expect(!!(x), 1))
#endif
#ifndef unlikely
#define unlikely(x) (__builtin_expect(!!(x), 0))
#endif
/*
* Helper macro to place programs, maps, license in
* different sections in elf_bpf file. Section names
@@ -137,7 +145,8 @@
/*
* Helper function to perform a tail call with a constant/immediate map slot.
*/
#if __clang_major__ >= 8 && defined(__bpf__)
#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12)
#if defined(__bpf__)
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
@@ -165,6 +174,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
: "r0", "r1", "r2", "r3", "r4", "r5");
}
#endif
#endif
enum libbpf_pin_type {
LIBBPF_PIN_NONE,
@@ -183,16 +193,29 @@ enum libbpf_tristate {
#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted")))
#define __kptr __attribute__((btf_type_tag("kptr")))
#define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))
#define __uptr __attribute__((btf_type_tag("uptr")))
#define bpf_ksym_exists(sym) ({ \
_Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \
!!sym; \
#if defined (__clang__)
#define bpf_ksym_exists(sym) ({ \
_Static_assert(!__builtin_constant_p(!!sym), \
#sym " should be marked as __weak"); \
!!sym; \
})
#elif __GNUC__ > 8
#define bpf_ksym_exists(sym) ({ \
_Static_assert(__builtin_has_attribute (*sym, __weak__), \
#sym " should be marked as __weak"); \
!!sym; \
})
#else
#define bpf_ksym_exists(sym) !!sym
#endif
#define __arg_ctx __attribute__((btf_decl_tag("arg:ctx")))
#define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull")))
#define __arg_nullable __attribute((btf_decl_tag("arg:nullable")))
#define __arg_trusted __attribute((btf_decl_tag("arg:trusted")))
#define __arg_untrusted __attribute((btf_decl_tag("arg:untrusted")))
#define __arg_arena __attribute((btf_decl_tag("arg:arena")))
#ifndef ___bpf_concat
@@ -292,6 +315,22 @@ enum libbpf_tristate {
___param, sizeof(___param)); \
})
extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
__u32 len__sz) __weak __ksym;
#define bpf_stream_printk(stream_id, fmt, args...) \
({ \
static const char ___fmt[] = fmt; \
unsigned long long ___param[___bpf_narg(args)]; \
\
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \
\
bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \
})
/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args
* Otherwise use __bpf_vprintk
*/
@@ -328,7 +367,7 @@ extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
* I.e., it looks almost like high-level for each loop in other languages,
* supports continue/break, and is verifiable by BPF verifier.
*
* For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
* For iterating integers, the difference between bpf_for_each(num, i, N, M)
* and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
* verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
* *`, not just `int`. So for integers bpf_for() is more convenient.

View File

@@ -163,7 +163,7 @@
struct pt_regs___s390 {
unsigned long orig_gpr2;
};
} __attribute__((preserve_access_index));
/* s390 provides user_pt_regs instead of struct pt_regs to userspace */
#define __PT_REGS_CAST(x) ((const user_pt_regs *)(x))
@@ -179,7 +179,7 @@ struct pt_regs___s390 {
#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
#define __PT_PARM6_SYSCALL_REG gprs[7]
#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___s390 *)(x))->__PT_PARM1_SYSCALL_REG)
#define PT_REGS_PARM1_CORE_SYSCALL(x) \
BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG)
@@ -222,7 +222,7 @@ struct pt_regs___s390 {
struct pt_regs___arm64 {
unsigned long orig_x0;
};
} __attribute__((preserve_access_index));
/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */
#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x))
@@ -241,7 +241,7 @@ struct pt_regs___arm64 {
#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___arm64 *)(x))->__PT_PARM1_SYSCALL_REG)
#define PT_REGS_PARM1_CORE_SYSCALL(x) \
BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG)
@@ -311,7 +311,7 @@ struct pt_regs___arm64 {
#define __PT_RET_REG regs[31]
#define __PT_FP_REG __unsupported__
#define __PT_RC_REG gpr[3]
#define __PT_SP_REG sp
#define __PT_SP_REG gpr[1]
#define __PT_IP_REG nip
#elif defined(bpf_target_sparc)
@@ -351,6 +351,10 @@ struct pt_regs___arm64 {
* https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions
*/
struct pt_regs___riscv {
unsigned long orig_a0;
} __attribute__((preserve_access_index));
/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */
#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
#define __PT_PARM1_REG a0
@@ -362,12 +366,15 @@ struct pt_regs___arm64 {
#define __PT_PARM7_REG a6
#define __PT_PARM8_REG a7
#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
#define __PT_PARM1_SYSCALL_REG orig_a0
#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___riscv *)(x))->__PT_PARM1_SYSCALL_REG)
#define PT_REGS_PARM1_CORE_SYSCALL(x) \
BPF_CORE_READ((const struct pt_regs___riscv *)(x), __PT_PARM1_SYSCALL_REG)
#define __PT_RET_REG ra
#define __PT_FP_REG s0
@@ -473,7 +480,7 @@ struct pt_regs;
#endif
/*
* Similarly, syscall-specific conventions might differ between function call
* conventions within each architecutre. All supported architectures pass
* conventions within each architecture. All supported architectures pass
* either 6 or 7 syscall arguments in registers.
*
* See syscall(2) manpage for succinct table with information on each arch.
@@ -515,7 +522,7 @@ struct pt_regs;
#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; })
#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP
#elif defined(bpf_target_sparc)
#elif defined(bpf_target_sparc) || defined(bpf_target_arm64)
#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); })
#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP
@@ -633,25 +640,25 @@ struct pt_regs;
#endif
#define ___bpf_ctx_cast0() ctx
#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1]
#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2]
#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3]
#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4]
#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5]
#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6]
#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7]
#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8]
#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0]
#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1]
#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2]
#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3]
#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4]
#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5]
#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6]
#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7]
#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8]
#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9]
#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10]
#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11]
#define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
/*
* BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
* similar kinds of BPF programs, that accept input arguments as a single
* pointer to untyped u64 array, where each u64 can actually be a typed
* pointer or integer of different size. Instead of requring user to write
* pointer or integer of different size. Instead of requiring user to write
* manual casts and work with array elements by index, BPF_PROG macro
* allows user to declare a list of named and typed input arguments in the
* same syntax as for normal C function. All the casting is hidden and
@@ -786,14 +793,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args))
struct pt_regs;
#define ___bpf_kprobe_args0() ctx
#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx)
#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx)
#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx)
#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx)
#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx)
#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx)
#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx)
#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx)
#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx)
#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx)
#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx)
#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx)
#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx)
#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx)
#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx)
#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx)
#define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args)
/*
@@ -801,7 +808,7 @@ struct pt_regs;
* tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific
* low-level way of getting kprobe input arguments from struct pt_regs, and
* provides a familiar typed and named function arguments syntax and
* semantics of accessing kprobe input paremeters.
* semantics of accessing kprobe input parameters.
*
* Original struct pt_regs* context is preserved as 'ctx' argument. This might
* be necessary when using BPF helpers like bpf_perf_event_output().
@@ -821,7 +828,7 @@ static __always_inline typeof(name(0)) \
____##name(struct pt_regs *ctx, ##args)
#define ___bpf_kretprobe_args0() ctx
#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx)
#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx)
#define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args)
/*
@@ -845,24 +852,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */
#define ___bpf_syscall_args0() ctx
#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs)
#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs)
#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs)
#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs)
#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs)
#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs)
#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs)
#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs)
#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs)
#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs)
#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs)
#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs)
#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs)
#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs)
#define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */
#define ___bpf_syswrap_args0() ctx
#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs)
#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args)
/*

1765
src/btf.c

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@ extern "C" {
#define BTF_ELF_SEC ".BTF"
#define BTF_EXT_ELF_SEC ".BTF.ext"
#define BTF_BASE_ELF_SEC ".BTF.base"
#define MAPS_ELF_SEC ".maps"
struct btf;
@@ -93,6 +94,7 @@ LIBBPF_API struct btf *btf__new_empty(void);
* @brief **btf__new_empty_split()** creates an unpopulated BTF object from an
* ELF BTF section except with a base BTF on top of which split BTF should be
* based
* @param base_btf base BTF object
* @return new BTF object instance which has to be eventually freed with
* **btf__free()**
*
@@ -107,6 +109,31 @@ LIBBPF_API struct btf *btf__new_empty(void);
*/
LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf);
/**
* @brief **btf__distill_base()** creates new versions of the split BTF
* *src_btf* and its base BTF. The new base BTF will only contain the types
* needed to improve robustness of the split BTF to small changes in base BTF.
* When that split BTF is loaded against a (possibly changed) base, this
* distilled base BTF will help update references to that (possibly changed)
* base BTF.
* @param src_btf source split BTF object
* @param new_base_btf pointer to where the new base BTF object pointer will be stored
* @param new_split_btf pointer to where the new split BTF object pointer will be stored
* @return 0 on success; negative error code, otherwise
*
* Both the new split and its associated new base BTF must be freed by
* the caller.
*
* If successful, 0 is returned and **new_base_btf** and **new_split_btf**
* will point at new base/split BTF. Both the new split and its associated
* new base BTF must be freed by the caller.
*
* A negative value is returned on error and the thread-local `errno` variable
* is set to the error code as well.
*/
LIBBPF_API int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf,
struct btf **new_split_btf);
LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext);
LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf);
LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext);
@@ -145,6 +172,9 @@ LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset);
LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size);
LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext);
LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size);
LIBBPF_API enum btf_endianness btf_ext__endianness(const struct btf_ext *btf_ext);
LIBBPF_API int btf_ext__set_endianness(struct btf_ext *btf_ext,
enum btf_endianness endian);
LIBBPF_API int btf__find_str(struct btf *btf, const char *s);
LIBBPF_API int btf__add_str(struct btf *btf, const char *s);
@@ -202,6 +232,7 @@ LIBBPF_API int btf__add_volatile(struct btf *btf, int ref_type_id);
LIBBPF_API int btf__add_const(struct btf *btf, int ref_type_id);
LIBBPF_API int btf__add_restrict(struct btf *btf, int ref_type_id);
LIBBPF_API int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id);
LIBBPF_API int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id);
/* func and func_proto construction APIs */
LIBBPF_API int btf__add_func(struct btf *btf, const char *name,
@@ -218,6 +249,8 @@ LIBBPF_API int btf__add_datasec_var_info(struct btf *btf, int var_type_id,
/* tag construction API */
LIBBPF_API int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id,
int component_idx);
LIBBPF_API int btf__add_decl_attr(struct btf *btf, const char *value, int ref_type_id,
int component_idx);
struct btf_dedup_opts {
size_t sz;
@@ -231,6 +264,65 @@ struct btf_dedup_opts {
LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
/**
* @brief **btf__relocate()** will check the split BTF *btf* for references
* to base BTF kinds, and verify those references are compatible with
* *base_btf*; if they are, *btf* is adjusted such that is re-parented to
* *base_btf* and type ids and strings are adjusted to accommodate this.
* @param btf split BTF object to relocate
* @param base_btf base BTF object
* @return 0 on success; negative error code, otherwise
*
* If successful, 0 is returned and **btf** now has **base_btf** as its
* base.
*
* A negative value is returned on error and the thread-local `errno` variable
* is set to the error code as well.
*/
LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf);
struct btf_permute_opts {
size_t sz;
/* optional .BTF.ext info along the main BTF info */
struct btf_ext *btf_ext;
size_t :0;
};
#define btf_permute_opts__last_field btf_ext
/**
* @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping
* @param btf BTF object to permute
* @param id_map Array mapping original type IDs to new IDs
* @param id_map_cnt Number of elements in @id_map
* @param opts Optional parameters, including BTF extension data for reference updates
* @return 0 on success, negative error code on failure
*
* **btf__permute()** reorders BTF types based on the provided @id_map array,
* updating all internal type references to maintain consistency. The function
* operates in-place, modifying the BTF object directly.
*
* For **base BTF**:
* - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1`
* - @id_map_cnt must be `btf__type_cnt(btf)`
* - Mapping is defined as `id_map[original_id] = new_id`
* - `id_map[0]` must be 0 (void type cannot be moved)
*
* For **split BTF**:
* - @id_map must include only split types (types added on top of the base BTF)
* - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))`
* - Mapping is defined as `id_map[original_id - start_id] = new_id`
* - `start_id` equals `btf__type_cnt(btf__base_btf(btf))`
*
* After permutation, all type references within the BTF data and optional
* BTF extension (if provided via @opts) are updated automatically.
*
* On error, returns a negative error code and sets errno:
* - `-EINVAL`: Invalid parameters or invalid ID mapping
* - `-ENOMEM`: Memory allocation failure
*/
LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt,
const struct btf_permute_opts *opts);
struct btf_dump;
struct btf_dump_opts {
@@ -250,7 +342,7 @@ LIBBPF_API void btf_dump__free(struct btf_dump *d);
LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id);
struct btf_dump_emit_type_decl_opts {
/* size of this struct, for forward/backward compatiblity */
/* size of this struct, for forward/backward compatibility */
size_t sz;
/* optional field name for type declaration, e.g.:
* - struct my_struct <FNAME>
@@ -284,9 +376,10 @@ struct btf_dump_type_data_opts {
bool compact; /* no newlines/indentation */
bool skip_names; /* skip member/type names */
bool emit_zeroes; /* show 0-valued fields */
bool emit_strings; /* print char arrays as strings */
size_t :0;
};
#define btf_dump_type_data_opts__last_field emit_zeroes
#define btf_dump_type_data_opts__last_field emit_strings
LIBBPF_API int
btf_dump__dump_type_data(struct btf_dump *d, __u32 id,

View File

@@ -67,6 +67,7 @@ struct btf_dump_data {
bool compact;
bool skip_names;
bool emit_zeroes;
bool emit_strings;
__u8 indent_lvl; /* base indent level */
char indent_str[BTF_DATA_INDENT_STR_LEN];
/* below are used during iteration */
@@ -225,6 +226,9 @@ static void btf_dump_free_names(struct hashmap *map)
size_t bkt;
struct hashmap_entry *cur;
if (!map)
return;
hashmap__for_each_entry(map, cur, bkt)
free((void *)cur->pkey);
@@ -304,7 +308,7 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id)
* definition, in which case they have to be declared inline as part of field
* type declaration; or as a top-level anonymous enum, typically used for
* declaring global constants. It's impossible to distinguish between two
* without knowning whether given enum type was referenced from other type:
* without knowing whether given enum type was referenced from other type:
* top-level anonymous enum won't be referenced by anything, while embedded
* one will.
*/
@@ -867,8 +871,8 @@ static void btf_dump_emit_bit_padding(const struct btf_dump *d,
} pads[] = {
{"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8}
};
int new_off, pad_bits, bits, i;
const char *pad_type;
int new_off = 0, pad_bits = 0, bits, i;
const char *pad_type = NULL;
if (cur_off >= next_off)
return; /* no gap */
@@ -1304,7 +1308,7 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
* chain, restore stack, emit warning, and try to
* proceed nevertheless
*/
pr_warn("not enough memory for decl stack:%d", err);
pr_warn("not enough memory for decl stack: %s\n", errstr(err));
d->decl_stack_cnt = stack_start;
return;
}
@@ -1493,7 +1497,10 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
case BTF_KIND_TYPE_TAG:
btf_dump_emit_mods(d, decls);
name = btf_name_of(d, t->name_off);
btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name);
if (btf_kflag(t))
btf_dump_printf(d, " __attribute__((%s))", name);
else
btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name);
break;
case BTF_KIND_ARRAY: {
const struct btf_array *a = btf_array(t);
@@ -1559,10 +1566,12 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
* Clang for BPF target generates func_proto with no
* args as a func_proto with a single void arg (e.g.,
* `int (*f)(void)` vs just `int (*f)()`). We are
* going to pretend there are no args for such case.
* going to emit valid empty args (void) syntax for
* such case. Similarly and conveniently, valid
* no args case can be special-cased here as well.
*/
if (vlen == 1 && p->type == 0) {
btf_dump_printf(d, ")");
if (vlen == 0 || (vlen == 1 && p->type == 0)) {
btf_dump_printf(d, "void)");
return;
}
@@ -1753,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d,
__u16 left_shift_bits, right_shift_bits;
const __u8 *bytes = data;
__u8 nr_copy_bits;
__u8 start_bit, nr_bytes;
__u64 num = 0;
int i;
/* Calculate how many bytes cover the bitfield */
start_bit = bits_offset % 8;
nr_bytes = (start_bit + bit_sz + 7) / 8;
/* Bound check */
if (data + nr_bytes > d->typed_dump->data_end)
return -E2BIG;
/* Maximum supported bitfield size is 64 bits */
if (t->size > 8) {
pr_warn("unexpected bitfield size %d\n", t->size);
@@ -1929,6 +1947,7 @@ static int btf_dump_int_data(struct btf_dump *d,
if (d->typed_dump->is_array_terminated)
break;
if (*(char *)data == '\0') {
btf_dump_type_values(d, "'\\0'");
d->typed_dump->is_array_terminated = true;
break;
}
@@ -2021,6 +2040,52 @@ static int btf_dump_var_data(struct btf_dump *d,
return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0);
}
static int btf_dump_string_data(struct btf_dump *d,
const struct btf_type *t,
__u32 id,
const void *data)
{
const struct btf_array *array = btf_array(t);
const char *chars = data;
__u32 i;
/* Make sure it is a NUL-terminated string. */
for (i = 0; i < array->nelems; i++) {
if ((void *)(chars + i) >= d->typed_dump->data_end)
return -E2BIG;
if (chars[i] == '\0')
break;
}
if (i == array->nelems) {
/* The caller will print this as a regular array. */
return -EINVAL;
}
btf_dump_data_pfx(d);
btf_dump_printf(d, "\"");
for (i = 0; i < array->nelems; i++) {
char c = chars[i];
if (c == '\0') {
/*
* When printing character arrays as strings, NUL bytes
* are always treated as string terminators; they are
* never printed.
*/
break;
}
if (isprint(c))
btf_dump_printf(d, "%c", c);
else
btf_dump_printf(d, "\\x%02x", (__u8)c);
}
btf_dump_printf(d, "\"");
return 0;
}
static int btf_dump_array_data(struct btf_dump *d,
const struct btf_type *t,
__u32 id,
@@ -2031,6 +2096,7 @@ static int btf_dump_array_data(struct btf_dump *d,
__u32 i, elem_type_id;
__s64 elem_size;
bool is_array_member;
bool is_array_terminated;
elem_type_id = array->type;
elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL);
@@ -2047,8 +2113,13 @@ static int btf_dump_array_data(struct btf_dump *d,
* char arrays, so if size is 1 and element is
* printable as a char, we'll do that.
*/
if (elem_size == 1)
if (elem_size == 1) {
if (d->typed_dump->emit_strings &&
btf_dump_string_data(d, t, id, data) == 0) {
return 0;
}
d->typed_dump->is_array_char = true;
}
}
/* note that we increment depth before calling btf_dump_print() below;
@@ -2066,12 +2137,15 @@ static int btf_dump_array_data(struct btf_dump *d,
*/
is_array_member = d->typed_dump->is_array_member;
d->typed_dump->is_array_member = true;
is_array_terminated = d->typed_dump->is_array_terminated;
d->typed_dump->is_array_terminated = false;
for (i = 0; i < array->nelems; i++, data += elem_size) {
if (d->typed_dump->is_array_terminated)
break;
btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0);
}
d->typed_dump->is_array_member = is_array_member;
d->typed_dump->is_array_terminated = is_array_terminated;
d->typed_dump->depth--;
btf_dump_data_pfx(d);
btf_dump_type_values(d, "]");
@@ -2533,6 +2607,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
d->typed_dump->compact = OPTS_GET(opts, compact, false);
d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false);
d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false);
d->typed_dump->emit_strings = OPTS_GET(opts, emit_strings, false);
ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0);

177
src/btf_iter.c Normal file
View File

@@ -0,0 +1,177 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/* Copyright (c) 2021 Facebook */
/* Copyright (c) 2024, Oracle and/or its affiliates. */
#ifdef __KERNEL__
#include <linux/bpf.h>
#include <linux/btf.h>
#define btf_var_secinfos(t) (struct btf_var_secinfo *)btf_type_var_secinfo(t)
#else
#include "btf.h"
#include "libbpf_internal.h"
#endif
int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t,
enum btf_field_iter_kind iter_kind)
{
it->p = NULL;
it->m_idx = -1;
it->off_idx = 0;
it->vlen = 0;
switch (iter_kind) {
case BTF_FIELD_ITER_IDS:
switch (btf_kind(t)) {
case BTF_KIND_UNKN:
case BTF_KIND_INT:
case BTF_KIND_FLOAT:
case BTF_KIND_ENUM:
case BTF_KIND_ENUM64:
it->desc = (struct btf_field_desc) {};
break;
case BTF_KIND_FWD:
case BTF_KIND_CONST:
case BTF_KIND_VOLATILE:
case BTF_KIND_RESTRICT:
case BTF_KIND_PTR:
case BTF_KIND_TYPEDEF:
case BTF_KIND_FUNC:
case BTF_KIND_VAR:
case BTF_KIND_DECL_TAG:
case BTF_KIND_TYPE_TAG:
it->desc = (struct btf_field_desc) { 1, {offsetof(struct btf_type, type)} };
break;
case BTF_KIND_ARRAY:
it->desc = (struct btf_field_desc) {
2, {sizeof(struct btf_type) + offsetof(struct btf_array, type),
sizeof(struct btf_type) + offsetof(struct btf_array, index_type)}
};
break;
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
it->desc = (struct btf_field_desc) {
0, {},
sizeof(struct btf_member),
1, {offsetof(struct btf_member, type)}
};
break;
case BTF_KIND_FUNC_PROTO:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, type)},
sizeof(struct btf_param),
1, {offsetof(struct btf_param, type)}
};
break;
case BTF_KIND_DATASEC:
it->desc = (struct btf_field_desc) {
0, {},
sizeof(struct btf_var_secinfo),
1, {offsetof(struct btf_var_secinfo, type)}
};
break;
default:
return -EINVAL;
}
break;
case BTF_FIELD_ITER_STRS:
switch (btf_kind(t)) {
case BTF_KIND_UNKN:
it->desc = (struct btf_field_desc) {};
break;
case BTF_KIND_INT:
case BTF_KIND_FLOAT:
case BTF_KIND_FWD:
case BTF_KIND_ARRAY:
case BTF_KIND_CONST:
case BTF_KIND_VOLATILE:
case BTF_KIND_RESTRICT:
case BTF_KIND_PTR:
case BTF_KIND_TYPEDEF:
case BTF_KIND_FUNC:
case BTF_KIND_VAR:
case BTF_KIND_DECL_TAG:
case BTF_KIND_TYPE_TAG:
case BTF_KIND_DATASEC:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, name_off)}
};
break;
case BTF_KIND_ENUM:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, name_off)},
sizeof(struct btf_enum),
1, {offsetof(struct btf_enum, name_off)}
};
break;
case BTF_KIND_ENUM64:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, name_off)},
sizeof(struct btf_enum64),
1, {offsetof(struct btf_enum64, name_off)}
};
break;
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, name_off)},
sizeof(struct btf_member),
1, {offsetof(struct btf_member, name_off)}
};
break;
case BTF_KIND_FUNC_PROTO:
it->desc = (struct btf_field_desc) {
1, {offsetof(struct btf_type, name_off)},
sizeof(struct btf_param),
1, {offsetof(struct btf_param, name_off)}
};
break;
default:
return -EINVAL;
}
break;
default:
return -EINVAL;
}
if (it->desc.m_sz)
it->vlen = btf_vlen(t);
it->p = t;
return 0;
}
__u32 *btf_field_iter_next(struct btf_field_iter *it)
{
if (!it->p)
return NULL;
if (it->m_idx < 0) {
if (it->off_idx < it->desc.t_off_cnt)
return it->p + it->desc.t_offs[it->off_idx++];
/* move to per-member iteration */
it->m_idx = 0;
it->p += sizeof(struct btf_type);
it->off_idx = 0;
}
/* if type doesn't have members, stop */
if (it->desc.m_sz == 0) {
it->p = NULL;
return NULL;
}
if (it->off_idx >= it->desc.m_off_cnt) {
/* exhausted this member's fields, go to the next member */
it->m_idx++;
it->p += it->desc.m_sz;
it->off_idx = 0;
}
if (it->m_idx < it->vlen)
return it->p + it->desc.m_offs[it->off_idx++];
it->p = NULL;
return NULL;
}

519
src/btf_relocate.c Normal file
View File

@@ -0,0 +1,519 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/* Copyright (c) 2024, Oracle and/or its affiliates. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#ifdef __KERNEL__
#include <linux/bpf.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/sort.h>
#include <linux/string.h>
#include <linux/bpf_verifier.h>
#define btf_type_by_id (struct btf_type *)btf_type_by_id
#define btf__type_cnt btf_nr_types
#define btf__base_btf btf_base_btf
#define btf__name_by_offset btf_name_by_offset
#define btf__str_by_offset btf_str_by_offset
#define btf_kflag btf_type_kflag
#define calloc(nmemb, sz) kvcalloc(nmemb, sz, GFP_KERNEL | __GFP_NOWARN)
#define free(ptr) kvfree(ptr)
#define qsort(base, num, sz, cmp) sort(base, num, sz, cmp, NULL)
#else
#include "btf.h"
#include "bpf.h"
#include "libbpf.h"
#include "libbpf_internal.h"
#endif /* __KERNEL__ */
struct btf;
struct btf_relocate {
struct btf *btf;
const struct btf *base_btf;
const struct btf *dist_base_btf;
unsigned int nr_base_types;
unsigned int nr_split_types;
unsigned int nr_dist_base_types;
int dist_str_len;
int base_str_len;
__u32 *id_map;
__u32 *str_map;
};
/* Set temporarily in relocation id_map if distilled base struct/union is
* embedded in a split BTF struct/union; in such a case, size information must
* match between distilled base BTF and base BTF representation of type.
*/
#define BTF_IS_EMBEDDED ((__u32)-1)
/* <name, size, id> triple used in sorting/searching distilled base BTF. */
struct btf_name_info {
const char *name;
/* set when search requires a size match */
bool needs_size: 1;
unsigned int size: 31;
__u32 id;
};
static int btf_relocate_rewrite_type_id(struct btf_relocate *r, __u32 i)
{
struct btf_type *t = btf_type_by_id(r->btf, i);
struct btf_field_iter it;
__u32 *id;
int err;
err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS);
if (err)
return err;
while ((id = btf_field_iter_next(&it)))
*id = r->id_map[*id];
return 0;
}
/* Simple string comparison used for sorting within BTF, since all distilled
* types are named. If strings match, and size is non-zero for both elements
* fall back to using size for ordering.
*/
static int cmp_btf_name_size(const void *n1, const void *n2)
{
const struct btf_name_info *ni1 = n1;
const struct btf_name_info *ni2 = n2;
int name_diff = strcmp(ni1->name, ni2->name);
if (!name_diff && ni1->needs_size && ni2->needs_size)
return ni2->size - ni1->size;
return name_diff;
}
/* Binary search with a small twist; find leftmost element that matches
* so that we can then iterate through all exact matches. So for example
* searching { "a", "bb", "bb", "c" } we would always match on the
* leftmost "bb".
*/
static struct btf_name_info *search_btf_name_size(struct btf_name_info *key,
struct btf_name_info *vals,
int nelems)
{
struct btf_name_info *ret = NULL;
int high = nelems - 1;
int low = 0;
while (low <= high) {
int mid = (low + high)/2;
struct btf_name_info *val = &vals[mid];
int diff = cmp_btf_name_size(key, val);
if (diff == 0)
ret = val;
/* even if found, keep searching for leftmost match */
if (diff <= 0)
high = mid - 1;
else
low = mid + 1;
}
return ret;
}
/* If a member of a split BTF struct/union refers to a base BTF
* struct/union, mark that struct/union id temporarily in the id_map
* with BTF_IS_EMBEDDED. Members can be const/restrict/volatile/typedef
* reference types, but if a pointer is encountered, the type is no longer
* considered embedded.
*/
static int btf_mark_embedded_composite_type_ids(struct btf_relocate *r, __u32 i)
{
struct btf_type *t = btf_type_by_id(r->btf, i);
struct btf_field_iter it;
__u32 *id;
int err;
if (!btf_is_composite(t))
return 0;
err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS);
if (err)
return err;
while ((id = btf_field_iter_next(&it))) {
__u32 next_id = *id;
while (next_id) {
t = btf_type_by_id(r->btf, next_id);
switch (btf_kind(t)) {
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
case BTF_KIND_VOLATILE:
case BTF_KIND_TYPEDEF:
case BTF_KIND_TYPE_TAG:
next_id = t->type;
break;
case BTF_KIND_ARRAY: {
struct btf_array *a = btf_array(t);
next_id = a->type;
break;
}
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
if (next_id < r->nr_dist_base_types)
r->id_map[next_id] = BTF_IS_EMBEDDED;
next_id = 0;
break;
default:
next_id = 0;
break;
}
}
}
return 0;
}
/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate
* through base BTF looking up distilled type (using binary search) equivalents.
*/
static int btf_relocate_map_distilled_base(struct btf_relocate *r)
{
struct btf_name_info *info, *info_end;
struct btf_type *base_t, *dist_t;
__u8 *base_name_cnt = NULL;
int err = 0;
__u32 id;
/* generate a sort index array of name/type ids sorted by name for
* distilled base BTF to speed name-based lookups.
*/
info = calloc(r->nr_dist_base_types, sizeof(*info));
if (!info) {
err = -ENOMEM;
goto done;
}
info_end = info + r->nr_dist_base_types;
for (id = 0; id < r->nr_dist_base_types; id++) {
dist_t = btf_type_by_id(r->dist_base_btf, id);
info[id].name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off);
info[id].id = id;
info[id].size = dist_t->size;
info[id].needs_size = true;
}
qsort(info, r->nr_dist_base_types, sizeof(*info), cmp_btf_name_size);
/* Mark distilled base struct/union members of split BTF structs/unions
* in id_map with BTF_IS_EMBEDDED; this signals that these types
* need to match both name and size, otherwise embedding the base
* struct/union in the split type is invalid.
*/
for (id = r->nr_dist_base_types; id < r->nr_dist_base_types + r->nr_split_types; id++) {
err = btf_mark_embedded_composite_type_ids(r, id);
if (err)
goto done;
}
/* Collect name counts for composite types in base BTF. If multiple
* instances of a struct/union of the same name exist, we need to use
* size to determine which to map to since name alone is ambiguous.
*/
base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt));
if (!base_name_cnt) {
err = -ENOMEM;
goto done;
}
for (id = 1; id < r->nr_base_types; id++) {
base_t = btf_type_by_id(r->base_btf, id);
if (!btf_is_composite(base_t) || !base_t->name_off)
continue;
if (base_name_cnt[base_t->name_off] < 255)
base_name_cnt[base_t->name_off]++;
}
/* Now search base BTF for matching distilled base BTF types. */
for (id = 1; id < r->nr_base_types; id++) {
struct btf_name_info *dist_info, base_info = {};
int dist_kind, base_kind;
base_t = btf_type_by_id(r->base_btf, id);
/* distilled base consists of named types only. */
if (!base_t->name_off)
continue;
base_kind = btf_kind(base_t);
base_info.id = id;
base_info.name = btf__name_by_offset(r->base_btf, base_t->name_off);
switch (base_kind) {
case BTF_KIND_INT:
case BTF_KIND_FLOAT:
case BTF_KIND_ENUM:
case BTF_KIND_ENUM64:
/* These types should match both name and size */
base_info.needs_size = true;
base_info.size = base_t->size;
break;
case BTF_KIND_FWD:
/* No size considerations for fwds. */
break;
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
/* Size only needs to be used for struct/union if there
* are multiple types in base BTF with the same name.
* If there are multiple _distilled_ types with the same
* name (a very unlikely scenario), that doesn't matter
* unless corresponding _base_ types to match them are
* missing.
*/
base_info.needs_size = base_name_cnt[base_t->name_off] > 1;
base_info.size = base_t->size;
break;
default:
continue;
}
/* iterate over all matching distilled base types */
for (dist_info = search_btf_name_size(&base_info, info, r->nr_dist_base_types);
dist_info != NULL && dist_info < info_end &&
cmp_btf_name_size(&base_info, dist_info) == 0;
dist_info++) {
if (!dist_info->id || dist_info->id >= r->nr_dist_base_types) {
pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n",
id, dist_info->id);
err = -EINVAL;
goto done;
}
dist_t = btf_type_by_id(r->dist_base_btf, dist_info->id);
dist_kind = btf_kind(dist_t);
/* Validate that the found distilled type is compatible.
* Do not error out on mismatch as another match may
* occur for an identically-named type.
*/
switch (dist_kind) {
case BTF_KIND_FWD:
switch (base_kind) {
case BTF_KIND_FWD:
if (btf_kflag(dist_t) != btf_kflag(base_t))
continue;
break;
case BTF_KIND_STRUCT:
if (btf_kflag(base_t))
continue;
break;
case BTF_KIND_UNION:
if (!btf_kflag(base_t))
continue;
break;
default:
continue;
}
break;
case BTF_KIND_INT:
if (dist_kind != base_kind ||
btf_int_encoding(base_t) != btf_int_encoding(dist_t))
continue;
break;
case BTF_KIND_FLOAT:
if (dist_kind != base_kind)
continue;
break;
case BTF_KIND_ENUM:
/* ENUM and ENUM64 are encoded as sized ENUM in
* distilled base BTF.
*/
if (base_kind != dist_kind && base_kind != BTF_KIND_ENUM64)
continue;
break;
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
/* size verification is required for embedded
* struct/unions.
*/
if (r->id_map[dist_info->id] == BTF_IS_EMBEDDED &&
base_t->size != dist_t->size)
continue;
break;
default:
continue;
}
if (r->id_map[dist_info->id] &&
r->id_map[dist_info->id] != BTF_IS_EMBEDDED) {
/* we already have a match; this tells us that
* multiple base types of the same name
* have the same size, since for cases where
* multiple types have the same name we match
* on name and size. In this case, we have
* no way of determining which to relocate
* to in base BTF, so error out.
*/
pr_warn("distilled base BTF type '%s' [%u], size %u has multiple candidates of the same size (ids [%u, %u]) in base BTF\n",
base_info.name, dist_info->id,
base_t->size, id, r->id_map[dist_info->id]);
err = -EINVAL;
goto done;
}
/* map id and name */
r->id_map[dist_info->id] = id;
r->str_map[dist_t->name_off] = base_t->name_off;
}
}
/* ensure all distilled BTF ids now have a mapping... */
for (id = 1; id < r->nr_dist_base_types; id++) {
const char *name;
if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED)
continue;
dist_t = btf_type_by_id(r->dist_base_btf, id);
name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off);
pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n",
name, id);
err = -EINVAL;
break;
}
done:
free(base_name_cnt);
free(info);
return err;
}
/* distilled base should only have named int/float/enum/fwd/struct/union types. */
static int btf_relocate_validate_distilled_base(struct btf_relocate *r)
{
unsigned int i;
for (i = 1; i < r->nr_dist_base_types; i++) {
struct btf_type *t = btf_type_by_id(r->dist_base_btf, i);
int kind = btf_kind(t);
switch (kind) {
case BTF_KIND_INT:
case BTF_KIND_FLOAT:
case BTF_KIND_ENUM:
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_FWD:
if (t->name_off)
break;
pr_warn("type [%d], kind [%d] is invalid for distilled base BTF; it is anonymous\n",
i, kind);
return -EINVAL;
default:
pr_warn("type [%d] in distilled based BTF has unexpected kind [%d]\n",
i, kind);
return -EINVAL;
}
}
return 0;
}
static int btf_relocate_rewrite_strs(struct btf_relocate *r, __u32 i)
{
struct btf_type *t = btf_type_by_id(r->btf, i);
struct btf_field_iter it;
__u32 *str_off;
int off, err;
err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS);
if (err)
return err;
while ((str_off = btf_field_iter_next(&it))) {
if (!*str_off)
continue;
if (*str_off >= r->dist_str_len) {
*str_off += r->base_str_len - r->dist_str_len;
} else {
off = r->str_map[*str_off];
if (!off) {
pr_warn("string '%s' [offset %u] is not mapped to base BTF\n",
btf__str_by_offset(r->btf, off), *str_off);
return -ENOENT;
}
*str_off = off;
}
}
return 0;
}
/* If successful, output of relocation is updated BTF with base BTF pointing
* at base_btf, and type ids, strings adjusted accordingly.
*/
int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map)
{
unsigned int nr_types = btf__type_cnt(btf);
const struct btf_header *dist_base_hdr;
const struct btf_header *base_hdr;
struct btf_relocate r = {};
int err = 0;
__u32 id, i;
r.dist_base_btf = btf__base_btf(btf);
if (!base_btf || r.dist_base_btf == base_btf)
return -EINVAL;
r.nr_dist_base_types = btf__type_cnt(r.dist_base_btf);
r.nr_base_types = btf__type_cnt(base_btf);
r.nr_split_types = nr_types - r.nr_dist_base_types;
r.btf = btf;
r.base_btf = base_btf;
r.id_map = calloc(nr_types, sizeof(*r.id_map));
r.str_map = calloc(btf_header(r.dist_base_btf)->str_len, sizeof(*r.str_map));
dist_base_hdr = btf_header(r.dist_base_btf);
base_hdr = btf_header(r.base_btf);
r.dist_str_len = dist_base_hdr->str_len;
r.base_str_len = base_hdr->str_len;
if (!r.id_map || !r.str_map) {
err = -ENOMEM;
goto err_out;
}
err = btf_relocate_validate_distilled_base(&r);
if (err)
goto err_out;
/* Split BTF ids need to be adjusted as base and distilled base
* have different numbers of types, changing the start id of split
* BTF.
*/
for (id = r.nr_dist_base_types; id < nr_types; id++)
r.id_map[id] = id + r.nr_base_types - r.nr_dist_base_types;
/* Build a map from distilled base ids to actual base BTF ids; it is used
* to update split BTF id references. Also build a str_map mapping from
* distilled base BTF names to base BTF names.
*/
err = btf_relocate_map_distilled_base(&r);
if (err)
goto err_out;
/* Next, rewrite type ids in split BTF, replacing split ids with updated
* ids based on number of types in base BTF, and base ids with
* relocated ids from base_btf.
*/
for (i = 0, id = r.nr_dist_base_types; i < r.nr_split_types; i++, id++) {
err = btf_relocate_rewrite_type_id(&r, id);
if (err)
goto err_out;
}
/* String offsets now need to be updated using the str_map. */
for (i = 0; i < r.nr_split_types; i++) {
err = btf_relocate_rewrite_strs(&r, i + r.nr_dist_base_types);
if (err)
goto err_out;
}
/* Finally reset base BTF to be base_btf */
btf_set_base_btf(btf, base_btf);
if (id_map) {
*id_map = r.id_map;
r.id_map = NULL;
}
err_out:
free(r.id_map);
free(r.str_map);
return err;
}

View File

@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include "libbpf_internal.h"
#include "str_error.h"
/* A SHT_GNU_versym section holds 16-bit words. This bit is set if
* the symbol is hidden and can only be seen when referenced using an
@@ -24,10 +23,12 @@
int elf_open(const char *binary_path, struct elf_fd *elf_fd)
{
char errmsg[STRERR_BUFSIZE];
int fd, ret;
Elf *elf;
elf_fd->elf = NULL;
elf_fd->fd = -1;
if (elf_version(EV_CURRENT) == EV_NONE) {
pr_warn("elf: failed to init libelf for %s\n", binary_path);
return -LIBBPF_ERRNO__LIBELF;
@@ -35,8 +36,7 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd)
fd = open(binary_path, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
ret = -errno;
pr_warn("elf: failed to open %s: %s\n", binary_path,
libbpf_strerror_r(ret, errmsg, sizeof(errmsg)));
pr_warn("elf: failed to open %s: %s\n", binary_path, errstr(ret));
return ret;
}
elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);

View File

@@ -6,7 +6,6 @@
#include "libbpf.h"
#include "libbpf_common.h"
#include "libbpf_internal.h"
#include "str_error.h"
static inline __u64 ptr_to_u64(const void *ptr)
{
@@ -22,7 +21,7 @@ int probe_fd(int fd)
static int probe_kern_prog_name(int token_fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, prog_name);
const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
@@ -47,7 +46,6 @@ static int probe_kern_prog_name(int token_fd)
static int probe_kern_global_data(int token_fd)
{
char *cp, errmsg[STRERR_BUFSIZE];
struct bpf_insn insns[] = {
BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
@@ -67,9 +65,8 @@ static int probe_kern_global_data(int token_fd)
map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, &map_opts);
if (map < 0) {
ret = -errno;
cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
__func__, cp, -ret);
pr_warn("Error in %s(): %s. Couldn't create simple array map.\n",
__func__, errstr(ret));
return ret;
}
@@ -267,7 +264,6 @@ static int probe_kern_probe_read_kernel(int token_fd)
static int probe_prog_bind_map(int token_fd)
{
char *cp, errmsg[STRERR_BUFSIZE];
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
@@ -285,9 +281,8 @@ static int probe_prog_bind_map(int token_fd)
map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, &map_opts);
if (map < 0) {
ret = -errno;
cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
__func__, cp, -ret);
pr_warn("Error in %s(): %s. Couldn't create simple array map.\n",
__func__, errstr(ret));
return ret;
}
@@ -392,11 +387,41 @@ static int probe_uprobe_multi_link(int token_fd)
link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
err = -errno; /* close() can clobber errno */
if (link_fd >= 0 || err != -EBADF) {
if (link_fd >= 0)
close(link_fd);
close(prog_fd);
return 0;
}
/* Initial multi-uprobe support in kernel didn't handle PID filtering
* correctly (it was doing thread filtering, not process filtering).
* So now we'll detect if PID filtering logic was fixed, and, if not,
* we'll pretend multi-uprobes are not supported, if not.
* Multi-uprobes are used in USDT attachment logic, and we need to be
* conservative here, because multi-uprobe selection happens early at
* load time, while the use of PID filtering is known late at
* attachment time, at which point it's too late to undo multi-uprobe
* selection.
*
* Creating uprobe with pid == -1 for (invalid) '/' binary will fail
* early with -EINVAL on kernels with fixed PID filtering logic;
* otherwise -ESRCH would be returned if passed correct binary path
* (but we'll just get -BADF, of course).
*/
link_opts.uprobe_multi.pid = -1; /* invalid PID */
link_opts.uprobe_multi.path = "/"; /* invalid path */
link_opts.uprobe_multi.offsets = &offset;
link_opts.uprobe_multi.cnt = 1;
link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
err = -errno; /* close() can clobber errno */
if (link_fd >= 0)
close(link_fd);
close(prog_fd);
return link_fd < 0 && err == -EBADF;
return link_fd < 0 && err == -EINVAL;
}
static int probe_kern_bpf_cookie(int token_fd)
@@ -574,7 +599,8 @@ bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_
} else if (ret == 0) {
WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
} else {
pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret);
pr_warn("Detection of kernel %s support failed: %s\n",
feat->desc, errstr(ret));
WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
}
}

View File

@@ -4,6 +4,7 @@
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <asm/byteorder.h>
#include <linux/filter.h>
#include <sys/param.h>
#include "btf.h"
@@ -13,7 +14,6 @@
#include "hashmap.h"
#include "bpf_gen_internal.h"
#include "skel_internal.h"
#include <asm/byteorder.h>
#define MAX_USED_MAPS 64
#define MAX_USED_PROGS 32
@@ -109,6 +109,7 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in
static int add_data(struct bpf_gen *gen, const void *data, __u32 size);
static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off);
static void emit_signature_match(struct bpf_gen *gen);
void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps)
{
@@ -151,6 +152,8 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps
/* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */
emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7));
emit(gen, BPF_EXIT_INSN());
if (OPTS_GET(gen->opts, gen_hash, false))
emit_signature_match(gen);
}
static int add_data(struct bpf_gen *gen, const void *data, __u32 size)
@@ -367,6 +370,8 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off)
__emit_sys_close(gen);
}
static void compute_sha_update_offsets(struct bpf_gen *gen);
int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
{
int i;
@@ -393,7 +398,10 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
blob_fd_array_off(gen, i));
emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
emit(gen, BPF_EXIT_INSN());
pr_debug("gen: finish %d\n", gen->error);
if (OPTS_GET(gen->opts, gen_hash, false))
compute_sha_update_offsets(gen);
pr_debug("gen: finish %s\n", errstr(gen->error));
if (!gen->error) {
struct gen_loader_opts *opts = gen->opts;
@@ -401,6 +409,15 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
opts->insns_sz = gen->insn_cur - gen->insn_start;
opts->data = gen->data_start;
opts->data_sz = gen->data_cur - gen->data_start;
/* use target endianness for embedded loader */
if (gen->swapped_endian) {
struct bpf_insn *insn = (struct bpf_insn *)opts->insns;
int insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
for (i = 0; i < insn_cnt; i++)
bpf_insn_bswap(insn++);
}
}
return gen->error;
}
@@ -414,6 +431,44 @@ void bpf_gen__free(struct bpf_gen *gen)
free(gen);
}
/*
* Fields of bpf_attr are set to values in native byte-order before being
* written to the target-bound data blob, and may need endian conversion.
* This macro allows providing the correct value in situ more simply than
* writing a separate converter for *all fields* of *all records* included
* in union bpf_attr. Note that sizeof(rval) should match the assignment
* target to avoid runtime problems.
*/
#define tgt_endian(rval) ({ \
typeof(rval) _val = (rval); \
if (gen->swapped_endian) { \
switch (sizeof(_val)) { \
case 1: break; \
case 2: _val = bswap_16(_val); break; \
case 4: _val = bswap_32(_val); break; \
case 8: _val = bswap_64(_val); break; \
default: pr_warn("unsupported bswap size!\n"); \
} \
} \
_val; \
})
static void compute_sha_update_offsets(struct bpf_gen *gen)
{
__u64 sha[SHA256_DWORD_SIZE];
__u64 sha_dw;
int i;
libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, (__u8 *)sha);
for (i = 0; i < SHA256_DWORD_SIZE; i++) {
struct bpf_insn *insn =
(struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]);
sha_dw = tgt_endian(sha[i]);
insn[0].imm = (__u32)sha_dw;
insn[1].imm = sha_dw >> 32;
}
}
void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
__u32 btf_raw_size)
{
@@ -422,11 +477,12 @@ void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
union bpf_attr attr;
memset(&attr, 0, attr_size);
pr_debug("gen: load_btf: size %d\n", btf_raw_size);
btf_data = add_data(gen, btf_raw_data, btf_raw_size);
attr.btf_size = btf_raw_size;
attr.btf_size = tgt_endian(btf_raw_size);
btf_load_attr = add_data(gen, &attr, attr_size);
pr_debug("gen: load_btf: off %d size %d, attr: off %d size %d\n",
btf_data, btf_raw_size, btf_load_attr, attr_size);
/* populate union bpf_attr with user provided log details */
move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_level), 4,
@@ -457,28 +513,29 @@ void bpf_gen__map_create(struct bpf_gen *gen,
union bpf_attr attr;
memset(&attr, 0, attr_size);
attr.map_type = map_type;
attr.key_size = key_size;
attr.value_size = value_size;
attr.map_flags = map_attr->map_flags;
attr.map_extra = map_attr->map_extra;
attr.map_type = tgt_endian(map_type);
attr.key_size = tgt_endian(key_size);
attr.value_size = tgt_endian(value_size);
attr.map_flags = tgt_endian(map_attr->map_flags);
attr.map_extra = tgt_endian(map_attr->map_extra);
if (map_name)
libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
attr.numa_node = map_attr->numa_node;
attr.map_ifindex = map_attr->map_ifindex;
attr.max_entries = max_entries;
attr.btf_key_type_id = map_attr->btf_key_type_id;
attr.btf_value_type_id = map_attr->btf_value_type_id;
pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n",
attr.map_name, map_idx, map_type, attr.btf_value_type_id);
attr.numa_node = tgt_endian(map_attr->numa_node);
attr.map_ifindex = tgt_endian(map_attr->map_ifindex);
attr.max_entries = tgt_endian(max_entries);
attr.btf_key_type_id = tgt_endian(map_attr->btf_key_type_id);
attr.btf_value_type_id = tgt_endian(map_attr->btf_value_type_id);
map_create_attr = add_data(gen, &attr, attr_size);
if (attr.btf_value_type_id)
pr_debug("gen: map_create: %s idx %d type %d value_type_id %d, attr: off %d size %d\n",
map_name, map_idx, map_type, map_attr->btf_value_type_id,
map_create_attr, attr_size);
if (map_attr->btf_value_type_id)
/* populate union bpf_attr with btf_fd saved in the stack earlier */
move_stack2blob(gen, attr_field(map_create_attr, btf_fd), 4,
stack_off(btf_fd));
switch (attr.map_type) {
switch (map_type) {
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
move_stack2blob(gen, attr_field(map_create_attr, inner_map_fd), 4,
@@ -498,8 +555,8 @@ void bpf_gen__map_create(struct bpf_gen *gen,
/* emit MAP_CREATE command */
emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d",
attr.map_name, map_idx, map_type, value_size,
attr.btf_value_type_id);
map_name, map_idx, map_type, value_size,
map_attr->btf_value_type_id);
emit_check_err(gen);
/* remember map_fd in the stack, if successful */
if (map_idx < 0) {
@@ -523,6 +580,29 @@ void bpf_gen__map_create(struct bpf_gen *gen,
emit_sys_close_stack(gen, stack_off(inner_map_fd));
}
static void emit_signature_match(struct bpf_gen *gen)
{
__s64 off;
int i;
for (i = 0; i < SHA256_DWORD_SIZE; i++) {
emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX,
0, 0, 0, 0));
emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, i * sizeof(__u64)));
gen->hash_insn_offset[i] = gen->insn_cur - gen->insn_start;
emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_3, 0, 0, 0, 0, 0));
off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1;
if (is_simm16(off)) {
emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL));
emit(gen, BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, off));
} else {
gen->error = -ERANGE;
emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1));
}
}
}
void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name,
enum bpf_attach_type type)
{
@@ -784,12 +864,12 @@ log:
emit_ksym_relo_log(gen, relo, kdesc->ref);
}
static __u32 src_reg_mask(void)
static __u32 src_reg_mask(struct bpf_gen *gen)
{
#if defined(__LITTLE_ENDIAN_BITFIELD)
return 0x0f; /* src_reg,dst_reg,... */
#elif defined(__BIG_ENDIAN_BITFIELD)
return 0xf0; /* dst_reg,src_reg,... */
#if defined(__LITTLE_ENDIAN_BITFIELD) /* src_reg,dst_reg,... */
return gen->swapped_endian ? 0xf0 : 0x0f;
#elif defined(__BIG_ENDIAN_BITFIELD) /* dst_reg,src_reg,... */
return gen->swapped_endian ? 0x0f : 0xf0;
#else
#error "Unsupported bit endianness, cannot proceed"
#endif
@@ -840,7 +920,7 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo,
emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3));
clear_src_reg:
/* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */
reg_mask = src_reg_mask();
reg_mask = src_reg_mask(gen);
emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code)));
emit(gen, BPF_ALU32_IMM(BPF_AND, BPF_REG_9, reg_mask));
emit(gen, BPF_STX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, offsetofend(struct bpf_insn, code)));
@@ -931,48 +1011,94 @@ static void cleanup_relos(struct bpf_gen *gen, int insns)
cleanup_core_relo(gen);
}
/* Convert func, line, and core relo info blobs to target endianness */
static void info_blob_bswap(struct bpf_gen *gen, int func_info, int line_info,
int core_relos, struct bpf_prog_load_opts *load_attr)
{
struct bpf_func_info *fi = gen->data_start + func_info;
struct bpf_line_info *li = gen->data_start + line_info;
struct bpf_core_relo *cr = gen->data_start + core_relos;
int i;
for (i = 0; i < load_attr->func_info_cnt; i++)
bpf_func_info_bswap(fi++);
for (i = 0; i < load_attr->line_info_cnt; i++)
bpf_line_info_bswap(li++);
for (i = 0; i < gen->core_relo_cnt; i++)
bpf_core_relo_bswap(cr++);
}
void bpf_gen__prog_load(struct bpf_gen *gen,
enum bpf_prog_type prog_type, const char *prog_name,
const char *license, struct bpf_insn *insns, size_t insn_cnt,
struct bpf_prog_load_opts *load_attr, int prog_idx)
{
int func_info_tot_sz = load_attr->func_info_cnt *
load_attr->func_info_rec_size;
int line_info_tot_sz = load_attr->line_info_cnt *
load_attr->line_info_rec_size;
int core_relo_tot_sz = gen->core_relo_cnt *
sizeof(struct bpf_core_relo);
int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos;
int attr_size = offsetofend(union bpf_attr, core_relo_rec_size);
union bpf_attr attr;
memset(&attr, 0, attr_size);
pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n",
prog_type, insn_cnt, prog_idx);
/* add license string to blob of bytes */
license_off = add_data(gen, license, strlen(license) + 1);
/* add insns to blob of bytes */
insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn));
pr_debug("gen: prog_load: prog_idx %d type %d insn off %d insns_cnt %zd license off %d\n",
prog_idx, prog_type, insns_off, insn_cnt, license_off);
attr.prog_type = prog_type;
attr.expected_attach_type = load_attr->expected_attach_type;
attr.attach_btf_id = load_attr->attach_btf_id;
attr.prog_ifindex = load_attr->prog_ifindex;
/* convert blob insns to target endianness */
if (gen->swapped_endian) {
struct bpf_insn *insn = gen->data_start + insns_off;
int i;
for (i = 0; i < insn_cnt; i++, insn++)
bpf_insn_bswap(insn);
}
attr.prog_type = tgt_endian(prog_type);
attr.expected_attach_type = tgt_endian(load_attr->expected_attach_type);
attr.attach_btf_id = tgt_endian(load_attr->attach_btf_id);
attr.prog_ifindex = tgt_endian(load_attr->prog_ifindex);
attr.kern_version = 0;
attr.insn_cnt = (__u32)insn_cnt;
attr.prog_flags = load_attr->prog_flags;
attr.insn_cnt = tgt_endian((__u32)insn_cnt);
attr.prog_flags = tgt_endian(load_attr->prog_flags);
attr.func_info_rec_size = load_attr->func_info_rec_size;
attr.func_info_cnt = load_attr->func_info_cnt;
func_info = add_data(gen, load_attr->func_info,
attr.func_info_cnt * attr.func_info_rec_size);
attr.func_info_rec_size = tgt_endian(load_attr->func_info_rec_size);
attr.func_info_cnt = tgt_endian(load_attr->func_info_cnt);
func_info = add_data(gen, load_attr->func_info, func_info_tot_sz);
pr_debug("gen: prog_load: func_info: off %d cnt %d rec size %d\n",
func_info, load_attr->func_info_cnt,
load_attr->func_info_rec_size);
attr.line_info_rec_size = load_attr->line_info_rec_size;
attr.line_info_cnt = load_attr->line_info_cnt;
line_info = add_data(gen, load_attr->line_info,
attr.line_info_cnt * attr.line_info_rec_size);
attr.line_info_rec_size = tgt_endian(load_attr->line_info_rec_size);
attr.line_info_cnt = tgt_endian(load_attr->line_info_cnt);
line_info = add_data(gen, load_attr->line_info, line_info_tot_sz);
pr_debug("gen: prog_load: line_info: off %d cnt %d rec size %d\n",
line_info, load_attr->line_info_cnt,
load_attr->line_info_rec_size);
attr.core_relo_rec_size = sizeof(struct bpf_core_relo);
attr.core_relo_cnt = gen->core_relo_cnt;
core_relos = add_data(gen, gen->core_relos,
attr.core_relo_cnt * attr.core_relo_rec_size);
attr.core_relo_rec_size = tgt_endian((__u32)sizeof(struct bpf_core_relo));
attr.core_relo_cnt = tgt_endian(gen->core_relo_cnt);
core_relos = add_data(gen, gen->core_relos, core_relo_tot_sz);
pr_debug("gen: prog_load: core_relos: off %d cnt %d rec size %zd\n",
core_relos, gen->core_relo_cnt,
sizeof(struct bpf_core_relo));
/* convert all info blobs to target endianness */
if (gen->swapped_endian)
info_blob_bswap(gen, func_info, line_info, core_relos, load_attr);
libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
prog_load_attr = add_data(gen, &attr, attr_size);
pr_debug("gen: prog_load: attr: off %d size %d\n",
prog_load_attr, attr_size);
/* populate union bpf_attr with a pointer to license */
emit_rel_store(gen, attr_field(prog_load_attr, license), license_off);
@@ -1040,7 +1166,6 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue,
int zero = 0;
memset(&attr, 0, attr_size);
pr_debug("gen: map_update_elem: idx %d\n", map_idx);
value = add_data(gen, pvalue, value_size);
key = add_data(gen, &zero, sizeof(zero));
@@ -1068,6 +1193,8 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue,
emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel));
map_update_attr = add_data(gen, &attr, attr_size);
pr_debug("gen: map_update_elem: idx %d, value: off %d size %d, attr: off %d size %d\n",
map_idx, value, value_size, map_update_attr, attr_size);
move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4,
blob_fd_array_off(gen, map_idx));
emit_rel_store(gen, attr_field(map_update_attr, key), key);
@@ -1084,14 +1211,16 @@ void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slo
int attr_size = offsetofend(union bpf_attr, flags);
int map_update_attr, key;
union bpf_attr attr;
int tgt_slot;
memset(&attr, 0, attr_size);
pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n",
outer_map_idx, slot, inner_map_idx);
key = add_data(gen, &slot, sizeof(slot));
tgt_slot = tgt_endian(slot);
key = add_data(gen, &tgt_slot, sizeof(tgt_slot));
map_update_attr = add_data(gen, &attr, attr_size);
pr_debug("gen: populate_outer_map: outer %d key %d inner %d, attr: off %d size %d\n",
outer_map_idx, slot, inner_map_idx, map_update_attr, attr_size);
move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4,
blob_fd_array_off(gen, outer_map_idx));
emit_rel_store(gen, attr_field(map_update_attr, key), key);
@@ -1112,8 +1241,9 @@ void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx)
union bpf_attr attr;
memset(&attr, 0, attr_size);
pr_debug("gen: map_freeze: idx %d\n", map_idx);
map_freeze_attr = add_data(gen, &attr, attr_size);
pr_debug("gen: map_freeze: idx %d, attr: off %d size %d\n",
map_idx, map_freeze_attr, attr_size);
move_blob2blob(gen, attr_field(map_freeze_attr, map_fd), 4,
blob_fd_array_off(gen, map_idx));
/* emit MAP_FREEZE command */

View File

@@ -166,8 +166,8 @@ bool hashmap_find(const struct hashmap *map, long key, long *value);
* @bkt: integer used as a bucket loop cursor
*/
#define hashmap__for_each_entry(map, cur, bkt) \
for (bkt = 0; bkt < map->cap; bkt++) \
for (cur = map->buckets[bkt]; cur; cur = cur->next)
for (bkt = 0; bkt < (map)->cap; bkt++) \
for (cur = (map)->buckets[bkt]; cur; cur = cur->next)
/*
* hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe
@@ -178,8 +178,8 @@ bool hashmap_find(const struct hashmap *map, long key, long *value);
* @bkt: integer used as a bucket loop cursor
*/
#define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \
for (bkt = 0; bkt < map->cap; bkt++) \
for (cur = map->buckets[bkt]; \
for (bkt = 0; bkt < (map)->cap; bkt++) \
for (cur = (map)->buckets[bkt]; \
cur && ({tmp = cur->next; true; }); \
cur = tmp)
@@ -190,19 +190,19 @@ bool hashmap_find(const struct hashmap *map, long key, long *value);
* @key: key to iterate entries for
*/
#define hashmap__for_each_key_entry(map, cur, _key) \
for (cur = map->buckets \
? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \
for (cur = (map)->buckets \
? (map)->buckets[hash_bits((map)->hash_fn((_key), (map)->ctx), (map)->cap_bits)] \
: NULL; \
cur; \
cur = cur->next) \
if (map->equal_fn(cur->key, (_key), map->ctx))
if ((map)->equal_fn(cur->key, (_key), (map)->ctx))
#define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \
for (cur = map->buckets \
? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \
for (cur = (map)->buckets \
? (map)->buckets[hash_bits((map)->hash_fn((_key), (map)->ctx), (map)->cap_bits)] \
: NULL; \
cur && ({ tmp = cur->next; true; }); \
cur = tmp) \
if (map->equal_fn(cur->key, (_key), map->ctx))
if ((map)->equal_fn(cur->key, (_key), (map)->ctx))
#endif /* __LIBBPF_HASHMAP_H */

File diff suppressed because it is too large Load Diff

View File

@@ -24,8 +24,25 @@
extern "C" {
#endif
/**
* @brief **libbpf_major_version()** provides the major version of libbpf.
* @return An integer, the major version number
*/
LIBBPF_API __u32 libbpf_major_version(void);
/**
* @brief **libbpf_minor_version()** provides the minor version of libbpf.
* @return An integer, the minor version number
*/
LIBBPF_API __u32 libbpf_minor_version(void);
/**
* @brief **libbpf_version_string()** provides the version of libbpf in a
* human-readable form, e.g., "v1.7".
* @return Pointer to a static string containing the version
*
* The format is *not* a part of a stable API and may change in the future.
*/
LIBBPF_API const char *libbpf_version_string(void);
enum libbpf_errno {
@@ -49,6 +66,14 @@ enum libbpf_errno {
__LIBBPF_ERRNO__END,
};
/**
* @brief **libbpf_strerror()** converts the provided error code into a
* human-readable string.
* @param err The error code to convert
* @param buf Pointer to a buffer where the error message will be stored
* @param size The number of bytes in the buffer
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size);
/**
@@ -98,7 +123,10 @@ typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level,
/**
* @brief **libbpf_set_print()** sets user-provided log callback function to
* be used for libbpf warnings and informational messages.
* be used for libbpf warnings and informational messages. If the user callback
* is not set, messages are logged to stderr by default. The verbosity of these
* messages can be controlled by setting the environment variable
* LIBBPF_LOG_LEVEL to either warn, info, or debug.
* @param fn The log print function. If NULL, libbpf won't print anything.
* @return Pointer to old print function.
*
@@ -149,7 +177,7 @@ struct bpf_object_open_opts {
* log_buf and log_level settings.
*
* If specified, this log buffer will be passed for:
* - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden
* - each BPF progral load (BPF_PROG_LOAD) attempt, unless overridden
* with bpf_program__set_log() on per-program level, to get
* BPF verifier log output.
* - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get
@@ -238,6 +266,19 @@ LIBBPF_API struct bpf_object *
bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
const struct bpf_object_open_opts *opts);
/**
* @brief **bpf_object__prepare()** prepares BPF object for loading:
* performs ELF processing, relocations, prepares final state of BPF program
* instructions (accessible with bpf_program__insns()), creates and
* (potentially) pins maps. Leaves BPF object in the state ready for program
* loading.
* @param obj Pointer to a valid BPF object instance returned by
* **bpf_object__open*()** API
* @return 0, on success; negative error code, otherwise, error code is
* stored in errno
*/
LIBBPF_API int bpf_object__prepare(struct bpf_object *obj);
/**
* @brief **bpf_object__load()** loads BPF object into kernel.
* @param obj Pointer to a valid BPF object instance returned by
@@ -291,6 +332,14 @@ LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj);
LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj);
LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version);
/**
* @brief **bpf_object__token_fd** is an accessor for BPF token FD associated
* with BPF object.
* @param obj Pointer to a valid BPF object
* @return BPF token FD or -1, if it wasn't set
*/
LIBBPF_API int bpf_object__token_fd(const struct bpf_object *obj);
struct btf;
LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj);
LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj);
@@ -399,7 +448,7 @@ LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path);
/**
* @brief **bpf_program__unpin()** unpins the BPF program from a file
* in the BPFFS specified by a path. This decrements the programs
* in the BPFFS specified by a path. This decrements program's in-kernel
* reference count.
*
* The file pinning the BPF program can also be unlinked by a different
@@ -432,14 +481,12 @@ LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path);
/**
* @brief **bpf_link__unpin()** unpins the BPF link from a file
* in the BPFFS specified by a path. This decrements the links
* reference count.
* in the BPFFS. This decrements link's in-kernel reference count.
*
* The file pinning the BPF link can also be unlinked by a different
* process in which case this function will return an error.
*
* @param prog BPF program to unpin
* @param path file path to the pin in a BPF file system
* @param link BPF link to unpin
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int bpf_link__unpin(struct bpf_link *link);
@@ -452,7 +499,7 @@ LIBBPF_API int bpf_link__destroy(struct bpf_link *link);
/**
* @brief **bpf_program__attach()** is a generic function for attaching
* a BPF program based on auto-detection of program type, attach type,
* and extra paremeters, where applicable.
* and extra parameters, where applicable.
*
* @param prog BPF program to attach
* @return Reference to the newly created BPF link; or NULL is returned on error,
@@ -475,9 +522,11 @@ struct bpf_perf_event_opts {
__u64 bpf_cookie;
/* don't use BPF link when attach BPF program */
bool force_ioctl_attach;
/* don't automatically enable the event */
bool dont_enable;
size_t :0;
};
#define bpf_perf_event_opts__last_field force_ioctl_attach
#define bpf_perf_event_opts__last_field dont_enable
LIBBPF_API struct bpf_link *
bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd);
@@ -539,10 +588,14 @@ struct bpf_kprobe_multi_opts {
size_t cnt;
/* create return kprobes */
bool retprobe;
/* create session kprobes */
bool session;
/* enforce unique match */
bool unique_match;
size_t :0;
};
#define bpf_kprobe_multi_opts__last_field retprobe
#define bpf_kprobe_multi_opts__last_field unique_match
LIBBPF_API struct bpf_link *
bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
@@ -564,10 +617,12 @@ struct bpf_uprobe_multi_opts {
size_t cnt;
/* create return uprobes */
bool retprobe;
/* create session kprobes */
bool session;
size_t :0;
};
#define bpf_uprobe_multi_opts__last_field retprobe
#define bpf_uprobe_multi_opts__last_field session
/**
* @brief **bpf_program__attach_uprobe_multi()** attaches a BPF program
@@ -674,7 +729,7 @@ struct bpf_uprobe_opts {
/**
* @brief **bpf_program__attach_uprobe()** attaches a BPF program
* to the userspace function which is found by binary path and
* offset. You can optionally specify a particular proccess to attach
* offset. You can optionally specify a particular process to attach
* to. You can also optionally attach the program to the function
* exit instead of entry.
*
@@ -795,6 +850,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd);
LIBBPF_API struct bpf_link *
bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd);
LIBBPF_API struct bpf_link *
bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd);
LIBBPF_API struct bpf_link *
bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex);
LIBBPF_API struct bpf_link *
bpf_program__attach_freplace(const struct bpf_program *prog,
@@ -845,6 +902,21 @@ LIBBPF_API struct bpf_link *
bpf_program__attach_netkit(const struct bpf_program *prog, int ifindex,
const struct bpf_netkit_opts *opts);
struct bpf_cgroup_opts {
/* size of this struct, for forward/backward compatibility */
size_t sz;
__u32 flags;
__u32 relative_fd;
__u32 relative_id;
__u64 expected_revision;
size_t :0;
};
#define bpf_cgroup_opts__last_field expected_revision
LIBBPF_API struct bpf_link *
bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd,
const struct bpf_cgroup_opts *opts);
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
@@ -908,6 +980,12 @@ LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_le
LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size);
LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size);
LIBBPF_API struct bpf_func_info *bpf_program__func_info(const struct bpf_program *prog);
LIBBPF_API __u32 bpf_program__func_info_cnt(const struct bpf_program *prog);
LIBBPF_API struct bpf_line_info *bpf_program__line_info(const struct bpf_program *prog);
LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog);
/**
* @brief **bpf_program__set_attach_target()** sets BTF-based attach target
* for supported BPF program types:
@@ -915,14 +993,35 @@ LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf,
* - fentry/fexit/fmod_ret;
* - lsm;
* - freplace.
* @param prog BPF program to set the attach type for
* @param type attach type to set the BPF map to have
* @param prog BPF program to configure; must be not yet loaded.
* @param attach_prog_fd FD of target BPF program (for freplace/extension).
* If >0 and func name omitted, defers BTF ID resolution.
* @param attach_func_name Target function name. Used either with
* attach_prog_fd to find destination BTF type ID in that BPF program, or
* alone (no attach_prog_fd) to resolve kernel (vmlinux/module) BTF ID.
* Must be provided if attach_prog_fd is 0.
* @return error code; or 0 if no error occurred.
*/
LIBBPF_API int
bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
const char *attach_func_name);
struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */
/**
* @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a
* struct_ops map.
*
* @param prog BPF program
* @param map struct_ops map to be associated with the BPF program
* @param opts optional options, can be NULL
*
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int
bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map,
struct bpf_prog_assoc_struct_ops_opts *opts);
/**
* @brief **bpf_object__find_map_by_name()** returns BPF map of
* the given name, if it exists within the passed BPF object
@@ -971,6 +1070,23 @@ bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map);
LIBBPF_API int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate);
LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map);
/**
* @brief **bpf_map__set_autoattach()** sets whether libbpf has to auto-attach
* map during BPF skeleton attach phase.
* @param map the BPF map instance
* @param autoattach whether to attach map during BPF skeleton attach phase
* @return 0 on success; negative error code, otherwise
*/
LIBBPF_API int bpf_map__set_autoattach(struct bpf_map *map, bool autoattach);
/**
* @brief **bpf_map__autoattach()** returns whether BPF map is configured to
* auto-attach during BPF skeleton attach phase.
* @param map the BPF map instance
* @return true if map is set to auto-attach during skeleton attach phase; false, otherwise
*/
LIBBPF_API bool bpf_map__autoattach(const struct bpf_map *map);
/**
* @brief **bpf_map__fd()** gets the file descriptor of the passed
* BPF map
@@ -1001,6 +1117,7 @@ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map);
/**
* @brief **bpf_map__set_value_size()** sets map value size.
* @param map the BPF map instance
* @param size the new value size
* @return 0, on success; negative error, otherwise
*
* There is a special case for maps with associated memory-mapped regions, like
@@ -1099,13 +1216,14 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map);
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @param value pointer to memory in which looked up value will be stored
* @param value_sz size in byte of value data memory; it has to match BPF map
* definition's **value_size**. For per-CPU BPF maps value size has to be
* a product of BPF map value size and number of possible CPUs in the system
* (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* definition's **value_size**. For per-CPU BPF maps, value size can be
* `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified
* in **flags**, otherwise a product of BPF map value size and number of
* possible CPUs in the system (could be fetched with
* **libbpf_num_possible_cpus()**). Note also that for per-CPU values value
* size has to be aligned up to closest 8 bytes, so expected size is:
* `round_up(value_size, 8) * libbpf_num_possible_cpus()`.
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_elem()** is high-level equivalent of
@@ -1122,14 +1240,8 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
* @param key pointer to memory containing bytes of the key
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @param value pointer to memory containing bytes of the value
* @param value_sz size in byte of value data memory; it has to match BPF map
* definition's **value_size**. For per-CPU BPF maps value size has to be
* a product of BPF map value size and number of possible CPUs in the system
* (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param value_sz refer to **bpf_map__lookup_elem**'s description.'
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__update_elem()** is high-level equivalent of
@@ -1145,7 +1257,7 @@ LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map,
* @param map BPF map to delete element from
* @param key pointer to memory containing bytes of the key
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__delete_elem()** is high-level equivalent of
@@ -1168,7 +1280,7 @@ LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map,
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_and_delete_elem()** is high-level equivalent of
@@ -1194,6 +1306,28 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map,
*/
LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map,
const void *cur_key, void *next_key, size_t key_sz);
/**
* @brief **bpf_map__set_exclusive_program()** sets a map to be exclusive to the
* specified program. This must be called *before* the map is created.
*
* @param map BPF map to make exclusive.
* @param prog BPF program to be the exclusive user of the map. Must belong
* to the same bpf_object as the map.
* @return 0 on success; a negative error code otherwise.
*
* This function must be called after the BPF object is opened but before
* it is loaded. Once the object is loaded, only the specified program
* will be able to access the map's contents.
*/
LIBBPF_API int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog);
/**
* @brief **bpf_map__exclusive_program()** returns the exclusive program
* that is registered with the map (if any).
* @param map BPF map to which the exclusive program is registered.
* @return the registered exclusive program.
*/
LIBBPF_API struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map);
struct bpf_xdp_set_link_opts {
size_t sz;
@@ -1234,6 +1368,7 @@ enum bpf_tc_attach_point {
BPF_TC_INGRESS = 1 << 0,
BPF_TC_EGRESS = 1 << 1,
BPF_TC_CUSTOM = 1 << 2,
BPF_TC_QDISC = 1 << 3,
};
#define BPF_TC_PARENT(a, b) \
@@ -1248,9 +1383,11 @@ struct bpf_tc_hook {
int ifindex;
enum bpf_tc_attach_point attach_point;
__u32 parent;
__u32 handle;
const char *qdisc;
size_t :0;
};
#define bpf_tc_hook__last_field parent
#define bpf_tc_hook__last_field qdisc
struct bpf_tc_opts {
size_t sz;
@@ -1293,6 +1430,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd,
ring_buffer_sample_fn sample_cb, void *ctx);
LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n);
LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb);
/**
@@ -1367,6 +1505,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r);
*/
LIBBPF_API int ring__consume(struct ring *r);
/**
* @brief **ring__consume_n()** consumes up to a requested amount of items from
* a ringbuffer without event polling.
*
* @param r A ringbuffer object.
* @param n Maximum amount of items to consume.
* @return The number of items consumed, or a negative number if any of the
* callbacks return an error.
*/
LIBBPF_API int ring__consume_n(struct ring *r, size_t n);
struct user_ring_buffer_opts {
size_t sz; /* size of this struct, for forward/backward compatibility */
};
@@ -1503,6 +1652,7 @@ struct perf_buffer_opts {
* @param sample_cb function called on each received data record
* @param lost_cb function called when record loss has occurred
* @param ctx user-provided extra context passed into *sample_cb* and *lost_cb*
* @param opts optional parameters for the perf buffer, can be null
* @return a new instance of struct perf_buffer on success, NULL on error with
* *errno* containing an error code
*/
@@ -1557,11 +1707,11 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i
* memory region of the ring buffer.
* This ring buffer can be used to implement a custom events consumer.
* The ring buffer starts with the *struct perf_event_mmap_page*, which
* holds the ring buffer managment fields, when accessing the header
* holds the ring buffer management fields, when accessing the header
* structure it's important to be SMP aware.
* You can refer to *perf_event_read_simple* for a simple example.
* @param pb the perf buffer structure
* @param buf_idx the buffer index to retreive
* @param buf_idx the buffer index to retrieve
* @param buf (out) gets the base pointer of the mmap()'ed memory
* @param buf_size (out) gets the size of the mmap()'ed region
* @return 0 on success, negative error code for failure
@@ -1653,6 +1803,7 @@ struct bpf_map_skeleton {
const char *name;
struct bpf_map **map;
void **mmaped;
struct bpf_link **link;
};
struct bpf_prog_skeleton {
@@ -1722,9 +1873,10 @@ struct gen_loader_opts {
const char *insns;
__u32 data_sz;
__u32 insns_sz;
bool gen_hash;
};
#define gen_loader_opts__last_field insns_sz
#define gen_loader_opts__last_field gen_hash
LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj,
struct gen_loader_opts *opts);
@@ -1749,9 +1901,14 @@ struct bpf_linker_file_opts {
struct bpf_linker;
LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts);
LIBBPF_API struct bpf_linker *bpf_linker__new_fd(int fd, struct bpf_linker_opts *opts);
LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker,
const char *filename,
const struct bpf_linker_file_opts *opts);
LIBBPF_API int bpf_linker__add_fd(struct bpf_linker *linker, int fd,
const struct bpf_linker_file_opts *opts);
LIBBPF_API int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz,
const struct bpf_linker_file_opts *opts);
LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);

View File

@@ -416,3 +416,42 @@ LIBBPF_1.4.0 {
btf__new_split;
btf_ext__raw_data;
} LIBBPF_1.3.0;
LIBBPF_1.5.0 {
global:
btf__distill_base;
btf__relocate;
btf_ext__endianness;
btf_ext__set_endianness;
bpf_map__autoattach;
bpf_map__set_autoattach;
bpf_object__token_fd;
bpf_program__attach_sockmap;
ring__consume_n;
ring_buffer__consume_n;
} LIBBPF_1.4.0;
LIBBPF_1.6.0 {
global:
bpf_linker__add_buf;
bpf_linker__add_fd;
bpf_linker__new_fd;
bpf_object__prepare;
bpf_prog_stream_read;
bpf_program__attach_cgroup_opts;
bpf_program__func_info;
bpf_program__func_info_cnt;
bpf_program__line_info;
bpf_program__line_info_cnt;
btf__add_decl_attr;
btf__add_type_attr;
} LIBBPF_1.5.0;
LIBBPF_1.7.0 {
global:
bpf_map__set_exclusive_program;
bpf_map__exclusive_program;
bpf_prog_assoc_struct_ops;
bpf_program__assoc_struct_ops;
btf__permute;
} LIBBPF_1.6.0;

View File

@@ -1,75 +0,0 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/*
* Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
* Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
* Copyright (C) 2015 Huawei Inc.
* Copyright (C) 2017 Nicira, Inc.
*/
#undef _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include "libbpf.h"
#include "libbpf_internal.h"
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c)
#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf",
[ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid",
[ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost",
[ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch",
[ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf",
[ERRCODE_OFFSET(RELOC)] = "Relocation failed",
[ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading",
[ERRCODE_OFFSET(PROG2BIG)] = "Program too big",
[ERRCODE_OFFSET(KVER)] = "Incorrect kernel version",
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
[ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
};
int libbpf_strerror(int err, char *buf, size_t size)
{
int ret;
if (!buf || !size)
return libbpf_err(-EINVAL);
err = err > 0 ? err : -err;
if (err < __LIBBPF_ERRNO__START) {
ret = strerror_r(err, buf, size);
buf[size - 1] = '\0';
return libbpf_err_errno(ret);
}
if (err < __LIBBPF_ERRNO__END) {
const char *msg;
msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
ret = snprintf(buf, size, "%s", msg);
buf[size - 1] = '\0';
/* The length of the buf and msg is positive.
* A negative number may be returned only when the
* size exceeds INT_MAX. Not likely to appear.
*/
if (ret >= size)
return libbpf_err(-ERANGE);
return 0;
}
ret = snprintf(buf, size, "Unknown libbpf error %d", err);
buf[size - 1] = '\0';
if (ret >= size)
return libbpf_err(-ERANGE);
return libbpf_err(-ENOENT);
}

View File

@@ -10,6 +10,7 @@
#define __LIBBPF_LIBBPF_INTERNAL_H
#include <stdlib.h>
#include <byteswap.h>
#include <limits.h>
#include <errno.h>
#include <linux/err.h>
@@ -73,6 +74,8 @@
#define ELF64_ST_VISIBILITY(o) ((o) & 0x03)
#endif
#define JUMPTABLES_SEC ".jumptables"
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)
@@ -171,6 +174,16 @@ do { \
#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__)
#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__)
/**
* @brief **libbpf_errstr()** returns string corresponding to numeric errno
* @param err negative numeric errno
* @return pointer to string representation of the errno, that is invalidated
* upon the next call.
*/
const char *libbpf_errstr(int err);
#define errstr(err) libbpf_errstr(err)
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@@ -234,6 +247,9 @@ struct btf_type;
struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id);
const char *btf_kind_str(const struct btf_type *t);
const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id);
const struct btf_header *btf_header(const struct btf *btf);
void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map);
static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
{
@@ -405,6 +421,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
int btf_load_into_kernel(struct btf *btf,
char *log_buf, size_t log_sz, __u32 log_level,
int token_fd);
struct btf *btf_load_from_kernel(__u32 id, struct btf *base_btf, int token_fd);
struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf);
void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
@@ -445,11 +462,11 @@ struct btf_ext_info {
*
* The func_info subsection layout:
* record size for struct bpf_func_info in the func_info subsection
* struct btf_sec_func_info for section #1
* struct btf_ext_info_sec for section #1
* a list of bpf_func_info records for section #1
* where struct bpf_func_info mimics one in include/uapi/linux/bpf.h
* but may not be identical
* struct btf_sec_func_info for section #2
* struct btf_ext_info_sec for section #2
* a list of bpf_func_info records for section #2
* ......
*
@@ -481,6 +498,8 @@ struct btf_ext {
struct btf_ext_header *hdr;
void *data;
};
void *data_swapped;
bool swapped_endian;
struct btf_ext_info func_info;
struct btf_ext_info line_info;
struct btf_ext_info core_relo_info;
@@ -508,21 +527,64 @@ struct bpf_line_info_min {
__u32 line_col;
};
/* Functions to byte-swap info records */
typedef void (*info_rec_bswap_fn)(void *);
static inline void bpf_func_info_bswap(struct bpf_func_info *i)
{
i->insn_off = bswap_32(i->insn_off);
i->type_id = bswap_32(i->type_id);
}
static inline void bpf_line_info_bswap(struct bpf_line_info *i)
{
i->insn_off = bswap_32(i->insn_off);
i->file_name_off = bswap_32(i->file_name_off);
i->line_off = bswap_32(i->line_off);
i->line_col = bswap_32(i->line_col);
}
static inline void bpf_core_relo_bswap(struct bpf_core_relo *i)
{
i->insn_off = bswap_32(i->insn_off);
i->type_id = bswap_32(i->type_id);
i->access_str_off = bswap_32(i->access_str_off);
i->kind = bswap_32(i->kind);
}
enum btf_field_iter_kind {
BTF_FIELD_ITER_IDS,
BTF_FIELD_ITER_STRS,
};
struct btf_field_desc {
/* once-per-type offsets */
int t_off_cnt, t_offs[2];
/* member struct size, or zero, if no members */
int m_sz;
/* repeated per-member offsets */
int m_off_cnt, m_offs[1];
};
struct btf_field_iter {
struct btf_field_desc desc;
void *p;
int m_idx;
int off_idx;
int vlen;
};
int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, enum btf_field_iter_kind iter_kind);
__u32 *btf_field_iter_next(struct btf_field_iter *it);
typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx);
typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx);
int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx);
int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx);
int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx);
int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx);
__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name,
__u32 kind);
typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type,
const char *sym_name, void *ctx);
int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg);
/* handle direct returned errors */
static inline int libbpf_err(int ret)
{
@@ -568,6 +630,16 @@ static inline bool is_ldimm64_insn(struct bpf_insn *insn)
return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}
static inline void bpf_insn_bswap(struct bpf_insn *insn)
{
__u8 tmp_reg = insn->dst_reg;
insn->dst_reg = insn->src_reg;
insn->src_reg = tmp_reg;
insn->off = bswap_16(insn->off);
insn->imm = bswap_32(insn->imm);
}
/* Unconditionally dup FD, ensuring it doesn't use [0, 2] range.
* Original FD is not closed or altered in any other way.
* Preserves original FD value, if it's invalid (negative).
@@ -602,13 +674,18 @@ static inline int ensure_good_fd(int fd)
return fd;
}
static inline int sys_dup2(int oldfd, int newfd)
static inline int sys_dup3(int oldfd, int newfd, int flags)
{
#ifdef __NR_dup2
return syscall(__NR_dup2, oldfd, newfd);
#else
return syscall(__NR_dup3, oldfd, newfd, 0);
#endif
return syscall(__NR_dup3, oldfd, newfd, flags);
}
/* Some versions of Android don't provide memfd_create() in their libc
* implementation, so avoid complications and just go straight to Linux
* syscall.
*/
static inline int sys_memfd_create(const char *name, unsigned flags)
{
return syscall(__NR_memfd_create, name, flags);
}
/* Point *fixed_fd* to the same file that *tmp_fd* points to.
@@ -619,7 +696,7 @@ static inline int reuse_fd(int fixed_fd, int tmp_fd)
{
int err;
err = sys_dup2(tmp_fd, fixed_fd);
err = sys_dup3(tmp_fd, fixed_fd, O_CLOEXEC);
err = err < 0 ? -errno : 0;
close(tmp_fd); /* clean up temporary FD */
return err;
@@ -647,6 +724,11 @@ static inline bool is_pow_of_2(size_t x)
return x && (x & (x - 1)) == 0;
}
static inline __u32 ror32(__u32 v, int bits)
{
return (v >> bits) | (v << (32 - bits));
}
#define PROG_LOAD_ATTEMPTS 5
int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts);
@@ -671,4 +753,8 @@ int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern,
int probe_fd(int fd);
#define SHA256_DIGEST_LENGTH 32
#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64)
void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]);
#endif /* __LIBBPF_LIBBPF_INTERNAL_H */

View File

@@ -76,7 +76,7 @@ enum libbpf_strict_mode {
* first BPF program or map creation operation. This is done only if
* kernel is too old to support memcg-based memory accounting for BPF
* subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY,
* but it can be overriden with libbpf_set_memlock_rlim() API.
* but it can be overridden with libbpf_set_memlock_rlim() API.
* Note that libbpf_set_memlock_rlim() needs to be called before
* the very first bpf_prog_load(), bpf_map_create() or bpf_object__load()
* operation.
@@ -97,7 +97,7 @@ LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode);
* @brief **libbpf_get_error()** extracts the error code from the passed
* pointer
* @param ptr pointer returned from libbpf API function
* @return error code; or 0 if no error occured
* @return error code; or 0 if no error occurred
*
* Note, as of libbpf 1.0 this function is not necessary and not recommended
* to be used. Libbpf doesn't return error code embedded into the pointer

View File

@@ -364,6 +364,10 @@ static int probe_map_create(enum bpf_map_type map_type)
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
break;
case BPF_MAP_TYPE_INSN_ARRAY:
key_size = sizeof(__u32);
value_size = sizeof(struct bpf_insn_array_value);
break;
case BPF_MAP_TYPE_UNSPEC:
default:
return -EOPNOTSUPP;
@@ -448,7 +452,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe
/* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id)
* at all, it will emit something like "invalid func unknown#181".
* If BPF verifier recognizes BPF helper but it's not supported for
* given BPF program type, it will emit "unknown func bpf_sys_bpf#166".
* given BPF program type, it will emit "unknown func bpf_sys_bpf#166"
* or "program of this type cannot use helper bpf_sys_bpf#166".
* In both cases, provided combination of BPF program type and BPF
* helper is not supported by the kernel.
* In all other cases, probe_prog_load() above will either succeed (e.g.,
@@ -457,7 +462,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe
* that), or we'll get some more specific BPF verifier error about
* some unsatisfied conditions.
*/
if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ")))
if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") ||
strstr(buf, "program of this type cannot use helper ")))
return 0;
return 1; /* assume supported */
}

256
src/libbpf_utils.c Normal file
View File

@@ -0,0 +1,256 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/*
* Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
* Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
* Copyright (C) 2015 Huawei Inc.
* Copyright (C) 2017 Nicira, Inc.
*/
#undef _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/kernel.h>
#include "libbpf.h"
#include "libbpf_internal.h"
#ifndef ENOTSUPP
#define ENOTSUPP 524
#endif
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c)
#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf",
[ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid",
[ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost",
[ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch",
[ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf",
[ERRCODE_OFFSET(RELOC)] = "Relocation failed",
[ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading",
[ERRCODE_OFFSET(PROG2BIG)] = "Program too big",
[ERRCODE_OFFSET(KVER)] = "Incorrect kernel version",
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
[ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
};
int libbpf_strerror(int err, char *buf, size_t size)
{
int ret;
if (!buf || !size)
return libbpf_err(-EINVAL);
err = err > 0 ? err : -err;
if (err < __LIBBPF_ERRNO__START) {
ret = strerror_r(err, buf, size);
buf[size - 1] = '\0';
return libbpf_err_errno(ret);
}
if (err < __LIBBPF_ERRNO__END) {
const char *msg;
msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
ret = snprintf(buf, size, "%s", msg);
buf[size - 1] = '\0';
/* The length of the buf and msg is positive.
* A negative number may be returned only when the
* size exceeds INT_MAX. Not likely to appear.
*/
if (ret >= size)
return libbpf_err(-ERANGE);
return 0;
}
ret = snprintf(buf, size, "Unknown libbpf error %d", err);
buf[size - 1] = '\0';
if (ret >= size)
return libbpf_err(-ERANGE);
return libbpf_err(-ENOENT);
}
const char *libbpf_errstr(int err)
{
static __thread char buf[12];
if (err > 0)
err = -err;
switch (err) {
case -E2BIG: return "-E2BIG";
case -EACCES: return "-EACCES";
case -EADDRINUSE: return "-EADDRINUSE";
case -EADDRNOTAVAIL: return "-EADDRNOTAVAIL";
case -EAGAIN: return "-EAGAIN";
case -EALREADY: return "-EALREADY";
case -EBADF: return "-EBADF";
case -EBADFD: return "-EBADFD";
case -EBUSY: return "-EBUSY";
case -ECANCELED: return "-ECANCELED";
case -ECHILD: return "-ECHILD";
case -EDEADLK: return "-EDEADLK";
case -EDOM: return "-EDOM";
case -EEXIST: return "-EEXIST";
case -EFAULT: return "-EFAULT";
case -EFBIG: return "-EFBIG";
case -EILSEQ: return "-EILSEQ";
case -EINPROGRESS: return "-EINPROGRESS";
case -EINTR: return "-EINTR";
case -EINVAL: return "-EINVAL";
case -EIO: return "-EIO";
case -EISDIR: return "-EISDIR";
case -ELOOP: return "-ELOOP";
case -EMFILE: return "-EMFILE";
case -EMLINK: return "-EMLINK";
case -EMSGSIZE: return "-EMSGSIZE";
case -ENAMETOOLONG: return "-ENAMETOOLONG";
case -ENFILE: return "-ENFILE";
case -ENODATA: return "-ENODATA";
case -ENODEV: return "-ENODEV";
case -ENOENT: return "-ENOENT";
case -ENOEXEC: return "-ENOEXEC";
case -ENOLINK: return "-ENOLINK";
case -ENOMEM: return "-ENOMEM";
case -ENOSPC: return "-ENOSPC";
case -ENOTBLK: return "-ENOTBLK";
case -ENOTDIR: return "-ENOTDIR";
case -ENOTSUPP: return "-ENOTSUPP";
case -ENOTTY: return "-ENOTTY";
case -ENXIO: return "-ENXIO";
case -EOPNOTSUPP: return "-EOPNOTSUPP";
case -EOVERFLOW: return "-EOVERFLOW";
case -EPERM: return "-EPERM";
case -EPIPE: return "-EPIPE";
case -EPROTO: return "-EPROTO";
case -EPROTONOSUPPORT: return "-EPROTONOSUPPORT";
case -ERANGE: return "-ERANGE";
case -EROFS: return "-EROFS";
case -ESPIPE: return "-ESPIPE";
case -ESRCH: return "-ESRCH";
case -ETXTBSY: return "-ETXTBSY";
case -EUCLEAN: return "-EUCLEAN";
case -EXDEV: return "-EXDEV";
default:
snprintf(buf, sizeof(buf), "%d", err);
return buf;
}
}
static inline __u32 get_unaligned_be32(const void *p)
{
__be32 val;
memcpy(&val, p, sizeof(val));
return be32_to_cpu(val);
}
static inline void put_unaligned_be32(__u32 val, void *p)
{
__be32 be_val = cpu_to_be32(val);
memcpy(p, &be_val, sizeof(be_val));
}
#define SHA256_BLOCK_LENGTH 64
#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
static const __u32 sha256_K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \
{ \
__u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \
d += tmp; \
h = tmp + Sigma_0(a) + Maj(a, b, c); \
}
static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks)
{
while (nblocks--) {
__u32 a = state[0];
__u32 b = state[1];
__u32 c = state[2];
__u32 d = state[3];
__u32 e = state[4];
__u32 f = state[5];
__u32 g = state[6];
__u32 h = state[7];
__u32 w[64];
int i;
for (i = 0; i < 16; i++)
w[i] = get_unaligned_be32(&data[4 * i]);
for (; i < ARRAY_SIZE(w); i++)
w[i] = sigma_1(w[i - 2]) + w[i - 7] +
sigma_0(w[i - 15]) + w[i - 16];
for (i = 0; i < ARRAY_SIZE(w); i += 8) {
SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
data += SHA256_BLOCK_LENGTH;
}
}
void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH])
{
__u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
const __be64 bitcount = cpu_to_be64((__u64)len * 8);
__u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 };
size_t final_len = len % SHA256_BLOCK_LENGTH;
int i;
sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH);
memcpy(final_data, data + len - final_len, final_len);
final_data[final_len] = 0x80;
final_len = roundup(final_len + 9, SHA256_BLOCK_LENGTH);
memcpy(&final_data[final_len - 8], &bitcount, 8);
sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH);
for (i = 0; i < ARRAY_SIZE(state); i++)
put_unaligned_be32(state[i], &out[4 * i]);
}

View File

@@ -4,6 +4,6 @@
#define __LIBBPF_VERSION_H
#define LIBBPF_MAJOR_VERSION 1
#define LIBBPF_MINOR_VERSION 4
#define LIBBPF_MINOR_VERSION 7
#endif /* __LIBBPF_VERSION_H */

View File

@@ -4,6 +4,10 @@
*
* Copyright (c) 2021 Facebook
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
@@ -16,6 +20,7 @@
#include <elf.h>
#include <libelf.h>
#include <fcntl.h>
#include <sys/mman.h>
#include "libbpf.h"
#include "btf.h"
#include "libbpf_internal.h"
@@ -135,6 +140,7 @@ struct bpf_linker {
int fd;
Elf *elf;
Elf64_Ehdr *elf_hdr;
bool swapped_endian;
/* Output sections metadata */
struct dst_sec *secs;
@@ -150,15 +156,19 @@ struct bpf_linker {
/* global (including extern) ELF symbols */
int glob_sym_cnt;
struct glob_sym *glob_syms;
bool fd_is_owned;
};
#define pr_warn_elf(fmt, ...) \
libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1))
static int init_output_elf(struct bpf_linker *linker, const char *file);
static int init_output_elf(struct bpf_linker *linker);
static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
const struct bpf_linker_file_opts *opts,
static int bpf_linker_add_file(struct bpf_linker *linker, int fd,
const char *filename);
static int linker_load_obj_file(struct bpf_linker *linker,
struct src_obj *obj);
static int linker_sanity_check_elf(struct src_obj *obj);
static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec);
@@ -189,7 +199,7 @@ void bpf_linker__free(struct bpf_linker *linker)
if (linker->elf)
elf_end(linker->elf);
if (linker->fd >= 0)
if (linker->fd >= 0 && linker->fd_is_owned)
close(linker->fd);
strset__free(linker->strtab_strs);
@@ -231,9 +241,63 @@ struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts
if (!linker)
return errno = ENOMEM, NULL;
linker->fd = -1;
linker->filename = strdup(filename);
if (!linker->filename) {
err = -ENOMEM;
goto err_out;
}
err = init_output_elf(linker, filename);
linker->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
if (linker->fd < 0) {
err = -errno;
pr_warn("failed to create '%s': %d\n", filename, err);
goto err_out;
}
linker->fd_is_owned = true;
err = init_output_elf(linker);
if (err)
goto err_out;
return linker;
err_out:
bpf_linker__free(linker);
return errno = -err, NULL;
}
struct bpf_linker *bpf_linker__new_fd(int fd, struct bpf_linker_opts *opts)
{
struct bpf_linker *linker;
char filename[32];
int err;
if (fd < 0)
return errno = EINVAL, NULL;
if (!OPTS_VALID(opts, bpf_linker_opts))
return errno = EINVAL, NULL;
if (elf_version(EV_CURRENT) == EV_NONE) {
pr_warn_elf("libelf initialization failed");
return errno = EINVAL, NULL;
}
linker = calloc(1, sizeof(*linker));
if (!linker)
return errno = ENOMEM, NULL;
snprintf(filename, sizeof(filename), "fd:%d", fd);
linker->filename = strdup(filename);
if (!linker->filename) {
err = -ENOMEM;
goto err_out;
}
linker->fd = fd;
linker->fd_is_owned = false;
err = init_output_elf(linker);
if (err)
goto err_out;
@@ -292,23 +356,12 @@ static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx)
return sym;
}
static int init_output_elf(struct bpf_linker *linker, const char *file)
static int init_output_elf(struct bpf_linker *linker)
{
int err, str_off;
Elf64_Sym *init_sym;
struct dst_sec *sec;
linker->filename = strdup(file);
if (!linker->filename)
return -ENOMEM;
linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
if (linker->fd < 0) {
err = -errno;
pr_warn("failed to create '%s': %d\n", file, err);
return err;
}
linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL);
if (!linker->elf) {
pr_warn_elf("failed to create ELF object");
@@ -324,13 +377,8 @@ static int init_output_elf(struct bpf_linker *linker, const char *file)
linker->elf_hdr->e_machine = EM_BPF;
linker->elf_hdr->e_type = ET_REL;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB;
#else
#error "Unknown __BYTE_ORDER__"
#endif
/* Set unknown ELF endianness, assign later from input files */
linker->elf_hdr->e_ident[EI_DATA] = ELFDATANONE;
/* STRTAB */
/* initialize strset with an empty string to conform to ELF */
@@ -396,6 +444,8 @@ static int init_output_elf(struct bpf_linker *linker, const char *file)
pr_warn_elf("failed to create SYMTAB data");
return -EINVAL;
}
/* Ensure libelf translates byte-order of symbol records */
sec->data->d_type = ELF_T_SYM;
str_off = strset__add_str(linker->strtab_strs, sec->sec_name);
if (str_off < 0)
@@ -437,19 +487,16 @@ static int init_output_elf(struct bpf_linker *linker, const char *file)
return 0;
}
int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
const struct bpf_linker_file_opts *opts)
static int bpf_linker_add_file(struct bpf_linker *linker, int fd,
const char *filename)
{
struct src_obj obj = {};
int err = 0;
if (!OPTS_VALID(opts, bpf_linker_file_opts))
return libbpf_err(-EINVAL);
obj.filename = filename;
obj.fd = fd;
if (!linker->elf)
return libbpf_err(-EINVAL);
err = err ?: linker_load_obj_file(linker, filename, opts, &obj);
err = err ?: linker_load_obj_file(linker, &obj);
err = err ?: linker_append_sec_data(linker, &obj);
err = err ?: linker_append_elf_syms(linker, &obj);
err = err ?: linker_append_elf_relos(linker, &obj);
@@ -464,12 +511,91 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
free(obj.sym_map);
if (obj.elf)
elf_end(obj.elf);
if (obj.fd >= 0)
close(obj.fd);
return err;
}
int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
const struct bpf_linker_file_opts *opts)
{
int fd, err;
if (!OPTS_VALID(opts, bpf_linker_file_opts))
return libbpf_err(-EINVAL);
if (!linker->elf)
return libbpf_err(-EINVAL);
fd = open(filename, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
err = -errno;
pr_warn("failed to open file '%s': %s\n", filename, errstr(err));
return libbpf_err(err);
}
err = bpf_linker_add_file(linker, fd, filename);
close(fd);
return libbpf_err(err);
}
int bpf_linker__add_fd(struct bpf_linker *linker, int fd,
const struct bpf_linker_file_opts *opts)
{
char filename[32];
int err;
if (!OPTS_VALID(opts, bpf_linker_file_opts))
return libbpf_err(-EINVAL);
if (!linker->elf)
return libbpf_err(-EINVAL);
if (fd < 0)
return libbpf_err(-EINVAL);
snprintf(filename, sizeof(filename), "fd:%d", fd);
err = bpf_linker_add_file(linker, fd, filename);
return libbpf_err(err);
}
int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz,
const struct bpf_linker_file_opts *opts)
{
char filename[32];
int fd, written, ret;
if (!OPTS_VALID(opts, bpf_linker_file_opts))
return libbpf_err(-EINVAL);
if (!linker->elf)
return libbpf_err(-EINVAL);
snprintf(filename, sizeof(filename), "mem:%p+%zu", buf, buf_sz);
fd = sys_memfd_create(filename, 0);
if (fd < 0) {
ret = -errno;
pr_warn("failed to create memfd '%s': %s\n", filename, errstr(ret));
return libbpf_err(ret);
}
written = 0;
while (written < buf_sz) {
ret = write(fd, buf, buf_sz);
if (ret < 0) {
ret = -errno;
pr_warn("failed to write '%s': %s\n", filename, errstr(ret));
goto err_out;
}
written += ret;
}
ret = bpf_linker_add_file(linker, fd, filename);
err_out:
close(fd);
return libbpf_err(ret);
}
static bool is_dwarf_sec_name(const char *name)
{
/* approximation, but the actual list is too long */
@@ -535,65 +661,69 @@ static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name)
return sec;
}
static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
const struct bpf_linker_file_opts *opts,
static int linker_load_obj_file(struct bpf_linker *linker,
struct src_obj *obj)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
const int host_endianness = ELFDATA2LSB;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
const int host_endianness = ELFDATA2MSB;
#else
#error "Unknown __BYTE_ORDER__"
#endif
int err = 0;
Elf_Scn *scn;
Elf_Data *data;
Elf64_Ehdr *ehdr;
Elf64_Shdr *shdr;
struct src_sec *sec;
unsigned char obj_byteorder;
unsigned char link_byteorder = linker->elf_hdr->e_ident[EI_DATA];
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
const unsigned char host_byteorder = ELFDATA2LSB;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
const unsigned char host_byteorder = ELFDATA2MSB;
#else
#error "Unknown __BYTE_ORDER__"
#endif
pr_debug("linker: adding object file '%s'...\n", filename);
pr_debug("linker: adding object file '%s'...\n", obj->filename);
obj->filename = filename;
obj->fd = open(filename, O_RDONLY | O_CLOEXEC);
if (obj->fd < 0) {
err = -errno;
pr_warn("failed to open file '%s': %d\n", filename, err);
return err;
}
obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL);
if (!obj->elf) {
err = -errno;
pr_warn_elf("failed to parse ELF file '%s'", filename);
return err;
pr_warn_elf("failed to parse ELF file '%s'", obj->filename);
return -EINVAL;
}
/* Sanity check ELF file high-level properties */
ehdr = elf64_getehdr(obj->elf);
if (!ehdr) {
err = -errno;
pr_warn_elf("failed to get ELF header for %s", filename);
return err;
pr_warn_elf("failed to get ELF header for %s", obj->filename);
return -EINVAL;
}
if (ehdr->e_ident[EI_DATA] != host_endianness) {
/* Linker output endianness set by first input object */
obj_byteorder = ehdr->e_ident[EI_DATA];
if (obj_byteorder != ELFDATA2LSB && obj_byteorder != ELFDATA2MSB) {
err = -EOPNOTSUPP;
pr_warn_elf("unsupported byte order of ELF file %s", filename);
pr_warn("unknown byte order of ELF file %s\n", obj->filename);
return err;
}
if (link_byteorder == ELFDATANONE) {
linker->elf_hdr->e_ident[EI_DATA] = obj_byteorder;
linker->swapped_endian = obj_byteorder != host_byteorder;
pr_debug("linker: set %s-endian output byte order\n",
obj_byteorder == ELFDATA2MSB ? "big" : "little");
} else if (link_byteorder != obj_byteorder) {
err = -EOPNOTSUPP;
pr_warn("byte order mismatch with ELF file %s\n", obj->filename);
return err;
}
if (ehdr->e_type != ET_REL
|| ehdr->e_machine != EM_BPF
|| ehdr->e_ident[EI_CLASS] != ELFCLASS64) {
err = -EOPNOTSUPP;
pr_warn_elf("unsupported kind of ELF file %s", filename);
pr_warn_elf("unsupported kind of ELF file %s", obj->filename);
return err;
}
if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) {
err = -errno;
pr_warn_elf("failed to get SHSTRTAB section index for %s", filename);
return err;
pr_warn_elf("failed to get SHSTRTAB section index for %s", obj->filename);
return -EINVAL;
}
scn = NULL;
@@ -603,26 +733,23 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
shdr = elf64_getshdr(scn);
if (!shdr) {
err = -errno;
pr_warn_elf("failed to get section #%zu header for %s",
sec_idx, filename);
return err;
sec_idx, obj->filename);
return -EINVAL;
}
sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name);
if (!sec_name) {
err = -errno;
pr_warn_elf("failed to get section #%zu name for %s",
sec_idx, filename);
return err;
sec_idx, obj->filename);
return -EINVAL;
}
data = elf_getdata(scn, 0);
if (!data) {
err = -errno;
pr_warn_elf("failed to get section #%zu (%s) data from %s",
sec_idx, sec_name, filename);
return err;
sec_idx, sec_name, obj->filename);
return -EINVAL;
}
sec = add_src_sec(obj, sec_name);
@@ -656,7 +783,8 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
obj->btf = btf__new(data->d_buf, shdr->sh_size);
err = libbpf_get_error(obj->btf);
if (err) {
pr_warn("failed to parse .BTF from %s: %d\n", filename, err);
pr_warn("failed to parse .BTF from %s: %s\n",
obj->filename, errstr(err));
return err;
}
sec->skipped = true;
@@ -666,7 +794,8 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size);
err = libbpf_get_error(obj->btf_ext);
if (err) {
pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err);
pr_warn("failed to parse .BTF.ext from '%s': %s\n",
obj->filename, errstr(err));
return err;
}
sec->skipped = true;
@@ -683,7 +812,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
break;
default:
pr_warn("unrecognized section #%zu (%s) in %s\n",
sec_idx, sec_name, filename);
sec_idx, sec_name, obj->filename);
err = -EINVAL;
return err;
}
@@ -957,19 +1086,33 @@ static int check_btf_str_off(__u32 *str_off, void *ctx)
static int linker_sanity_check_btf(struct src_obj *obj)
{
struct btf_type *t;
int i, n, err = 0;
int i, n, err;
if (!obj->btf)
return 0;
n = btf__type_cnt(obj->btf);
for (i = 1; i < n; i++) {
struct btf_field_iter it;
__u32 *type_id, *str_off;
t = btf_type_by_id(obj->btf, i);
err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf);
err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf);
err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS);
if (err)
return err;
while ((type_id = btf_field_iter_next(&it))) {
if (*type_id >= n)
return -EINVAL;
}
err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS);
if (err)
return err;
while ((str_off = btf_field_iter_next(&it))) {
if (!btf__str_by_offset(obj->btf, *str_off))
return -EINVAL;
}
}
return 0;
@@ -1095,6 +1238,24 @@ static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec
return true;
}
static bool is_exec_sec(struct dst_sec *sec)
{
if (!sec || sec->ephemeral)
return false;
return (sec->shdr->sh_type == SHT_PROGBITS) &&
(sec->shdr->sh_flags & SHF_EXECINSTR);
}
static void exec_sec_bswap(void *raw_data, int size)
{
const int insn_cnt = size / sizeof(struct bpf_insn);
struct bpf_insn *insn = raw_data;
int i;
for (i = 0; i < insn_cnt; i++, insn++)
bpf_insn_bswap(insn);
}
static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src)
{
void *tmp;
@@ -1154,6 +1315,10 @@ static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src
memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz);
/* now copy src data at a properly aligned offset */
memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size);
/* convert added bpf insns to native byte-order */
if (linker->swapped_endian && is_exec_sec(dst))
exec_sec_bswap(dst->raw_data + dst_align_sz, src->shdr->sh_size);
}
dst->sec_sz = dst_final_sz;
@@ -1210,7 +1375,7 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj
} else {
if (!secs_match(dst_sec, src_sec)) {
pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name);
return -1;
return -EINVAL;
}
/* "license" and "version" sections are deduped */
@@ -1399,7 +1564,7 @@ recur:
return true;
case BTF_KIND_PTR:
/* just validate overall shape of the referenced type, so no
* contents comparison for struct/union, and allowd fwd vs
* contents comparison for struct/union, and allowed fwd vs
* struct/union
*/
exact = false;
@@ -1860,6 +2025,9 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj,
obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx;
return 0;
}
if (strcmp(src_sec->sec_name, JUMPTABLES_SEC) == 0)
goto add_sym;
}
if (sym_bind == STB_LOCAL)
@@ -1948,7 +2116,7 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj,
/* If existing symbol is a strong resolved symbol, bail out,
* because we lost resolution battle have nothing to
* contribute. We already checked abover that there is no
* contribute. We already checked above that there is no
* strong-strong conflict. We also already tightened binding
* and visibility, so nothing else to contribute at that point.
*/
@@ -1997,7 +2165,7 @@ add_sym:
obj->sym_map[src_sym_idx] = dst_sym_idx;
if (sym_type == STT_SECTION && dst_sym) {
if (sym_type == STT_SECTION && dst_sec) {
dst_sec->sec_sym_idx = dst_sym_idx;
dst_sym->st_value = 0;
}
@@ -2057,7 +2225,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob
}
} else if (!secs_match(dst_sec, src_sec)) {
pr_warn("sections %s are not compatible\n", src_sec->sec_name);
return -1;
return -EINVAL;
}
/* shdr->sh_link points to SYMTAB */
@@ -2213,10 +2381,17 @@ static int linker_fixup_btf(struct src_obj *obj)
vi = btf_var_secinfos(t);
for (j = 0, m = btf_vlen(t); j < m; j++, vi++) {
const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type);
const char *var_name = btf__str_by_offset(obj->btf, vt->name_off);
int var_linkage = btf_var(vt)->linkage;
const char *var_name;
int var_linkage;
Elf64_Sym *sym;
/* could be a variable or function */
if (!btf_is_var(vt))
continue;
var_name = btf__str_by_offset(obj->btf, vt->name_off);
var_linkage = btf_var(vt)->linkage;
/* no need to patch up static or extern vars */
if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED)
continue;
@@ -2234,26 +2409,10 @@ static int linker_fixup_btf(struct src_obj *obj)
return 0;
}
static int remap_type_id(__u32 *type_id, void *ctx)
{
int *id_map = ctx;
int new_id = id_map[*type_id];
/* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */
if (new_id == 0 && *type_id != 0) {
pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id);
return -EINVAL;
}
*type_id = id_map[*type_id];
return 0;
}
static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj)
{
const struct btf_type *t;
int i, j, n, start_id, id;
int i, j, n, start_id, id, err;
const char *name;
if (!obj->btf)
@@ -2324,9 +2483,25 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj)
n = btf__type_cnt(linker->btf);
for (i = start_id; i < n; i++) {
struct btf_type *dst_t = btf_type_by_id(linker->btf, i);
struct btf_field_iter it;
__u32 *type_id;
if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map))
return -EINVAL;
err = btf_field_iter_init(&it, dst_t, BTF_FIELD_ITER_IDS);
if (err)
return err;
while ((type_id = btf_field_iter_next(&it))) {
int new_id = obj->btf_type_map[*type_id];
/* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */
if (new_id == 0 && *type_id != 0) {
pr_warn("failed to find new ID mapping for original BTF type ID %u\n",
*type_id);
return -EINVAL;
}
*type_id = obj->btf_type_map[*type_id];
}
}
/* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's
@@ -2394,6 +2569,10 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj)
if (glob_sym && glob_sym->var_idx >= 0) {
__s64 sz;
/* FUNCs don't have size, nothing to update */
if (btf_is_func(t))
continue;
dst_var = &dst_sec->sec_vars[glob_sym->var_idx];
/* Because underlying BTF type might have
* changed, so might its size have changed, so
@@ -2607,27 +2786,32 @@ int bpf_linker__finalize(struct bpf_linker *linker)
if (!sec->scn)
continue;
/* restore sections with bpf insns to target byte-order */
if (linker->swapped_endian && is_exec_sec(sec))
exec_sec_bswap(sec->raw_data, sec->sec_sz);
sec->data->d_buf = sec->raw_data;
}
/* Finalize ELF layout */
if (elf_update(linker->elf, ELF_C_NULL) < 0) {
err = -errno;
err = -EINVAL;
pr_warn_elf("failed to finalize ELF layout");
return libbpf_err(err);
}
/* Write out final ELF contents */
if (elf_update(linker->elf, ELF_C_WRITE) < 0) {
err = -errno;
err = -EINVAL;
pr_warn_elf("failed to write ELF contents");
return libbpf_err(err);
}
elf_end(linker->elf);
close(linker->fd);
linker->elf = NULL;
if (linker->fd_is_owned)
close(linker->fd);
linker->fd = -1;
return 0;
@@ -2675,6 +2859,7 @@ static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name,
static int finalize_btf(struct bpf_linker *linker)
{
enum btf_endianness link_endianness;
LIBBPF_OPTS(btf_dedup_opts, opts);
struct btf *btf = linker->btf;
const void *raw_data;
@@ -2708,17 +2893,24 @@ static int finalize_btf(struct bpf_linker *linker)
err = finalize_btf_ext(linker);
if (err) {
pr_warn(".BTF.ext generation failed: %d\n", err);
pr_warn(".BTF.ext generation failed: %s\n", errstr(err));
return err;
}
opts.btf_ext = linker->btf_ext;
err = btf__dedup(linker->btf, &opts);
if (err) {
pr_warn("BTF dedup failed: %d\n", err);
pr_warn("BTF dedup failed: %s\n", errstr(err));
return err;
}
/* Set .BTF and .BTF.ext output byte order */
link_endianness = linker->elf_hdr->e_ident[EI_DATA] == ELFDATA2MSB ?
BTF_BIG_ENDIAN : BTF_LITTLE_ENDIAN;
btf__set_endianness(linker->btf, link_endianness);
if (linker->btf_ext)
btf_ext__set_endianness(linker->btf_ext, link_endianness);
/* Emit .BTF section */
raw_data = btf__raw_data(linker->btf, &raw_sz);
if (!raw_data)
@@ -2726,7 +2918,7 @@ static int finalize_btf(struct bpf_linker *linker)
err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz);
if (err) {
pr_warn("failed to write out .BTF ELF section: %d\n", err);
pr_warn("failed to write out .BTF ELF section: %s\n", errstr(err));
return err;
}
@@ -2738,7 +2930,7 @@ static int finalize_btf(struct bpf_linker *linker)
err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz);
if (err) {
pr_warn("failed to write out .BTF.ext ELF section: %d\n", err);
pr_warn("failed to write out .BTF.ext ELF section: %s\n", errstr(err));
return err;
}
}
@@ -2914,7 +3106,7 @@ static int finalize_btf_ext(struct bpf_linker *linker)
err = libbpf_get_error(linker->btf_ext);
if (err) {
linker->btf_ext = NULL;
pr_warn("failed to parse final .BTF.ext data: %d\n", err);
pr_warn("failed to parse final .BTF.ext data: %s\n", errstr(err));
goto out;
}

View File

@@ -529,9 +529,9 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id)
}
typedef int (*qdisc_config_t)(struct libbpf_nla_req *req);
typedef int (*qdisc_config_t)(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook);
static int clsact_config(struct libbpf_nla_req *req)
static int clsact_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook)
{
req->tc.tcm_parent = TC_H_CLSACT;
req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
@@ -539,6 +539,16 @@ static int clsact_config(struct libbpf_nla_req *req)
return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact"));
}
static int qdisc_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook)
{
const char *qdisc = OPTS_GET(hook, qdisc, NULL);
req->tc.tcm_parent = OPTS_GET(hook, parent, TC_H_ROOT);
req->tc.tcm_handle = OPTS_GET(hook, handle, 0);
return nlattr_add(req, TCA_KIND, qdisc, strlen(qdisc) + 1);
}
static int attach_point_to_config(struct bpf_tc_hook *hook,
qdisc_config_t *config)
{
@@ -552,6 +562,9 @@ static int attach_point_to_config(struct bpf_tc_hook *hook,
return 0;
case BPF_TC_CUSTOM:
return -EOPNOTSUPP;
case BPF_TC_QDISC:
*config = &qdisc_config;
return 0;
default:
return -EINVAL;
}
@@ -596,7 +609,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags)
req.tc.tcm_family = AF_UNSPEC;
req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0);
ret = config(&req);
ret = config(&req, hook);
if (ret < 0)
return ret;
@@ -639,6 +652,7 @@ int bpf_tc_hook_destroy(struct bpf_tc_hook *hook)
case BPF_TC_INGRESS:
case BPF_TC_EGRESS:
return libbpf_err(__bpf_tc_detach(hook, NULL, true));
case BPF_TC_QDISC:
case BPF_TC_INGRESS | BPF_TC_EGRESS:
return libbpf_err(tc_qdisc_delete(hook));
case BPF_TC_CUSTOM:

View File

@@ -63,16 +63,16 @@ static int validate_nla(struct nlattr *nla, int maxtype,
minlen = nla_attr_minlen[pt->type];
if (libbpf_nla_len(nla) < minlen)
return -1;
return -EINVAL;
if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen)
return -1;
return -EINVAL;
if (pt->type == LIBBPF_NLA_STRING) {
char *data = libbpf_nla_data(nla);
if (data[libbpf_nla_len(nla) - 1] != '\0')
return -1;
return -EINVAL;
}
return 0;
@@ -118,19 +118,18 @@ int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head,
if (policy) {
err = validate_nla(nla, maxtype, policy);
if (err < 0)
goto errout;
return err;
}
if (tb[type])
if (tb[type]) {
pr_warn("Attribute of type %#x found multiple times in message, "
"previous attribute is being ignored.\n", type);
}
tb[type] = nla;
}
err = 0;
errout:
return err;
return 0;
}
/**

View File

@@ -64,7 +64,6 @@ enum libbpf_print_level {
#include "libbpf.h"
#include "bpf.h"
#include "btf.h"
#include "str_error.h"
#include "libbpf_internal.h"
#endif
@@ -683,7 +682,7 @@ static int bpf_core_calc_field_relo(const char *prog_name,
{
const struct bpf_core_accessor *acc;
const struct btf_type *t;
__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id, elem_id;
const struct btf_member *m;
const struct btf_type *mt;
bool bitfield;
@@ -706,8 +705,14 @@ static int bpf_core_calc_field_relo(const char *prog_name,
if (!acc->name) {
if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) {
*val = spec->bit_offset / 8;
/* remember field size for load/store mem size */
sz = btf__resolve_size(spec->btf, acc->type_id);
/* remember field size for load/store mem size;
* note, for arrays we care about individual element
* sizes, not the overall array size
*/
t = skip_mods_and_typedefs(spec->btf, acc->type_id, &elem_id);
while (btf_is_array(t))
t = skip_mods_and_typedefs(spec->btf, btf_array(t)->type, &elem_id);
sz = btf__resolve_size(spec->btf, elem_id);
if (sz < 0)
return -EINVAL;
*field_sz = sz;
@@ -767,7 +772,17 @@ static int bpf_core_calc_field_relo(const char *prog_name,
case BPF_CORE_FIELD_BYTE_OFFSET:
*val = byte_off;
if (!bitfield) {
*field_sz = byte_sz;
/* remember field size for load/store mem size;
* note, for arrays we care about individual element
* sizes, not the overall array size
*/
t = skip_mods_and_typedefs(spec->btf, field_type_id, &elem_id);
while (btf_is_array(t))
t = skip_mods_and_typedefs(spec->btf, btf_array(t)->type, &elem_id);
sz = btf__resolve_size(spec->btf, elem_id);
if (sz < 0)
return -EINVAL;
*field_sz = sz;
*type_id = field_type_id;
}
break;
@@ -1339,7 +1354,7 @@ int bpf_core_calc_relo_insn(const char *prog_name,
cands->cands[i].id, cand_spec);
if (err < 0) {
bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec);
pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n ",
pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n",
prog_name, relo_idx, i, spec_buf, err);
return err;
}

View File

@@ -88,8 +88,8 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
err = bpf_map_get_info_by_fd(map_fd, &info, &len);
if (err) {
err = -errno;
pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
map_fd, err);
pr_warn("ringbuf: failed to get map info for fd=%d: %s\n",
map_fd, errstr(err));
return libbpf_err(err);
}
@@ -123,8 +123,8 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
if (tmp == MAP_FAILED) {
err = -errno;
pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
map_fd, err);
pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %s\n",
map_fd, errstr(err));
goto err_out;
}
r->consumer_pos = tmp;
@@ -142,8 +142,8 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size);
if (tmp == MAP_FAILED) {
err = -errno;
pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
map_fd, err);
pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %s\n",
map_fd, errstr(err));
goto err_out;
}
r->producer_pos = tmp;
@@ -156,8 +156,8 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
e->data.fd = rb->ring_cnt;
if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) {
err = -errno;
pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
map_fd, err);
pr_warn("ringbuf: failed to epoll add map fd=%d: %s\n",
map_fd, errstr(err));
goto err_out;
}
@@ -205,7 +205,7 @@ ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (rb->epoll_fd < 0) {
err = -errno;
pr_warn("ringbuf: failed to create epoll instance: %d\n", err);
pr_warn("ringbuf: failed to create epoll instance: %s\n", errstr(err));
goto err_out;
}
@@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len)
return (len + 7) / 8 * 8;
}
static int64_t ringbuf_process_ring(struct ring *r)
static int64_t ringbuf_process_ring(struct ring *r, size_t n)
{
int *len_ptr, len, err;
/* 64-bit to avoid overflow in case of extreme application behavior */
@@ -268,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring *r)
}
smp_store_release(r->consumer_pos, cons_pos);
if (cnt >= n)
goto done;
}
} while (got_new_data);
done:
return cnt;
}
/* Consume available ring buffer(s) data without event polling, up to n
* records.
*
* Returns number of records consumed across all registered ring buffers (or
* n, whichever is less), or negative number if any of the callbacks return
* error.
*/
int ring_buffer__consume_n(struct ring_buffer *rb, size_t n)
{
int64_t err, res = 0;
int i;
for (i = 0; i < rb->ring_cnt; i++) {
struct ring *ring = rb->rings[i];
err = ringbuf_process_ring(ring, n);
if (err < 0)
return libbpf_err(err);
res += err;
n -= err;
if (n == 0)
break;
}
return res > INT_MAX ? INT_MAX : res;
}
/* Consume available ring buffer(s) data without event polling.
* Returns number of records consumed across all registered ring buffers (or
* INT_MAX, whichever is less), or negative number if any of the callbacks
@@ -287,13 +317,15 @@ int ring_buffer__consume(struct ring_buffer *rb)
for (i = 0; i < rb->ring_cnt; i++) {
struct ring *ring = rb->rings[i];
err = ringbuf_process_ring(ring);
err = ringbuf_process_ring(ring, INT_MAX);
if (err < 0)
return libbpf_err(err);
res += err;
if (res > INT_MAX) {
res = INT_MAX;
break;
}
}
if (res > INT_MAX)
return INT_MAX;
return res;
}
@@ -314,13 +346,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
__u32 ring_id = rb->events[i].data.fd;
struct ring *ring = rb->rings[ring_id];
err = ringbuf_process_ring(ring);
err = ringbuf_process_ring(ring, INT_MAX);
if (err < 0)
return libbpf_err(err);
res += err;
}
if (res > INT_MAX)
return INT_MAX;
res = INT_MAX;
return res;
}
@@ -371,17 +403,22 @@ int ring__map_fd(const struct ring *r)
return r->map_fd;
}
int ring__consume(struct ring *r)
int ring__consume_n(struct ring *r, size_t n)
{
int64_t res;
res = ringbuf_process_ring(r);
res = ringbuf_process_ring(r, n);
if (res < 0)
return libbpf_err(res);
return res > INT_MAX ? INT_MAX : res;
}
int ring__consume(struct ring *r)
{
return ring__consume_n(r, INT_MAX);
}
static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb)
{
if (rb->consumer_pos) {
@@ -421,7 +458,8 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
err = bpf_map_get_info_by_fd(map_fd, &info, &len);
if (err) {
err = -errno;
pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err);
pr_warn("user ringbuf: failed to get map info for fd=%d: %s\n",
map_fd, errstr(err));
return err;
}
@@ -437,8 +475,8 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0);
if (tmp == MAP_FAILED) {
err = -errno;
pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
map_fd, err);
pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %s\n",
map_fd, errstr(err));
return err;
}
rb->consumer_pos = tmp;
@@ -457,8 +495,8 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
map_fd, rb->page_size);
if (tmp == MAP_FAILED) {
err = -errno;
pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n",
map_fd, err);
pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %s\n",
map_fd, errstr(err));
return err;
}
@@ -469,7 +507,7 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
rb_epoll->events = EPOLLOUT;
if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) {
err = -errno;
pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err);
pr_warn("user ringbuf: failed to epoll add map fd=%d: %s\n", map_fd, errstr(err));
return err;
}
@@ -494,7 +532,7 @@ user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts)
rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (rb->epoll_fd < 0) {
err = -errno;
pr_warn("user ringbuf: failed to create epoll instance: %d\n", err);
pr_warn("user ringbuf: failed to create epoll instance: %s\n", errstr(err));
goto err_out;
}

View File

@@ -13,10 +13,15 @@
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <linux/keyctl.h>
#include <stdlib.h>
#include "bpf.h"
#endif
#ifndef SHA256_DIGEST_LENGTH
#define SHA256_DIGEST_LENGTH 32
#endif
#ifndef __NR_bpf
# if defined(__mips__) && defined(_ABIO32)
# define __NR_bpf 4355
@@ -64,6 +69,11 @@ struct bpf_load_and_run_opts {
__u32 data_sz;
__u32 insns_sz;
const char *errstr;
void *signature;
__u32 signature_sz;
__s32 keyring_id;
void *excl_prog_hash;
__u32 excl_prog_hash_sz;
};
long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size);
@@ -107,7 +117,7 @@ static inline void skel_free(const void *p)
* The loader program will perform probe_read_kernel() from maps.rodata.initial_value.
* skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and
* does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree
* is not nessary.
* is not necessary.
*
* For user space:
* skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly.
@@ -220,14 +230,19 @@ static inline int skel_map_create(enum bpf_map_type map_type,
const char *map_name,
__u32 key_size,
__u32 value_size,
__u32 max_entries)
__u32 max_entries,
const void *excl_prog_hash,
__u32 excl_prog_hash_sz)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
union bpf_attr attr;
memset(&attr, 0, attr_sz);
attr.map_type = map_type;
attr.excl_prog_hash = (unsigned long) excl_prog_hash;
attr.excl_prog_hash_size = excl_prog_hash_sz;
strncpy(attr.map_name, map_name, sizeof(attr.map_name));
attr.key_size = key_size;
attr.value_size = value_size;
@@ -300,6 +315,35 @@ static inline int skel_link_create(int prog_fd, int target_fd,
return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz);
}
static inline int skel_obj_get_info_by_fd(int fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, info);
__u8 sha[SHA256_DIGEST_LENGTH];
struct bpf_map_info info;
__u32 info_len = sizeof(info);
union bpf_attr attr;
memset(&info, 0, sizeof(info));
info.hash = (long) &sha;
info.hash_size = SHA256_DIGEST_LENGTH;
memset(&attr, 0, attr_sz);
attr.info.bpf_fd = fd;
attr.info.info = (long) &info;
attr.info.info_len = info_len;
return skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz);
}
static inline int skel_map_freeze(int fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, map_fd);
union bpf_attr attr;
memset(&attr, 0, attr_sz);
attr.map_fd = fd;
return skel_sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz);
}
#ifdef __KERNEL__
#define set_err
#else
@@ -308,12 +352,13 @@ static inline int skel_link_create(int prog_fd, int target_fd,
static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
{
const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array);
const size_t prog_load_attr_sz = offsetofend(union bpf_attr, keyring_id);
const size_t test_run_attr_sz = offsetofend(union bpf_attr, test);
int map_fd = -1, prog_fd = -1, key = 0, err;
union bpf_attr attr;
err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1);
err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1,
opts->excl_prog_hash, opts->excl_prog_hash_sz);
if (map_fd < 0) {
opts->errstr = "failed to create loader map";
set_err;
@@ -327,11 +372,34 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
goto out;
}
#ifndef __KERNEL__
err = skel_map_freeze(map_fd);
if (err < 0) {
opts->errstr = "failed to freeze map";
set_err;
goto out;
}
err = skel_obj_get_info_by_fd(map_fd);
if (err < 0) {
opts->errstr = "failed to fetch obj info";
set_err;
goto out;
}
#endif
memset(&attr, 0, prog_load_attr_sz);
attr.prog_type = BPF_PROG_TYPE_SYSCALL;
attr.insns = (long) opts->insns;
attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
attr.license = (long) "Dual BSD/GPL";
#ifndef __KERNEL__
attr.signature = (long) opts->signature;
attr.signature_size = opts->signature_sz;
#else
if (opts->signature || opts->signature_sz)
pr_warn("signatures are not supported from bpf_preload\n");
#endif
attr.keyring_id = opts->keyring_id;
memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog"));
attr.fd_array = (long) &map_fd;
attr.log_level = opts->ctx->log_level;
@@ -351,10 +419,11 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
attr.test.ctx_size_in = opts->ctx->sz;
err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz);
if (err < 0 || (int)attr.test.retval < 0) {
opts->errstr = "failed to execute loader prog";
if (err < 0) {
opts->errstr = "failed to execute loader prog";
set_err;
} else {
opts->errstr = "error returned by loader prog";
err = (int)attr.test.retval;
#ifndef __KERNEL__
errno = -err;

View File

@@ -1,21 +0,0 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
#undef _GNU_SOURCE
#include <string.h>
#include <stdio.h>
#include "str_error.h"
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
/*
* Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl
* libc, while checking strerror_r() return to avoid having to check this in
* all places calling it.
*/
char *libbpf_strerror_r(int err, char *dst, int len)
{
int ret = strerror_r(err < 0 ? -err : err, dst, len);
if (ret)
snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret);
return dst;
}

View File

@@ -1,9 +0,0 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __LIBBPF_STR_ERROR_H
#define __LIBBPF_STR_ERROR_H
#define STRERR_BUFSIZE 128
char *libbpf_strerror_r(int err, char *dst, int len);
#endif /* __LIBBPF_STR_ERROR_H */

View File

@@ -34,13 +34,32 @@ enum __bpf_usdt_arg_type {
BPF_USDT_ARG_CONST,
BPF_USDT_ARG_REG,
BPF_USDT_ARG_REG_DEREF,
BPF_USDT_ARG_SIB,
};
/*
* This struct layout is designed specifically to be backwards/forward
* compatible between libbpf versions for ARG_CONST, ARG_REG, and
* ARG_REG_DEREF modes. ARG_SIB requires libbpf v1.7+.
*/
struct __bpf_usdt_arg_spec {
/* u64 scalar interpreted depending on arg_type, see below */
__u64 val_off;
/* arg location case, see bpf_udst_arg() for details */
enum __bpf_usdt_arg_type arg_type;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
/* arg location case, see bpf_usdt_arg() for details */
enum __bpf_usdt_arg_type arg_type: 8;
/* index register offset within struct pt_regs */
__u16 idx_reg_off: 12;
/* scale factor for index register (1, 2, 4, or 8) */
__u16 scale_bitshift: 4;
/* reserved for future use, keeps reg_off offset stable */
__u8 __reserved: 8;
#else
__u8 __reserved: 8;
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
enum __bpf_usdt_arg_type arg_type: 8;
#endif
/* offset of referenced register within struct pt_regs */
short reg_off;
/* whether arg should be interpreted as signed value */
@@ -108,6 +127,38 @@ int bpf_usdt_arg_cnt(struct pt_regs *ctx)
return spec->arg_cnt;
}
/* Returns the size in bytes of the #*arg_num* (zero-indexed) USDT argument.
* Returns negative error if argument is not found or arg_num is invalid.
*/
static __always_inline
int bpf_usdt_arg_size(struct pt_regs *ctx, __u64 arg_num)
{
struct __bpf_usdt_arg_spec *arg_spec;
struct __bpf_usdt_spec *spec;
int spec_id;
spec_id = __bpf_usdt_spec_id(ctx);
if (spec_id < 0)
return -ESRCH;
spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id);
if (!spec)
return -ESRCH;
if (arg_num >= BPF_USDT_MAX_ARG_CNT)
return -ENOENT;
barrier_var(arg_num);
if (arg_num >= spec->arg_cnt)
return -ENOENT;
arg_spec = &spec->args[arg_num];
/* arg_spec->arg_bitshift = 64 - arg_sz * 8
* so: arg_sz = (64 - arg_spec->arg_bitshift) / 8
*/
return (unsigned int)(64 - arg_spec->arg_bitshift) / 8;
}
/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res.
* Returns 0 on success; negative error, otherwise.
* On error *res is guaranteed to be set to zero.
@@ -117,7 +168,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
{
struct __bpf_usdt_spec *spec;
struct __bpf_usdt_arg_spec *arg_spec;
unsigned long val;
unsigned long val, idx;
int err, spec_id;
*res = 0;
@@ -170,6 +221,27 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
return err;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
val >>= arg_spec->arg_bitshift;
#endif
break;
case BPF_USDT_ARG_SIB:
/* Arg is in memory addressed by SIB (Scale-Index-Base) mode
* (e.g., "-1@-96(%rbp,%rax,8)" in USDT arg spec). We first
* fetch the base register contents and the index register
* contents from pt_regs. Then we calculate the final address
* as base + (index * scale) + offset, and do a user-space
* probe read to fetch the argument value.
*/
err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
if (err)
return err;
err = bpf_probe_read_kernel(&idx, sizeof(idx), (void *)ctx + arg_spec->idx_reg_off);
if (err)
return err;
err = bpf_probe_read_user(&val, sizeof(val), (void *)(val + (idx << arg_spec->scale_bitshift) + arg_spec->val_off));
if (err)
return err;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
val >>= arg_spec->arg_bitshift;
#endif
break;
default:
@@ -214,18 +286,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx)
/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */
#define ___bpf_usdt_args0() ctx
#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; })
#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; })
#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; })
#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; })
#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; })
#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; })
#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; })
#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; })
#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; })
#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; })
#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; })
#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; })
#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; })
#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; })
#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; })
#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; })
#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; })
#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; })
#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; })
#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; })
#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; })
#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; })
#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; })
#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; })
#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args)
/*

View File

@@ -58,7 +58,7 @@
*
* STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y);
*
* USDT is identified by it's <provider-name>:<probe-name> pair of names. Each
* USDT is identified by its <provider-name>:<probe-name> pair of names. Each
* individual USDT has a fixed number of arguments (3 in the above example)
* and specifies values of each argument as if it was a function call.
*
@@ -80,7 +80,7 @@
* NOP instruction that kernel can replace with an interrupt instruction to
* trigger instrumentation code (BPF program for all that we care about).
*
* Semaphore above is and optional feature. It records an address of a 2-byte
* Semaphore above is an optional feature. It records an address of a 2-byte
* refcount variable (normally in '.probes' ELF section) used for signaling if
* there is anything that is attached to USDT. This is useful for user
* applications if, for example, they need to prepare some arguments that are
@@ -120,7 +120,7 @@
* a uprobe BPF program (which for kernel, at least currently, is just a kprobe
* program, so BPF_PROG_TYPE_KPROBE program type). With the only difference
* that uprobe is usually attached at the function entry, while USDT will
* normally will be somewhere inside the function. But it should always be
* normally be somewhere inside the function. But it should always be
* pointing to NOP instruction, which makes such uprobes the fastest uprobe
* kind.
*
@@ -150,7 +150,7 @@
* libbpf sets to spec ID during attach time, or, if kernel is too old to
* support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such
* case. The latter means that some modes of operation can't be supported
* without BPF cookie. Such mode is attaching to shared library "generically",
* without BPF cookie. Such a mode is attaching to shared library "generically",
* without specifying target process. In such case, it's impossible to
* calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode
* is not supported without BPF cookie support.
@@ -184,7 +184,7 @@
* as even if USDT spec string is the same, USDT cookie value can be
* different. It was deemed excessive to try to deduplicate across independent
* USDT attachments by taking into account USDT spec string *and* USDT cookie
* value, which would complicated spec ID accounting significantly for little
* value, which would complicate spec ID accounting significantly for little
* gain.
*/
@@ -199,12 +199,23 @@ enum usdt_arg_type {
USDT_ARG_CONST,
USDT_ARG_REG,
USDT_ARG_REG_DEREF,
USDT_ARG_SIB,
};
/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */
struct usdt_arg_spec {
__u64 val_off;
enum usdt_arg_type arg_type;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
enum usdt_arg_type arg_type: 8;
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
__u8 __reserved: 8; /* keep reg_off offset stable */
#else
__u8 __reserved: 8; /* keep reg_off offset stable */
__u16 idx_reg_off: 12;
__u16 scale_bitshift: 4;
enum usdt_arg_type arg_type: 8;
#endif
short reg_off;
bool arg_signed;
char arg_bitshift;
@@ -465,8 +476,8 @@ static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs,
goto proceed;
if (!realpath(lib_path, path)) {
pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n",
lib_path, -errno);
pr_warn("usdt: failed to get absolute path of '%s' (err %s), using path as is...\n",
lib_path, errstr(-errno));
libbpf_strlcpy(path, lib_path, sizeof(path));
}
@@ -475,8 +486,8 @@ proceed:
f = fopen(line, "re");
if (!f) {
err = -errno;
pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n",
line, lib_path, err);
pr_warn("usdt: failed to open '%s' to get base addr of '%s': %s\n",
line, lib_path, errstr(err));
return err;
}
@@ -569,9 +580,8 @@ static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long o
return NULL;
}
static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr,
const char *data, size_t name_off, size_t desc_off,
struct usdt_note *usdt_note);
static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off,
size_t desc_off, struct usdt_note *usdt_note);
static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie);
@@ -606,7 +616,8 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
err = parse_elf_segs(elf, path, &segs, &seg_cnt);
if (err) {
pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err);
pr_warn("usdt: failed to process ELF program segments for '%s': %s\n",
path, errstr(err));
goto err_out;
}
@@ -624,7 +635,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
struct elf_seg *seg = NULL;
void *tmp;
err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, &note);
err = parse_usdt_note(&nhdr, data->d_buf, name_off, desc_off, &note);
if (err)
goto err_out;
@@ -659,7 +670,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
* [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
*/
usdt_abs_ip = note.loc_addr;
if (base_addr)
if (base_addr && note.base_addr)
usdt_abs_ip += base_addr - note.base_addr;
/* When attaching uprobes (which is what USDTs basically are)
@@ -708,8 +719,8 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
if (vma_seg_cnt == 0) {
err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt);
if (err) {
pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n",
pid, path, err);
pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %s\n",
pid, path, errstr(err));
goto err_out;
}
}
@@ -1047,8 +1058,8 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct
if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) {
err = -errno;
pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n",
spec_id, usdt_provider, usdt_name, path, err);
pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %s\n",
spec_id, usdt_provider, usdt_name, path, errstr(err));
goto err_out;
}
if (!man->has_bpf_cookie &&
@@ -1058,9 +1069,9 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct
pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n",
spec_id, usdt_provider, usdt_name, path);
} else {
pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n",
pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %s\n",
target->abs_ip, spec_id, usdt_provider, usdt_name,
path, err);
path, errstr(err));
}
goto err_out;
}
@@ -1076,8 +1087,8 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct
target->rel_ip, &opts);
err = libbpf_get_error(uprobe_link);
if (err) {
pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n",
i, usdt_provider, usdt_name, path, err);
pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %s\n",
i, usdt_provider, usdt_name, path, errstr(err));
goto err_out;
}
@@ -1099,8 +1110,8 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct
NULL, &opts_multi);
if (!link->multi_link) {
err = -errno;
pr_warn("usdt: failed to attach uprobe multi for '%s:%s' in '%s': %d\n",
usdt_provider, usdt_name, path, err);
pr_warn("usdt: failed to attach uprobe multi for '%s:%s' in '%s': %s\n",
usdt_provider, usdt_name, path, errstr(err));
goto err_out;
}
@@ -1130,8 +1141,7 @@ err_out:
/* Parse out USDT ELF note from '.note.stapsdt' section.
* Logic inspired by perf's code.
*/
static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr,
const char *data, size_t name_off, size_t desc_off,
static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off,
struct usdt_note *note)
{
const char *provider, *name, *args;
@@ -1281,11 +1291,51 @@ static int calc_pt_regs_off(const char *reg_name)
static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
{
char reg_name[16];
int len, reg_off;
long off;
char reg_name[16] = {0}, idx_reg_name[16] = {0};
int len, reg_off, idx_reg_off, scale = 1;
long off = 0;
if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) {
if (sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^,] , %d ) %n",
arg_sz, &off, reg_name, idx_reg_name, &scale, &len) == 5 ||
sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^,] , %d ) %n",
arg_sz, reg_name, idx_reg_name, &scale, &len) == 4 ||
sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^)] ) %n",
arg_sz, &off, reg_name, idx_reg_name, &len) == 4 ||
sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^)] ) %n",
arg_sz, reg_name, idx_reg_name, &len) == 3
) {
/*
* Scale Index Base case:
* 1@-96(%rbp,%rax,8)
* 1@(%rbp,%rax,8)
* 1@-96(%rbp,%rax)
* 1@(%rbp,%rax)
*/
arg->arg_type = USDT_ARG_SIB;
arg->val_off = off;
reg_off = calc_pt_regs_off(reg_name);
if (reg_off < 0)
return reg_off;
arg->reg_off = reg_off;
idx_reg_off = calc_pt_regs_off(idx_reg_name);
if (idx_reg_off < 0)
return idx_reg_off;
arg->idx_reg_off = idx_reg_off;
/* validate scale factor and set fields directly */
switch (scale) {
case 1: arg->scale_bitshift = 0; break;
case 2: arg->scale_bitshift = 1; break;
case 4: arg->scale_bitshift = 2; break;
case 8: arg->scale_bitshift = 3; break;
default:
pr_warn("usdt: invalid SIB scale %d, expected 1, 2, 4, 8\n", scale);
return -EINVAL;
}
} else if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n",
arg_sz, &off, reg_name, &len) == 3) {
/* Memory dereference case, e.g., -4@-20(%rbp) */
arg->arg_type = USDT_ARG_REG_DEREF;
arg->val_off = off;
@@ -1304,6 +1354,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
} else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) {
/* Register read case, e.g., -4@%eax */
arg->arg_type = USDT_ARG_REG;
/* register read has no memory offset */
arg->val_off = 0;
reg_off = calc_pt_regs_off(reg_name);
@@ -1325,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
#elif defined(__s390x__)
/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */
static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
{
unsigned int reg;

View File

@@ -223,7 +223,7 @@ struct zip_archive *zip_archive_open(const char *path)
if (!archive) {
munmap(data, size);
return ERR_PTR(-ENOMEM);
};
}
archive->data = data;
archive->size = size;