// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include #include #include #include #include #include #include #include #include #include "bpf.h" #include "libbpf.h" #include "libbpf_common.h" #include "libbpf_internal.h" #include "hashmap.h" /* libbpf's USDT support consists of BPF-side state/code and user-space * state/code working together in concert. BPF-side parts are defined in * usdt.bpf.h header library. User-space state is encapsulated by struct * usdt_manager and all the supporting code centered around usdt_manager. * * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that * don't support BPF cookie (see below). These two maps are implicitly * embedded into user's end BPF object file when user's code included * usdt.bpf.h. This means that libbpf doesn't do anything special to create * these USDT support maps. They are created by normal libbpf logic of * instantiating BPF maps when opening and loading BPF object. * * As such, libbpf is basically unaware of the need to do anything * USDT-related until the very first call to bpf_program__attach_usdt(), which * can be called by user explicitly or happen automatically during skeleton * attach (or, equivalently, through generic bpf_program__attach() call). At * this point, libbpf will instantiate and initialize struct usdt_manager and * store it in bpf_object. USDT manager is per-BPF object construct, as each * independent BPF object might or might not have USDT programs, and thus all * the expected USDT-related state. There is no coordination between two * bpf_object in parts of USDT attachment, they are oblivious of each other's * existence and libbpf is just oblivious, dealing with bpf_object-specific * USDT state. * * Quick crash course on USDTs. * * From user-space application's point of view, USDT is essentially just * a slightly special function call that normally has zero overhead, unless it * is being traced by some external entity (e.g, BPF-based tool). Here's how * a typical application can trigger USDT probe: * * #include // provided by systemtap-sdt-devel package * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h * * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); * * USDT is identified by it's : pair of names. Each * individual USDT has a fixed number of arguments (3 in the above example) * and specifies values of each argument as if it was a function call. * * USDT call is actually not a function call, but is instead replaced by * a single NOP instruction (thus zero overhead, effectively). But in addition * to that, those USDT macros generate special SHT_NOTE ELF records in * .note.stapsdt ELF section. Here's an example USDT definition as emitted by * `readelf -n `: * * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) * Provider: test * Name: usdt12 * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil * * In this case we have USDT test:usdt12 with 12 arguments. * * Location and base are offsets used to calculate absolute IP address of that * NOP instruction that kernel can replace with an interrupt instruction to * trigger instrumentation code (BPF program for all that we care about). * * Semaphore above is and optional feature. It records an address of a 2-byte * refcount variable (normally in '.probes' ELF section) used for signaling if * there is anything that is attached to USDT. This is useful for user * applications if, for example, they need to prepare some arguments that are * passed only to USDTs and preparation is expensive. By checking if USDT is * "activated", an application can avoid paying those costs unnecessarily. * Recent enough kernel has built-in support for automatically managing this * refcount, which libbpf expects and relies on. If USDT is defined without * associated semaphore, this value will be zero. See selftests for semaphore * examples. * * Arguments is the most interesting part. This USDT specification string is * providing information about all the USDT arguments and their locations. The * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and * whether the argument is signed or unsigned (negative size means signed). * The part after @ sign is assembly-like definition of argument location * (see [0] for more details). Technically, assembler can provide some pretty * advanced definitions, but libbpf is currently supporting three most common * cases: * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer * whose value is in register %rdx"; * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which * specifies signed 32-bit integer stored at offset -1204 bytes from * memory address stored in %rbp. * * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation * * During attachment, libbpf parses all the relevant USDT specifications and * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side * code through spec map. This allows BPF applications to quickly fetch the * actual value at runtime using a simple BPF-side code. * * With basics out of the way, let's go over less immeditately obvious aspects * of supporting USDTs. * * First, there is no special USDT BPF program type. It is actually just * a uprobe BPF program (which for kernel, at least currently, is just a kprobe * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference * that uprobe is usually attached at the function entry, while USDT will * normally will be somewhere inside the function. But it should always be * pointing to NOP instruction, which makes such uprobes the fastest uprobe * kind. * * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) * macro invocations can end up being inlined many-many times, depending on * specifics of each individual user application. So single conceptual USDT * (identified by provider:name pair of identifiers) is, generally speaking, * multiple uprobe locations (USDT call sites) in different places in user * application. Further, again due to inlining, each USDT call site might end * up having the same argument #N be located in a different place. In one call * site it could be a constant, in another will end up in a register, and in * yet another could be some other register or even somewhere on the stack. * * As such, "attaching to USDT" means (in general case) attaching the same * uprobe BPF program to multiple target locations in user application, each * potentially having a completely different USDT spec associated with it. * To wire all this up together libbpf allocates a unique integer spec ID for * each unique USDT spec. Spec IDs are allocated as sequential small integers * so that they can be used as keys in array BPF map (for performance reasons). * Spec ID allocation and accounting is big part of what usdt_manager is * about. This state has to be maintained per-BPF object and coordinate * between different USDT attachments within the same BPF object. * * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out * as struct usdt_spec. Each invocation of BPF program at runtime needs to * know its associated spec ID. It gets it either through BPF cookie, which * libbpf sets to spec ID during attach time, or, if kernel is too old to * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such * case. The latter means that some modes of operation can't be supported * without BPF cookie. Such mode is attaching to shared library "generically", * without specifying target process. In such case, it's impossible to * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode * is not supported without BPF cookie support. * * Note that libbpf is using BPF cookie functionality for its own internal * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf * provides conceptually equivalent USDT cookie support. It's still u64 * user-provided value that can be associated with USDT attachment. Note that * this will be the same value for all USDT call sites within the same single * *logical* USDT attachment. This makes sense because to user attaching to * USDT is a single BPF program triggered for singular USDT probe. The fact * that this is done at multiple actual locations is a mostly hidden * implementation details. This USDT cookie value can be fetched with * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h * * Lastly, while single USDT can have tons of USDT call sites, it doesn't * necessarily have that many different USDT specs. It very well might be * that 1000 USDT call sites only need 5 different USDT specs, because all the * arguments are typically contained in a small set of registers or stack * locations. As such, it's wasteful to allocate as many USDT spec IDs as * there are USDT call sites. So libbpf tries to be frugal and performs * on-the-fly deduplication during a single USDT attachment to only allocate * the minimal required amount of unique USDT specs (and thus spec IDs). This * is trivially achieved by using USDT spec string (Arguments string from USDT * note) as a lookup key in a hashmap. USDT spec string uniquely defines * everything about how to fetch USDT arguments, so two USDT call sites * sharing USDT spec string can safely share the same USDT spec and spec ID. * Note, this spec string deduplication is happening only during the same USDT * attachment, so each USDT spec shares the same USDT cookie value. This is * not generally true for other USDT attachments within the same BPF object, * as even if USDT spec string is the same, USDT cookie value can be * different. It was deemed excessive to try to deduplicate across independent * USDT attachments by taking into account USDT spec string *and* USDT cookie * value, which would complicated spec ID accounting significantly for little * gain. */ struct usdt_target { long abs_ip; long rel_ip; long sema_off; }; struct usdt_manager { struct bpf_map *specs_map; struct bpf_map *ip_to_spec_id_map; bool has_bpf_cookie; bool has_sema_refcnt; }; struct usdt_manager *usdt_manager_new(struct bpf_object *obj) { static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; struct usdt_manager *man; struct bpf_map *specs_map, *ip_to_spec_id_map; specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); if (!specs_map || !ip_to_spec_id_map) { pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); return ERR_PTR(-ESRCH); } man = calloc(1, sizeof(*man)); if (!man) return ERR_PTR(-ENOMEM); man->specs_map = specs_map; man->ip_to_spec_id_map = ip_to_spec_id_map; /* Detect if BPF cookie is supported for kprobes. * We don't need IP-to-ID mapping if we can use BPF cookies. * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") */ man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); /* Detect kernel support for automatic refcounting of USDT semaphore. * If this is not supported, USDTs with semaphores will not be supported. * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") */ man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; return man; } void usdt_manager_free(struct usdt_manager *man) { if (IS_ERR_OR_NULL(man)) return; free(man); } static int sanity_check_usdt_elf(Elf *elf, const char *path) { GElf_Ehdr ehdr; int endianness; if (elf_kind(elf) != ELF_K_ELF) { pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); return -EBADF; } switch (gelf_getclass(elf)) { case ELFCLASS64: if (sizeof(void *) != 8) { pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); return -EBADF; } break; case ELFCLASS32: if (sizeof(void *) != 4) { pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); return -EBADF; } break; default: pr_warn("usdt: unsupported ELF class for '%s'\n", path); return -EBADF; } if (!gelf_getehdr(elf, &ehdr)) return -EINVAL; if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", path, ehdr.e_type); return -EBADF; } #if __BYTE_ORDER == __LITTLE_ENDIAN endianness = ELFDATA2LSB; #elif __BYTE_ORDER == __BIG_ENDIAN endianness = ELFDATA2MSB; #else # error "Unrecognized __BYTE_ORDER__" #endif if (endianness != ehdr.e_ident[EI_DATA]) { pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); return -EBADF; } return 0; } static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, const char *usdt_provider, const char *usdt_name, long usdt_cookie, struct usdt_target **out_targets, size_t *out_target_cnt) { return -ENOTSUP; } struct bpf_link_usdt { struct bpf_link link; struct usdt_manager *usdt_man; size_t uprobe_cnt; struct { long abs_ip; struct bpf_link *link; } *uprobes; }; static int bpf_link_usdt_detach(struct bpf_link *link) { struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); int i; for (i = 0; i < usdt_link->uprobe_cnt; i++) { /* detach underlying uprobe link */ bpf_link__destroy(usdt_link->uprobes[i].link); } return 0; } static void bpf_link_usdt_dealloc(struct bpf_link *link) { struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); free(usdt_link->uprobes); free(usdt_link); } struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, pid_t pid, const char *path, const char *usdt_provider, const char *usdt_name, long usdt_cookie) { LIBBPF_OPTS(bpf_uprobe_opts, opts); struct bpf_link_usdt *link = NULL; struct usdt_target *targets = NULL; size_t target_cnt; int i, fd, err; Elf *elf; /* TODO: perform path resolution similar to uprobe's */ fd = open(path, O_RDONLY); if (fd < 0) { err = -errno; pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); return libbpf_err_ptr(err); } elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); if (!elf) { err = -EBADF; pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); goto err_out; } err = sanity_check_usdt_elf(elf, path); if (err) goto err_out; /* normalize PID filter */ if (pid < 0) pid = -1; else if (pid == 0) pid = getpid(); /* discover USDT in given binary, optionally limiting * activations to a given PID, if pid > 0 */ err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, usdt_cookie, &targets, &target_cnt); if (err <= 0) { err = (err == 0) ? -ENOENT : err; goto err_out; } link = calloc(1, sizeof(*link)); if (!link) { err = -ENOMEM; goto err_out; } link->usdt_man = man; link->link.detach = &bpf_link_usdt_detach; link->link.dealloc = &bpf_link_usdt_dealloc; link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); if (!link->uprobes) { err = -ENOMEM; goto err_out; } for (i = 0; i < target_cnt; i++) { struct usdt_target *target = &targets[i]; struct bpf_link *uprobe_link; opts.ref_ctr_offset = target->sema_off; uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, target->rel_ip, &opts); err = libbpf_get_error(uprobe_link); if (err) { pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", i, usdt_provider, usdt_name, path, err); goto err_out; } link->uprobes[i].link = uprobe_link; link->uprobes[i].abs_ip = target->abs_ip; link->uprobe_cnt++; } elf_end(elf); close(fd); return &link->link; err_out: bpf_link__destroy(&link->link); if (elf) elf_end(elf); close(fd); return libbpf_err_ptr(err); }