diff --git a/src/bpf_helper_defs.h b/src/bpf_helper_defs.h index acc89a6..7f8a65c 100644 --- a/src/bpf_helper_defs.h +++ b/src/bpf_helper_defs.h @@ -18,6 +18,7 @@ struct pt_regs; struct sk_reuseport_md; struct sockaddr; struct tcphdr; +struct seq_file; struct __sk_buff; struct sk_msg_md; struct xdp_md; @@ -71,8 +72,8 @@ static int (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. * - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() - * instead. + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. * * Returns * 0 on success, or a negative error in case of failure. @@ -84,7 +85,7 @@ static int (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (v * * Return the time elapsed since system boot, in nanoseconds. * Does not include time the system was suspended. - * See: clock_gettime(CLOCK_MONOTONIC) + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) * * Returns * Current *ktime*. @@ -1106,11 +1107,11 @@ static int (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 4 * bpf_probe_read_str * * Copy a NUL terminated string from an unsafe kernel address - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for * more details. * - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() - * instead. + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. * * Returns * On success, the strictly positive length of the string, @@ -1169,6 +1170,12 @@ static __u32 (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1184,7 +1191,7 @@ static __u32 (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; * Returns * 0 on success, or a negative error in case of failure. */ -static int (*bpf_setsockopt)(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; +static int (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; /* * bpf_skb_adjust_room @@ -1241,12 +1248,12 @@ static int (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 m * * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be - * one of the XDP program return codes up to XDP_TX, as chosen by - * the caller. Any higher bits in the *flags* argument must be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. Any higher bits in the *flags* argument must be * unset. * - * See also bpf_redirect(), which only supports redirecting to an - * ifindex, but doesn't require a map to do so. + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. * * Returns * **XDP_REDIRECT** on success, or the value of the two lower bits @@ -1370,7 +1377,7 @@ static int (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 5 * the time running for event since last normalization. The * enabled and running times are accumulated since the perf event * open. To achieve scaling factor between two invocations of an - * eBPF program, users can can use CPU id as the key (which is + * eBPF program, users can use CPU id as the key (which is * typical for perf array usage model) to remember the previous * value and do the calculation inside the eBPF program. * @@ -1404,6 +1411,12 @@ static int (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct b * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * @@ -1415,7 +1428,7 @@ static int (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct b * Returns * 0 on success, or a negative error in case of failure. */ -static int (*bpf_getsockopt)(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; +static int (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; /* * bpf_override_return @@ -1425,7 +1438,7 @@ static int (*bpf_getsockopt)(struct bpf_sock_ops *bpf_socket, int level, int opt * The first argument is the context *regs* on which the kprobe * works. * - * This helper works by setting setting the PC (program counter) + * This helper works by setting the PC (program counter) * to an override function which is run in place of the original * probed function. This means the probed function is not run at * all. The replacement function just returns with the required @@ -1618,10 +1631,11 @@ static int (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, _ * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * * Returns * 0 on success, or a negative error in case of failure. @@ -1632,8 +1646,8 @@ static int (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int add * bpf_xdp_adjust_tail * * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -1971,7 +1985,7 @@ static int (*bpf_rc_repeat)(void *ctx) = (void *) 77; * **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_repeat**\ (). * - * Some protocols include a toggle bit, in case the button was + * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into @@ -2407,7 +2421,6 @@ static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple * * *th* points to the start of the TCP header, while *th_len* * contains **sizeof**\ (**struct tcphdr**). * - * * Returns * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -2630,7 +2643,6 @@ static int (*bpf_send_signal)(__u32 sig) = (void *) 109; * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header. * - * * Returns * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, @@ -2728,7 +2740,7 @@ static int (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_pt * // size, after checking its boundaries. * } * - * In comparison, using **bpf_probe_read_user()** helper here + * In comparison, using **bpf_probe_read_user**\ () helper here * instead to read the string would require to estimate the length * at compile time, and would often result in copying more memory * than necessary. @@ -2750,10 +2762,10 @@ static int (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ * bpf_probe_read_kernel_str * * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. * * Returns - * On success, the strictly positive length of the string, including + * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. */ static int (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; @@ -2761,7 +2773,7 @@ static int (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsaf /* * bpf_tcp_send_ack * - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * * Returns @@ -2801,8 +2813,8 @@ static __u64 (*bpf_jiffies64)(void) = (void *) 118; * bpf_read_branch_records * * For an eBPF program attached to a perf event, retrieve the - * branch records (struct perf_branch_entry) associated to *ctx* - * and store it in the buffer pointed by *buf* up to size + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size * *size* bytes. * * Returns @@ -2810,11 +2822,11 @@ static __u64 (*bpf_jiffies64)(void) = (void *) 118; * negative value. * * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to - * instead return the number of bytes required to store all the + * instead return the number of bytes required to store all the * branch entries. If this flag is set, *buf* may be NULL. * * **-EINVAL** if arguments invalid or **size** not a multiple - * of sizeof(struct perf_branch_entry). + * of **sizeof**\ (**struct perf_branch_entry**\ ). * * **-ENOENT** if architecture does not support branch records. */ @@ -2826,13 +2838,13 @@ static int (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. * - * On failure, the returned value is one of the following: + * Returns + * 0 on success, or one of the following in case of failure: * * **-EINVAL** if dev and inum supplied don't match dev_t and inode number * with nsfs of current task, or if dev conversion to dev_t lost high bits. * * **-ENOENT** if pidns does not exists for the current task. - * */ static int (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; @@ -2873,8 +2885,8 @@ static int (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u6 * a global identifier that can be assumed unique. If *ctx* is * NULL, then the helper returns the cookie for the initial * network namespace. The cookie itself is very similar to that - * of bpf_get_socket_cookie() helper, but for network namespaces - * instead of sockets. + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. * * Returns * A 8-byte long opaque number. @@ -2918,14 +2930,19 @@ static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void * * The *flags* argument must be zero. * * Returns - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EINVAL** Unsupported flags specified. - * * **-ENOENT** Socket is unavailable for assignment. - * * **-ENETUNREACH** Socket is unreachable (wrong netns). - * * **-EOPNOTSUPP** Unsupported operation, for example a - * call from outside of TC ingress. - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). */ static int (*bpf_sk_assign)(struct __sk_buff *skb, struct bpf_sock *sk, __u64 flags) = (void *) 124; @@ -2934,11 +2951,98 @@ static int (*bpf_sk_assign)(struct __sk_buff *skb, struct bpf_sock *sk, __u64 fl * * Return the time elapsed since system boot, in nanoseconds. * Does include the time the system was suspended. - * See: clock_gettime(CLOCK_BOOTTIME) + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) * * Returns * Current *ktime*. */ static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; +/* + * bpf_seq_printf + * + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static int (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; + +/* + * bpf_seq_write + * + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static int (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; + +/* + * bpf_sk_cgroup_id + * + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_cgroup_id)(struct bpf_sock *sk) = (void *) 128; + +/* + * bpf_sk_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_ancestor_cgroup_id)(struct bpf_sock *sk, int ancestor_level) = (void *) 129; +