Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [RFC bpf-next v2 1/8] bpf: add script and prepare bpf.h for new helpers documentation
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Remove previous "overview" of eBPF helpers from user bpf.h header.
Replace it by a comment explaining how to process the new documentation
(to come in following patches) with a Python script to produce RST, then
man page documentation.

Also add the aforementioned Python script under scripts/. It is used to
process include/uapi/linux/bpf.h and to extract helper descriptions, to
turn it into a RST document that can further be processed with rst2man
to produce a man page. The script takes one "--filename <path/to/file>"
option. If the script is launched from scripts/ in the kernel root
directory, it should be able to find the location of the header to
parse, and "--filename <path/to/file>" is then optional. If it cannot
find the file, then the option becomes mandatory. RST-formatted
documentation is printed to standard output.

Typical workflow for producing the final man page would be:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

Note that the tool kernel-doc cannot be used to document eBPF helpers,
whose signatures are not available directly in the header files
(pre-processor directives are used to produce them at the beginning of
the compilation process).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h   | 406 ++------------------------------------------
 scripts/bpf_helpers_doc.py | 414 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 430 insertions(+), 390 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..45f77f01e672 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -365,396 +365,22 @@ union bpf_attr {
 	} raw_tracepoint;
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
- *
- * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
- *
- * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
- *
- * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
- *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..3a15ba3f0a83
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,414 @@
+#!/usr/bin/python3
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) 2018 Netronome Systems, Inc.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program. For
+security reasons, these functions are restricted to a white-list of helpers
+defined in the kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets metadata. Since there are
+several eBPF program types, and that they do not run in the same context, each
+program type can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+NOTES
+=====
+
+On the performance side, eBPF programs move to the stack all arguments to pass
+to the helpers, and call directly into the compiled helper functions without
+requiring any foreign-function interface. As a result, calling helpers
+introduce very little overhead.
+
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* contains the full list of all helper functions.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 6/8] bpf: add documentation for eBPF helpers (42-50)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast
  Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man,
	Kaixu Xia, Martin KaFai Lau, Sargun Dhillon, Thomas Graf,
	Gianluca Borello, Chenbo Feng
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Kaixu:
- bpf_perf_event_read()

Helpers from Martin:
- bpf_skb_under_cgroup()
- bpf_xdp_adjust_head()

Helpers from Sargun:
- bpf_probe_write_user()
- bpf_current_task_under_cgroup()

Helper from Thomas:
- bpf_skb_change_head()

Helper from Gianluca:
- bpf_probe_read_str()

Helpers from Chenbo:
- bpf_get_socket_cookie()
- bpf_get_socket_uid()

Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Gianluca Borello <g.borello@gmail.com>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 158 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index af429ec79f50..15d9ccafebbe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -701,6 +701,25 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The
+ * 		nature of the perf event counter is selected at the creation of
+ * 		the *map*. The *map* is an array whose size is the number of
+ * 		available CPU cores, and each cell contains a value relative to
+ * 		one core. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the core to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU core should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
  * int bpf_redirect(u32 ifindex, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
@@ -939,6 +958,17 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
  * u32 bpf_get_hash_recalc(struct sk_buff *skb)
  * 	Description
  * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
@@ -959,6 +989,37 @@ union bpf_attr {
  * 	Return
  * 		A pointer to the current task struct.
  *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
  * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Resize (trim or grow) the packet associated to *skb* to the
@@ -1043,6 +1104,103 @@ union bpf_attr {
  * 	Return
  * 		The id of current NUMA node.
  *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the socket cookie generated by the kernel from a
+ * 		**struct sk_buff** with a known socket. If none has been set
+ * 		yet, generate a new cookie. This helper can be useful for
+ * 		monitoring per socket networking traffic statistics as it
+ * 		provides a unique socket identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
  * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
  * 	Description
  * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 7/8] bpf: add documentation for eBPF helpers (51-57)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast
  Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man,
	Lawrence Brakmo, Yonghong Song, Josef Bacik, Andrey Ignatov
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helpers from Lawrence:
- bpf_setsockopt()
- bpf_getsockopt()
- bpf_sock_ops_cb_flags_set()

Helpers from Yonghong:
- bpf_perf_event_read_value()
- bpf_perf_prog_read_value()

Helper from Josef:
- bpf_override_return()

Helper from Andrey:
- bpf_bind()

Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 184 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 15d9ccafebbe..7343af4196c8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1208,6 +1208,28 @@ union bpf_attr {
  * 	Return
  * 		0
  *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
@@ -1255,6 +1277,168 @@ union bpf_attr {
  * 		performed again.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf
+ * 		event counter is selected at the creation of the *map*. The
+ * 		*map* is an array whose size is the number of available CPU
+ * 		cores, and each cell contains a value relative to one core. The
+ * 		value to retrieve is indicated by *flags*, that contains the
+ * 		index of the core to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU core should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ *
+ * 		As for the remote end, both parts of it can be overridden,
+ * 		remote IP and remote port. This can be useful if an application
+ * 		inside a cgroup wants to connect to another application inside
+ * 		the same cgroup or to itself, but knows nothing about the IP
+ * 		address assigned to the cgroup.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 5/8] bpf: add documentation for eBPF helpers (33-41)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_hash_recalc()
- bpf_skb_change_tail()
- bpf_skb_pull_data()
- bpf_csum_update()
- bpf_set_hash_invalid()
- bpf_get_numa_node_id()
- bpf_set_hash()
- bpf_skb_adjust_room()
- bpf_xdp_adjust_meta()

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d147d9dd6a83..af429ec79f50 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -939,9 +939,164 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, when testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**)
+ * 		fails, programs just bail out, or, in the direct read case, use
+ * 		**bpf_skb_load_bytes()** as an alternative to overcome this
+ * 		limitation. If such data sits in non-linear parts, it is
+ * 		possible to pull them in once with the new helper, retest and
+ * 		eventually access them.
+ *
+ * 		At the same time, this also makes sure the skb is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the skb from the
+ * 		very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver fed us an IP checksum. Return an error otherwise. This
+ * 		header is intended to be used in combination with
+ * 		**bpf_csum_diff()** helper, in particular when the checksum
+ * 		needs to be updated after data has been written into the packet
+ * 		through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**).
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v11 0/4] set VSESR_EL2 by user space and support NOTIFY_SEI notification
From: James Morse @ 2018-04-10 14:15 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, devel, huangshaoyu,
	zhengxiang9
In-Reply-To: <1523309796-36423-1-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 09/04/18 22:36, Dongjiu Geng wrote:
> 1. Detect whether KVM can set set guest SError syndrome
> 2. Support to Set VSESR_EL2 and inject SError by user space.
> 3. Support live migration to keep SError pending state and VSESR_EL2 value.
> 4. ACPI 6.1 adds support for NOTIFY_SEI as a GHES notification mechanism, so support this
>    notification in software, KVM or kernel ARCH code call handle_guest_sei() to let ACP driver
>    to handle this notification.

Please don't post code during the merge-window, will this apply to v4.17-rc1? We
can't know until its tagged.

This series is doing two separate things, please split it into two series.

But on the ACPI front: I don't see how any OS can support your NOTIFY_SEI when
firmware is ignoring the normal world's PSTATE.A.

The latest lobe of that discussion was on the list here:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1611496.html

As it is, we would need to spot SError being delivered while SError is masked,
spray nasty messages about firmware being horrifically buggy, then panic(). For
a corrected error, this looks bad, but its preferable to letting firmware
silently overwrite the exception registers, causing linux to spin through the
vectors 'eret' with all exceptions masked.
I still think its best to wait for firmware that does the right thing.

Thanks,

James
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v11 2/4] arm/arm64: KVM: Add KVM_GET/SET_VCPU_EVENTS
From: James Morse @ 2018-04-10 14:15 UTC (permalink / raw)
  To: Dongjiu Geng, devel
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, huangshaoyu, zhengxiang9
In-Reply-To: <1523309796-36423-3-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 09/04/18 22:36, Dongjiu Geng wrote:
> This new IOCTL exports user-invisible states related to SError.
> Together with appropriate user space changes, it can inject
> SError with specified syndrome to guest by setup kvm_vcpu_events
> value.

> Also it can support live migration.

Could you explain what user-space is expected to do for this?
(this is also relevant for snapshot-ing/suspending VMs)

It's probably worth noting that this solves an existing problem: KVM may make an
SError pending, but user-space has no way to discover/migrate this.


> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 8a3d708..45719b4 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -819,11 +819,13 @@ struct kvm_clock_data {
>  
>  Capability: KVM_CAP_VCPU_EVENTS
>  Extended by: KVM_CAP_INTR_SHADOW
> -Architectures: x86
> +Architectures: x86, arm, arm64
>  Type: vm ioctl
>  Parameters: struct kvm_vcpu_event (out)
>  Returns: 0 on success, -1 on error
>  
> +X86:
> +
>  Gets currently pending exceptions, interrupts, and NMIs as well as related
>  states of the vcpu.
>  
> @@ -865,15 +867,31 @@ Only two fields are defined in the flags field:
>  - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
>    smi contains a valid state.
>  
> +ARM, ARM64:
> +
> +Gets currently pending SError exceptions as well as related states of the vcpu.
> +
> +struct kvm_vcpu_events {
> +	struct {
> +		__u8 serror_pending;
> +		__u8 serror_has_esr;
> +		/* Align it to 4 bytes */
> +		__u8 pad[2];
> +		__u64 serror_esr;
> +	} exception;
> +};
> +

I'm not convinced we should change this struct from the layout/size x86 has. Its
confusing for the documentation, is this API call really the same on all
architectures?

What if we want to add some future interrupt, NMI or related state? We've found
ourselves needing to add this API, it seems odd to remove its other uses on x86.
We can't put them back in the future.

Having a different layout would force user-space to ifdef/duplicate any code
that accesses this between architectures.



The compiler will want that __u64 to be naturally aligned to 8-bytes, so your
4-byte padding still causes some secret compiler-padding to be inserted.
Different versions of the compiler may put it in different places.


>  4.32 KVM_SET_VCPU_EVENTS
>  
>  Capability: KVM_CAP_VCPU_EVENTS
>  Extended by: KVM_CAP_INTR_SHADOW
> -Architectures: x86
> +Architectures: x86, arm, arm64
>  Type: vm ioctl
>  Parameters: struct kvm_vcpu_event (in)
>  Returns: 0 on success, -1 on error
>  
> +X86:
> +
>  Set pending exceptions, interrupts, and NMIs as well as related states of the
>  vcpu.
>  
> @@ -894,6 +912,12 @@ shall be written into the VCPU.
>  
>  KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
>  
> +ARM, ARM64:
> +
> +Set pending SError exceptions as well as related states of the vcpu.
> +
> +See KVM_GET_VCPU_EVENTS for the data structure.
> +
>  
>  4.33 KVM_GET_DEBUGREGS
>  


> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 9abbf30..855cc9a 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -39,6 +39,7 @@
>  #define __KVM_HAVE_GUEST_DEBUG
>  #define __KVM_HAVE_IRQ_LINE
>  #define __KVM_HAVE_READONLY_MEM
> +#define __KVM_HAVE_VCPU_EVENTS
>  
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
>  
> @@ -153,6 +154,17 @@ struct kvm_sync_regs {
>  struct kvm_arch_memory_slot {
>  };
>  
> +/* for KVM_GET/SET_VCPU_EVENTS */
> +struct kvm_vcpu_events {
> +	struct {
> +		__u8 serror_pending;
> +		__u8 serror_has_esr;

> +		/* Align it to 4 bytes */
> +		__u8 pad[2];

(padding noted above)


> +		__u64 serror_esr;
> +	} exception;
> +};
> +
>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT	16


> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 5c7f657..42e1222 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -277,6 +277,37 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>  	return -EINVAL;
>  }
>  
> +int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
> +			struct kvm_vcpu_events *events)
> +{
> +	events->exception.serror_pending = (vcpu_get_hcr(vcpu) & HCR_VSE);
> +	events->exception.serror_has_esr =
> +			cpus_have_const_cap(ARM64_HAS_RAS_EXTN) &&
> +					(!!vcpu_get_vsesr(vcpu));

> +	events->exception.serror_esr = vcpu_get_vsesr(vcpu);

This will return a stale ESR even if nothing is pending. On systems without the
RAS extensions it will return 'ESR_ELx_ISV' if kvm_inject_vabt() has ever been
called for this CPU.

Could we hide this behind (pending && has_esr), setting it to 0 otherwise. This
is just to avoid exposing the stale value.


> +
> +	return 0;
> +}

> +int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> +			struct kvm_vcpu_events *events)
> +{
> +	bool injected = events->exception.serror_pending;
> +	bool has_esr = events->exception.serror_has_esr;
> +
> +	if (injected && has_esr) {
> +		if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
> +			return -EINVAL;
> +
> +		kvm_set_sei_esr(vcpu, events->exception.serror_esr);
> +
> +	} else if (injected) {
> +		kvm_inject_vabt(vcpu);

Nit: looks like 'injected' is misnamed.


> +	}
> +
> +	return 0;
> +}
> +
>  int __attribute_const__ kvm_target_cpu(void)
>  {
>  	unsigned long implementor = read_cpuid_implementor();


> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 38c8a64..20e919a 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -82,6 +82,7 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  		break;
>  	case KVM_CAP_SET_GUEST_DEBUG:
>  	case KVM_CAP_VCPU_ATTRIBUTES:
> +	case KVM_CAP_VCPU_EVENTS:
>  		r = 1;
>  		break;
>  	default:

> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
> index 7e3941f..945655d 100644
> --- a/virt/kvm/arm/arm.c
> +++ b/virt/kvm/arm/arm.c
> @@ -1051,6 +1051,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  			return -EFAULT;
>  		return kvm_arm_vcpu_has_attr(vcpu, &attr);
>  	}
> +	case KVM_GET_VCPU_EVENTS: {
> +		struct kvm_vcpu_events events;
> +
> +		memset(&events, 0, sizeof(struct kvm_vcpu_events));

sizeof(events) is the normal style here, it means if someone changes event's
type we don't get any surprises...


> +		if (kvm_arm_vcpu_get_events(vcpu, &events))
> +			return -EINVAL;
> +
> +		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))

sizeof(events)


> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case KVM_SET_VCPU_EVENTS: {
> +		struct kvm_vcpu_events events;
> +
> +		if (copy_from_user(&events, argp,
> +				sizeof(struct kvm_vcpu_events)))
> +			return -EFAULT;
> +
> +		return kvm_arm_vcpu_set_events(vcpu, &events);
> +	}
>  	default:
>  		return -EINVAL;
>  	}
> 

Despite KVM_CAP_VCPU_EVENTS not being advertised on 32bit, any attempt to call
it will still end up in here, but will always fail as the {g,s}et_events() calls
always return -EINVAL. I don't think this will cause us any problems.


Thanks,

James
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v11 1/4] arm64: KVM: export the capability to set guest SError syndrome
From: James Morse @ 2018-04-10 14:15 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, devel, huangshaoyu,
	zhengxiang9
In-Reply-To: <1523309796-36423-2-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 09/04/18 22:36, Dongjiu Geng wrote:
> Before user space injects a SError, it needs to know whether it can
> specify the guest Exception Syndrome, so KVM should tell user space
> whether it has such capability.

(you could improve the commit message by briefly explaining how/why user-space
would want to do this. As this is patch 1, you don't have the context of the
previous patch to say that some systems can provide an ESR with virtual-SError)


> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index fc3ae95..8a3d708 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4415,3 +4415,14 @@ Parameters: none
>  This capability indicates if the flic device will be able to get/set the
>  AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
>  to discover this without having to create a flic device.
> +
> +8.14 KVM_CAP_ARM_SET_SERROR_ESR
> +
> +Architectures: arm, arm64
> +
> +This capability indicates that userspace can specify syndrome value reported to

(Nit: 'the syndrome value')

> +guest OS when guest takes a virtual SError interrupt exception.

(Nit: 'the guest')

> +If KVM has this capability, userspace can only specify the ISS field for the ESR
> +syndrome, can not specify the EC field which is not under control by KVM.

(Nit: 'it can not specify...')

> +If this virtual SError is taken to EL1 using AArch64, this value will be reported
> +into ISS filed of ESR_EL1.

(Nit: 'in the ISS field')


> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 3256b92..38c8a64 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -77,6 +77,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_ARM_PMU_V3:
>  		r = kvm_arm_support_pmu_v3();
>  		break;
> +	case KVM_CAP_ARM_INJECT_SERROR_ESR:
> +		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
> +		break;
>  	case KVM_CAP_SET_GUEST_DEBUG:
>  	case KVM_CAP_VCPU_ATTRIBUTES:
>  		r = 1;

'dev_ioctl' feels a bit weird, but we already have cpu_has_32bit_el1() in here.


> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 8fb90a0..3587b33 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -934,6 +934,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_AIS_MIGRATION 150
>  #define KVM_CAP_PPC_GET_CPU_CHAR 151
>  #define KVM_CAP_S390_BPB 152
> +#define KVM_CAP_ARM_INJECT_SERROR_ESR 153
>  
>  #ifdef KVM_CAP_IRQ_ROUTING

(patch 1&2 should probably be swapped around, as on its own this does thing).

Reviewed-by: James Morse <james.morse@arm.com>


Thanks,

James
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V2 3/9] dt-bindings: Tegra186 tachometer device tree bindings
From: Guenter Roeck @ 2018-04-10 13:43 UTC (permalink / raw)
  To: Rob Herring, Mikko Perttunen
  Cc: Rajkumar Rampelli, Mark Rutland, Thierry Reding, Jon Hunter,
	Jean Delvare, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Kate Stewart, Greg Kroah-Hartman, Philippe Ombredanne,
	Manikanta Maddireddy, Mikko Perttunen, Arnd Bergmann, Timur Tabi,
	Andy Gross, Wei Xu, Alex Elder, heiko@sntech.de,
	Krzysztof Kozlowski, Ard Biesheuvel, devicetree,
	linux-kernel@vger.kernel.org, Linux PWM List, linux-tegra,
	Linux HWMON List, linux-doc,
	moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE,
	Laxman Dewangan
In-Reply-To: <CAL_JsqJ-ZJtprXcLxVhRGKp30DTFRO1NzNoqpeSne0marcnmBQ@mail.gmail.com>

On 04/10/2018 06:30 AM, Rob Herring wrote:
> On Mon, Apr 9, 2018 at 9:37 AM, Mikko Perttunen <cyndis@kapsi.fi> wrote:
>>
>>
>> On 04/09/2018 04:21 PM, Rob Herring wrote:
>>>
>>> On Mon, Apr 9, 2018 at 12:38 AM, Mikko Perttunen <cyndis@kapsi.fi> wrote:
>>>>
>>>> Rob,
>>>
>>>
>>> Please don't top post to lists.
>>>
>>>> this binding is for a specific IP block (for measuring/aggregating input
>>>> pulses) on the Tegra186 SoC, so I don't think it fits into any generic
>>>> binding.
>>>
>>>
>>> What is it hooked up to to measure? You only mention "fan" five times
>>> in the doc.
>>
>>
>> In practice, fans.
>>
>>>
>>> You have #pwm-cells too, so this block has PWM output as well? If not,
>>> then where's the PWM for the fan control because there is no point in
>>> having fan tach without some control mechanism.
>>
>>
>> It doesn't provide a PWM output. The (Linux) PWM framework provides
>> functionality in both directions - control and capture. But if the device
>> tree #pwm-cells/pwms properties are only for control, we may need to
>> introduce a new #capture-pwm-cells/capture-pwms or similar.
> 
> Yes, perhaps. But there is no point in having
> #capture-pwm-cells/capture-pwms if you aren't describing the
> connection between the fan and the fan controller.
> 
>> The idea is that the generic fan node can then specify two pwms, one for
>> control and one for capture, to enable e.g. closed-loop control (I'm not
>> personally familiar with the usecase for this but I could imagine something
>> like that). The control PWM can be something completely different, maybe not
>> a PWM in the first place (e.g. some fixed voltage).
> 
> Yes. As you can have different types of fans (3-wire, 4-wire, etc.)
> they would have different compatibles and differing properties
> associated with them.
> 
>>> There's only so many ways to control fans and types of fans, so yes,
>>> the interface of control and feedback lines between a fan and its
>>> controller should absolutely be generic.
>>
>>
>> I'm not quite getting what you mean by this. Clearly we need a custom
>> compatibility string for the tachometer as it's a different hardware block
>> with different programming than others.
> 
> Yes, of course. It's the interface between fan controllers and fans
> that I'm concerned about.
> 
>> Or are you complaining about the
>> nvidia,pulse-per-rev/capture-window-len properties?
> 
> Well, those sound like properties of a fan (at least the first one),
> so they belong in a fan node.
> 
> The aspeed fan controller is probably the closest thing we have to a
> fan binding. Look at that if you haven't already.
> 

FWIW, this is a fan speed (tachometer) counter which is modeled as pwm input.
This, in my opinion, and as stated before, is conceptually wrong. The pwm
subsystem should not (need to) know anything about fans, much less about
specifics such as the number of pulses per revolution.

Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V2 3/9] dt-bindings: Tegra186 tachometer device tree bindings
From: Rob Herring @ 2018-04-10 13:30 UTC (permalink / raw)
  To: Mikko Perttunen
  Cc: Rajkumar Rampelli, Mark Rutland, Thierry Reding, Jon Hunter,
	Jean Delvare, Guenter Roeck, Jonathan Corbet, Catalin Marinas,
	Will Deacon, Kate Stewart, Greg Kroah-Hartman,
	Philippe Ombredanne, Manikanta Maddireddy, Mikko Perttunen,
	Arnd Bergmann, Timur Tabi, Andy Gross, Wei Xu, Alex Elder,
	heiko@sntech.de, Krzysztof Kozlowski, Ard Biesheuvel, devicetree,
	linux-kernel@vger.kernel.org, Linux PWM List, linux-tegra,
	Linux HWMON List, linux-doc,
	moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE,
	Laxman Dewangan
In-Reply-To: <615d6771-00c3-a4a7-a99f-ec4f6e667c8f@kapsi.fi>

On Mon, Apr 9, 2018 at 9:37 AM, Mikko Perttunen <cyndis@kapsi.fi> wrote:
>
>
> On 04/09/2018 04:21 PM, Rob Herring wrote:
>>
>> On Mon, Apr 9, 2018 at 12:38 AM, Mikko Perttunen <cyndis@kapsi.fi> wrote:
>>>
>>> Rob,
>>
>>
>> Please don't top post to lists.
>>
>>> this binding is for a specific IP block (for measuring/aggregating input
>>> pulses) on the Tegra186 SoC, so I don't think it fits into any generic
>>> binding.
>>
>>
>> What is it hooked up to to measure? You only mention "fan" five times
>> in the doc.
>
>
> In practice, fans.
>
>>
>> You have #pwm-cells too, so this block has PWM output as well? If not,
>> then where's the PWM for the fan control because there is no point in
>> having fan tach without some control mechanism.
>
>
> It doesn't provide a PWM output. The (Linux) PWM framework provides
> functionality in both directions - control and capture. But if the device
> tree #pwm-cells/pwms properties are only for control, we may need to
> introduce a new #capture-pwm-cells/capture-pwms or similar.

Yes, perhaps. But there is no point in having
#capture-pwm-cells/capture-pwms if you aren't describing the
connection between the fan and the fan controller.

> The idea is that the generic fan node can then specify two pwms, one for
> control and one for capture, to enable e.g. closed-loop control (I'm not
> personally familiar with the usecase for this but I could imagine something
> like that). The control PWM can be something completely different, maybe not
> a PWM in the first place (e.g. some fixed voltage).

Yes. As you can have different types of fans (3-wire, 4-wire, etc.)
they would have different compatibles and differing properties
associated with them.

>> There's only so many ways to control fans and types of fans, so yes,
>> the interface of control and feedback lines between a fan and its
>> controller should absolutely be generic.
>
>
> I'm not quite getting what you mean by this. Clearly we need a custom
> compatibility string for the tachometer as it's a different hardware block
> with different programming than others.

Yes, of course. It's the interface between fan controllers and fans
that I'm concerned about.

> Or are you complaining about the
> nvidia,pulse-per-rev/capture-window-len properties?

Well, those sound like properties of a fan (at least the first one),
so they belong in a fan node.

The aspeed fan controller is probably the closest thing we have to a
fan binding. Look at that if you haven't already.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] gpiolib: add hogs support for machine code
From: Bartosz Golaszewski @ 2018-04-10 12:40 UTC (permalink / raw)
  To: Linus Walleij, Jonathan Corbet
  Cc: linux-gpio, linux-doc, linux-kernel, Bartosz Golaszewski

Board files constitute a significant part of the users of the legacy
GPIO framework. In many cases they only export a line and set its
desired value. We could use GPIO hogs for that like we do for DT and
ACPI but there's no support for that in machine code.

This patch proposes to extend the machine.h API with support for
registering hog tables in board files.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 Documentation/driver-api/gpio/board.rst | 16 ++++++
 drivers/gpio/gpiolib.c                  | 67 +++++++++++++++++++++++++
 include/linux/gpio/machine.h            | 31 ++++++++++++
 3 files changed, 114 insertions(+)

diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst
index 25d62b2e9fd0..2c112553df84 100644
--- a/Documentation/driver-api/gpio/board.rst
+++ b/Documentation/driver-api/gpio/board.rst
@@ -177,3 +177,19 @@ mapping and is thus transparent to GPIO consumers.
 
 A set of functions such as gpiod_set_value() is available to work with
 the new descriptor-oriented interface.
+
+Boards using platform data can also hog GPIO lines by defining GPIO hog tables.
+
+.. code-block:: c
+
+        struct gpiod_hog gpio_hog_table[] = {
+                GPIO_HOG("gpio.0", 10, "foo", GPIO_ACTIVE_LOW, GPIOD_OUT_HIGH),
+                { }
+        };
+
+And the table can be added to the board code as follows::
+
+        gpiod_add_hogs(gpio_hog_table);
+
+The line will be hogged as soon as the gpiochip is created or - in case the
+chip was created earlier - when the hog table is registered.
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 43aeb07343ec..547adc149b62 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -71,6 +71,9 @@ static DEFINE_MUTEX(gpio_lookup_lock);
 static LIST_HEAD(gpio_lookup_list);
 LIST_HEAD(gpio_devices);
 
+static DEFINE_MUTEX(gpio_machine_hogs_mutex);
+static LIST_HEAD(gpio_machine_hogs);
+
 static void gpiochip_free_hogs(struct gpio_chip *chip);
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 				struct lock_class_key *lock_key,
@@ -1171,6 +1174,41 @@ static int gpiochip_setup_dev(struct gpio_device *gdev)
 	return status;
 }
 
+static void gpiochip_machine_hog(struct gpio_chip *chip, struct gpiod_hog *hog)
+{
+	struct gpio_desc *desc;
+	int rv;
+
+	desc = gpiochip_get_desc(chip, hog->chip_hwnum);
+	if (IS_ERR(desc)) {
+		pr_err("%s: unable to get GPIO desc: %ld\n",
+		       __func__, PTR_ERR(desc));
+		return;
+	}
+
+	if (desc->flags & FLAG_IS_HOGGED)
+		return;
+
+	rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags);
+	if (rv)
+		pr_err("%s: unable to hog GPIO line (%s:%u): %d\n",
+		       __func__, chip->label, hog->chip_hwnum, rv);
+}
+
+static void machine_gpiochip_add(struct gpio_chip *chip)
+{
+	struct gpiod_hog *hog;
+
+	mutex_lock(&gpio_machine_hogs_mutex);
+
+	list_for_each_entry(hog, &gpio_machine_hogs, list) {
+		if (!strcmp(chip->label, hog->chip_label))
+			gpiochip_machine_hog(chip, hog);
+	}
+
+	mutex_unlock(&gpio_machine_hogs_mutex);
+}
+
 static void gpiochip_setup_devs(void)
 {
 	struct gpio_device *gdev;
@@ -1326,6 +1364,8 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 
 	acpi_gpiochip_add(chip);
 
+	machine_gpiochip_add(chip);
+
 	/*
 	 * By first adding the chardev, and then adding the device,
 	 * we get a device node entry in sysfs under
@@ -3462,6 +3502,33 @@ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table)
 }
 EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table);
 
+/**
+ * gpiod_add_hogs() - register a set of GPIO hogs from machine code
+ * @hogs: table of gpio hog entries with a zeroed sentinel at the end
+ */
+void gpiod_add_hogs(struct gpiod_hog *hogs)
+{
+	struct gpio_chip *chip;
+	struct gpiod_hog *hog;
+
+	mutex_lock(&gpio_machine_hogs_mutex);
+
+	for (hog = &hogs[0]; hog->chip_label; hog++) {
+		list_add_tail(&hog->list, &gpio_machine_hogs);
+
+		/*
+		 * The chip may have been registered earlier, so check if it
+		 * exists and, if so, try to hog the line now.
+		 */
+		chip = find_chip_by_name(hog->chip_label);
+		if (chip)
+			gpiochip_machine_hog(chip, hog);
+	}
+
+	mutex_unlock(&gpio_machine_hogs_mutex);
+}
+EXPORT_SYMBOL_GPL(gpiod_add_hogs);
+
 static struct gpiod_lookup_table *gpiod_find_lookup_table(struct device *dev)
 {
 	const char *dev_id = dev ? dev_name(dev) : NULL;
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index b2f2dc638463..517957d6b168 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -39,6 +39,23 @@ struct gpiod_lookup_table {
 	struct gpiod_lookup table[];
 };
 
+/**
+ * struct gpiod_hog - GPIO line hog table
+ * @chip_label: name of the chip the GPIO belongs to
+ * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO
+ * @line_name: consumer name for the hogged line
+ * @lflags: mask of GPIO lookup flags
+ * @dflags: GPIO flags used to specify the direction and value
+ */
+struct gpiod_hog {
+	struct list_head list;
+	const char *chip_label;
+	u16 chip_hwnum;
+	const char *line_name;
+	enum gpio_lookup_flags lflags;
+	enum gpiod_flags dflags;
+};
+
 /*
  * Simple definition of a single GPIO under a con_id
  */
@@ -59,10 +76,23 @@ struct gpiod_lookup_table {
 	.flags = _flags,                                                  \
 }
 
+/*
+ * Simple definition of a single GPIO hog in an array.
+ */
+#define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags)  \
+{                                                                         \
+	.chip_label = _chip_label,                                        \
+	.chip_hwnum = _chip_hwnum,                                        \
+	.line_name = _line_name,                                          \
+	.lflags = _lflags,                                                \
+	.dflags = _dflags,                                                \
+}
+
 #ifdef CONFIG_GPIOLIB
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n);
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
+void gpiod_add_hogs(struct gpiod_hog *hogs);
 #else
 static inline
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {}
@@ -70,6 +100,7 @@ static inline
 void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {}
 static inline
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {}
+static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {}
 #endif
 
 #endif /* __LINUX_GPIO_MACHINE_H */
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v2 7/9] trace_uprobe/sdt: Fix multiple update of same reference counter
From: Oleg Nesterov @ 2018-04-10 11:06 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, rostedt, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	jolsa, kan.liang, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, milian.wolff, mingo, namhyung, naveen.n.rao, pc, tglx,
	yao.jin, fengguang.wu, jglisse
In-Reply-To: <84c1e60f-8aad-a0ce-59af-4fcb3f77df94@linux.vnet.ibm.com>

Hi Ravi,

On 04/10, Ravi Bangoria wrote:
>
> > and what if __mmu_notifier_register() fails simply because signal_pending() == T?
> > see mm_take_all_locks().
> >
> > at first glance this all look suspicious and sub-optimal,
>
> Yes. I should have added checks for failure cases.
> Will fix them in v3.

And what can you do if it fails? Nothing except report the problem. But
signal_pending() is not the unlikely or error condition, it should not
cause the tracing errors.

Plus mm_take_all_locks() is very heavy... BTW, uprobe_mmap_callback() is
called unconditionally. Whatever it does, can we at least move it after
the no_uprobe_events() check? Can't we also check MMF_HAS_UPROBES?

Either way, I do not feel that mmu_notifier is the right tool... Did you
consider the uprobe_clear_state() hook we already have?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 00/32] docs/vm: convert to ReST format
From: Mike Rapoport @ 2018-04-10 10:23 UTC (permalink / raw)
  To: Jonathan Corbet, Andrew Morton
  Cc: Andrey Ryabinin, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Tony Luck, Fenghua Yu, Ralf Baechle, James Hogan,
	Michael Ellerman, Alexander Viro, linux-kernel, linux-doc,
	kasan-dev, linux-alpha, linux-ia64, linux-mips, linuxppc-dev,
	linux-fsdevel, linux-mm
In-Reply-To: <20180401063857.GA3357@rapoport-lnx>

Jon, Andrew,

How do you suggest to continue with this?

On Sun, Apr 01, 2018 at 09:38:58AM +0300, Mike Rapoport wrote:
> (added akpm)
> 
> On Thu, Mar 29, 2018 at 03:46:07PM -0600, Jonathan Corbet wrote:
> > On Wed, 21 Mar 2018 21:22:16 +0200
> > Mike Rapoport <rppt@linux.vnet.ibm.com> wrote:
> > 
> > > These patches convert files in Documentation/vm to ReST format, add an
> > > initial index and link it to the top level documentation.
> > > 
> > > There are no contents changes in the documentation, except few spelling
> > > fixes. The relatively large diffstat stems from the indentation and
> > > paragraph wrapping changes.
> > > 
> > > I've tried to keep the formatting as consistent as possible, but I could
> > > miss some places that needed markup and add some markup where it was not
> > > necessary.
> > 
> > So I've been pondering on these for a bit.  It looks like a reasonable and
> > straightforward RST conversion, no real complaints there.  But I do have a
> > couple of concerns...
> > 
> > One is that, as we move documentation into RST, I'm really trying to
> > organize it a bit so that it is better tuned to the various audiences we
> > have.  For example, ksm.txt is going to be of interest to sysadmin types,
> > who might want to tune it.  mmu_notifier.txt is of interest to ...
> > somebody, but probably nobody who is thinking in user space.  And so on.
> > 
> > So I would really like to see this material split up and put into the
> > appropriate places in the RST hierarchy - admin-guide for administrative
> > stuff, core-api for kernel development topics, etc.  That, of course,
> > could be done separately from the RST conversion, but I suspect I know
> > what will (or will not) happen if we agree to defer that for now :)
> 
> Well, I was actually planning on doing that ;-)
> 
> My thinking was to start with mechanical RST conversion and then to start
> working on the contents and ordering of the documentation. Some of the
> existing files, e.g. ksm.txt, can be moved as is into the appropriate
> places, others, like transhuge.txt should be at least split into admin/user
> and developer guides.
> 
> Another problem with many of the existing mm docs is that they are rather
> developer notes and it wouldn't be really straight forward to assign them
> to a particular topic.
> 
> I believe that keeping the mm docs together will give better visibility of
> what (little) mm documentation we have and will make the updates easier.
> The documents that fit well into a certain topic could be linked there. For
> instance:
> 
> -------------------------
> diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
> index 5bb9161..8f6c6e6 100644
> --- a/Documentation/admin-guide/index.rst
> +++ b/Documentation/admin-guide/index.rst
> @@ -63,6 +63,7 @@ configure specific aspects of kernel behavior to your liking.
>     pm/index
>     thunderbolt
>     LSM/index
> +   vm/index
> 
>  .. only::  subproject and html
> 
> diff --git a/Documentation/admin-guide/vm/index.rst b/Documentation/admin-guide/vm/index.rst
> new file mode 100644
> index 0000000..d86f1c8
> --- /dev/null
> +++ b/Documentation/admin-guide/vm/index.rst
> @@ -0,0 +1,5 @@
> +==============================================
> +Knobs and Buttons for Memory Management Tuning
> +==============================================
> +
> +* :ref:`ksm <ksm>`
> -------------------------
> 
> > The other is the inevitable merge conflicts that changing that many doc
> > files will create.  Sending the patches through Andrew could minimize
> > that, I guess, or at least make it his problem.  Alternatively, we could
> > try to do it as an end-of-merge-window sort of thing.  I can try to manage
> > that, but an ack or two from the mm crowd would be nice to have.
> 
> I can rebase on top of Andrew's tree if that would help to minimize the
> merge conflicts.
> 
> > Thanks,
> > 
> > jon
> > 
> 
> -- 
> Sincerely yours,
> Mike.
> 

-- 
Sincerely yours,
Mike.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC bpf-next] bpf: document eBPF helpers and add a script to generate man page
From: Quentin Monnet @ 2018-04-10 10:21 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, ast, netdev, oss-drivers, linux-doc, linux-man
In-Reply-To: <20180410014751.mqwqdyujvybir6g5@ast-mbp.dhcp.thefacebook.com>

2018-04-09 18:47 UTC-0700 ~ Alexei Starovoitov
<alexei.starovoitov@gmail.com>
> On Mon, Apr 09, 2018 at 02:25:26PM +0100, Quentin Monnet wrote:
>>
>> Anyway, I am fine with keeping just signatures, descriptions and return
>> values for now. I will submit a new version with only those items.
> 
> Thank you.
> 
> Could you also split it into few patches?
>  include/uapi/linux/bpf.h   | 2237 ++++++++++++++++++++++++++++++++++++--------
>  scripts/bpf_helpers_doc.py |  568 +++++++++++
>  2 files changed, 2429 insertions(+), 376 deletions(-)
> 
> replying back and forth on a single patch of such size will be tedious
> for others to follow.
> May be document ~10 helpers at a time ? Total of ~7 patches and extra
> patch for .py ?
> 

Sure, I'll do that. And I'll try to group helpers in a patch by author,
it should also help for reviewing the descriptions.

Quentin
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 7/9] trace_uprobe/sdt: Fix multiple update of same reference counter
From: Ravi Bangoria @ 2018-04-10  8:19 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: mhiramat, peterz, srikar, rostedt, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	jolsa, kan.liang, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, milian.wolff, mingo, namhyung, naveen.n.rao, pc, tglx,
	yao.jin, fengguang.wu, jglisse, Ravi Bangoria
In-Reply-To: <20180409132928.GA25722@redhat.com>

Hi Oleg,

On 04/09/2018 06:59 PM, Oleg Nesterov wrote:
> On 04/04, Ravi Bangoria wrote:
>> +static void sdt_add_mm_list(struct trace_uprobe *tu, struct mm_struct *mm)
>> +{
>> +	struct mmu_notifier *mn;
>> +	struct sdt_mm_list *sml = kzalloc(sizeof(*sml), GFP_KERNEL);
>> +
>> +	if (!sml)
>> +		return;
>> +	sml->mm = mm;
>> +	list_add(&(sml->list), &(tu->sml.list));
>> +
>> +	/* Register mmu_notifier for this mm. */
>> +	mn = kzalloc(sizeof(*mn), GFP_KERNEL);
>> +	if (!mn)
>> +		return;
>> +
>> +	mn->ops = &sdt_mmu_notifier_ops;
>> +	__mmu_notifier_register(mn, mm);
>> +}
> and what if __mmu_notifier_register() fails simply because signal_pending() == T?
> see mm_take_all_locks().
>
> at first glance this all look suspicious and sub-optimal,

Yes. I should have added checks for failure cases.
Will fix them in v3.

Thanks for the review,
Ravi

>  but let me repeat that
> I didn't read this version yet.
>
> Oleg.
>

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2] squashfs: Add posix acl support
From: Geliang Tang @ 2018-04-10  2:24 UTC (permalink / raw)
  To: Phillip Lougher, Jonathan Corbet, kbuild test robot
  Cc: Geliang Tang, linux-doc, linux-kernel
In-Reply-To: <201804100217.d2VDp6Fv%fengguang.wu@intel.com>

Add posix acl (Access Control Lists) support for squashfs, which is
marked as a todo item in squashfs' documentation.  This patch implements
the squashfs_get_acl function to read file's acl information from its
xattr lists.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
---
Changes in v2:
 - fix build error, set squashfs_get_acl to NULL when CONFIG_SQUASHFS_POSIX_ACL is not selected.
---
 Documentation/filesystems/squashfs.txt |  2 -
 fs/squashfs/Kconfig                    | 11 ++++++
 fs/squashfs/Makefile                   |  1 +
 fs/squashfs/acl.c                      | 69 ++++++++++++++++++++++++++++++++++
 fs/squashfs/acl.h                      | 31 +++++++++++++++
 fs/squashfs/inode.c                    |  4 +-
 fs/squashfs/namei.c                    |  6 ++-
 fs/squashfs/squashfs_fs.h              | 12 +++---
 fs/squashfs/super.c                    |  3 ++
 fs/squashfs/symlink.c                  |  6 ++-
 fs/squashfs/xattr.c                    | 13 ++++++-
 fs/squashfs/xattr.h                    |  8 ++++
 12 files changed, 153 insertions(+), 13 deletions(-)
 create mode 100644 fs/squashfs/acl.c
 create mode 100644 fs/squashfs/acl.h

diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index e5274f84dc56..539fad6b4db0 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -235,8 +235,6 @@ list using a second xattr id lookup table.
 4.1 Todo list
 -------------
 
-Implement ACL support.
-
 4.2 Squashfs internal cache
 ---------------------------
 
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1adb3346b9d6..f9587bcf9dd9 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -107,6 +107,17 @@ config SQUASHFS_XATTR
 
 	  If unsure, say N.
 
+config SQUASHFS_POSIX_ACL
+	bool "Squashfs POSIX ACL support"
+	depends on SQUASHFS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Saying Y here includes support for Access Control Lists (acls).
+	  Acls are used to define more fine-grained discretionary access
+	  rights for files and directories (see the acl(5) manual page).
+
+	  If unsure, say N.
+
 config SQUASHFS_ZLIB
 	bool "Include support for ZLIB compressed file systems"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7bd9b8b856d0..73bc1c8a8df6 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -12,6 +12,7 @@ squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_POSIX_ACL) += acl.o
 squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/acl.c b/fs/squashfs/acl.c
new file mode 100644
index 000000000000..1c9eb2d13c2b
--- /dev/null
+++ b/fs/squashfs/acl.c
@@ -0,0 +1,69 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2018
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * acl.c
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+struct posix_acl *squashfs_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *name;
+	struct posix_acl *acl = NULL;
+	char *value = NULL;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = SQUASHFS_XATTR_POSIX_ACL_ACCESS;
+		name = XATTR_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = SQUASHFS_XATTR_POSIX_ACL_DEFAULT;
+		name = XATTR_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	retval = squashfs_xattr_get(inode, name_index, name, NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = squashfs_xattr_get(inode, name_index, name, value, retval);
+	}
+	if (retval > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
diff --git a/fs/squashfs/acl.h b/fs/squashfs/acl.h
new file mode 100644
index 000000000000..06f704e05450
--- /dev/null
+++ b/fs/squashfs/acl.h
@@ -0,0 +1,31 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2018
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * acl.h
+ */
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+
+#ifdef CONFIG_SQUASHFS_POSIX_ACL
+extern struct posix_acl *squashfs_get_acl(struct inode *inode, int type);
+#else
+#define squashfs_get_acl	NULL
+#endif
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index e9793b1e49a5..2035a1acffd7 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -48,6 +48,7 @@
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
+#include "acl.h"
 
 /*
  * Initialise VFS inode with the base inode information common to all
@@ -425,6 +426,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 
 
 const struct inode_operations squashfs_inode_ops = {
-	.listxattr = squashfs_listxattr
+	.listxattr	= squashfs_listxattr,
+	.get_acl	= squashfs_get_acl
 };
 
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 40c10d9974c9..33ad74780040 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -64,6 +64,7 @@
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
+#include "acl.h"
 
 /*
  * Lookup name in the directory index, returning the location of the metadata
@@ -246,6 +247,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
 
 
 const struct inode_operations squashfs_dir_inode_ops = {
-	.lookup = squashfs_lookup,
-	.listxattr = squashfs_listxattr
+	.lookup		= squashfs_lookup,
+	.listxattr	= squashfs_listxattr,
+	.get_acl	= squashfs_get_acl
 };
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 24d12fd14177..c7ac9fc4f8f4 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -107,11 +107,13 @@
 #define SQUASHFS_MAX_DIR_TYPE		7
 
 /* Xattr types */
-#define SQUASHFS_XATTR_USER             0
-#define SQUASHFS_XATTR_TRUSTED          1
-#define SQUASHFS_XATTR_SECURITY         2
-#define SQUASHFS_XATTR_VALUE_OOL        256
-#define SQUASHFS_XATTR_PREFIX_MASK      0xff
+#define SQUASHFS_XATTR_USER			0
+#define SQUASHFS_XATTR_POSIX_ACL_ACCESS		1
+#define SQUASHFS_XATTR_POSIX_ACL_DEFAULT	2
+#define SQUASHFS_XATTR_TRUSTED			3
+#define SQUASHFS_XATTR_SECURITY			4
+#define SQUASHFS_XATTR_VALUE_OOL		256
+#define SQUASHFS_XATTR_PREFIX_MASK		0xff
 
 /* Flag whether block is compressed or uncompressed, bit is set if block is
  * uncompressed */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 8a73b97217c8..beea564f1063 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -196,6 +196,9 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_flags |= SB_RDONLY;
+#ifdef CONFIG_SQUASHFS_POSIX_ACL
+	sb->s_flags |= SB_POSIXACL;
+#endif
 	sb->s_op = &squashfs_super_ops;
 
 	err = -ENOMEM;
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index befeba0fa70a..a7f30d890905 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -42,6 +42,7 @@
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
+#include "acl.h"
 
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -118,7 +119,8 @@ const struct address_space_operations squashfs_symlink_aops = {
 };
 
 const struct inode_operations squashfs_symlink_inode_ops = {
-	.get_link = page_get_link,
-	.listxattr = squashfs_listxattr
+	.get_link	= page_get_link,
+	.listxattr	= squashfs_listxattr,
+	.get_acl	= squashfs_get_acl
 };
 
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 1548b3784548..a1d773b5b0bc 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -33,6 +33,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "acl.h"
 
 static const struct xattr_handler *squashfs_xattr_handler(int);
 
@@ -115,7 +116,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 }
 
 
-static int squashfs_xattr_get(struct inode *inode, int name_index,
+int squashfs_xattr_get(struct inode *inode, int name_index,
 	const char *name, void *buffer, size_t buffer_size)
 {
 	struct super_block *sb = inode->i_sb;
@@ -265,6 +266,12 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
 	switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
 	case SQUASHFS_XATTR_USER:
 		return &squashfs_xattr_user_handler;
+#ifdef CONFIG_SQUASHFS_POSIX_ACL
+	case SQUASHFS_XATTR_POSIX_ACL_ACCESS:
+		return &posix_acl_access_xattr_handler;
+	case SQUASHFS_XATTR_POSIX_ACL_DEFAULT:
+		return &posix_acl_default_xattr_handler;
+#endif
 	case SQUASHFS_XATTR_TRUSTED:
 		return &squashfs_xattr_trusted_handler;
 	case SQUASHFS_XATTR_SECURITY:
@@ -277,6 +284,10 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
 
 const struct xattr_handler *squashfs_xattr_handlers[] = {
 	&squashfs_xattr_user_handler,
+#ifdef CONFIG_SQUASHFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
 	&squashfs_xattr_trusted_handler,
 	&squashfs_xattr_security_handler,
 	NULL
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index afe70f815e3d..ac08650c08cc 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -26,6 +26,8 @@ extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
 		u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
 		unsigned int *, unsigned long long *);
+extern int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 		u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -41,6 +43,12 @@ static inline int squashfs_xattr_lookup(struct super_block *sb,
 {
 	return 0;
 }
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size)
+{
+	return 0;
+}
 #define squashfs_listxattr NULL
 #define squashfs_xattr_handlers NULL
 #endif
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [RFC bpf-next] bpf: document eBPF helpers and add a script to generate man page
From: Alexei Starovoitov @ 2018-04-10  1:47 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Daniel Borkmann, ast, netdev, oss-drivers, linux-doc, linux-man
In-Reply-To: <16d4d67a-ab36-3e58-1082-52f0898546e5@netronome.com>

On Mon, Apr 09, 2018 at 02:25:26PM +0100, Quentin Monnet wrote:
> 
> Anyway, I am fine with keeping just signatures, descriptions and return
> values for now. I will submit a new version with only those items.

Thank you.

Could you also split it into few patches?
 include/uapi/linux/bpf.h   | 2237 ++++++++++++++++++++++++++++++++++++--------
 scripts/bpf_helpers_doc.py |  568 +++++++++++
 2 files changed, 2429 insertions(+), 376 deletions(-)

replying back and forth on a single patch of such size will be tedious
for others to follow.
May be document ~10 helpers at a time ? Total of ~7 patches and extra
patch for .py ?

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/2] perf: riscv: preliminary RISC-V support
From: Palmer Dabbelt @ 2018-04-09 21:03 UTC (permalink / raw)
  To: alankao
  Cc: albert, peterz, mingo, acme, alexander.shishkin, jolsa, namhyung,
	sols, corbet, linux-riscv, linux-doc, linux-kernel, nickhu,
	greentime
In-Reply-To: <20180409070710.GA3844@andestech.com>

On Mon, 09 Apr 2018 00:07:11 PDT (-0700), alankao@andestech.com wrote:
> On Thu, Apr 05, 2018 at 09:47:50AM -0700, Palmer Dabbelt wrote:
>> On Mon, 26 Mar 2018 00:57:54 PDT (-0700), alankao@andestech.com wrote:
>> >This patch provide a basic PMU, riscv_base_pmu, which supports two
>> >general hardware event, instructions and cycles.  Furthermore, this
>> >PMU serves as a reference implementation to ease the portings in
>> >the future.
>> >
>> >riscv_base_pmu should be able to run on any RISC-V machine that
>> >conforms to the Priv-Spec.  Note that the latest qemu model hasn't
>> >fully support a proper behavior of Priv-Spec 1.10 yet, but work
>> >around should be easy with very small fixes.  Please check
>> >https://github.com/riscv/riscv-qemu/pull/115 for future updates.
>> >
>> >Cc: Nick Hu <nickhu@andestech.com>
>> >Cc: Greentime Hu <greentime@andestech.com>
>> >Signed-off-by: Alan Kao <alankao@andestech.com>
>>
>> We should really be able to detect PMU types at runtime (via a device tree
>> entry) rather than requiring that a single PMU is built in to the kernel.
>> This will require a handful of modifications to how this patch works, which
>> I'll try to list below.
>
>> >+menu "PMU type"
>> >+	depends on PERF_EVENTS
>> >+
>> >+config RISCV_BASE_PMU
>> >+	bool "Base Performance Monitoring Unit"
>> >+	def_bool y
>> >+	help
>> >+	  A base PMU that serves as a reference implementation and has limited
>> >+	  feature of perf.
>> >+
>> >+endmenu
>> >+
>>
>> Rather than a menu where a single PMU can be selected, there should be
>> options to enable or disable support for each PMU type -- this is just like
>> how all our other drivers work.
>>
>
> I see.  Sure.  The descriptions and implementation will be refined in v3.
>
>> >+struct pmu * __weak __init riscv_init_platform_pmu(void)
>> >+{
>> >+	riscv_pmu = &riscv_base_pmu;
>> >+	return riscv_pmu->pmu;
>> >+}
>>
>> Rather than relying on a weak symbol that gets overridden by other PMU
>> types, this should look through the device tree for a compatible PMU (in the
>> case of just the base PMU it could be any RISC-V hart) and install a PMU
>> handler for it.  There'd probably be some sort of priority scheme here, like
>> there are for other driver subsystems, where we'd pick the best PMU driver
>> that's compatible with the PMUs on every hart.
>>
>> >+
>> >+int __init init_hw_perf_events(void)
>> >+{
>> >+	struct pmu *pmu = riscv_init_platform_pmu();
>> >+
>> >+	perf_irq = NULL;
>> >+	perf_pmu_register(pmu, "cpu", PERF_TYPE_RAW);
>> >+	return 0;
>> >+}
>> >+arch_initcall(init_hw_perf_events);
>>
>> Since we only have a single PMU type right now this isn't critical to handle
>> right away, but we will have to refactor this before adding another PMU.
>
> I see.  My rough plan is to do the device tree parsing here, and if no specific
> PMU string is found then just register the base PMU proposed in this patch.
> How about this idea?

Sounds good.  We know the generic PMU will work on all RISC-V harts, so there's 
no need to add an explicit device tree entry for it.  Then we can figure out 
how to add device tree entries for custom performance monitors later :)
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] squashfs: Add posix acl support
From: kbuild test robot @ 2018-04-09 20:35 UTC (permalink / raw)
  To: Geliang Tang
  Cc: kbuild-all, Phillip Lougher, Jonathan Corbet, Geliang Tang,
	linux-doc, linux-kernel
In-Reply-To: <af77c1f80e2725c4cf1bf106d6add820b3b0eed5.1523276963.git.geliangtang@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 844 bytes --]

Hi Geliang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.16]
[also build test ERROR on next-20180409]
[cannot apply to squashfs/master]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Geliang-Tang/squashfs-Add-posix-acl-support/20180410-013038
config: i386-randconfig-s0-201814 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> ERROR: "squashfs_get_acl" [fs/squashfs/squashfs.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 35602 bytes --]

^ permalink raw reply

* Re: [PATCH v4 10/10] dt-bindings: gpio: Add bindings for Cadence I3C gpio expander
From: Rob Herring @ 2018-04-09 20:26 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Wolfram Sang, linux-i2c, Jonathan Corbet, linux-doc,
	Greg Kroah-Hartman, Arnd Bergmann, Przemyslaw Sroka,
	Arkadiusz Golec, Alan Douglas, Bartosz Folta, Damian Kos,
	Alicja Jurasik-Urbaniak, Cyprian Wronka, Suresh Punnoose,
	Rafal Ciepiela, Thomas Petazzoni, Nishanth Menon, Pawel Moll,
	Mark Rutland, Ian Campbell, Kumar Gala, devicetree, linux-kernel,
	Vitor Soares, Geert Uytterhoeven, Linus Walleij, Xiang Lin,
	linux-gpio
In-Reply-To: <20180330074751.25987-11-boris.brezillon@bootlin.com>

On Fri, Mar 30, 2018 at 09:47:51AM +0200, Boris Brezillon wrote:
> Document the Cadence I3C gpio expander bindings.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
> ---
> Changes in v4:
> - Use GPIO_ and IRQ_TYPE_ macros instead of raw numbers
> - Fix the unit-address in the example
> ---
>  .../devicetree/bindings/gpio/gpio-cdns-i3c.txt     | 39 ++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/gpio/gpio-cdns-i3c.txt

Reviewed-by: Rob Herring <robh@kernel.org>

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 08/10] dt-bindings: i3c: Document Cadence I3C master bindings
From: Rob Herring @ 2018-04-09 20:25 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Wolfram Sang, linux-i2c, Jonathan Corbet, linux-doc,
	Greg Kroah-Hartman, Arnd Bergmann, Przemyslaw Sroka,
	Arkadiusz Golec, Alan Douglas, Bartosz Folta, Damian Kos,
	Alicja Jurasik-Urbaniak, Cyprian Wronka, Suresh Punnoose,
	Rafal Ciepiela, Thomas Petazzoni, Nishanth Menon, Pawel Moll,
	Mark Rutland, Ian Campbell, Kumar Gala, devicetree, linux-kernel,
	Vitor Soares, Geert Uytterhoeven, Linus Walleij, Xiang Lin,
	linux-gpio
In-Reply-To: <20180330074751.25987-9-boris.brezillon@bootlin.com>

On Fri, Mar 30, 2018 at 09:47:49AM +0200, Boris Brezillon wrote:
> Document Cadence I3C master DT bindings.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
> ---
> Changes in v4:
> - Fix example to match the new representation
> ---
>  .../devicetree/bindings/i3c/cdns,i3c-master.txt    | 44 ++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt

Reviewed-by: Rob Herring <robh@kernel.org>

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 04/10] dt-bindings: i3c: Document core bindings
From: Rob Herring @ 2018-04-09 20:24 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Wolfram Sang, linux-i2c, Jonathan Corbet, linux-doc,
	Greg Kroah-Hartman, Arnd Bergmann, Przemyslaw Sroka,
	Arkadiusz Golec, Alan Douglas, Bartosz Folta, Damian Kos,
	Alicja Jurasik-Urbaniak, Cyprian Wronka, Suresh Punnoose,
	Rafal Ciepiela, Thomas Petazzoni, Nishanth Menon, Pawel Moll,
	Mark Rutland, Ian Campbell, Kumar Gala, devicetree, linux-kernel,
	Vitor Soares, Geert Uytterhoeven, Linus Walleij, Xiang Lin,
	linux-gpio
In-Reply-To: <20180330074751.25987-5-boris.brezillon@bootlin.com>

On Fri, Mar 30, 2018 at 09:47:45AM +0200, Boris Brezillon wrote:
> A new I3C subsystem has been added and a generic description has been
> created to represent the I3C bus and the devices connected on it.
> 
> Document this generic representation.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
> ---
> Changes in v4:
> - Clarify the fact that static address == I3C address and dynamic
>   address == I3C address
> - Use i2c-scl-hz in the example
> 
> Changes in v3:
> - Rename {i2c,i3c}-scl-frequency DT prop into {i2c,i3c}-scl-hz
> - Rework the way we expose the provisional ID and LVR information
> - Rename dynamic-address into assigned-address
> - Enforce the I3C master node name
> 
> Changes in v2:
> - Define how to describe I3C devices in the DT and when it should be
>   used. Note that the parsing of I3C devices is not yet implemented in
>   the framework. Will be added when someone really needs it.
> ---
>  Documentation/devicetree/bindings/i3c/i3c.txt | 140 ++++++++++++++++++++++++++
>  1 file changed, 140 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/i3c/i3c.txt

Reviewed-by: Rob Herring <robh@kernel.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/3] mm: replace __HAVE_ARCH_PTE_SPECIAL
From: David Rientjes @ 2018-04-09 20:08 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Laurent Dufour, linux-kernel, linux-mm, linuxppc-dev, x86,
	linux-doc, linux-snps-arc, linux-arm-kernel, linux-riscv,
	linux-s390, linux-sh, sparclinux, Jerome Glisse, mhocko,
	aneesh.kumar, akpm, mpe, benh, paulus, Jonathan Corbet,
	Catalin Marinas, Will Deacon, Yoshinori Sato, Rich Felker,
	David S . Miller, Thomas Gleixner, Ingo Molnar, Vineet Gupta,
	Palmer Dabbelt, Albert Ou, Martin Schwidefsky, Heiko Carstens
In-Reply-To: <20180409175757.GA12938@infradead.org>

On Mon, 9 Apr 2018, Christoph Hellwig wrote:

> > -#ifdef __HAVE_ARCH_PTE_SPECIAL
> > +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
> >  # define HAVE_PTE_SPECIAL 1
> >  #else
> >  # define HAVE_PTE_SPECIAL 0
> 
> I'd say kill this odd indirection and just use the
> CONFIG_ARCH_HAS_PTE_SPECIAL symbol directly.
> 
> 

Agree, and I think it would be easier to audit/review if patches 1 and 3 
were folded together to see the relationship between the newly added 
selects and what #define's it is replacing.  Otherwise, looks good!
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] squashfs: Add posix acl support
From: kbuild test robot @ 2018-04-09 19:39 UTC (permalink / raw)
  To: Geliang Tang
  Cc: kbuild-all, Phillip Lougher, Jonathan Corbet, Geliang Tang,
	linux-doc, linux-kernel
In-Reply-To: <af77c1f80e2725c4cf1bf106d6add820b3b0eed5.1523276963.git.geliangtang@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1207 bytes --]

Hi Geliang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.16]
[also build test ERROR on next-20180409]
[cannot apply to squashfs/master]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Geliang-Tang/squashfs-Add-posix-acl-support/20180410-013038
config: arm-multi_v7_defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   fs/squashfs/inode.o: In function `.LANCHOR0':
>> inode.c:(.rodata+0xc): undefined reference to `squashfs_get_acl'
   fs/squashfs/namei.o:(.rodata+0xc): undefined reference to `squashfs_get_acl'
   fs/squashfs/symlink.o:(.rodata+0xc): undefined reference to `squashfs_get_acl'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 43430 bytes --]

^ permalink raw reply

* Re: [PATCH 2/3] mm: replace __HAVE_ARCH_PTE_SPECIAL
From: Christoph Hellwig @ 2018-04-09 17:57 UTC (permalink / raw)
  To: Laurent Dufour
  Cc: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens
In-Reply-To: <1523282229-20731-3-git-send-email-ldufour@linux.vnet.ibm.com>

> -#ifdef __HAVE_ARCH_PTE_SPECIAL
> +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
>  # define HAVE_PTE_SPECIAL 1
>  #else
>  # define HAVE_PTE_SPECIAL 0

I'd say kill this odd indirection and just use the
CONFIG_ARCH_HAS_PTE_SPECIAL symbol directly.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] sched/fair: add support to tune PELT ramp/decay timings
From: Patrick Bellasi @ 2018-04-09 16:51 UTC (permalink / raw)
  To: linux-kernel, linux-pm
  Cc: Ingo Molnar, Peter Zijlstra, Rafael J . Wysocki, Viresh Kumar,
	Vincent Guittot, Juri Lelli, Joel Fernandes, Steve Muckle,
	Dietmar Eggemann, Morten Rasmussen, Jonathan Corbet, Paul Turner,
	linux-doc

The PELT half-life is the time [ms] required by the PELT signal to build
up a 50% load/utilization, starting from zero. This time is currently
hardcoded to be 32ms, a value which seems to make sense for most of the
workloads.

However, 32ms has been verified to be too long for certain classes of
workloads. For example, in the mobile space many tasks affecting the
user-experience run with a 16ms or 8ms cadence, since they need to match
the common 60Hz or 120Hz refresh rate of the graphics pipeline.
This contributed so fare to the idea that "PELT is too slow" to properly
track the utilization of interactive mobile workloads, especially
compared to alternative load tracking solutions which provides a
better representation of tasks demand in the range of 10-20ms.

A faster PELT ramp-up time could give some advantages to speed-up the
time required for the signal to stabilize and thus to better represent
task demands in the mobile space. As a downside, it also reduces the
decay time, and thus we forget the load/utilization of sleeping tasks
(or idle CPUs) faster.

Fortunately, since the integration of the utilization estimation
support in mainline kernel:

   commit 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT")

a fast decay time is no longer an issue for tasks utilization estimation.
Although estimated utilization does not slow down the decay of blocked
utilization on idle CPUs, for mobile workloads this seems not to be a
major concern compared to the benefits in interactivity responsiveness.

Let's add a compile time option to choose the PELT speed which better
fits for a specific system. By default the current 32ms half-life is
used, but we can also compile a kernel to use a faster ramp-up time of
either 16ms or 8ms. These two configurations have been verified to give
PELT a further improvement in performance, compared to other out-of-tree
load tracking solutions, when it comes to track interactive workloads
thus better supporting both tasks placements and frequencies selections.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Paul Turner <pjt@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 Documentation/scheduler/sched-pelt.c | 45 ++++++++++++++++++++++--------------
 init/Kconfig                         | 44 +++++++++++++++++++++++++++++++++++
 kernel/sched/sched-pelt.h            | 39 ++++++++++++++++++++++++++-----
 3 files changed, 105 insertions(+), 23 deletions(-)

diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
index e4219139386a..e0ae21616188 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -10,34 +10,35 @@
 #include <math.h>
 #include <stdio.h>
 
-#define HALFLIFE 32
+#define HALFLIFE { 32, 16, 8 }
 #define SHIFT 32
 
 double y;
 
-void calc_runnable_avg_yN_inv(void)
+void calc_runnable_avg_yN_inv(const int halflife)
 {
 	int i;
 	unsigned int x;
 
 	printf("static const u32 runnable_avg_yN_inv[] = {");
-	for (i = 0; i < HALFLIFE; i++) {
+	for (i = 0; i < halflife; i++) {
 		x = ((1UL<<32)-1)*pow(y, i);
 
-		if (i % 6 == 0) printf("\n\t");
-		printf("0x%8x, ", x);
+		if (i % 4 == 0)
+			printf("\n\t");
+		printf("0x%8x,", x);
 	}
 	printf("\n};\n\n");
 }
 
 int sum = 1024;
 
-void calc_runnable_avg_yN_sum(void)
+void calc_runnable_avg_yN_sum(const int halflife)
 {
 	int i;
 
 	printf("static const u32 runnable_avg_yN_sum[] = {\n\t    0,");
-	for (i = 1; i <= HALFLIFE; i++) {
+	for (i = 1; i <= halflife; i++) {
 		if (i == 1)
 			sum *= y;
 		else
@@ -55,7 +56,7 @@ int n = -1;
 /* first period */
 long max = 1024;
 
-void calc_converged_max(void)
+void calc_converged_max(const int halflife)
 {
 	long last = 0, y_inv = ((1UL<<32)-1)*y;
 
@@ -73,17 +74,17 @@ void calc_converged_max(void)
 		last = max;
 	}
 	n--;
-	printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE);
+	printf("#define LOAD_AVG_PERIOD %d\n", halflife);
 	printf("#define LOAD_AVG_MAX %ld\n", max);
-//	printf("#define LOAD_AVG_MAX_N %d\n\n", n);
+	/* printf("#define LOAD_AVG_MAX_N %d\n\n", n); */
 }
 
-void calc_accumulated_sum_32(void)
+void calc_accumulated_sum_32(const int halflife)
 {
 	int i, x = sum;
 
 	printf("static const u32 __accumulated_sum_N32[] = {\n\t     0,");
-	for (i = 1; i <= n/HALFLIFE+1; i++) {
+	for (i = 1; i <= n / halflife + 1; i++) {
 		if (i > 1)
 			x = x/2 + sum;
 
@@ -97,12 +98,22 @@ void calc_accumulated_sum_32(void)
 
 void main(void)
 {
+	int hl_value[] = HALFLIFE;
+	int hl_count = sizeof(hl_value) / sizeof(int);
+	int hl_idx, halflife;
+
 	printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n");
 
-	y = pow(0.5, 1/(double)HALFLIFE);
+	for (hl_idx = 0; hl_idx < hl_count; ++hl_idx) {
+		halflife = hl_value[hl_idx];
 
-	calc_runnable_avg_yN_inv();
-//	calc_runnable_avg_yN_sum();
-	calc_converged_max();
-//	calc_accumulated_sum_32();
+		y = pow(0.5, 1 / (double)halflife);
+
+		printf("\n#ifdef CONFIG_PELT_HALFLIFE_%d\n", halflife);
+		calc_runnable_avg_yN_inv(halflife);
+		/* calc_runnable_avg_yN_sum(halflife); */
+		calc_converged_max(halflife);
+		/* calc_accumulated_sum_32(halflife); */
+		printf("#endif\n");
+	}
 }
diff --git a/init/Kconfig b/init/Kconfig
index e37f4b2a6445..6fd13887d2bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -585,6 +585,50 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 config GENERIC_SCHED_CLOCK
 	bool
 
+menu "Scheduler features"
+
+choice
+	bool "Configure PELT speed for load/utilization tracking"
+	default PELT_HALFLIFE_32
+	help
+	  Allows to choose one of the possible values for the PELT half-life to
+	  be used for the update of the load and utilization of tasks and CPUs.
+	  The half-life is the amount of [ms] required by the PELT signal to
+	  build up to 50% load/utilization.
+	  The higher the half-life the longer it takes for a task to be
+	  represented as a big one.
+
+	  If not sure, use the default of 32 ms.
+
+config PELT_HALFLIFE_32
+	bool "32 ms, default"
+	help
+	  A 32ms PELT half-life is the default value usually suitable for
+	  server/enterprise class of workloads where tasks can normally
+	  runs for tens or hundreds of milliseconds.
+
+	  If not sure, use this option
+
+config PELT_HALFLIFE_16
+	bool "16 ms, faster"
+	help
+	  A 16ms PELT half-life is suggested for mobile/interactive workloads
+	  where tasks usually run with a 60Hz activation cadence.
+
+	  If not sure, use the default of 32 ms
+
+config PELT_HALFLIFE_8
+	bool "8 ms, very fast"
+	help
+	  An 8ms PELT half-life is suggested for mobile/interactive workloads
+	  where tasks usually run with a 120Hz activation cadence.
+
+	  If not sure, use the default of 32 ms
+
+endchoice
+
+endmenu # Scheduler features"
+
 #
 # For architectures that want to enable the support for NUMA-affine scheduler
 # balancing logic:
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c978fe03f788 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,14 +1,41 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */
 
+#ifdef CONFIG_PELT_HALFLIFE_32
 static const u32 runnable_avg_yN_inv[] = {
-	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
-	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
-	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
-	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
-	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
-	0x85aac367, 0x82cd8698,
+	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 
+	0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 
+	0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 
+	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 
+	0xb504f333, 0xb123f581, 0xad583ee9, 0xa9a15ab4, 
+	0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, 
+	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 
+	0x8b95c1e3, 0x88980e80, 0x85aac367, 0x82cd8698, 
 };
 
 #define LOAD_AVG_PERIOD 32
 #define LOAD_AVG_MAX 47742
+#endif
+
+#ifdef CONFIG_PELT_HALFLIFE_16
+static const u32 runnable_avg_yN_inv[] = {
+	0xffffffff, 0xf5257d14, 0xeac0c6e6, 0xe0ccdeeb, 
+	0xd744fcc9, 0xce248c14, 0xc5672a10, 0xbd08a39e, 
+	0xb504f333, 0xad583ee9, 0xa5fed6a9, 0x9ef5325f, 
+	0x9837f050, 0x91c3d373, 0x8b95c1e3, 0x85aac367, 
+};
+
+#define LOAD_AVG_PERIOD 16
+#define LOAD_AVG_MAX 24152
+#endif
+
+#ifdef CONFIG_PELT_HALFLIFE_8
+static const u32 runnable_avg_yN_inv[] = {
+	0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10, 
+	0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3, 
+};
+
+#define LOAD_AVG_PERIOD 8
+#define LOAD_AVG_MAX 12337
+#endif
+
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox