Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [RFC bpf-next v2 1/8] bpf: add script and prepare bpf.h for new helpers documentation
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Remove previous "overview" of eBPF helpers from user bpf.h header.
Replace it by a comment explaining how to process the new documentation
(to come in following patches) with a Python script to produce RST, then
man page documentation.

Also add the aforementioned Python script under scripts/. It is used to
process include/uapi/linux/bpf.h and to extract helper descriptions, to
turn it into a RST document that can further be processed with rst2man
to produce a man page. The script takes one "--filename <path/to/file>"
option. If the script is launched from scripts/ in the kernel root
directory, it should be able to find the location of the header to
parse, and "--filename <path/to/file>" is then optional. If it cannot
find the file, then the option becomes mandatory. RST-formatted
documentation is printed to standard output.

Typical workflow for producing the final man page would be:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

Note that the tool kernel-doc cannot be used to document eBPF helpers,
whose signatures are not available directly in the header files
(pre-processor directives are used to produce them at the beginning of
the compilation process).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h   | 406 ++------------------------------------------
 scripts/bpf_helpers_doc.py | 414 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 430 insertions(+), 390 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..45f77f01e672 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -365,396 +365,22 @@ union bpf_attr {
 	} raw_tracepoint;
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
- *
- * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
- *
- * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
- *
- * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
- *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..3a15ba3f0a83
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,414 @@
+#!/usr/bin/python3
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) 2018 Netronome Systems, Inc.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program. For
+security reasons, these functions are restricted to a white-list of helpers
+defined in the kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets metadata. Since there are
+several eBPF program types, and that they do not run in the same context, each
+program type can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+NOTES
+=====
+
+On the performance side, eBPF programs move to the stack all arguments to pass
+to the helpers, and call directly into the compiled helper functions without
+requiring any foreign-function interface. As a result, calling helpers
+introduce very little overhead.
+
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* contains the full list of all helper functions.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 0/8] bpf: document eBPF helpers and add a script to generate man page
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man

eBPF helper functions can be called from within eBPF programs to perform
a variety of tasks that would be otherwise hard or impossible to do with
eBPF itself. There is a growing number of such helper functions in the
kernel, but documentation is scarce. The main user space header file
does contain a short commented description of most helpers, but it is
somewhat outdated and not complete. It is more a "cheat sheet" than a
real documentation accessible to new eBPF developers.

This commit attempts to improve the situation by replacing the existing
overview for the helpers with a more developed description. Furthermore,
a Python script is added to generate a manual page for eBPF helpers. The
workflow is the following, and requires the rst2man utility:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

The objective is to keep all documentation related to the helpers in a
single place, and to be able to generate from here a manual page that
could be packaged in the man-pages repository and shipped with most
distributions.

Additionally, parsing the prototypes of the helper functions could
hopefully be reused, with a different Printer object, to generate
header files needed in some eBPF-related projects.

Regarding the description of each helper, it comprises several items:

- The function prototype.
- A description of the function and of its arguments (except for a
  couple of cases, when there are no arguments and the return value
  makes the function usage really obvious).
- A description of return values (if not void).

Additional items such as the list of compatible eBPF program and map
types for each helper, Linux kernel version that introduced the helper,
GPL-only restriction, and commit hash could be added in the future, but
it was decided on the mailing list to leave them aside for now.

For several helpers, descriptions are inspired (at times, nearly copied)
from the commit logs introducing them in the kernel--Many thanks to
their respective authors! They were completed as much as possible, the
objective being to have something easily accessible even for people just
starting with eBPF. There is probably a bit more work to do in this
direction for some helpers.

Some RST formatting is used in the descriptions (not in function
prototypes, to keep them readable, but the Python script provided in
order to generate the RST for the manual page does add formatting to
prototypes, to produce something pretty) to get "bold" and "italics" in
manual pages. Hopefully, the descriptions in bpf.h file remains
perfectly readable. Note that the few trailing white spaces are
intentional, removing them would break paragraphs for rst2man.

The descriptions should ideally be updated each time someone adds a new
helper, or updates the behaviour (new socket option supported, ...) or
the interface (new flags available, ...) of existing ones.

The second RFC for this set splits the documentation into several patches.
Ideally all helper descriptions should be reviewed by the respective
authors of the functions they describe. Please do not hesitate to suggest
improvements to make descriptions more complete or accessible.

v2:
- Remove "For" (compatible program and map types), "Since" (minimal
  Linux kernel version required), "GPL only" sections and commit hashes
  for the helpers.
- Add comment on top of the description list to explain how this
  documentation is supposed to be processed.
- Update Python script accordingly (remove the same sections, and remove
  paragraphs on program types and GPL restrictions from man page
  header).
- Split series into several patches.

Cc: linux-doc@vger.kernel.org
Cc: linux-man@vger.kernel.org
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>

Quentin Monnet (8):
  bpf: add script and prepare bpf.h for new helpers documentation
  bpf: add documentation for eBPF helpers (01-11)
  bpf: add documentation for eBPF helpers (12-22)
  bpf: add documentation for eBPF helpers (23-32)
  bpf: add documentation for eBPF helpers (33-41)
  bpf: add documentation for eBPF helpers (42-50)
  bpf: add documentation for eBPF helpers (51-57)
  bpf: add documentation for eBPF helpers (58-64)

 include/uapi/linux/bpf.h   | 1580 +++++++++++++++++++++++++++++++++-----------
 scripts/bpf_helpers_doc.py |  414 ++++++++++++
 2 files changed, 1616 insertions(+), 378 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC bpf-next v2 8/8] bpf: add documentation for eBPF helpers (58-64)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast
  Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man,
	John Fastabend
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by John:

- bpf_redirect_map()
- bpf_sk_redirect_map()
- bpf_sock_map_update()
- bpf_msg_redirect_map()
- bpf_msg_apply_bytes()
- bpf_msg_cork_bytes()
- bpf_msg_pull_data()

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 140 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7343af4196c8..db090ad03626 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1250,6 +1250,51 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, his *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is not fully implemented as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORT** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. The only flag
+ * 		supported for now is **BPF_F_INGRESS**, which indicates the
+ * 		packet is to be redirected to the ingress side of the socket
+ * 		instead of (by default) egress.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
@@ -1417,6 +1462,101 @@ union bpf_attr {
  * 		be set is returned (which comes down to 0 if all bits were set
  * 		as required).
  *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. The only flag
+ * 		supported for now is **BPF_F_INGRESS**, which indicates the
+ * 		packet is to be redirected to the ingress side of the socket
+ * 		instead of (by default) egress.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 2/8] bpf: add documentation for eBPF helpers (01-11)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_map_lookup_elem()
- bpf_map_update_elem()
- bpf_map_delete_elem()
- bpf_probe_read()
- bpf_ktime_get_ns()
- bpf_trace_printk()
- bpf_skb_store_bytes()
- bpf_l3_csum_replace()
- bpf_l4_csum_replace()
- bpf_tail_call()
- bpf_clone_redirect()

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 199 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45f77f01e672..2bc653a3a20f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -381,6 +381,205 @@ union bpf_attr {
  * intentional, removing them would break paragraphs for rst2man.
  *
  * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		These flags are only useful for maps of type
+ * 		**BPF_MAP_TYPE_HASH**. For all other map types, **BPF_ANY**
+ * 		should be used.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five). Each time the helper is called, it appends a
+ * 		line that looks like the following:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: BPF command: 2
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``BPF command: 2`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will silently fail if it encounters an unknown
+ * 		specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For passing values to user
+ * 		space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the IP checksum for the packet associated to *skb*.
+ * 		Computation is incremental, so the helper must know the former
+ * 		value of the header field that was modified (*from*), the new
+ * 		value of this field (*to*), and the number of bytes (2 or 4)
+ * 		for this field, stored in *size*. Alternatively, it is possible
+ * 		to store the difference between the previous and the new values
+ * 		of the header field in *to*, by setting *from* and *size* to 0.
+ * 		For both methods, *offset* indicates the location of the IP
+ * 		checksum within the packet.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the TCP or UDP checksum for the packet associated to
+ * 		*skb*. Computation is incremental, so the helper must know the
+ * 		former value of the header field that was modified (*from*),
+ * 		the new value of this field (*to*), and the number of bytes (2
+ * 		or 4) for this field, stored on the lowest four bits of
+ * 		*flags*. Alternatively, it is possible to store the difference
+ * 		between the previous and the new values of the header field in
+ * 		*to*, by setting *from* and the four lowest bits of *flags* to
+ * 		0. For both methods, *offset* indicates the location of the IP
+ * 		checksum within the packet. In addition to the size of the
+ * 		field, *flags* can be added (bitwise OR) actual flags. With
+ * 		**BPF_F_MARK_MANGLED_0**, a null checksum is left untouched
+ * 		(unless **BPF_F_MARK_ENFORCE** is added as well), and for
+ * 		updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR**
+ * 		indicates the checksum is to be computed against a
+ * 		pseudo-header.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The contents of
+ * 		eBPF registers and stack are not modified, the new program
+ * 		"inherits" them from the caller. This mechanism allows for
+ * 		program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never goes back to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its own instructions. A call can fail if the destination
+ * 		program for the jump does not exist (i.e. *index* is superior
+ * 		to the number of entries in *prog_array_map*), or if the
+ * 		maximum number of tail calls has been reached for this chain of
+ * 		programs. This limit is defined in the kernel by the macro
+ * 		**MAX_TAIL_CALL_CNT** (not accessible to user space), which
+ * 		is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. The only flag supported for now
+ * 		is **BPF_F_INGRESS**, which indicates the packet is to be
+ * 		redirected to the ingress interface instead of (by default)
+ * 		egress.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 4/8] bpf: add documentation for eBPF helpers (23-32)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_prandom_u32()
- bpf_get_smp_processor_id()
- bpf_get_cgroup_classid()
- bpf_get_route_realm()
- bpf_skb_load_bytes()
- bpf_csum_diff()
- bpf_skb_get_tunnel_opt()
- bpf_skb_set_tunnel_opt()
- bpf_skb_change_proto()
- bpf_skb_change_type()

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 125 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f3ea8824efbc..d147d9dd6a83 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -473,6 +473,14 @@ union bpf_attr {
  * 		The number of bytes written to the buffer, or a negative error
  * 		in case of failure.
  *
+ * u32 bpf_prandom_u32(void)
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Return
+ * 		The SMP (Symmetric multiprocessing) processor id.
+ *
  * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
  * 		Store *len* bytes from address *from* into the packet
@@ -604,6 +612,13 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the
+ * 		net_cls (network classifier) cgroup to which *skb* belongs.
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
  * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  * 	Description
  * 		Push a *vlan_tci* (VLAN tag control information) of protocol
@@ -703,6 +718,14 @@ union bpf_attr {
  * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
  * 		error.
  *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*.
+ * 	Return
+ * 		The realm of the route for the packet associated to *sdb*, or 0
+ * 		if none was found.
+ *
  * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write perf raw sample into a perf event held by *map* of type
@@ -779,6 +802,21 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, this helper is deprecated in favor of
+ * 		"direct packet access", enabling packet data to be manipulated
+ * 		with *skb*\ **->data** and *skb*\ **->data_end** pointing
+ * 		respectively to the first byte of packet data and to the byte
+ * 		after the last byte of packet data.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
@@ -814,6 +852,93 @@ union bpf_attr {
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
  *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value.
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		().
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC bpf-next v2 3/8] bpf: add documentation for eBPF helpers (12-22)
From: Quentin Monnet @ 2018-04-10 14:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, oss-drivers, quentin.monnet, linux-doc, linux-man
In-Reply-To: <20180410144157.4831-1-quentin.monnet@netronome.com>

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
writter by Alexei:

- bpf_get_current_pid_tgid()
- bpf_get_current_uid_gid()
- bpf_get_current_comm()
- bpf_skb_vlan_push()
- bpf_skb_vlan_pop()
- bpf_skb_get_tunnel_key()
- bpf_skb_set_tunnel_key()
- bpf_redirect()
- bpf_perf_event_output()
- bpf_get_stackid()
- bpf_get_current_task()

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/uapi/linux/bpf.h | 237 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2bc653a3a20f..f3ea8824efbc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -580,6 +580,243 @@ union bpf_attr {
  * 		performed again.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change data from the
+ * 		packet. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		This is typically used on the receive path to perform a lookup
+ * 		or a packet redirection based on the value of *key*:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key = {};
+ * 			bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			     lookup or redirect based on key ...
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		For hooks other than XDP, *flags* can be set to
+ * 		**BPF_F_INGRESS**, which indicates the packet is to be
+ * 		redirected to the ingress interface instead of (by default)
+ * 		egress. Currently, XDP does not support any flag.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORT** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write perf raw sample into a perf event held by *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf event must
+ * 		have the following attributes: **PERF_SAMPLE_RAW** as
+ * 		**sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper, and will get interpreted as a pointer to a **struct
+ * 		pt_reg**.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*). It looks like the
+ * 		following snippet:
+ *
+ * 		::
+ *
+ * 			volatile struct perf_event_mmap_page *header;
+ * 			struct perf_event_attr attr = {
+ * 			        .sample_type = PERF_SAMPLE_RAW,
+ * 			        .type = PERF_TYPE_SOFTWARE,
+ * 			        .config = PERF_COUNT_SW_BPF_OUTPUT,
+ * 			};
+ * 			int page_size;
+ * 			int mmap_size;
+ * 			int key = 0;
+ * 			int pmu_fd;
+ * 			void *base;
+ * 			
+ * 			if (load_bpf_file(filename))
+ * 			        return -1;
+ * 			
+ * 			pmu_fd = sys_perf_event_open(&attr,
+ * 			                             -1, // pid
+ * 			                              0, // cpu
+ * 			                             -1, // group_fd
+ * 			                              0);
+ * 			
+ * 			assert(pmu_fd >= 0);
+ * 			assert(bpf_map_update_elem(map_fd[0], &key,
+ * 			                           &pmu_fd, BPF_ANY) == 0);
+ * 			assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
+ * 			
+ * 			page_size = getpagesize();
+ * 			mmap_size = page_size * (page_cnt + 1);
+ * 			
+ * 			base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ * 			            MAP_SHARED, fd, 0);
+ * 			if (base == MAP_FAILED)
+ * 			        return -1;
+ * 			
+ * 			header = base;
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames.
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
2.14.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH 2/3] mm: replace __HAVE_ARCH_PTE_SPECIAL
From: Laurent Dufour @ 2018-04-10 15:10 UTC (permalink / raw)
  To: David Rientjes, Christoph Hellwig
  Cc: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens
In-Reply-To: <alpine.DEB.2.21.1804091307480.56406@chino.kir.corp.google.com>

On 09/04/2018 22:08, David Rientjes wrote:
> On Mon, 9 Apr 2018, Christoph Hellwig wrote:
> 
>>> -#ifdef __HAVE_ARCH_PTE_SPECIAL
>>> +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
>>>  # define HAVE_PTE_SPECIAL 1
>>>  #else
>>>  # define HAVE_PTE_SPECIAL 0
>>
>> I'd say kill this odd indirection and just use the
>> CONFIG_ARCH_HAS_PTE_SPECIAL symbol directly.
>>
>>
> 
> Agree, and I think it would be easier to audit/review if patches 1 and 3 
> were folded together to see the relationship between the newly added 
> selects and what #define's it is replacing.  Otherwise, looks good!
>

Ok I will fold the 3 patches and introduce a new one removing HAVE_PTE_SPECIAL.

Thanks,
Laurent.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2 0/2] move __HAVE_ARCH_PTE_SPECIAL in Kconfig
From: Laurent Dufour @ 2018-04-10 15:25 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens, David Rientjes

The per architecture __HAVE_ARCH_PTE_SPECIAL is defined statically in the
per architecture header files. This doesn't allow to make other
configuration dependent on it.

The first patch of this series is replacing __HAVE_ARCH_PTE_SPECIAL by
CONFIG_ARCH_HAS_PTE_SPECIAL defined into the Kconfig files,
setting it automatically when architectures was already setting it in
header file.

The second patch is removing the odd define HAVE_PTE_SPECIAL which is a
duplicate of CONFIG_ARCH_HAS_PTE_SPECIAL.

There is no functional change introduced by this series.

Laurent Dufour (2):
  mm: introduce ARCH_HAS_PTE_SPECIAL
  mm: remove odd HAVE_PTE_SPECIAL

 .../features/vm/pte_special/arch-support.txt       |  2 +-
 arch/arc/Kconfig                                   |  1 +
 arch/arc/include/asm/pgtable.h                     |  2 --
 arch/arm/Kconfig                                   |  1 +
 arch/arm/include/asm/pgtable-3level.h              |  1 -
 arch/arm64/Kconfig                                 |  1 +
 arch/arm64/include/asm/pgtable.h                   |  2 --
 arch/powerpc/Kconfig                               |  1 +
 arch/powerpc/include/asm/book3s/64/pgtable.h       |  3 ---
 arch/powerpc/include/asm/pte-common.h              |  3 ---
 arch/riscv/Kconfig                                 |  1 +
 arch/s390/Kconfig                                  |  1 +
 arch/s390/include/asm/pgtable.h                    |  1 -
 arch/sh/Kconfig                                    |  1 +
 arch/sh/include/asm/pgtable.h                      |  2 --
 arch/sparc/Kconfig                                 |  1 +
 arch/sparc/include/asm/pgtable_64.h                |  3 ---
 arch/x86/Kconfig                                   |  1 +
 arch/x86/include/asm/pgtable_types.h               |  1 -
 include/linux/pfn_t.h                              |  4 ++--
 mm/Kconfig                                         |  3 +++
 mm/gup.c                                           |  4 ++--
 mm/memory.c                                        | 23 ++++++++++------------
 23 files changed, 27 insertions(+), 36 deletions(-)

-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2 2/2] mm: remove odd HAVE_PTE_SPECIAL
From: Laurent Dufour @ 2018-04-10 15:25 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens, David Rientjes
In-Reply-To: <1523373951-10981-1-git-send-email-ldufour@linux.vnet.ibm.com>

Remove the additional define HAVE_PTE_SPECIAL and rely directly on
CONFIG_ARCH_HAS_PTE_SPECIAL.

There is no functional change introduced by this patch

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
 mm/memory.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 96910c625daa..53b6344a90d2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,19 +817,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-# define HAVE_PTE_SPECIAL 1
-#else
-# define HAVE_PTE_SPECIAL 0
-#endif
 struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte, bool with_public_device)
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	if (HAVE_PTE_SPECIAL) {
-		if (likely(!pte_special(pte)))
-			goto check_pfn;
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
+	if (unlikely(pte_special(pte))) {
 		if (vma->vm_ops && vma->vm_ops->find_special_page)
 			return vma->vm_ops->find_special_page(vma, addr);
 		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
@@ -862,7 +856,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		return NULL;
 	}
 
-	/* !HAVE_PTE_SPECIAL case follows: */
+#else	/* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
 	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
 		if (vma->vm_flags & VM_MIXEDMAP) {
@@ -881,7 +875,8 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 
 	if (is_zero_pfn(pfn))
 		return NULL;
-check_pfn:
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
+
 	if (unlikely(pfn > highest_memmap_pfn)) {
 		print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
@@ -891,7 +886,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 	 * NOTE! We still have PageReserved() pages in the page tables.
 	 * eg. VDSO mappings can cause them to exist.
 	 */
-out:
+out: __maybe_unused
 	return pfn_to_page(pfn);
 }
 
@@ -904,7 +899,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 	/*
 	 * There is no pmd_special() but there may be special pmds, e.g.
 	 * in a direct-access (dax) mapping, so let's just replicate the
-	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
 	 */
 	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
 		if (vma->vm_flags & VM_MIXEDMAP) {
@@ -1926,6 +1921,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 
 	track_pfn_insert(vma, &pgprot, pfn);
 
+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
 	/*
 	 * If we don't have pte special, then we have to use the pfn_valid()
 	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1933,7 +1929,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 	 * without pte special, it would there be refcounted as a normal page.
 	 */
-	if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+	if (!pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
 		struct page *page;
 
 		/*
@@ -1944,6 +1940,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		page = pfn_to_page(pfn_t_to_pfn(pfn));
 		return insert_page(vma, addr, page, pgprot);
 	}
+#endif
 	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
 }
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v2 1/2] mm: introduce ARCH_HAS_PTE_SPECIAL
From: Laurent Dufour @ 2018-04-10 15:25 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens, David Rientjes
In-Reply-To: <1523373951-10981-1-git-send-email-ldufour@linux.vnet.ibm.com>

Currently the PTE special supports is turned on in per architecture header
files. Most of the time, it is defined in arch/*/include/asm/pgtable.h
depending or not on some other per architecture static definition.

This patch introduce a new configuration variable to manage this directly
in the Kconfig files. It would later replace __HAVE_ARCH_PTE_SPECIAL.

Here notes for some architecture where the definition of
__HAVE_ARCH_PTE_SPECIAL is not obvious:

arm
 __HAVE_ARCH_PTE_SPECIAL which is currently defined in
arch/arm/include/asm/pgtable-3level.h which is included by
arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set.
So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE.

powerpc
__HAVE_ARCH_PTE_SPECIAL is defined in 2 files:
 - arch/powerpc/include/asm/book3s/64/pgtable.h
 - arch/powerpc/include/asm/pte-common.h
The first one is included if (PPC_BOOK3S & PPC64) while the second is
included in all the other cases.
So select ARCH_HAS_PTE_SPECIAL all the time.

sparc:
__HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) &&
defined(__arch64__) which are defined through the compiler in
sparc/Makefile if !SPARC32 which I assume to be if SPARC64.
So select ARCH_HAS_PTE_SPECIAL if SPARC64

There is no functional change introduced by this patch.

Suggested-by: Jerome Glisse <jglisse@redhat>
Reviewed-by: Jerome Glisse <jglisse@redhat>
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
 Documentation/features/vm/pte_special/arch-support.txt | 2 +-
 arch/arc/Kconfig                                       | 1 +
 arch/arc/include/asm/pgtable.h                         | 2 --
 arch/arm/Kconfig                                       | 1 +
 arch/arm/include/asm/pgtable-3level.h                  | 1 -
 arch/arm64/Kconfig                                     | 1 +
 arch/arm64/include/asm/pgtable.h                       | 2 --
 arch/powerpc/Kconfig                                   | 1 +
 arch/powerpc/include/asm/book3s/64/pgtable.h           | 3 ---
 arch/powerpc/include/asm/pte-common.h                  | 3 ---
 arch/riscv/Kconfig                                     | 1 +
 arch/s390/Kconfig                                      | 1 +
 arch/s390/include/asm/pgtable.h                        | 1 -
 arch/sh/Kconfig                                        | 1 +
 arch/sh/include/asm/pgtable.h                          | 2 --
 arch/sparc/Kconfig                                     | 1 +
 arch/sparc/include/asm/pgtable_64.h                    | 3 ---
 arch/x86/Kconfig                                       | 1 +
 arch/x86/include/asm/pgtable_types.h                   | 1 -
 include/linux/pfn_t.h                                  | 4 ++--
 mm/Kconfig                                             | 3 +++
 mm/gup.c                                               | 4 ++--
 mm/memory.c                                            | 2 +-
 23 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 055004f467d2..cd05924ea875 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -1,6 +1,6 @@
 #
 # Feature name:          pte_special
-#         Kconfig:       __HAVE_ARCH_PTE_SPECIAL
+#         Kconfig:       ARCH_HAS_PTE_SPECIAL
 #         description:   arch supports the pte_special()/pte_mkspecial() VM APIs
 #
     -----------------------
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d76bf4a83740..8516e2b0239a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -44,6 +44,7 @@ config ARC
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
+	select ARCH_HAS_PTE_SPECIAL
 
 config MIGHT_HAVE_PCI
 	bool
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 08fe33830d4b..8ec5599a0957 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec,	|= (_PAGE_EXECUTE));
 PTE_BIT_FUNC(mkspecial,	|= (_PAGE_SPECIAL));
 PTE_BIT_FUNC(mkhuge,	|= (_PAGE_HW_SZ));
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a7f8e7f4b88f..c088c851b235 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 2a4836087358..6d50a11d7793 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	pte_val(pte) |= L_PTE_SPECIAL;
 	return pte;
 }
-#define	__HAVE_ARCH_PTE_SPECIAL
 
 #define pmd_write(pmd)		(pmd_isclear((pmd), L_PMD_SECT_RDONLY))
 #define pmd_dirty(pmd)		(pmd_isset((pmd), L_PMD_SECT_DIRTY))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf4938f6d..9a3f1b1ab50c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -17,6 +17,7 @@ config ARM64
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7e2c27e63cd8..b96c8a186908 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte(pgd_val(pgd));
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c32a181a7cbb..f7415fe25c07 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 47b5ffc8715d..b3ac8948b257 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -319,9 +319,6 @@ extern unsigned long pci_io_base;
 /* Advertise special mapping type for AGP */
 #define HAVE_PAGE_AGP
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index c4a72c7a8c83..03dfddb1f49a 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -216,9 +216,6 @@ static inline bool pte_user(pte_t pte)
 #define PAGE_AGP		(PAGE_KERNEL_NC)
 #define HAVE_PAGE_AGP
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #ifndef _PAGE_READ
 /* if not defined, we should not find _PAGE_WRITE too */
 #define _PAGE_READ 0
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5287c1441d66..b01c183836e5 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -34,6 +34,7 @@ config RISCV
 	select THREAD_INFO_IN_TASK
 	select RISCV_TIMER
 	select GENERIC_IRQ_MULTI_HANDLER
+	select ARCH_HAS_PTE_SPECIAL
 
 config MMU
 	def_bool y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 32a0d5b958bf..5f1f4997e7e9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -72,6 +72,7 @@ config S390
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2d24d33bf188..9809694e1389 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr)
 #define _PAGE_WRITE	0x020		/* SW pte write bit */
 #define _PAGE_SPECIAL	0x040		/* SW associated with special page */
 #define _PAGE_UNUSED	0x080		/* SW bit for pgste usage state */
-#define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY 0x002		/* SW pte soft dirty bit */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 97fe29316476..a6c75b6806d2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -50,6 +50,7 @@ config SUPERH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
+	select ARCH_HAS_PTE_SPECIAL
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 89c513a982fc..f6abfe2bca93 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end,
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-#define __HAVE_ARCH_PTE_SPECIAL
-
 #include <asm-generic/pgtable.h>
 
 #endif /* __ASM_SH_PGTABLE_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8767e45f1b2b..6b5a4f05dcb2 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -86,6 +86,7 @@ config SPARC64
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select GENERIC_TIME_VSYSCALL
 	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_HAS_PTE_SPECIAL
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 44d6ac47e035..1393a8ac596b 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr);
 #define _PAGE_PMD_HUGE    _AC(0x0100000000000000,UL) /* Huge page            */
 #define _PAGE_PUD_HUGE    _PAGE_PMD_HUGE
 
-/* Advertise support for _PAGE_SPECIAL */
-#define __HAVE_ARCH_PTE_SPECIAL
-
 /* SUN4U pte bits... */
 #define _PAGE_SZ4MB_4U	  _AC(0x6000000000000000,UL) /* 4MB Page             */
 #define _PAGE_SZ512K_4U	  _AC(0x4000000000000000,UL) /* 512K Page            */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bf4ddea48e61..3f5fb25486bf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -56,6 +56,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index acfe755562a6..3e195728d7d1 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -65,7 +65,6 @@
 #define _PAGE_PKEY_BIT2	(_AT(pteval_t, 0))
 #define _PAGE_PKEY_BIT3	(_AT(pteval_t, 0))
 #endif
-#define __HAVE_ARCH_PTE_SPECIAL
 
 #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
 			 _PAGE_PKEY_BIT1 | \
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a03c2642a87c..21713dc14ce2 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud);
 #endif
 #endif /* __HAVE_ARCH_PTE_DEVMAP */
 
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
 {
 	return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL;
@@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn)
 {
 	return false;
 }
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 #endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..1ea3e4a6a123 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -752,3 +752,6 @@ config GUP_BENCHMARK
 	  performance of get_user_pages_fast().
 
 	  See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_HAS_PTE_SPECIAL
+	bool
diff --git a/mm/gup.c b/mm/gup.c
index f296df6cf666..b044a3d14dc5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1351,7 +1351,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
 	}
 }
 
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 int write, struct page **pages, int *nr)
 {
@@ -1427,7 +1427,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 {
 	return 0;
 }
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index a1f990e33e38..96910c625daa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,7 +817,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 # define HAVE_PTE_SPECIAL 1
 #else
 # define HAVE_PTE_SPECIAL 0
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v2 2/2] mm: remove odd HAVE_PTE_SPECIAL
From: Robin Murphy @ 2018-04-10 15:58 UTC (permalink / raw)
  To: Laurent Dufour, linux-kernel, linux-mm, linuxppc-dev, x86,
	linux-doc, linux-snps-arc, linux-arm-kernel, linux-riscv,
	linux-s390, linux-sh, sparclinux, Jerome Glisse, mhocko,
	aneesh.kumar, akpm, mpe, benh, paulus, Jonathan Corbet,
	Catalin Marinas, Will Deacon, Yoshinori Sato, Rich Felker,
	David S . Miller, Thomas Gleixner, Ingo Molnar, Vineet Gupta,
	Palmer Dabbelt, Albert Ou, Martin Schwidefsky, Heiko Carstens,
	David Rientjes
In-Reply-To: <1523373951-10981-3-git-send-email-ldufour@linux.vnet.ibm.com>

On 10/04/18 16:25, Laurent Dufour wrote:
> Remove the additional define HAVE_PTE_SPECIAL and rely directly on
> CONFIG_ARCH_HAS_PTE_SPECIAL.
> 
> There is no functional change introduced by this patch
> 
> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
> ---
>   mm/memory.c | 23 ++++++++++-------------
>   1 file changed, 10 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 96910c625daa..53b6344a90d2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -817,19 +817,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
>    * PFNMAP mappings in order to support COWable mappings.
>    *
>    */
> -#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
> -# define HAVE_PTE_SPECIAL 1
> -#else
> -# define HAVE_PTE_SPECIAL 0
> -#endif
>   struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>   			     pte_t pte, bool with_public_device)
>   {
>   	unsigned long pfn = pte_pfn(pte);
>   
> -	if (HAVE_PTE_SPECIAL) {
> -		if (likely(!pte_special(pte)))
> -			goto check_pfn;
> +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL

Nit: Couldn't you use IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) within the 
existing code structure to avoid having to add these #ifdefs?

Robin.

> +	if (unlikely(pte_special(pte))) {
>   		if (vma->vm_ops && vma->vm_ops->find_special_page)
>   			return vma->vm_ops->find_special_page(vma, addr);
>   		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
> @@ -862,7 +856,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>   		return NULL;
>   	}
>   
> -	/* !HAVE_PTE_SPECIAL case follows: */
> +#else	/* CONFIG_ARCH_HAS_PTE_SPECIAL */
>   
>   	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
>   		if (vma->vm_flags & VM_MIXEDMAP) {
> @@ -881,7 +875,8 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>   
>   	if (is_zero_pfn(pfn))
>   		return NULL;
> -check_pfn:
> +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
> +
>   	if (unlikely(pfn > highest_memmap_pfn)) {
>   		print_bad_pte(vma, addr, pte, NULL);
>   		return NULL;
> @@ -891,7 +886,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>   	 * NOTE! We still have PageReserved() pages in the page tables.
>   	 * eg. VDSO mappings can cause them to exist.
>   	 */
> -out:
> +out: __maybe_unused
>   	return pfn_to_page(pfn);
>   }
>   
> @@ -904,7 +899,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
>   	/*
>   	 * There is no pmd_special() but there may be special pmds, e.g.
>   	 * in a direct-access (dax) mapping, so let's just replicate the
> -	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
> +	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
>   	 */
>   	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
>   		if (vma->vm_flags & VM_MIXEDMAP) {
> @@ -1926,6 +1921,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>   
>   	track_pfn_insert(vma, &pgprot, pfn);
>   
> +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
>   	/*
>   	 * If we don't have pte special, then we have to use the pfn_valid()
>   	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
> @@ -1933,7 +1929,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>   	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
>   	 * without pte special, it would there be refcounted as a normal page.
>   	 */
> -	if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
> +	if (!pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
>   		struct page *page;
>   
>   		/*
> @@ -1944,6 +1940,7 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>   		page = pfn_to_page(pfn_t_to_pfn(pfn));
>   		return insert_page(vma, addr, page, pgprot);
>   	}
> +#endif
>   	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
>   }
>   
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 1/2] mm: introduce ARCH_HAS_PTE_SPECIAL
From: Matthew Wilcox @ 2018-04-10 16:09 UTC (permalink / raw)
  To: Laurent Dufour
  Cc: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens, David Rientjes
In-Reply-To: <1523373951-10981-2-git-send-email-ldufour@linux.vnet.ibm.com>

On Tue, Apr 10, 2018 at 05:25:50PM +0200, Laurent Dufour wrote:
>  arch/powerpc/include/asm/pte-common.h                  | 3 ---
>  arch/riscv/Kconfig                                     | 1 +
>  arch/s390/Kconfig                                      | 1 +

You forgot to delete __HAVE_ARCH_PTE_SPECIAL from
arch/riscv/include/asm/pgtable-bits.h
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 1/2] mm: introduce ARCH_HAS_PTE_SPECIAL
From: Laurent Dufour @ 2018-04-10 16:42 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-kernel, linux-mm, linuxppc-dev, x86, linux-doc,
	linux-snps-arc, linux-arm-kernel, linux-riscv, linux-s390,
	linux-sh, sparclinux, Jerome Glisse, mhocko, aneesh.kumar, akpm,
	mpe, benh, paulus, Jonathan Corbet, Catalin Marinas, Will Deacon,
	Yoshinori Sato, Rich Felker, David S . Miller, Thomas Gleixner,
	Ingo Molnar, Vineet Gupta, Palmer Dabbelt, Albert Ou,
	Martin Schwidefsky, Heiko Carstens, David Rientjes
In-Reply-To: <20180410160932.GB3614@bombadil.infradead.org>

On 10/04/2018 18:09, Matthew Wilcox wrote:
> On Tue, Apr 10, 2018 at 05:25:50PM +0200, Laurent Dufour wrote:
>>  arch/powerpc/include/asm/pte-common.h                  | 3 ---
>>  arch/riscv/Kconfig                                     | 1 +
>>  arch/s390/Kconfig                                      | 1 +
> 
> You forgot to delete __HAVE_ARCH_PTE_SPECIAL from
> arch/riscv/include/asm/pgtable-bits.h

Damned !
Thanks for catching it.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 2/2] mm: remove odd HAVE_PTE_SPECIAL
From: Laurent Dufour @ 2018-04-10 16:44 UTC (permalink / raw)
  To: Robin Murphy, linux-kernel, linux-mm, linuxppc-dev, x86,
	linux-doc, linux-snps-arc, linux-arm-kernel, linux-riscv,
	linux-s390, linux-sh, sparclinux, Jerome Glisse, mhocko,
	aneesh.kumar, akpm, mpe, benh, paulus, Jonathan Corbet,
	Catalin Marinas, Will Deacon, Yoshinori Sato, Rich Felker,
	David S . Miller, Thomas Gleixner, Ingo Molnar, Vineet Gupta,
	Palmer Dabbelt, Albert Ou, Martin Schwidefsky, Heiko Carstens,
	David Rientjes
In-Reply-To: <3f20ac8b-20b8-f052-bc44-dcc0316354ca@arm.com>



On 10/04/2018 17:58, Robin Murphy wrote:
> On 10/04/18 16:25, Laurent Dufour wrote:
>> Remove the additional define HAVE_PTE_SPECIAL and rely directly on
>> CONFIG_ARCH_HAS_PTE_SPECIAL.
>>
>> There is no functional change introduced by this patch
>>
>> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
>> ---
>>   mm/memory.c | 23 ++++++++++-------------
>>   1 file changed, 10 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 96910c625daa..53b6344a90d2 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -817,19 +817,13 @@ static void print_bad_pte(struct vm_area_struct *vma,
>> unsigned long addr,
>>    * PFNMAP mappings in order to support COWable mappings.
>>    *
>>    */
>> -#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
>> -# define HAVE_PTE_SPECIAL 1
>> -#else
>> -# define HAVE_PTE_SPECIAL 0
>> -#endif
>>   struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>                    pte_t pte, bool with_public_device)
>>   {
>>       unsigned long pfn = pte_pfn(pte);
>>   -    if (HAVE_PTE_SPECIAL) {
>> -        if (likely(!pte_special(pte)))
>> -            goto check_pfn;
>> +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
> 
> Nit: Couldn't you use IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) within the
> existing code structure to avoid having to add these #ifdefs?

I agree, that would be better. I didn't thought about this option..
Thanks for reporting this.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC bpf-next v2 7/8] bpf: add documentation for eBPF helpers (51-57)
From: Yonghong Song @ 2018-04-10 16:58 UTC (permalink / raw)
  To: Quentin Monnet, daniel, ast
  Cc: netdev, oss-drivers, linux-doc, linux-man, Lawrence Brakmo,
	Josef Bacik, Andrey Ignatov
In-Reply-To: <20180410144157.4831-8-quentin.monnet@netronome.com>



On 4/10/18 7:41 AM, Quentin Monnet wrote:
> Add documentation for eBPF helper functions to bpf.h user header file.
> This documentation can be parsed with the Python script provided in
> another commit of the patch series, in order to provide a RST document
> that can later be converted into a man page.
> 
> The objective is to make the documentation easily understandable and
> accessible to all eBPF developers, including beginners.
> 
> This patch contains descriptions for the following helper functions:
> 
> Helpers from Lawrence:
> - bpf_setsockopt()
> - bpf_getsockopt()
> - bpf_sock_ops_cb_flags_set()
> 
> Helpers from Yonghong:
> - bpf_perf_event_read_value()
> - bpf_perf_prog_read_value()
> 
> Helper from Josef:
> - bpf_override_return()
> 
> Helper from Andrey:
> - bpf_bind()
> 
> Cc: Lawrence Brakmo <brakmo@fb.com>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Josef Bacik <jbacik@fb.com>
> Cc: Andrey Ignatov <rdna@fb.com>
> Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
> ---
>   include/uapi/linux/bpf.h | 184 +++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 184 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 15d9ccafebbe..7343af4196c8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1208,6 +1208,28 @@ union bpf_attr {
>    * 	Return
>    * 		0
>    *
> + * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
> + * 	Description
> + * 		Emulate a call to **setsockopt()** on the socket associated to
> + * 		*bpf_socket*, which must be a full socket. The *level* at
> + * 		which the option resides and the name *optname* of the option
> + * 		must be specified, see **setsockopt(2)** for more information.
> + * 		The option value of length *optlen* is pointed by *optval*.
> + *
> + * 		This helper actually implements a subset of **setsockopt()**.
> + * 		It supports the following *level*\ s:
> + *
> + * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
> + * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
> + * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
> + * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
> + * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
> + * 		  **TCP_BPF_SNDCWND_CLAMP**.
> + * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
> + * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
>    * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
>    * 	Description
>    * 		Grow or shrink the room for data in the packet associated to
> @@ -1255,6 +1277,168 @@ union bpf_attr {
>    * 		performed again.
>    * 	Return
>    * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
> + * 	Description
> + * 		Read the value of a perf event counter, and store it into *buf*
> + * 		of size *buf_size*. This helper relies on a *map* of type
> + * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf
> + * 		event counter is selected at the creation of the *map*. The

The nature of the perf event counter is selected when *map* is updated 
with perf_event fd's.

> + * 		*map* is an array whose size is the number of available CPU
> + * 		cores, and each cell contains a value relative to one core. The

It is confusing to mix core/cpu here. Maybe just use perf_event 
convention, always using cpu?

> + * 		value to retrieve is indicated by *flags*, that contains the
> + * 		index of the core to look up, masked with
> + * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
> + * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
> + * 		current CPU core should be retrieved.
> + *
> + * 		This helper behaves in a way close to
> + * 		**bpf_perf_event_read**\ () helper, save that instead of
> + * 		just returning the value observed, it fills the *buf*
> + * 		structure. This allows for additional data to be retrieved: in
> + * 		particular, the enabled and running times (in *buf*\
> + * 		**->enabled** and *buf*\ **->running**, respectively) are
> + * 		copied.
> + *
> + * 		These values are interesting, because hardware PMU (Performance
> + * 		Monitoring Unit) counters are limited resources. When there are
> + * 		more PMU based perf events opened than available counters,
> + * 		kernel will multiplex these events so each event gets certain
> + * 		percentage (but not all) of the PMU time. In case that
> + * 		multiplexing happens, the number of samples or counter value
> + * 		will not reflect the case compared to when no multiplexing
> + * 		occurs. This makes comparison between different runs difficult.
> + * 		Typically, the counter value should be normalized before
> + * 		comparing to other experiments. The usual normalization is done
> + * 		as follows.
> + *
> + * 		::
> + *
> + * 			normalized_counter = counter * t_enabled / t_running
> + *
> + * 		Where t_enabled is the time enabled for event and t_running is
> + * 		the time running for event since last normalization. The
> + * 		enabled and running times are accumulated since the perf event
> + * 		open. To achieve scaling factor between two invocations of an
> + * 		eBPF program, users can can use CPU id as the key (which is
> + * 		typical for perf array usage model) to remember the previous
> + * 		value and do the calculation inside the eBPF program.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
> + * 	Description
> + * 		For en eBPF program attached to a perf event, retrieve the
> + * 		value of the event counter associated to *ctx* and store it in
> + * 		the structure pointed by *buf* and of size *buf_size*. Enabled
> + * 		and running times are also stored in the structure (see
> + * 		description of helper **bpf_perf_event_read_value**\ () for
> + * 		more details).
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
> + * 	Description
> + * 		Emulate a call to **getsockopt()** on the socket associated to
> + * 		*bpf_socket*, which must be a full socket. The *level* at
> + * 		which the option resides and the name *optname* of the option
> + * 		must be specified, see **getsockopt(2)** for more information.
> + * 		The retrieved value is stored in the structure pointed by
> + * 		*opval* and of length *optlen*.
> + *
> + * 		This helper actually implements a subset of **getsockopt()**.
> + * 		It supports the following *level*\ s:
> + *
> + * 		* **IPPROTO_TCP**, which supports *optname*
> + * 		  **TCP_CONGESTION**.
> + * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
> + * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_override_return(struct pt_reg *regs, u64 rc)
> + * 	Description
> + * 		Used for error injection, this helper uses kprobes to override
> + * 		the return value of the probed function, and to set it to *rc*.
> + * 		The first argument is the context *regs* on which the kprobe
> + * 		works.
> + *
> + * 		This helper works by setting setting the PC (program counter)
> + * 		to an override function which is run in place of the original
> + * 		probed function. This means the probed function is not run at
> + * 		all. The replacement function just returns with the required
> + * 		value.
> + *
> + * 		This helper has security implications, and thus is subject to
> + * 		restrictions. It is only available if the kernel was compiled
> + * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
> + * 		option, and in this case it only works on functions tagged with
> + * 		**ALLOW_ERROR_INJECTION** in the kernel code.
> + *
> + * 		Also, the helper is only available for the architectures having
> + * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
> + * 		x86 architecture is the only one to support this feature.
> + * 	Return
> + * 		0
> + *
> + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
> + * 	Description
> + * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
> + * 		for the full TCP socket associated to *bpf_sock_ops* to
> + * 		*argval*.
> + *
> + * 		The primary use of this field is to determine if there should
> + * 		be calls to eBPF programs of type
> + * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
> + * 		code. A program of the same type can change its value, per
> + * 		connection and as necessary, when the connection is
> + * 		established. This field is directly accessible for reading, but
> + * 		this helper must be used for updates in order to return an
> + * 		error if an eBPF program tries to set a callback that is not
> + * 		supported in the current kernel.
> + *
> + * 		The supported callback values that *argval* can combine are:
> + *
> + * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
> + * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
> + * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
> + *
> + * 		Here are some examples of where one could call such eBPF
> + * 		program:
> + *
> + * 		* When RTO fires.
> + * 		* When a packet is retransmitted.
> + * 		* When the connection terminates.
> + * 		* When a packet is sent.
> + * 		* When a packet is received.
> + * 	Return
> + * 		Code **-EINVAL** if the socket is not a full TCP socket;
> + * 		otherwise, a positive number containing the bits that could not
> + * 		be set is returned (which comes down to 0 if all bits were set
> + * 		as required).
> + *
> + * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
> + * 	Description
> + * 		Bind the socket associated to *ctx* to the address pointed by
> + * 		*addr*, of length *addr_len*. This allows for making outgoing
> + * 		connection from the desired IP address, which can be useful for
> + * 		example when all processes inside a cgroup should use one
> + * 		single IP address on a host that has multiple IP configured.
> + *
> + * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
> + * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
> + * 		**AF_INET6**). Looking for a free port to bind to can be
> + * 		expensive, therefore binding to port is not permitted by the
> + * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
> + * 		must be set to zero.
> + *
> + * 		As for the remote end, both parts of it can be overridden,
> + * 		remote IP and remote port. This can be useful if an application
> + * 		inside a cgroup wants to connect to another application inside
> + * 		the same cgroup or to itself, but knows nothing about the IP
> + * 		address assigned to the cgroup.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] gpiolib: add hogs support for machine code
From: kbuild test robot @ 2018-04-10 17:05 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: kbuild-all, Linus Walleij, Jonathan Corbet, linux-gpio, linux-doc,
	linux-kernel, Bartosz Golaszewski
In-Reply-To: <20180410124029.2915-1-brgl@bgdev.pl>

[-- Attachment #1: Type: text/plain, Size: 1708 bytes --]

Hi Bartosz,

I love your patch! Yet something to improve:

[auto build test ERROR on gpio/for-next]
[also build test ERROR on v4.16 next-20180410]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Bartosz-Golaszewski/gpiolib-add-hogs-support-for-machine-code/20180410-232047
base:   https://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git for-next
config: i386-randconfig-a0-201814 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from drivers//mfd/sm501.c:23:0:
>> include/linux/gpio/machine.h:56:19: error: field 'dflags' has incomplete type
     enum gpiod_flags dflags;
                      ^

vim +/dflags +56 include/linux/gpio/machine.h

    41	
    42	/**
    43	 * struct gpiod_hog - GPIO line hog table
    44	 * @chip_label: name of the chip the GPIO belongs to
    45	 * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO
    46	 * @line_name: consumer name for the hogged line
    47	 * @lflags: mask of GPIO lookup flags
    48	 * @dflags: GPIO flags used to specify the direction and value
    49	 */
    50	struct gpiod_hog {
    51		struct list_head list;
    52		const char *chip_label;
    53		u16 chip_hwnum;
    54		const char *line_name;
    55		enum gpio_lookup_flags lflags;
  > 56		enum gpiod_flags dflags;
    57	};
    58	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31760 bytes --]

^ permalink raw reply

* [RFC 00/10] Adds pcitest tool support for MSI-X
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel

This patch set depends the following series:
https://lkml.org/lkml/2018/4/10/421

This series aims to add pcitest tool support for MSI-X. 

Includes new callbacks methods and handlers to trigger the MSI-X
interruptions on the EP Designware IP driver.

Provides new methods on pci_epf_test driver that allows to set/get
EP maximum number of MSI-X entries (similar to set/get MSI methods).

Reworks on MSI set/get and triggering methods on EP Designware IP driver
to be more generic and flexible.

Adds a new input parameter (msix) and replicates the whole MSI mechanism
applied to the MSI-X feature on pcitest tool. Also updates the pcitest
script with the new test set applied to this new feature.

Gustavo Pimentel (10):
  PCI: dwc: Add MSI-X callbacks handler
  PCI: cadence: Update cdns_pcie_ep_raise_irq function signature
  PCI: endpoint: Add MSI-X interfaces
  PCI: dwc: MSI callbacks handler rework
  PCI: dwc: Add legacy interrupt callback handler
  misc: pci_endpoint_test: Add MSI-X support
  misc: pci_endpoint_test: Replace lower into upper case characters
  PCI: endpoint: functions/pci-epf-test: Add MSI-X support
  PCI: endpoint: functions/pci-epf-test: Replace lower into upper case
    characters
  tools: PCI: Add MSI-X support

 Documentation/misc-devices/pci-endpoint-test.txt |   3 +
 drivers/misc/pci_endpoint_test.c                 | 120 ++++++++++----
 drivers/pci/cadence/pcie-cadence-ep.c            |   2 +-
 drivers/pci/dwc/pci-dra7xx.c                     |   2 +-
 drivers/pci/dwc/pcie-artpec6.c                   |   2 +-
 drivers/pci/dwc/pcie-designware-ep.c             | 201 +++++++++++++++++++++--
 drivers/pci/dwc/pcie-designware-plat.c           |   9 +-
 drivers/pci/dwc/pcie-designware.h                |  40 +++--
 drivers/pci/endpoint/functions/pci-epf-test.c    | 113 +++++++++----
 drivers/pci/endpoint/pci-ep-cfs.c                |  24 +++
 drivers/pci/endpoint/pci-epc-core.c              |  60 ++++++-
 include/linux/pci-epc.h                          |  11 +-
 include/linux/pci-epf.h                          |   1 +
 include/uapi/linux/pcitest.h                     |   1 +
 tools/pci/pcitest.c                              |  18 +-
 tools/pci/pcitest.sh                             |  25 +++
 16 files changed, 528 insertions(+), 104 deletions(-)

-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC 02/10] PCI: cadence: Update cdns_pcie_ep_raise_irq function signature
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Changes the cdns_pcie_ep_raise_irq function signature, namely the
interrupt_num variable type from u8 to u16 to accommodate the MSI-X maximum
interrupts of 2048.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/cadence/pcie-cadence-ep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/cadence/pcie-cadence-ep.c b/drivers/pci/cadence/pcie-cadence-ep.c
index 3d8283e..6d6322c 100644
--- a/drivers/pci/cadence/pcie-cadence-ep.c
+++ b/drivers/pci/cadence/pcie-cadence-ep.c
@@ -363,7 +363,7 @@ static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep *ep, u8 fn,
 }
 
 static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn,
-				  enum pci_epc_irq_type type, u8 interrupt_num)
+				  enum pci_epc_irq_type type, u16 interrupt_num)
 {
 	struct cdns_pcie_ep *ep = epc_get_drvdata(epc);
 
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 04/10] PCI: dwc: MSI callbacks handler rework
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Adds in pci_epc_set_msi function a maximum number of 32 interrupts
validation.

Removes duplicate defines located on pcie-designware.h file. Uses now
the defines available on /include/uapi/linux/pci-regs.h file.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/dwc/pcie-designware-ep.c | 46 +++++++++++++++++++++++-------------
 drivers/pci/dwc/pcie-designware.h    | 11 ---------
 drivers/pci/endpoint/pci-epc-core.c  |  3 ++-
 3 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/drivers/pci/dwc/pcie-designware-ep.c b/drivers/pci/dwc/pcie-designware-ep.c
index 874d4c2..e352786 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -251,29 +251,38 @@ static int dw_pcie_ep_map_addr(struct pci_epc *epc, u8 func_no,
 
 static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 func_no)
 {
-	int val;
 	struct dw_pcie_ep *ep = epc_get_drvdata(epc);
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+	u32 val, reg;
+
+	if (ep->cap_addr.msi_addr == 0)
+		return 0;
 
-	val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
-	if (!(val & MSI_CAP_MSI_EN_MASK))
+	reg = ep->cap_addr.msi_addr + PCI_MSI_FLAGS;
+	val = dw_pcie_readw_dbi(pci, reg);
+	if (!(val & PCI_MSI_FLAGS_ENABLE))
 		return -EINVAL;
 
-	val = (val & MSI_CAP_MME_MASK) >> MSI_CAP_MME_SHIFT;
+	val = (val & PCI_MSI_FLAGS_QSIZE) >> 4;
+
 	return val;
 }
 
-static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 encode_int)
+static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts)
 {
-	int val;
 	struct dw_pcie_ep *ep = epc_get_drvdata(epc);
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+	u32 val, reg;
+
+	if (ep->cap_addr.msi_addr == 0)
+		return 0;
 
-	val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
-	val &= ~MSI_CAP_MMC_MASK;
-	val |= (encode_int << MSI_CAP_MMC_SHIFT) & MSI_CAP_MMC_MASK;
+	reg = ep->cap_addr.msi_addr + PCI_MSI_FLAGS;
+	val = dw_pcie_readw_dbi(pci, reg);
+	val &= ~PCI_MSI_FLAGS_QMASK;
+	val |= (interrupts << 1) & PCI_MSI_FLAGS_QMASK;
 	dw_pcie_dbi_ro_wr_en(pci);
-	dw_pcie_writew_dbi(pci, MSI_MESSAGE_CONTROL, val);
+	dw_pcie_writew_dbi(pci, reg, val);
 	dw_pcie_dbi_ro_wr_dis(pci);
 
 	return 0;
@@ -372,21 +381,26 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	struct pci_epc *epc = ep->epc;
 	u16 msg_ctrl, msg_data;
-	u32 msg_addr_lower, msg_addr_upper;
+	u32 msg_addr_lower, msg_addr_upper, reg;
 	u64 msg_addr;
 	bool has_upper;
 	int ret;
 
 	/* Raise MSI per the PCI Local Bus Specification Revision 3.0, 6.8.1. */
-	msg_ctrl = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
+	reg = ep->cap_addr.msi_addr + PCI_MSI_FLAGS;
+	msg_ctrl = dw_pcie_readw_dbi(pci, reg);
 	has_upper = !!(msg_ctrl & PCI_MSI_FLAGS_64BIT);
-	msg_addr_lower = dw_pcie_readl_dbi(pci, MSI_MESSAGE_ADDR_L32);
+	reg = ep->cap_addr.msi_addr + PCI_MSI_ADDRESS_LO;
+	msg_addr_lower = dw_pcie_readl_dbi(pci, reg);
 	if (has_upper) {
-		msg_addr_upper = dw_pcie_readl_dbi(pci, MSI_MESSAGE_ADDR_U32);
-		msg_data = dw_pcie_readw_dbi(pci, MSI_MESSAGE_DATA_64);
+		reg = ep->cap_addr.msi_addr + PCI_MSI_ADDRESS_HI;
+		msg_addr_upper = dw_pcie_readl_dbi(pci, reg);
+		reg = ep->cap_addr.msi_addr + PCI_MSI_DATA_64;
+		msg_data = dw_pcie_readw_dbi(pci, reg);
 	} else {
 		msg_addr_upper = 0;
-		msg_data = dw_pcie_readw_dbi(pci, MSI_MESSAGE_DATA_32);
+		reg = ep->cap_addr.msi_addr + PCI_MSI_DATA_32;
+		msg_data = dw_pcie_readw_dbi(pci, reg);
 	}
 	msg_addr = ((u64) msg_addr_upper) << 32 | msg_addr_lower;
 	ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr,
diff --git a/drivers/pci/dwc/pcie-designware.h b/drivers/pci/dwc/pcie-designware.h
index 456fd94..2acf18b0 100644
--- a/drivers/pci/dwc/pcie-designware.h
+++ b/drivers/pci/dwc/pcie-designware.h
@@ -96,17 +96,6 @@
 #define PCIE_GET_ATU_INB_UNR_REG_OFFSET(region)				\
 			((0x3 << 20) | ((region) << 9) | (0x1 << 8))
 
-#define MSI_MESSAGE_CONTROL		0x52
-#define MSI_CAP_MMC_SHIFT		1
-#define MSI_CAP_MMC_MASK		(7 << MSI_CAP_MMC_SHIFT)
-#define MSI_CAP_MME_SHIFT		4
-#define MSI_CAP_MSI_EN_MASK		0x1
-#define MSI_CAP_MME_MASK		(7 << MSI_CAP_MME_SHIFT)
-#define MSI_MESSAGE_ADDR_L32		0x54
-#define MSI_MESSAGE_ADDR_U32		0x58
-#define MSI_MESSAGE_DATA_32		0x58
-#define MSI_MESSAGE_DATA_64		0x5C
-
 #define MAX_MSI_IRQS			256
 #define MAX_MSI_IRQS_PER_CTRL		32
 #define MAX_MSI_CTRLS			(MAX_MSI_IRQS / MAX_MSI_IRQS_PER_CTRL)
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 294a383..dbd17e4 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -201,7 +201,8 @@ int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts)
 	u8 encode_int;
 	unsigned long flags;
 
-	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions)
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions ||
+	    interrupts > 32)
 		return -EINVAL;
 
 	if (!epc->ops->set_msi)
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 10/10] tools: PCI: Add MSI-X support
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Adds MSI-X support to the pcitest tool and modified the pcitest.sh script
to accomodate this new type of interruption test.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 include/uapi/linux/pcitest.h |  1 +
 tools/pci/pcitest.c          | 18 +++++++++++++++++-
 tools/pci/pcitest.sh         | 25 +++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index 953cf03..d746fb1 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -16,5 +16,6 @@
 #define PCITEST_WRITE		_IOW('P', 0x4, unsigned long)
 #define PCITEST_READ		_IOW('P', 0x5, unsigned long)
 #define PCITEST_COPY		_IOW('P', 0x6, unsigned long)
+#define PCITEST_MSIX		_IOW('P', 0x7, int)
 
 #endif /* __UAPI_LINUX_PCITEST_H */
diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
index 9074b47..9d145a3 100644
--- a/tools/pci/pcitest.c
+++ b/tools/pci/pcitest.c
@@ -37,6 +37,7 @@ struct pci_test {
 	char		barnum;
 	bool		legacyirq;
 	unsigned int	msinum;
+	unsigned int	msixnum;
 	bool		read;
 	bool		write;
 	bool		copy;
@@ -83,6 +84,15 @@ static int run_test(struct pci_test *test)
 			fprintf(stdout, "%s\n", result[ret]);
 	}
 
+	if (test->msixnum > 0 && test->msixnum <= 2048) {
+		ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
+		fprintf(stdout, "MSI-X%d:\t\t", test->msixnum);
+		if (ret < 0)
+			fprintf(stdout, "TEST FAILED\n");
+		else
+			fprintf(stdout, "%s\n", result[ret]);
+	}
+
 	if (test->write) {
 		ret = ioctl(fd, PCITEST_WRITE, test->size);
 		fprintf(stdout, "WRITE (%7ld bytes):\t\t", test->size);
@@ -133,7 +143,7 @@ int main(int argc, char **argv)
 	/* set default endpoint device */
 	test->device = "/dev/pci-endpoint-test.0";
 
-	while ((c = getopt(argc, argv, "D:b:m:lrwcs:")) != EOF)
+	while ((c = getopt(argc, argv, "D:b:m:x:lrwcs:")) != EOF)
 	switch (c) {
 	case 'D':
 		test->device = optarg;
@@ -151,6 +161,11 @@ int main(int argc, char **argv)
 		if (test->msinum < 1 || test->msinum > 32)
 			goto usage;
 		continue;
+	case 'x':
+		test->msixnum = atoi(optarg);
+		if (test->msixnum < 1 || test->msixnum > 2048)
+			goto usage;
+		continue;
 	case 'r':
 		test->read = true;
 		continue;
@@ -173,6 +188,7 @@ int main(int argc, char **argv)
 			"\t-D <dev>		PCI endpoint test device {default: /dev/pci-endpoint-test.0}\n"
 			"\t-b <bar num>		BAR test (bar number between 0..5)\n"
 			"\t-m <msi num>		MSI test (msi number between 1..32)\n"
+			"\t-x <msix num>	MSI-X test (msix number between 1..2048)\n"
 			"\t-l			Legacy IRQ test\n"
 			"\t-r			Read buffer test\n"
 			"\t-w			Write buffer test\n"
diff --git a/tools/pci/pcitest.sh b/tools/pci/pcitest.sh
index 77e8c85..86709a2 100644
--- a/tools/pci/pcitest.sh
+++ b/tools/pci/pcitest.sh
@@ -4,6 +4,8 @@
 echo "BAR tests"
 echo
 
+modprobe pci_endpoint_test
+sleep 2
 bar=0
 
 while [ $bar -lt 6 ]
@@ -16,7 +18,14 @@ echo
 echo "Interrupt tests"
 echo
 
+rmmod pci_endpoint_test
+sleep 2
+modprobe pci_endpoint_test irq_type=0
 pcitest -l
+
+rmmod pci_endpoint_test
+sleep 2
+modprobe pci_endpoint_test irq_type=1
 msi=1
 
 while [ $msi -lt 33 ]
@@ -26,9 +35,25 @@ do
 done
 echo
 
+rmmod pci_endpoint_test
+sleep 2
+modprobe pci_endpoint_test irq_type=2
+msix=1
+
+while [ $msix -lt 2049 ]
+do
+        pcitest -x $msix
+        msix=`expr $msix + 1`
+done
+echo
+
 echo "Read Tests"
 echo
 
+rmmod pci_endpoint_test
+sleep 2
+modprobe pci_endpoint_test irq_type=1
+
 pcitest -r -s 1
 pcitest -r -s 1024
 pcitest -r -s 1025
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 06/10] misc: pci_endpoint_test: Add MSI-X support
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Adds the MSI-X support and updates driver documentation accordingly.

Changes the driver parameter in order to allow the interruption type
selection.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 Documentation/misc-devices/pci-endpoint-test.txt |   3 +
 drivers/misc/pci_endpoint_test.c                 | 102 +++++++++++++++++------
 2 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/Documentation/misc-devices/pci-endpoint-test.txt b/Documentation/misc-devices/pci-endpoint-test.txt
index 4ebc359..fdfa0f6 100644
--- a/Documentation/misc-devices/pci-endpoint-test.txt
+++ b/Documentation/misc-devices/pci-endpoint-test.txt
@@ -10,6 +10,7 @@ The PCI driver for the test device performs the following tests
 	*) verifying addresses programmed in BAR
 	*) raise legacy IRQ
 	*) raise MSI IRQ
+	*) raise MSI-X IRQ
 	*) read data
 	*) write data
 	*) copy data
@@ -25,6 +26,8 @@ ioctl
  PCITEST_LEGACY_IRQ: Tests legacy IRQ
  PCITEST_MSI: Tests message signalled interrupts. The MSI number
 	      to be tested should be passed as argument.
+ PCITEST_MSIX: Tests message signalled interrupts. The MSI-X number
+	      to be tested should be passed as argument.
  PCITEST_WRITE: Perform write tests. The size of the buffer should be passed
 		as argument.
  PCITEST_READ: Perform read tests. The size of the buffer should be passed
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 37db0fc..a7d9354 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -42,11 +42,16 @@
 #define PCI_ENDPOINT_TEST_COMMAND	0x4
 #define COMMAND_RAISE_LEGACY_IRQ	BIT(0)
 #define COMMAND_RAISE_MSI_IRQ		BIT(1)
-#define MSI_NUMBER_SHIFT		2
-/* 6 bits for MSI number */
-#define COMMAND_READ                    BIT(8)
-#define COMMAND_WRITE                   BIT(9)
-#define COMMAND_COPY                    BIT(10)
+#define COMMAND_RAISE_MSIX_IRQ		BIT(2)
+#define IRQ_TYPE_SHIFT			3
+#define IRQ_TYPE_LEGACY			0
+#define IRQ_TYPE_MSI			1
+#define IRQ_TYPE_MSIX			2
+#define MSI_NUMBER_SHIFT		5
+/* 12 bits for MSI number */
+#define COMMAND_READ                    BIT(17)
+#define COMMAND_WRITE                   BIT(18)
+#define COMMAND_COPY                    BIT(19)
 
 #define PCI_ENDPOINT_TEST_STATUS	0x8
 #define STATUS_READ_SUCCESS             BIT(0)
@@ -73,9 +78,9 @@ static DEFINE_IDA(pci_endpoint_test_ida);
 #define to_endpoint_test(priv) container_of((priv), struct pci_endpoint_test, \
 					    miscdev)
 
-static bool no_msi;
-module_param(no_msi, bool, 0444);
-MODULE_PARM_DESC(no_msi, "Disable MSI interrupt in pci_endpoint_test");
+static int irq_type = IRQ_TYPE_MSIX;
+module_param(irq_type, int, 0444);
+MODULE_PARM_DESC(irq_type, "IRQ mode selection in pci_endpoint_test (0 - Legacy, 1 - MSI, 2 - MSI-X)");
 
 enum pci_barno {
 	BAR_0,
@@ -103,7 +108,7 @@ struct pci_endpoint_test {
 struct pci_endpoint_test_data {
 	enum pci_barno test_reg_bar;
 	size_t alignment;
-	bool no_msi;
+	int irq_type;
 };
 
 static inline u32 pci_endpoint_test_readl(struct pci_endpoint_test *test,
@@ -177,10 +182,10 @@ static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
 
 static bool pci_endpoint_test_legacy_irq(struct pci_endpoint_test *test)
 {
-	u32 val;
+	u32 val = COMMAND_RAISE_LEGACY_IRQ;
 
-	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 COMMAND_RAISE_LEGACY_IRQ);
+	val |= (IRQ_TYPE_LEGACY << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
 	val = wait_for_completion_timeout(&test->irq_raised,
 					  msecs_to_jiffies(1000));
 	if (!val)
@@ -192,12 +197,12 @@ static bool pci_endpoint_test_legacy_irq(struct pci_endpoint_test *test)
 static bool pci_endpoint_test_msi_irq(struct pci_endpoint_test *test,
 				      u8 msi_num)
 {
-	u32 val;
+	u32 val = COMMAND_RAISE_MSI_IRQ;
 	struct pci_dev *pdev = test->pdev;
 
-	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 msi_num << MSI_NUMBER_SHIFT |
-				 COMMAND_RAISE_MSI_IRQ);
+	val |= (msi_num << MSI_NUMBER_SHIFT);
+	val |= (IRQ_TYPE_MSI << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
 	val = wait_for_completion_timeout(&test->irq_raised,
 					  msecs_to_jiffies(1000));
 	if (!val)
@@ -209,6 +214,26 @@ static bool pci_endpoint_test_msi_irq(struct pci_endpoint_test *test,
 	return false;
 }
 
+static bool pci_endpoint_test_msix_irq(struct pci_endpoint_test *test,
+				       u16 msix_num)
+{
+	u32 val = COMMAND_RAISE_MSIX_IRQ;
+	struct pci_dev *pdev = test->pdev;
+
+	val |= (msix_num << MSI_NUMBER_SHIFT);
+	val |= (IRQ_TYPE_MSIX << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
+	val = wait_for_completion_timeout(&test->irq_raised,
+					  msecs_to_jiffies(1000));
+	if (!val)
+		return false;
+
+	if (test->last_irq - pdev->irq == msix_num - 1)
+		return true;
+
+	return false;
+}
+
 static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 {
 	bool ret = false;
@@ -226,6 +251,7 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 	size_t alignment = test->alignment;
 	u32 src_crc32;
 	u32 dst_crc32;
+	u32 val = COMMAND_COPY;
 
 	if (size > SIZE_MAX - alignment)
 		goto err;
@@ -281,8 +307,9 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_SIZE,
 				 size);
 
-	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 1 << MSI_NUMBER_SHIFT | COMMAND_COPY);
+	val |= (1 << MSI_NUMBER_SHIFT);
+	val |= (irq_type << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
 
 	wait_for_completion(&test->irq_raised);
 
@@ -314,6 +341,7 @@ static bool pci_endpoint_test_write(struct pci_endpoint_test *test, size_t size)
 	size_t offset;
 	size_t alignment = test->alignment;
 	u32 crc32;
+	u32 val = COMMAND_READ;
 
 	if (size > SIZE_MAX - alignment)
 		goto err;
@@ -348,8 +376,9 @@ static bool pci_endpoint_test_write(struct pci_endpoint_test *test, size_t size)
 
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_SIZE, size);
 
-	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 1 << MSI_NUMBER_SHIFT | COMMAND_READ);
+	val |= (1 << MSI_NUMBER_SHIFT);
+	val |= (irq_type << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
 
 	wait_for_completion(&test->irq_raised);
 
@@ -375,6 +404,7 @@ static bool pci_endpoint_test_read(struct pci_endpoint_test *test, size_t size)
 	size_t offset;
 	size_t alignment = test->alignment;
 	u32 crc32;
+	u32 val = COMMAND_WRITE;
 
 	if (size > SIZE_MAX - alignment)
 		goto err;
@@ -403,8 +433,9 @@ static bool pci_endpoint_test_read(struct pci_endpoint_test *test, size_t size)
 
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_SIZE, size);
 
-	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 1 << MSI_NUMBER_SHIFT | COMMAND_WRITE);
+	val |= (1 << MSI_NUMBER_SHIFT);
+	val |= (irq_type << IRQ_TYPE_SHIFT);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, val);
 
 	wait_for_completion(&test->irq_raised);
 
@@ -438,6 +469,9 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 	case PCITEST_MSI:
 		ret = pci_endpoint_test_msi_irq(test, arg);
 		break;
+	case PCITEST_MSIX:
+		ret = pci_endpoint_test_msix_irq(test, arg);
+		break;
 	case PCITEST_WRITE:
 		ret = pci_endpoint_test_write(test, arg);
 		break;
@@ -490,7 +524,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	if (data) {
 		test_reg_bar = data->test_reg_bar;
 		test->alignment = data->alignment;
-		no_msi = data->no_msi;
+		irq_type = data->irq_type;
 	}
 
 	init_completion(&test->irq_raised);
@@ -510,11 +544,24 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	if (!no_msi) {
+	switch (irq_type) {
+	case IRQ_TYPE_LEGACY:
+		break;
+	case IRQ_TYPE_MSI:
 		irq = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI);
 		if (irq < 0)
 			dev_err(dev, "failed to get MSI interrupts\n");
 		test->num_irqs = irq;
+		break;
+	case IRQ_TYPE_MSIX:
+		irq = pci_alloc_irq_vectors(pdev, 1, 2048, PCI_IRQ_MSIX);
+		if (irq < 0)
+			dev_err(dev, "Failed to get MSI-X interrupts\n");
+		test->num_irqs = irq;
+		break;
+	default:
+		dev_err(dev, "Invalid IRQ type selected\n");
+		goto err_disable_msi;
 	}
 
 	err = devm_request_irq(dev, pdev->irq, pci_endpoint_test_irqhandler,
@@ -529,8 +576,9 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 				       pci_endpoint_test_irqhandler,
 				       IRQF_SHARED, DRV_MODULE_NAME, test);
 		if (err)
-			dev_err(dev, "failed to request IRQ %d for MSI %d\n",
-				pdev->irq + i, i + 1);
+			dev_err(dev, "Failed to request IRQ %d for MSI%s %d\n",
+				pdev->irq + i,
+				irq_type == IRQ_TYPE_MSIX ? "-X" : "", i + 1);
 	}
 
 	for (bar = BAR_0; bar <= BAR_5; bar++) {
@@ -596,6 +644,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 
 err_disable_msi:
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 
 err_disable_pdev:
@@ -627,6 +676,7 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 	for (i = 0; i < test->num_irqs; i++)
 		devm_free_irq(&pdev->dev, pdev->irq + i, test);
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 }
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 09/10] PCI: endpoint: functions/pci-epf-test: Replace lower into upper case characters
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Replaces lower into upper case characters in comments and debug printks.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 5997c6e..e3d4af0 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -94,7 +94,7 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test)
 
 	src_addr = pci_epc_mem_alloc_addr(epc, &src_phys_addr, reg->size);
 	if (!src_addr) {
-		dev_err(dev, "failed to allocate source address\n");
+		dev_err(dev, "Failed to allocate source address\n");
 		reg->status = STATUS_SRC_ADDR_INVALID;
 		ret = -ENOMEM;
 		goto err;
@@ -103,14 +103,14 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test)
 	ret = pci_epc_map_addr(epc, epf->func_no, src_phys_addr, reg->src_addr,
 			       reg->size);
 	if (ret) {
-		dev_err(dev, "failed to map source address\n");
+		dev_err(dev, "Failed to map source address\n");
 		reg->status = STATUS_SRC_ADDR_INVALID;
 		goto err_src_addr;
 	}
 
 	dst_addr = pci_epc_mem_alloc_addr(epc, &dst_phys_addr, reg->size);
 	if (!dst_addr) {
-		dev_err(dev, "failed to allocate destination address\n");
+		dev_err(dev, "Failed to allocate destination address\n");
 		reg->status = STATUS_DST_ADDR_INVALID;
 		ret = -ENOMEM;
 		goto err_src_map_addr;
@@ -119,7 +119,7 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test)
 	ret = pci_epc_map_addr(epc, epf->func_no, dst_phys_addr, reg->dst_addr,
 			       reg->size);
 	if (ret) {
-		dev_err(dev, "failed to map destination address\n");
+		dev_err(dev, "Failed to map destination address\n");
 		reg->status = STATUS_DST_ADDR_INVALID;
 		goto err_dst_addr;
 	}
@@ -156,7 +156,7 @@ static int pci_epf_test_read(struct pci_epf_test *epf_test)
 
 	src_addr = pci_epc_mem_alloc_addr(epc, &phys_addr, reg->size);
 	if (!src_addr) {
-		dev_err(dev, "failed to allocate address\n");
+		dev_err(dev, "Failed to allocate address\n");
 		reg->status = STATUS_SRC_ADDR_INVALID;
 		ret = -ENOMEM;
 		goto err;
@@ -165,7 +165,7 @@ static int pci_epf_test_read(struct pci_epf_test *epf_test)
 	ret = pci_epc_map_addr(epc, epf->func_no, phys_addr, reg->src_addr,
 			       reg->size);
 	if (ret) {
-		dev_err(dev, "failed to map address\n");
+		dev_err(dev, "Failed to map address\n");
 		reg->status = STATUS_SRC_ADDR_INVALID;
 		goto err_addr;
 	}
@@ -208,7 +208,7 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test)
 
 	dst_addr = pci_epc_mem_alloc_addr(epc, &phys_addr, reg->size);
 	if (!dst_addr) {
-		dev_err(dev, "failed to allocate address\n");
+		dev_err(dev, "Failed to allocate address\n");
 		reg->status = STATUS_DST_ADDR_INVALID;
 		ret = -ENOMEM;
 		goto err;
@@ -217,7 +217,7 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test)
 	ret = pci_epc_map_addr(epc, epf->func_no, phys_addr, reg->dst_addr,
 			       reg->size);
 	if (ret) {
-		dev_err(dev, "failed to map address\n");
+		dev_err(dev, "Failed to map address\n");
 		reg->status = STATUS_DST_ADDR_INVALID;
 		goto err_addr;
 	}
@@ -422,7 +422,7 @@ static int pci_epf_test_set_bar(struct pci_epf *epf)
 		ret = pci_epc_set_bar(epc, epf->func_no, epf_bar);
 		if (ret) {
 			pci_epf_free_space(epf, epf_test->reg[bar], bar);
-			dev_err(dev, "failed to set BAR%d\n", bar);
+			dev_err(dev, "Failed to set BAR%d\n", bar);
 			if (bar == test_reg_bar)
 				return ret;
 		}
@@ -449,7 +449,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 	base = pci_epf_alloc_space(epf, sizeof(struct pci_epf_test_reg),
 				   test_reg_bar);
 	if (!base) {
-		dev_err(dev, "failed to allocated register space\n");
+		dev_err(dev, "Failed to allocated register space\n");
 		return -ENOMEM;
 	}
 	epf_test->reg[test_reg_bar] = base;
@@ -459,7 +459,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 			continue;
 		base = pci_epf_alloc_space(epf, bar_size[bar], bar);
 		if (!base)
-			dev_err(dev, "failed to allocate space for BAR%d\n",
+			dev_err(dev, "Failed to allocate space for BAR%d\n",
 				bar);
 		epf_test->reg[bar] = base;
 	}
@@ -480,7 +480,7 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 
 	ret = pci_epc_write_header(epc, epf->func_no, header);
 	if (ret) {
-		dev_err(dev, "configuration header write failed\n");
+		dev_err(dev, "Configuration header write failed\n");
 		return ret;
 	}
 
@@ -578,7 +578,7 @@ static int __init pci_epf_test_init(void)
 					     WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	ret = pci_epf_register_driver(&test_driver);
 	if (ret) {
-		pr_err("failed to register pci epf test driver --> %d\n", ret);
+		pr_err("Failed to register pci epf test driver --> %d\n", ret);
 		return ret;
 	}
 
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 08/10] PCI: endpoint: functions/pci-epf-test: Add MSI-X support
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Adds driver's MSI-X support.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 87 +++++++++++++++++++++------
 1 file changed, 69 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 63dca44..5997c6e 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -20,11 +20,18 @@
 
 #define COMMAND_RAISE_LEGACY_IRQ	BIT(0)
 #define COMMAND_RAISE_MSI_IRQ		BIT(1)
-#define MSI_NUMBER_SHIFT		2
+#define COMMAND_RAISE_MSIX_IRQ		BIT(2)
+#define IRQ_TYPE_SHIFT			3
+#define MSI_NUMBER_SHIFT		5
+#define IRQ_TYPE_MASK			(0x3 << IRQ_TYPE_SHIFT)
+#define IRQ_TYPE_LEGACY			0
+#define IRQ_TYPE_MSI			1
+#define IRQ_TYPE_MSIX			2
 #define MSI_NUMBER_MASK			(0x3f << MSI_NUMBER_SHIFT)
-#define COMMAND_READ			BIT(8)
-#define COMMAND_WRITE			BIT(9)
-#define COMMAND_COPY			BIT(10)
+#define MSIX_NUMBER_MASK		(0xfff << MSI_NUMBER_SHIFT)
+#define COMMAND_READ			BIT(17)
+#define COMMAND_WRITE			BIT(18)
+#define COMMAND_COPY			BIT(19)
 
 #define STATUS_READ_SUCCESS		BIT(0)
 #define STATUS_READ_FAIL		BIT(1)
@@ -244,31 +251,44 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test)
 	return ret;
 }
 
-static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq)
+static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq_type,
+				   u16 irq)
 {
-	u8 msi_count;
 	struct pci_epf *epf = epf_test->epf;
+	struct device *dev = &epf->dev;
 	struct pci_epc *epc = epf->epc;
 	enum pci_barno test_reg_bar = epf_test->test_reg_bar;
 	struct pci_epf_test_reg *reg = epf_test->reg[test_reg_bar];
 
 	reg->status |= STATUS_IRQ_RAISED;
-	msi_count = pci_epc_get_msi(epc, epf->func_no);
-	if (irq > msi_count || msi_count <= 0)
-		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, 0);
-	else
+
+	switch (irq_type) {
+	case IRQ_TYPE_LEGACY:
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, irq);
+		break;
+	case IRQ_TYPE_MSI:
 		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSI, irq);
+		break;
+	case IRQ_TYPE_MSIX:
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, irq);
+		break;
+	default:
+		dev_err(dev, "Failed to raise IRQ, unknown type\n");
+		break;
+	}
 }
 
 static void pci_epf_test_cmd_handler(struct work_struct *work)
 {
 	int ret;
-	u8 irq;
-	u8 msi_count;
+	u16 irq;
+	u8 irq_type;
+	u16 msi_count;
 	u32 command;
 	struct pci_epf_test *epf_test = container_of(work, struct pci_epf_test,
 						     cmd_handler.work);
 	struct pci_epf *epf = epf_test->epf;
+	struct device *dev = &epf->dev;
 	struct pci_epc *epc = epf->epc;
 	enum pci_barno test_reg_bar = epf_test->test_reg_bar;
 	struct pci_epf_test_reg *reg = epf_test->reg[test_reg_bar];
@@ -280,11 +300,25 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 	reg->command = 0;
 	reg->status = 0;
 
-	irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT;
+	irq_type = (command & IRQ_TYPE_MASK) >> IRQ_TYPE_SHIFT;
+	switch (irq_type) {
+	case IRQ_TYPE_LEGACY:
+		irq = 0;
+		break;
+	case IRQ_TYPE_MSI:
+		irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT;
+		break;
+	case IRQ_TYPE_MSIX:
+		irq = (command & MSIX_NUMBER_MASK) >> MSI_NUMBER_SHIFT;
+		break;
+	default:
+		dev_err(dev, "Failed to detect IRQ type\n");
+		goto reset_handler;
+	}
 
 	if (command & COMMAND_RAISE_LEGACY_IRQ) {
 		reg->status = STATUS_IRQ_RAISED;
-		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, 0);
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, irq);
 		goto reset_handler;
 	}
 
@@ -294,7 +328,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 			reg->status |= STATUS_WRITE_FAIL;
 		else
 			reg->status |= STATUS_WRITE_SUCCESS;
-		pci_epf_test_raise_irq(epf_test, irq);
+		pci_epf_test_raise_irq(epf_test, irq_type, irq);
 		goto reset_handler;
 	}
 
@@ -304,7 +338,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 			reg->status |= STATUS_READ_SUCCESS;
 		else
 			reg->status |= STATUS_READ_FAIL;
-		pci_epf_test_raise_irq(epf_test, irq);
+		pci_epf_test_raise_irq(epf_test, irq_type, irq);
 		goto reset_handler;
 	}
 
@@ -314,7 +348,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 			reg->status |= STATUS_COPY_SUCCESS;
 		else
 			reg->status |= STATUS_COPY_FAIL;
-		pci_epf_test_raise_irq(epf_test, irq);
+		pci_epf_test_raise_irq(epf_test, irq_type, irq);
 		goto reset_handler;
 	}
 
@@ -327,6 +361,15 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 		goto reset_handler;
 	}
 
+	if (command & COMMAND_RAISE_MSIX_IRQ) {
+		msi_count = pci_epc_get_msix(epc, epf->func_no);
+		if (irq > msi_count || msi_count <= 0)
+			goto reset_handler;
+		reg->status = STATUS_IRQ_RAISED;
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, irq);
+		goto reset_handler;
+	}
+
 reset_handler:
 	queue_delayed_work(kpcitest_workqueue, &epf_test->cmd_handler,
 			   msecs_to_jiffies(1));
@@ -450,8 +493,16 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		return ret;
 
 	ret = pci_epc_set_msi(epc, epf->func_no, epf->msi_interrupts);
-	if (ret)
+	if (ret) {
+		dev_err(dev, "MSI configuration failed\n");
 		return ret;
+	}
+
+	ret = pci_epc_set_msix(epc, epf->func_no, epf->msix_interrupts);
+	if (ret) {
+		dev_err(dev, "MSI-X configuration failed\n");
+		return ret;
+	}
 
 	if (!epf_test->linkup_notifier)
 		queue_work(kpcitest_workqueue, &epf_test->cmd_handler.work);
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 07/10] misc: pci_endpoint_test: Replace lower into upper case characters
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Replaces lower into upper case characters in comments and debug printks.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/misc/pci_endpoint_test.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index a7d9354..7212a7d 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -259,7 +259,7 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 	orig_src_addr = dma_alloc_coherent(dev, size + alignment,
 					   &orig_src_phys_addr, GFP_KERNEL);
 	if (!orig_src_addr) {
-		dev_err(dev, "failed to allocate source buffer\n");
+		dev_err(dev, "Failed to allocate source buffer\n");
 		ret = false;
 		goto err;
 	}
@@ -285,7 +285,7 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 	orig_dst_addr = dma_alloc_coherent(dev, size + alignment,
 					   &orig_dst_phys_addr, GFP_KERNEL);
 	if (!orig_dst_addr) {
-		dev_err(dev, "failed to allocate destination address\n");
+		dev_err(dev, "Failed to allocate destination address\n");
 		ret = false;
 		goto err_orig_src_addr;
 	}
@@ -349,7 +349,7 @@ static bool pci_endpoint_test_write(struct pci_endpoint_test *test, size_t size)
 	orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr,
 				       GFP_KERNEL);
 	if (!orig_addr) {
-		dev_err(dev, "failed to allocate address\n");
+		dev_err(dev, "Failed to allocate address\n");
 		ret = false;
 		goto err;
 	}
@@ -412,7 +412,7 @@ static bool pci_endpoint_test_read(struct pci_endpoint_test *test, size_t size)
 	orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr,
 				       GFP_KERNEL);
 	if (!orig_addr) {
-		dev_err(dev, "failed to allocate destination address\n");
+		dev_err(dev, "Failed to allocate destination address\n");
 		ret = false;
 		goto err;
 	}
@@ -550,7 +550,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	case IRQ_TYPE_MSI:
 		irq = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI);
 		if (irq < 0)
-			dev_err(dev, "failed to get MSI interrupts\n");
+			dev_err(dev, "Failed to get MSI interrupts\n");
 		test->num_irqs = irq;
 		break;
 	case IRQ_TYPE_MSIX:
@@ -567,7 +567,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	err = devm_request_irq(dev, pdev->irq, pci_endpoint_test_irqhandler,
 			       IRQF_SHARED, DRV_MODULE_NAME, test);
 	if (err) {
-		dev_err(dev, "failed to request IRQ %d\n", pdev->irq);
+		dev_err(dev, "Failed to request IRQ %d\n", pdev->irq);
 		goto err_disable_msi;
 	}
 
@@ -585,7 +585,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 		if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
 			base = pci_ioremap_bar(pdev, bar);
 			if (!base) {
-				dev_err(dev, "failed to read BAR%d\n", bar);
+				dev_err(dev, "Failed to read BAR%d\n", bar);
 				WARN_ON(bar == test_reg_bar);
 			}
 			test->bar[bar] = base;
@@ -605,7 +605,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	id = ida_simple_get(&pci_endpoint_test_ida, 0, 0, GFP_KERNEL);
 	if (id < 0) {
 		err = id;
-		dev_err(dev, "unable to get id\n");
+		dev_err(dev, "Unable to get id\n");
 		goto err_iounmap;
 	}
 
@@ -621,7 +621,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 
 	err = misc_register(misc_device);
 	if (err) {
-		dev_err(dev, "failed to register device\n");
+		dev_err(dev, "Failed to register device\n");
 		goto err_kfree_name;
 	}
 
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC 03/10] PCI: endpoint: Add MSI-X interfaces
From: Gustavo Pimentel @ 2018-04-10 17:14 UTC (permalink / raw)
  To: bhelgaas, lorenzo.pieralisi, Joao.Pinto, jingoohan1, kishon,
	adouglas, niklas.cassel, jesper.nilsson
  Cc: linux-pci, linux-doc, linux-kernel, gustavo.pimentel
In-Reply-To: <cover.1523379766.git.gustavo.pimentel@synopsys.com>

Implements the generic method for calling the get/set callbacks.

Adds the PCI_EPC_IRQ_MSIX type.

Adds the MSI-X callbacks signatures to the ops structure.

Adds sysfs interface for altering the number of MSI-X entries.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
---
 drivers/pci/endpoint/pci-ep-cfs.c   | 24 ++++++++++++++++
 drivers/pci/endpoint/pci-epc-core.c | 57 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-epc.h             | 11 ++++++-
 include/linux/pci-epf.h             |  1 +
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 018ea34..d1288a0 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -286,6 +286,28 @@ static ssize_t pci_epf_msi_interrupts_show(struct config_item *item,
 		       to_pci_epf_group(item)->epf->msi_interrupts);
 }
 
+static ssize_t pci_epf_msix_interrupts_store(struct config_item *item,
+					     const char *page, size_t len)
+{
+	u16 val;
+	int ret;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret)
+		return ret;
+
+	to_pci_epf_group(item)->epf->msix_interrupts = val;
+
+	return len;
+}
+
+static ssize_t pci_epf_msix_interrupts_show(struct config_item *item,
+					    char *page)
+{
+	return sprintf(page, "%d\n",
+		       to_pci_epf_group(item)->epf->msix_interrupts);
+}
+
 PCI_EPF_HEADER_R(vendorid)
 PCI_EPF_HEADER_W_u16(vendorid)
 
@@ -327,6 +349,7 @@ CONFIGFS_ATTR(pci_epf_, subsys_vendor_id);
 CONFIGFS_ATTR(pci_epf_, subsys_id);
 CONFIGFS_ATTR(pci_epf_, interrupt_pin);
 CONFIGFS_ATTR(pci_epf_, msi_interrupts);
+CONFIGFS_ATTR(pci_epf_, msix_interrupts);
 
 static struct configfs_attribute *pci_epf_attrs[] = {
 	&pci_epf_attr_vendorid,
@@ -340,6 +363,7 @@ static struct configfs_attribute *pci_epf_attrs[] = {
 	&pci_epf_attr_subsys_id,
 	&pci_epf_attr_interrupt_pin,
 	&pci_epf_attr_msi_interrupts,
+	&pci_epf_attr_msix_interrupts,
 	NULL,
 };
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index b0ee427..294a383 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -218,6 +218,63 @@ int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts)
 EXPORT_SYMBOL_GPL(pci_epc_set_msi);
 
 /**
+ * pci_epc_get_msix() - get the number of MSI-X interrupt numbers allocated
+ * @epc: the EPC device to which MSI-X interrupts was requested
+ * @func_no: the endpoint function number in the EPC device
+ *
+ * Invoke to get the number of MSI-X interrupts allocated by the RC
+ */
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no)
+{
+	int interrupt;
+	unsigned long flags;
+
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions)
+		return 0;
+
+	if (!epc->ops->get_msix)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	interrupt = epc->ops->get_msix(epc, func_no);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	if (interrupt < 0)
+		return 0;
+
+	return interrupt++;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_msix);
+
+/**
+ * pci_epc_set_msix() - set the number of MSI-X interrupt numbers required
+ * @epc: the EPC device on which MSI-X has to be configured
+ * @func_no: the endpoint function number in the EPC device
+ * @interrupts: number of MSI-X interrupts required by the EPF
+ *
+ * Invoke to set the required number of MSI-X interrupts.
+ */
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions ||
+	    interrupts < 1 || interrupts > 2048)
+		return -EINVAL;
+
+	if (!epc->ops->set_msix)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->set_msix(epc, func_no, interrupts - 1);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_set_msix);
+
+/**
  * pci_epc_unmap_addr() - unmap CPU address from PCI address
  * @epc: the EPC device on which address is allocated
  * @func_no: the endpoint function number in the EPC device
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index af657ca..32e8961 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -17,6 +17,7 @@ enum pci_epc_irq_type {
 	PCI_EPC_IRQ_UNKNOWN,
 	PCI_EPC_IRQ_LEGACY,
 	PCI_EPC_IRQ_MSI,
+	PCI_EPC_IRQ_MSIX,
 };
 
 /**
@@ -30,6 +31,10 @@ enum pci_epc_irq_type {
  *	     capability register
  * @get_msi: ops to get the number of MSI interrupts allocated by the RC from
  *	     the MSI capability register
+ * @set_msix: ops to set the requested number of MSI-X interrupts in the
+ *	     MSI-X capability register
+ * @get_msix: ops to get the number of MSI-X interrupts allocated by the RC
+ *	     from the MSI-X capability register
  * @raise_irq: ops to raise a legacy or MSI interrupt
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
@@ -48,8 +53,10 @@ struct pci_epc_ops {
 			      phys_addr_t addr);
 	int	(*set_msi)(struct pci_epc *epc, u8 func_no, u8 interrupts);
 	int	(*get_msi)(struct pci_epc *epc, u8 func_no);
+	int	(*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts);
+	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
-			     enum pci_epc_irq_type type, u8 interrupt_num);
+			     enum pci_epc_irq_type type, u16 interrupt_num);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
 	struct module *owner;
@@ -136,6 +143,8 @@ void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no,
 			phys_addr_t phys_addr);
 int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts);
 int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts);
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 		      enum pci_epc_irq_type type, u8 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index f7d6f48..9bb1f31 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -119,6 +119,7 @@ struct pci_epf {
 	struct pci_epf_header	*header;
 	struct pci_epf_bar	bar[6];
 	u8			msi_interrupts;
+	u16			msix_interrupts;
 	u8			func_no;
 
 	struct pci_epc		*epc;
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox