Netdev List

Netdev List
 help / color / mirror / Atom feed

* Make CONFIG_NET and CONFIG_SECCOMP_FILTER independent of CONFIG_NET
From: Norbert Manthey @ 2018-06-06 13:52 UTC (permalink / raw)
  To: linux-kernel, kvm, netdev, x86

Dear all,

currently, KVM and SECCOMP rely on functionality of CONFIG_NET, and hence the
latter has to be enabled when building the kernel for the first two
configurations. However, there exists scenarios where the system does not need
networking, but KVM and SECCOMP filters. To reduce the kernel image size for
these scenarios, and to be able to drop active code, this commit series allows
to enable CONFIG_KVM and CONFIG_SECCOMP_FILTER without using CONFIG_NET.

The functionality that is required for seccomp filters is kept in the same
files and - after reordering the source code - is guarded with a single ifdef
per file.

I hope these changes are useful for other scenarios than the one I currently
face.

Best,
Norbert

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply

* [less-CONFIG_NET 1/7] net: reorder filter code
From: Norbert Manthey @ 2018-06-06 13:53 UTC (permalink / raw)
  Cc: Norbert Manthey, Alexei Starovoitov, Daniel Borkmann,
	David S. Miller, netdev, linux-kernel
In-Reply-To: <1528293127-23825-1-git-send-email-nmanthey@amazon.de>

This commit reorders the definition of functions and struct in the
file filter.c, such that in the next step we can easily cut the file
into a commonly used part, as well as a part that is only required in
case CONFIG_NET is actually set.

This is part of the effort to split CONFIG_SECCOMP_FILTER and
CONFIG_NET.

Signed-off-by: Norbert Manthey <nmanthey@amazon.de>
---
 net/core/filter.c | 330 +++++++++++++++++++++++++++---------------------------
 1 file changed, 165 insertions(+), 165 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 201ff36b..0d980e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,58 +59,6 @@
 #include <net/tcp.h>
 #include <linux/bpf_trace.h>
 
-/**
- *	sk_filter_trim_cap - run a packet through a socket filter
- *	@sk: sock associated with &sk_buff
- *	@skb: buffer to filter
- *	@cap: limit on how short the eBPF program may trim the packet
- *
- * Run the eBPF program and then cut skb->data to correct size returned by
- * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
- * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
- * be accepted or -EPERM if the packet should be tossed.
- *
- */
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
-{
-	int err;
-	struct sk_filter *filter;
-
-	/*
-	 * If the skb was allocated from pfmemalloc reserves, only
-	 * allow SOCK_MEMALLOC sockets to use it as this socket is
-	 * helping free memory
-	 */
-	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
-		return -ENOMEM;
-	}
-	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
-	if (err)
-		return err;
-
-	err = security_sock_rcv_skb(sk, skb);
-	if (err)
-		return err;
-
-	rcu_read_lock();
-	filter = rcu_dereference(sk->sk_filter);
-	if (filter) {
-		struct sock *save_sk = skb->sk;
-		unsigned int pkt_len;
-
-		skb->sk = sk;
-		pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
-		skb->sk = save_sk;
-		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
-	}
-	rcu_read_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL(sk_filter_trim_cap);
-
 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
 {
 	return skb_get_poff(skb);
@@ -165,12 +113,6 @@ BPF_CALL_0(__get_raw_cpu_id)
 	return raw_smp_processor_id();
 }
 
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
-	.func		= __get_raw_cpu_id,
-	.gpl_only	= false,
-	.ret_type	= RET_INTEGER,
-};
-
 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 			      struct bpf_insn *insn_buf)
 {
@@ -954,71 +896,6 @@ static void __bpf_prog_release(struct bpf_prog *prog)
 	}
 }
 
-static void __sk_filter_release(struct sk_filter *fp)
-{
-	__bpf_prog_release(fp->prog);
-	kfree(fp);
-}
-
-/**
- * 	sk_filter_release_rcu - Release a socket filter by rcu_head
- *	@rcu: rcu_head that contains the sk_filter to free
- */
-static void sk_filter_release_rcu(struct rcu_head *rcu)
-{
-	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
-
-	__sk_filter_release(fp);
-}
-
-/**
- *	sk_filter_release - release a socket filter
- *	@fp: filter to remove
- *
- *	Remove a filter from a socket and release its resources.
- */
-static void sk_filter_release(struct sk_filter *fp)
-{
-	if (refcount_dec_and_test(&fp->refcnt))
-		call_rcu(&fp->rcu, sk_filter_release_rcu);
-}
-
-void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
-{
-	u32 filter_size = bpf_prog_size(fp->prog->len);
-
-	atomic_sub(filter_size, &sk->sk_omem_alloc);
-	sk_filter_release(fp);
-}
-
-/* try to charge the socket memory if there is space available
- * return true on success
- */
-static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
-{
-	u32 filter_size = bpf_prog_size(fp->prog->len);
-
-	/* same check as in sock_kmalloc() */
-	if (filter_size <= sysctl_optmem_max &&
-	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
-		atomic_add(filter_size, &sk->sk_omem_alloc);
-		return true;
-	}
-	return false;
-}
-
-bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
-{
-	if (!refcount_inc_not_zero(&fp->refcnt))
-		return false;
-
-	if (!__sk_filter_charge(sk, fp)) {
-		sk_filter_release(fp);
-		return false;
-	}
-	return true;
-}
-
 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 {
 	struct sock_filter *old_prog;
@@ -1127,19 +1004,22 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
 }
 
 /**
- *	bpf_prog_create - create an unattached filter
+ *	bpf_prog_create_from_user - create an unattached filter from user buffer
  *	@pfp: the unattached filter that is created
  *	@fprog: the filter program
+ *	@trans: post-classic verifier transformation handler
+ *	@save_orig: save classic BPF program
  *
- * Create a filter independent of any socket. We first run some
- * sanity checks on it to make sure it does not explode on us later.
- * If an error occurs or there is insufficient memory for the filter
- * a negative errno code is returned. On success the return is zero.
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
  */
-int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+			      bpf_aux_classic_check_t trans, bool save_orig)
 {
 	unsigned int fsize = bpf_classic_proglen(fprog);
 	struct bpf_prog *fp;
+	int err;
 
 	/* Make sure new filter is there and in the right amounts. */
 	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
@@ -1149,44 +1029,177 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 	if (!fp)
 		return -ENOMEM;
 
-	memcpy(fp->insns, fprog->filter, fsize);
+	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+		__bpf_prog_free(fp);
+		return -EFAULT;
+	}
 
 	fp->len = fprog->len;
-	/* Since unattached filters are not copied back to user
-	 * space through sk_get_filter(), we do not need to hold
-	 * a copy here, and can spare us the work.
-	 */
 	fp->orig_prog = NULL;
 
+	if (save_orig) {
+		err = bpf_prog_store_orig_filter(fp, fprog);
+		if (err) {
+			__bpf_prog_free(fp);
+			return -ENOMEM;
+		}
+	}
+
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = bpf_prepare_filter(fp, NULL);
+	fp = bpf_prepare_filter(fp, trans);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
 	*pfp = fp;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_create);
+EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
+
+void bpf_prog_destroy(struct bpf_prog *fp)
+{
+	__bpf_prog_release(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
 /**
- *	bpf_prog_create_from_user - create an unattached filter from user buffer
+ *	sk_filter_trim_cap - run a packet through a socket filter
+ *	@sk: sock associated with &sk_buff
+ *	@skb: buffer to filter
+ *	@cap: limit on how short the eBPF program may trim the packet
+ *
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+{
+	int err;
+	struct sk_filter *filter;
+
+	/*
+	 * If the skb was allocated from pfmemalloc reserves, only
+	 * allow SOCK_MEMALLOC sockets to use it as this socket is
+	 * helping free memory
+	 */
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+		return -ENOMEM;
+	}
+	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+	if (err)
+		return err;
+
+	err = security_sock_rcv_skb(sk, skb);
+	if (err)
+		return err;
+
+	rcu_read_lock();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter) {
+		struct sock *save_sk = skb->sk;
+		unsigned int pkt_len;
+
+		skb->sk = sk;
+		pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+		skb->sk = save_sk;
+		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+	}
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(sk_filter_trim_cap);
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+	.func		= __get_raw_cpu_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+	__bpf_prog_release(fp->prog);
+	kfree(fp);
+}
+
+/**
+ * 	sk_filter_release_rcu - Release a socket filter by rcu_head
+ *	@rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+	__sk_filter_release(fp);
+}
+
+/**
+ *	sk_filter_release - release a socket filter
+ *	@fp: filter to remove
+ *
+ *	Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+	if (refcount_dec_and_test(&fp->refcnt))
+		call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+	u32 filter_size = bpf_prog_size(fp->prog->len);
+
+	atomic_sub(filter_size, &sk->sk_omem_alloc);
+	sk_filter_release(fp);
+}
+
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+	u32 filter_size = bpf_prog_size(fp->prog->len);
+
+	/* same check as in sock_kmalloc() */
+	if (filter_size <= sysctl_optmem_max &&
+	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+		atomic_add(filter_size, &sk->sk_omem_alloc);
+		return true;
+	}
+	return false;
+}
+
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+	if (!refcount_inc_not_zero(&fp->refcnt))
+		return false;
+
+	if (!__sk_filter_charge(sk, fp)) {
+		sk_filter_release(fp);
+		return false;
+	}
+	return true;
+}
+
+/**
+ *	bpf_prog_create - create an unattached filter
  *	@pfp: the unattached filter that is created
  *	@fprog: the filter program
- *	@trans: post-classic verifier transformation handler
- *	@save_orig: save classic BPF program
  *
- * This function effectively does the same as bpf_prog_create(), only
- * that it builds up its insns buffer from user space provided buffer.
- * It also allows for passing a bpf_aux_classic_check_t handler.
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
  */
-int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
-			      bpf_aux_classic_check_t trans, bool save_orig)
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 {
 	unsigned int fsize = bpf_classic_proglen(fprog);
 	struct bpf_prog *fp;
-	int err;
 
 	/* Make sure new filter is there and in the right amounts. */
 	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
@@ -1196,39 +1209,26 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
 	if (!fp)
 		return -ENOMEM;
 
-	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
-		__bpf_prog_free(fp);
-		return -EFAULT;
-	}
+	memcpy(fp->insns, fprog->filter, fsize);
 
 	fp->len = fprog->len;
+	/* Since unattached filters are not copied back to user
+	 * space through sk_get_filter(), we do not need to hold
+	 * a copy here, and can spare us the work.
+	 */
 	fp->orig_prog = NULL;
 
-	if (save_orig) {
-		err = bpf_prog_store_orig_filter(fp, fprog);
-		if (err) {
-			__bpf_prog_free(fp);
-			return -ENOMEM;
-		}
-	}
-
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = bpf_prepare_filter(fp, trans);
+	fp = bpf_prepare_filter(fp, NULL);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
 	*pfp = fp;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
-
-void bpf_prog_destroy(struct bpf_prog *fp)
-{
-	__bpf_prog_release(fp);
-}
-EXPORT_SYMBOL_GPL(bpf_prog_destroy);
+EXPORT_SYMBOL_GPL(bpf_prog_create);
 
 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 {
-- 
2.7.4

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply related

* [less-CONFIG_NET 2/7] net: reorder flow_dissector
From: Norbert Manthey @ 2018-06-06 13:53 UTC (permalink / raw)
  Cc: Norbert Manthey, David S. Miller, Simon Horman, Andrew Lunn,
	Jakub Kicinski, Tom Herbert, John Crispin, Eric Dumazet,
	Sven Eckelmann, WANG Cong, David Ahern, Jon Maloy, netdev,
	linux-kernel
In-Reply-To: <1528293206-24298-1-git-send-email-nmanthey@amazon.de>

This commit reorders the definitions, such that in the next step we
can easily cut the file into a commonly used part, as well as a part
that is only required in case CONFIG_NET is used.

This is part of the effort to split CONFIG_SECCOMP_FILTER and
CONFIG_NET.

Signed-off-by: Norbert Manthey <nmanthey@amazon.de>
---
 net/core/flow_dissector.c | 206 +++++++++++++++++++++++-----------------------
 1 file changed, 103 insertions(+), 103 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09b..70e0679 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1085,36 +1085,6 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 	return (sizeof(*flow) - diff) / sizeof(u32);
 }
 
-__be32 flow_get_u32_src(const struct flow_keys *flow)
-{
-	switch (flow->control.addr_type) {
-	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
-		return flow->addrs.v4addrs.src;
-	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
-		return (__force __be32)ipv6_addr_hash(
-			&flow->addrs.v6addrs.src);
-	case FLOW_DISSECTOR_KEY_TIPC:
-		return flow->addrs.tipckey.key;
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL(flow_get_u32_src);
-
-__be32 flow_get_u32_dst(const struct flow_keys *flow)
-{
-	switch (flow->control.addr_type) {
-	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
-		return flow->addrs.v4addrs.dst;
-	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
-		return (__force __be32)ipv6_addr_hash(
-			&flow->addrs.v6addrs.dst);
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL(flow_get_u32_dst);
-
 static inline void __flow_hash_consistentify(struct flow_keys *keys)
 {
 	int addr_diff, i;
@@ -1162,49 +1132,6 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	return hash;
 }
 
-u32 flow_hash_from_keys(struct flow_keys *keys)
-{
-	__flow_hash_secret_init();
-	return __flow_hash_from_keys(keys, hashrnd);
-}
-EXPORT_SYMBOL(flow_hash_from_keys);
-
-static inline u32 ___skb_get_hash(const struct sk_buff *skb,
-				  struct flow_keys *keys, u32 keyval)
-{
-	skb_flow_dissect_flow_keys(skb, keys,
-				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
-
-	return __flow_hash_from_keys(keys, keyval);
-}
-
-struct _flow_keys_digest_data {
-	__be16	n_proto;
-	u8	ip_proto;
-	u8	padding;
-	__be32	ports;
-	__be32	src;
-	__be32	dst;
-};
-
-void make_flow_keys_digest(struct flow_keys_digest *digest,
-			   const struct flow_keys *flow)
-{
-	struct _flow_keys_digest_data *data =
-	    (struct _flow_keys_digest_data *)digest;
-
-	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
-
-	memset(digest, 0, sizeof(*digest));
-
-	data->n_proto = flow->basic.n_proto;
-	data->ip_proto = flow->basic.ip_proto;
-	data->ports = flow->ports.ports;
-	data->src = flow->addrs.v4addrs.src;
-	data->dst = flow->addrs.v4addrs.dst;
-}
-EXPORT_SYMBOL(make_flow_keys_digest);
-
 static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
 
 u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
@@ -1222,36 +1149,6 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
 
-/**
- * __skb_get_hash: calculate a flow hash
- * @skb: sk_buff to calculate flow hash from
- *
- * This function calculates a flow hash based on src/dst addresses
- * and src/dst port numbers.  Sets hash in skb to non-zero hash value
- * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
- * if hash is a canonical 4-tuple hash over transport ports.
- */
-void __skb_get_hash(struct sk_buff *skb)
-{
-	struct flow_keys keys;
-	u32 hash;
-
-	__flow_hash_secret_init();
-
-	hash = ___skb_get_hash(skb, &keys, hashrnd);
-
-	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
-}
-EXPORT_SYMBOL(__skb_get_hash);
-
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
-{
-	struct flow_keys keys;
-
-	return ___skb_get_hash(skb, &keys, perturb);
-}
-EXPORT_SYMBOL(skb_get_hash_perturb);
-
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
@@ -1322,6 +1219,109 @@ u32 skb_get_poff(const struct sk_buff *skb)
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
 
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.src;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.src);
+	case FLOW_DISSECTOR_KEY_TIPC:
+		return flow->addrs.tipckey.key;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.dst;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.dst);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
+
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+	__flow_hash_secret_init();
+	return __flow_hash_from_keys(keys, hashrnd);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+				  struct flow_keys *keys, u32 keyval)
+{
+	skb_flow_dissect_flow_keys(skb, keys,
+				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+	return __flow_hash_from_keys(keys, keyval);
+}
+
+struct _flow_keys_digest_data {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+	__be32	ports;
+	__be32	src;
+	__be32	dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow)
+{
+	struct _flow_keys_digest_data *data =
+	    (struct _flow_keys_digest_data *)digest;
+
+	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+	memset(digest, 0, sizeof(*digest));
+
+	data->n_proto = flow->basic.n_proto;
+	data->ip_proto = flow->basic.ip_proto;
+	data->ports = flow->ports.ports;
+	data->src = flow->addrs.v4addrs.src;
+	data->dst = flow->addrs.v4addrs.dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
+/**
+ * __skb_get_hash: calculate a flow hash
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
+ * and src/dst port numbers.  Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash(struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	u32 hash;
+
+	__flow_hash_secret_init();
+
+	hash = ___skb_get_hash(skb, &keys, hashrnd);
+
+	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
+}
+EXPORT_SYMBOL(__skb_get_hash);
+
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+{
+	struct flow_keys keys;
+
+	return ___skb_get_hash(skb, &keys, perturb);
+}
+EXPORT_SYMBOL(skb_get_hash_perturb);
+
 __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
-- 
2.7.4

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply related

* [less-CONFIG_NET 3/7] seccomp: include net and bpf files
From: Norbert Manthey @ 2018-06-06 13:53 UTC (permalink / raw)
  Cc: Norbert Manthey, Alexei Starovoitov, Daniel Borkmann,
	David S. Miller, netdev, linux-kernel
In-Reply-To: <1528293206-24298-1-git-send-email-nmanthey@amazon.de>

When we want to use CONFIG_SECCOMP_FILTER without CONFIG_NET, we have
to ensure that the required files that would be pulled in via
CONFIG_NET are compiled when dropping CONFIG_NET.

Signed-off-by: Norbert Manthey <nmanthey@amazon.de>
---
 kernel/bpf/Makefile | 3 ++-
 net/Makefile        | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd2..5d13269 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,7 +4,8 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
-ifeq ($(CONFIG_NET),y)
+
+ifneq ($(filter y,$(CONFIG_NET) $(CONFIG_SECCOMP_FILTER)),)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
diff --git a/net/Makefile b/net/Makefile
index a6147c6..08f1875 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -11,6 +11,11 @@ obj-$(CONFIG_NET)		:= socket.o core/
 tmp-$(CONFIG_COMPAT) 		:= compat.o
 obj-$(CONFIG_NET)		+= $(tmp-y)
 
+ifneq ($(CONFIG_NET),y)
+obj-$(CONFIG_SECCOMP_FILTER)    += core/filter.o
+obj-$(CONFIG_SECCOMP_FILTER)    += core/flow_dissector.o
+endif
+
 # LLC has to be linked before the files in net/802/
 obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/ bpf/
-- 
2.7.4

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply related

* [less-CONFIG_NET 5/7] seccomp: cut off functions not required
From: Norbert Manthey @ 2018-06-06 13:53 UTC (permalink / raw)
  Cc: Norbert Manthey, Alexei Starovoitov, Daniel Borkmann,
	David S. Miller, John Crispin, Simon Horman, Jakub Kicinski,
	Tom Herbert, Eric Dumazet, Sven Eckelmann, WANG Cong, David Ahern,
	Jon Maloy, netdev, linux-kernel
In-Reply-To: <1528293206-24298-1-git-send-email-nmanthey@amazon.de>

When using CONFIG_SECCOMP_FILTER, not all functions of filter.c and
flow_dissector.c are required. To not pull in more dependencies,
guard the functions that are not required with CONFIG_NET defines.
This way, these functions are enabled in case the file is compiled
because of CONFIG_NET, but they are not present when the file is
compiled because of other configurations.

Signed-off-by: Norbert Manthey <nmanthey@amazon.de>
---
 net/core/filter.c         | 2 ++
 net/core/flow_dissector.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 0d980e9..4ddacb7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1063,6 +1063,7 @@ void bpf_prog_destroy(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
+#if defined(CONFIG_NET)
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
@@ -5657,3 +5658,4 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 	release_sock(sk);
 	return ret;
 }
+#endif  // CONFIG_NET
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 70e0679..0903444 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1219,6 +1219,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
 
+#if defined(CONFIG_NET)
 __be32 flow_get_u32_src(const struct flow_keys *flow)
 {
 	switch (flow->control.addr_type) {
@@ -1340,6 +1341,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	return flow_hash_from_keys(keys);
 }
 EXPORT_SYMBOL(__get_hash_from_flowi6);
+#endif  // CONFIG_NET
 
 static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 	{
-- 
2.7.4

Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply related

* [PATCH net] rxrpc: Fix terminal retransmission connection ID to include the channel
From: David Howells @ 2018-06-06 13:59 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel

When retransmitting the final ACK or ABORT packet for a call, the cid field
in the packet header is set to the connection's cid, but this is incorrect
as it also needs to include the channel number on that connection that the
call was made on.

Fix this by OR'ing in the channel number.

Note that this fixes the bug that:

	commit 1a025028d400b23477341aa7ec2ce55f8b39b554
	rxrpc: Fix handling of call quietly cancelled out on server

works around.  I'm not intending to revert that as it will help protect
against problems that might occur on the server.

Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call")
Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/rxrpc/conn_event.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 1350f1be8037..8229a52c2acd 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -70,7 +70,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	iov[2].iov_len	= sizeof(ack_info);
 
 	pkt.whdr.epoch		= htonl(conn->proto.epoch);
-	pkt.whdr.cid		= htonl(conn->proto.cid);
+	pkt.whdr.cid		= htonl(conn->proto.cid | channel);
 	pkt.whdr.callNumber	= htonl(call_id);
 	pkt.whdr.seq		= 0;
 	pkt.whdr.type		= chan->last_type;

^ permalink raw reply related

* Re: [PATCH net] ipv4: igmp: hold wakelock to prevent delayed reports
From: Tejaswi Tanikella @ 2018-06-06 14:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, f.fainelli, andrew, edumazet
In-Reply-To: <20180604.110640.184022152967598302.davem@davemloft.net>

On Mon, Jun 04, 2018 at 11:06:40AM -0400, David Miller wrote:
> From: Tejaswi Tanikella <tejaswit@codeaurora.org>
> Date: Fri, 1 Jun 2018 19:35:41 +0530
> 
> > On receiving a IGMPv2/v3 query, based on max_delay set in the header a
> > timer is started to send out a response after a random time within
> > max_delay. If the system then moves into suspend state, Report is
> > delayed until system wakes up.
> > 
> > In one reported scenario, on arm64 devices, max_delay was set to 10s,
> > Reports were consistantly delayed if the timer is scheduled after 5 plus
> > seconds.
> > 
> > Hold wakelock while starting the timer to prevent moving into suspend
> > state.
> > 
> > Signed-off-by: Tejaswi Tanikella <tejaswit@codeaurora.org>
> 
> As Florian stated, this won't be the only networking facility to hit
> a problem like this.  So, if we go down this route, we probably want
> to generically solve this.
> 
> But I have a deeper concern.
> 
> Do we really want every timer based querying mechanism to prevent a
> system from being able to suspend?
> 
> We get to the point where external entities can generate traffic which
> can prevent a remote system from entering suspend state.

Like you suggested maybe aquiring a wakelock will unnecessarily stop the
system from suspending.
Would using alarmtimer be better? It seems similar to a hrtimer but can
wake the system up from suspend.

This should allow the system to suspend till the timer expires. But might
cause repeated wake-ups and suspends.

^ permalink raw reply

* Re: [PATCH net-next 1/6] net: hippi: use pci_zalloc_consistent
From: kbuild test robot @ 2018-06-06 15:04 UTC (permalink / raw)
  To: YueHaibing
  Cc: kbuild-all, davem, netdev, linux-kernel, jcliburn, chris.snook,
	benve, jdmason, chessman, jes, rahul.verma, YueHaibing
In-Reply-To: <20180605122851.23912-2-yuehaibing@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 5410 bytes --]

Hi YueHaibing,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/YueHaibing/use-pci_zalloc_consistent/20180606-205531
config: x86_64-allyesdebian (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/net/hippi/rrunner.c: In function 'rr_init_one':
>> drivers/net/hippi/rrunner.c:176:7: error: 'trrpriv' undeclared (first use in this function); did you mean 'rrpriv'?
     if (!trrpriv->evt_ring) {
          ^~~~~~~
          rrpriv
   drivers/net/hippi/rrunner.c:176:7: note: each undeclared identifier is reported only once for each function it appears in

vim +176 drivers/net/hippi/rrunner.c

    74	
    75	/*
    76	 * Implementation notes:
    77	 *
    78	 * The DMA engine only allows for DMA within physical 64KB chunks of
    79	 * memory. The current approach of the driver (and stack) is to use
    80	 * linear blocks of memory for the skbuffs. However, as the data block
    81	 * is always the first part of the skb and skbs are 2^n aligned so we
    82	 * are guarantted to get the whole block within one 64KB align 64KB
    83	 * chunk.
    84	 *
    85	 * On the long term, relying on being able to allocate 64KB linear
    86	 * chunks of memory is not feasible and the skb handling code and the
    87	 * stack will need to know about I/O vectors or something similar.
    88	 */
    89	
    90	static int rr_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
    91	{
    92		struct net_device *dev;
    93		static int version_disp;
    94		u8 pci_latency;
    95		struct rr_private *rrpriv;
    96		dma_addr_t ring_dma;
    97		int ret = -ENOMEM;
    98	
    99		dev = alloc_hippi_dev(sizeof(struct rr_private));
   100		if (!dev)
   101			goto out3;
   102	
   103		ret = pci_enable_device(pdev);
   104		if (ret) {
   105			ret = -ENODEV;
   106			goto out2;
   107		}
   108	
   109		rrpriv = netdev_priv(dev);
   110	
   111		SET_NETDEV_DEV(dev, &pdev->dev);
   112	
   113		ret = pci_request_regions(pdev, "rrunner");
   114		if (ret < 0)
   115			goto out;
   116	
   117		pci_set_drvdata(pdev, dev);
   118	
   119		rrpriv->pci_dev = pdev;
   120	
   121		spin_lock_init(&rrpriv->lock);
   122	
   123		dev->netdev_ops = &rr_netdev_ops;
   124	
   125		/* display version info if adapter is found */
   126		if (!version_disp) {
   127			/* set display flag to TRUE so that */
   128			/* we only display this string ONCE */
   129			version_disp = 1;
   130			printk(version);
   131		}
   132	
   133		pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &pci_latency);
   134		if (pci_latency <= 0x58){
   135			pci_latency = 0x58;
   136			pci_write_config_byte(pdev, PCI_LATENCY_TIMER, pci_latency);
   137		}
   138	
   139		pci_set_master(pdev);
   140	
   141		printk(KERN_INFO "%s: Essential RoadRunner serial HIPPI "
   142		       "at 0x%llx, irq %i, PCI latency %i\n", dev->name,
   143		       (unsigned long long)pci_resource_start(pdev, 0),
   144		       pdev->irq, pci_latency);
   145	
   146		/*
   147		 * Remap the MMIO regs into kernel space.
   148		 */
   149		rrpriv->regs = pci_iomap(pdev, 0, 0x1000);
   150		if (!rrpriv->regs) {
   151			printk(KERN_ERR "%s:  Unable to map I/O register, "
   152				"RoadRunner will be disabled.\n", dev->name);
   153			ret = -EIO;
   154			goto out;
   155		}
   156	
   157		rrpriv->tx_ring = pci_alloc_consistent(pdev, TX_TOTAL_SIZE, &ring_dma);
   158		rrpriv->tx_ring_dma = ring_dma;
   159	
   160		if (!rrpriv->tx_ring) {
   161			ret = -ENOMEM;
   162			goto out;
   163		}
   164	
   165		rrpriv->rx_ring = pci_alloc_consistent(pdev, RX_TOTAL_SIZE, &ring_dma);
   166		rrpriv->rx_ring_dma = ring_dma;
   167	
   168		if (!rrpriv->rx_ring) {
   169			ret = -ENOMEM;
   170			goto out;
   171		}
   172	
   173		rrpriv->evt_ring = pci_alloc_consistent(pdev, EVT_RING_SIZE, &ring_dma);
   174		rrpriv->evt_ring_dma = ring_dma;
   175	
 > 176		if (!trrpriv->evt_ring) {
   177			ret = -ENOMEM;
   178			goto out;
   179		}
   180	
   181		/*
   182		 * Don't access any register before this point!
   183		 */
   184	#ifdef __BIG_ENDIAN
   185		writel(readl(&rrpriv->regs->HostCtrl) | NO_SWAP,
   186			&rrpriv->regs->HostCtrl);
   187	#endif
   188		/*
   189		 * Need to add a case for little-endian 64-bit hosts here.
   190		 */
   191	
   192		rr_init(dev);
   193	
   194		ret = register_netdev(dev);
   195		if (ret)
   196			goto out;
   197		return 0;
   198	
   199	 out:
   200		if (rrpriv->evt_ring)
   201			pci_free_consistent(pdev, EVT_RING_SIZE, rrpriv->evt_ring,
   202					    rrpriv->evt_ring_dma);
   203		if (rrpriv->rx_ring)
   204			pci_free_consistent(pdev, RX_TOTAL_SIZE, rrpriv->rx_ring,
   205					    rrpriv->rx_ring_dma);
   206		if (rrpriv->tx_ring)
   207			pci_free_consistent(pdev, TX_TOTAL_SIZE, rrpriv->tx_ring,
   208					    rrpriv->tx_ring_dma);
   209		if (rrpriv->regs)
   210			pci_iounmap(pdev, rrpriv->regs);
   211		if (pdev)
   212			pci_release_regions(pdev);
   213	 out2:
   214		free_netdev(dev);
   215	 out3:
   216		return ret;
   217	}
   218	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 39429 bytes --]

^ permalink raw reply

* [PATCH net] net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan
From: Willem de Bruijn @ 2018-06-06 15:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, rppt, herbert, anton.ivanov, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Tun, tap, virtio, packet and uml vector all use struct virtio_net_hdr
to communicate packet metadata to userspace.

For skbuffs with vlan, the first two return the packet as it may have
existed on the wire, inserting the VLAN tag in the user buffer.  Then
virtio_net_hdr.csum_start needs to be adjusted by VLAN_HLEN bytes.

Commit f09e2249c4f5 ("macvtap: restore vlan header on user read")
added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix
csum_start when VLAN tags are present") then fixed up csum_start.

Virtio, packet and uml do not insert the vlan header in the user
buffer.

When introducing virtio_net_hdr_from_skb to deduplicate filling in
the virtio_net_hdr, the variant from macvtap which adds VLAN_HLEN was
applied uniformly, breaking csum offset for packets with vlan on
virtio and packet.

Make insertion of VLAN_HLEN optional. Convert the callers to pass it
when needed.

Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Fixes: 1276f24eeef2 ("packet: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 arch/um/drivers/vector_transports.c |  3 ++-
 drivers/net/tap.c                   |  5 ++++-
 drivers/net/tun.c                   |  3 ++-
 drivers/net/virtio_net.c            |  3 ++-
 include/linux/virtio_net.h          | 11 ++++-------
 net/packet/af_packet.c              |  4 ++--
 6 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c
index 9065047f844b..77e4ebc206ae 100644
--- a/arch/um/drivers/vector_transports.c
+++ b/arch/um/drivers/vector_transports.c
@@ -120,7 +120,8 @@ static int raw_form_header(uint8_t *header,
 		skb,
 		vheader,
 		virtio_legacy_is_little_endian(),
-		false
+		false,
+		0
 	);
 
 	return 0;
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 9b6cb780affe..f0f7cd977667 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -774,13 +774,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
+		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
 		struct virtio_net_hdr vnet_hdr;
+
 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
 		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    tap_is_little_endian(q), true))
+					    tap_is_little_endian(q), true,
+					    vlan_hlen))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 23e9eb66197f..409eb8b74740 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2078,7 +2078,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &gso,
-					    tun_is_little_endian(tun), true)) {
+					    tun_is_little_endian(tun), true,
+					    vlan_hlen)) {
 			struct skb_shared_info *sinfo = skb_shinfo(skb);
 			pr_err("unexpected GSO type: "
 			       "0x%x, gso_size %d, hdr_len %d\n",
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 032e1ac10a30..8c7207535179 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1358,7 +1358,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		hdr = skb_vnet_hdr(skb);
 
 	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
-				    virtio_is_little_endian(vi->vdev), false))
+				    virtio_is_little_endian(vi->vdev), false,
+				    0))
 		BUG();
 
 	if (vi->mergeable_rx_bufs)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index f144216febc6..9397628a1967 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian,
-					  bool has_data_valid)
+					  bool has_data_valid,
+					  int vlan_hlen)
 {
 	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
@@ -83,12 +84,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (skb_vlan_tag_present(skb))
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb) + VLAN_HLEN);
-		else
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb));
+		hdr->csum_start = __cpu_to_virtio16(little_endian,
+			skb_checksum_start_offset(skb) + vlan_hlen);
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
 	} else if (has_data_valid &&
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index acb7b86574cd..9198997b39d1 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2037,7 +2037,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 		return -EINVAL;
 	*len -= sizeof(vnet_hdr);
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
+	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2304,7 +2304,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (do_vnet) {
 		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
 					    sizeof(struct virtio_net_hdr),
-					    vio_le(), true)) {
+					    vio_le(), true, 0)) {
 			spin_lock(&sk->sk_receive_queue.lock);
 			goto drop_n_account;
 		}
-- 
2.17.1.1185.g55be947832-goog

^ permalink raw reply related

* [PATCH v2 net-next] net/sched: add skbprio scheduler
From: Nishanth Devarajan @ 2018-06-06 15:40 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, jiri, davem; +Cc: netdev, doucette, michel

net/sched: add skbprio scheduler

Skbprio (SKB Priority Queue) is a queuing discipline that prioritizes IPv4
and IPv6 packets according to their skb->priority field. Although Skbprio can
be employed in any scenario in which a higher skb->priority field means a
higher priority packet, Skbprio was concieved as a solution for
denial-of-service defenses that need to route packets with different priorities.

v2:
*Use skb->priority field rather than DS field. Rename queueing discipline as
SKB Priority Queue (previously Gatekeeper Priority Queue).

*Queueing discipline is made classful to expose Skbprio's internal priority
queues.

Signed-off-by: Nishanth Devarajan <ndev2021@gmail.com>
Reviewed-by: Sachin Paryani <sachin.paryani@gmail.com>
Reviewed-by: Cody Doucette <doucette@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
---
 include/uapi/linux/pkt_sched.h |  15 ++
 net/sched/Kconfig              |  13 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_skbprio.c        | 347 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 376 insertions(+)
 create mode 100644 net/sched/sch_skbprio.c

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096..6fd07e8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -124,6 +124,21 @@ struct tc_fifo_qopt {
 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
 };
 
+/* SKBPRIO section */
+
+/*
+ * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
+ * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
+ * to map one to one the DS field of IPV4 and IPV6 headers.
+ * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
+ */
+
+#define SKBPRIO_MAX_PRIORITY 64
+
+struct tc_skbprio_qopt {
+	__u32	limit; 	    	/* Queue length in packets. */
+};
+
 /* PRIO section */
 
 #define TCQ_PRIO_BANDS	16
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169f..9ac4b53 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
 
 	  If unsure, say N.
 
+config NET_SCH_SKBPRIO
+	tristate "SKB priority queue scheduler (SKBPRIO)"
+	help
+	  Say Y here if you want to use the SKB priority queue
+	  scheduler. This schedules packets according to skb->priority,
+	  which is useful for request packets in DoS mitigation systems such
+	  as Gatekeeper.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_skbprio.
+
+	  If unsure, say N.
+
 config NET_SCH_CHOKE
 	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d38..a4d8893 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO)	+= sch_skbprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 0000000..f73ad62
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,347 @@
+/*
+ * net/sched/sch_skbprio.c  SKB Priority Queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nishanth Devarajan, <ndev2021@gmail.com>
+ *		Cody Doucette, <doucette@bu.edu>
+ *	        original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+
+/*	  SKB Priority Queue
+ *	=================================
+ *
+ * This qdisc schedules a packet according to skb->priority, where a higher
+ * value places the packet closer to the exit of the queue. When the queue is
+ * full, the lowest priority packet in the queue is dropped to make room for
+ * the packet to be added if it has higher priority. If the packet to be added
+ * has lower priority than all packets in the queue, it is dropped.
+ *
+ * Without the SKB priority queue, queue length limits must be imposed
+ * for individual queues, and there is no easy way to enforce a global queue
+ * length limit across all priorities. With the SKBprio queue, a global
+ * queue length limit can be enforced while not restricting the queue lengths
+ * of individual priorities.
+ *
+ * This is especially useful for a denial-of-service defense system like
+ * Gatekeeper, which prioritizes packets in flows that demonstrate expected
+ * behavior of legitimate users. The queue is flexible to allow any number
+ * of packets of any priority up to the global limit of the scheduler
+ * without risking resource overconsumption by a flood of low priority packets.
+ *
+ * The Gatekeeper codebase is found here:
+ *
+ *		https://github.com/AltraMayor/gatekeeper
+ */
+
+struct skbprio_sched_data {
+	/* Parameters. */
+	u32 max_limit;
+
+	/* Queue state. */
+	struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
+	struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
+	u16 highest_prio;
+	u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* SKB queue is empty, return 0 (default highest priority setting). */
+	return 0;
+}
+
+static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
+	 * (default lowest priority setting).
+	 */
+	return SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *qdisc;
+	struct sk_buff_head *lp_qdisc;
+	struct sk_buff *to_drop;
+	const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
+	u16 prio, lp;
+
+	/* Obtain the priority of @skb. */
+	prio = min(skb->priority, max_priority);
+
+	qdisc = &q->qdiscs[prio];
+	if (sch->q.qlen < q->max_limit) {
+		__skb_queue_tail(qdisc, skb);
+		qdisc_qstats_backlog_inc(sch, skb);
+		q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+		/* Check to update highest and lowest priorities. */
+		if (prio > q->highest_prio)
+			q->highest_prio = prio;
+
+		if (prio < q->lowest_prio)
+			q->lowest_prio = prio;
+
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+
+	/* If this packet has the lowest priority, drop it. */
+	lp = q->lowest_prio;
+	if (prio <= lp) {
+		q->qstats[prio].drops++;
+		return qdisc_drop(skb, sch, to_free);
+	}
+
+	/* Drop the packet at the tail of the lowest priority qdisc. */
+	lp_qdisc = &q->qdiscs[lp];
+	to_drop = __skb_dequeue_tail(lp_qdisc);
+	BUG_ON(!to_drop);
+	qdisc_qstats_backlog_dec(sch, to_drop);
+	qdisc_drop(to_drop, sch, to_free);
+
+	q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
+	q->qstats[lp].drops++;
+
+	__skb_queue_tail(qdisc, skb);
+	qdisc_qstats_backlog_inc(sch, skb);
+	q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+	/* Check to update highest and lowest priorities. */
+	if (skb_queue_empty(lp_qdisc)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->lowest_prio = prio;
+			q->highest_prio = prio;
+		} else {
+			q->lowest_prio = calc_new_low_prio(q);
+		}
+	}
+
+	if (prio > q->highest_prio)
+		q->highest_prio = prio;
+
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+	struct sk_buff *skb = __skb_dequeue(hpq);
+
+	if (unlikely(!skb))
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_bstats_update(sch, skb);
+
+	q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
+
+	/* Update highest priority field. */
+	if (skb_queue_empty(hpq)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->highest_prio = 0;
+			q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+		} else {
+			q->highest_prio = calc_new_high_prio(q);
+		}
+	}
+	return skb;
+}
+
+static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct tc_skbprio_qopt *ctl = nla_data(opt);
+	const unsigned int min_limit = 1;
+
+	if (ctl->limit == (typeof(ctl->limit))-1)
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+	else if (ctl->limit < min_limit ||
+		ctl->limit > qdisc_dev(sch)->tx_queue_len)
+		return -EINVAL;
+	else
+		q->max_limit = ctl->limit;
+
+	return 0;
+}
+
+static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+	const unsigned int min_limit = 1;
+
+	/* Initialise all queues, one for each possible priority. */
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_head_init(&q->qdiscs[prio]);
+
+	memset(&q->qstats, 0, sizeof(q->qstats));
+	q->highest_prio = 0;
+	q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+	if (!opt) {
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+		return 0;
+	}
+	return skbprio_change(sch, opt, extack);
+}
+
+static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct tc_skbprio_qopt opt;
+
+	opt.limit = q->max_limit;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		return -1;
+
+	return skb->len;
+}
+
+static void skbprio_reset(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+
+	memset(&q->qstats, 0, sizeof(q->qstats));
+	q->highest_prio = 0;
+	q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static void skbprio_destroy(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+}
+
+static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
+		q->qstats[cl - 1].qlen) < 0)
+		return -1;
+	return 0;
+}
+
+static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops skbprio_class_ops = {
+	.leaf		=	skbprio_leaf,
+	.find		=	skbprio_find,
+	.dump		=	skbprio_dump_class,
+	.dump_stats	=	skbprio_dump_class_stats,
+	.walk		=	skbprio_walk,
+};
+
+static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
+	.cl_ops		=	&skbprio_class_ops,
+	.id		=	"skbprio",
+	.priv_size	=	sizeof(struct skbprio_sched_data),
+	.enqueue	=	skbprio_enqueue,
+	.dequeue	=	skbprio_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	skbprio_init,
+	.reset		=	skbprio_reset,
+	.change		=	skbprio_change,
+	.dump		=	skbprio_dump,
+	.destroy	=	skbprio_destroy,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init skbprio_module_init(void)
+{
+	return register_qdisc(&skbprio_qdisc_ops);
+}
+
+static void __exit skbprio_module_exit(void)
+{
+	unregister_qdisc(&skbprio_qdisc_ops);
+}
+
+module_init(skbprio_module_init)
+module_exit(skbprio_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net] net: sched: cls: Fix offloading when ingress dev is vxlan
From: Jakub Kicinski @ 2018-06-06 15:46 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, Paul Blakey, Jiri Pirko, Cong Wang,
	Jamal Hadi Salim, Linux Netdev List, Yevgeny Kliteynik, Roi Dayan,
	Shahar Klein, Mark Bloch, Or Gerlitz
In-Reply-To: <CAJ3xEMh5TffWyD1V1w_EWGgnQuhA26O1BX8kZrdkHu3bB9+7-g@mail.gmail.com>

On Wed, 6 Jun 2018 08:15:27 +0300, Or Gerlitz wrote:
> On Wed, Jun 6, 2018 at 12:27 AM, Jakub Kicinski <kubakici@wp.pl> wrote:
> > On Tue, 05 Jun 2018 15:06:40 -0400 (EDT), David Miller wrote:  
> >> From: Jakub Kicinski <kubakici@wp.pl>
> >> Date: Tue, 5 Jun 2018 11:57:47 -0700
> >>  
> >> > Do we still care about correctness and not breaking backward
> >> > compatibility?  
> >>
> >> Jakub let me know if you want me to revert this change.  
> >
> > Yes, I think this patch introduces a regression when block is shared
> > between offload capable and in-capable device, therefore it should be
> > reverted.  Shared blocks went through a number of review cycles to
> > ensure such cases are handled correctly.
> >
> >
> > Longer story about the actual issue which is never explained in the
> > commit message is this: in kernels 4.10 - 4.14 skip_sw flag was
> > supported on tunnels in cls_flower only:
> >
> > static int fl_hw_replace_filter(struct tcf_proto *tp,
> > [...]
> >         if (!tc_can_offload(dev)) {
> >                 if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
> >                     (f->hw_dev && !tc_can_offload(f->hw_dev))) {
> >                         f->hw_dev = dev;
> >                         return tc_skip_sw(f->flags) ? -EINVAL : 0;
> >                 }
> >                 dev = f->hw_dev;
> >                 cls_flower.egress_dev = true;
> >         } else {
> >                 f->hw_dev = dev;
> >         }
> >
> >
> > In 4.15 - 4.17 with addition of shared blocks egdev mechanism was
> > promoted to a generic TC thing supported on all filters but it no
> > longer works with skip_sw.
> >
> > I'd argue skip_sw is incorrect for tunnels, because the rule will only
> > apply to traffic ingressing on ASIC ports, not all traffic which hits
> > the tunnel netdev.  
> 
> This argument makes sense, however, skip_sw for tunnel decap rules
>  **is** allowed since 4.10 and we have some sort of regression here (turns
> out that before and after the patch..)

As I said it was allowed in 4 releases, which was a mistake, in last 3
it wasn't.  I understand your use case, but the semantics of skip_sw
are not preserved here so we should find a different solution.

> > Therefore we should keep the 4.15 - 4.17 behaviour.
> > But that's a side note, I don't think we should be breaking offload on
> > shared blocks whether we decide to support skip_sw on tunnels or not.  
> 
> skip_sw on tunnels was there before shared-block, newer features should
> take care not to break existing ones.

Oh, I agree we shouldn't break existing use cases so please don't break
the use case I mentioned above.  I want to set up shared block between
a LAG and its members.  Now since the bond will not be offload-capable
TC will not even make an attempt to offload to members.

I'm gonna test that my reading of the code is correct and send a revert
later today, sorry.

^ permalink raw reply

* Re: [PATCH net] net: sched: cls: Fix offloading when ingress dev is vxlan
From: Jakub Kicinski @ 2018-06-06 15:50 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Paul Blakey, Jiri Pirko, Cong Wang, Jamal Hadi Salim,
	David Miller, Linux Netdev List, Yevgeny Kliteynik, Roi Dayan,
	Shahar Klein, Mark Bloch, Or Gerlitz
In-Reply-To: <CAJ3xEMg0QJ2MdcXgWbe1J+7KssN9b36X_2+nBFaGAKjkcy2xPg@mail.gmail.com>

On Wed, 6 Jun 2018 08:12:20 +0300, Or Gerlitz wrote:
> On Tue, Jun 5, 2018 at 9:59 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> > On Tue, 5 Jun 2018 11:57:47 -0700, Jakub Kicinski wrote:  
> >> On Tue,  5 Jun 2018 11:04:03 +0300, Paul Blakey wrote:  
> >> > When using a vxlan device as the ingress dev, we count it as a
> >> > "no offload dev", so when such a rule comes and err stop is true,
> >> > we fail early and don't try the egdev route which can offload it
> >> > through the egress device.
> >> >
> >> > Fix that by not calling the block offload if one of the devices
> >> > attached to it is not offload capable, but make sure egress on such case
> >> > is capable instead.
> >> >
> >> > Fixes: caa7260156eb ("net: sched: keep track of offloaded filters [..]")
> >> > Reviewed-by: Roi Dayan <roid@mellanox.com>
> >> > Acked-by: Jiri Pirko <jiri@mellanox.com>
> >> > Signed-off-by: Paul Blakey <paulb@mellanox.com>  
> >>
> >> Very poor commit message.  What you're doing is re-enabling skip_sw
> >> filters on tunnel devices which semantically make no sense and
> >> shouldn't have been allowed in the first place.
> >>
> >> This will breaks block sharing between tunnels and HW netdevs (because
> >> you skip the tcf_block_cb_call() completely).  The entire egdev idea
> >> remains badly broken accepting filters like this:
> >>
> >> # tc filter add dev vxlan0 ingress \
> >>       matchall action skip_sw \
> >>               mirred egress redirect dev lo \
> >>               mirred egress redirect dev sw1np0  
> >
> > For above we probably need something like this (untested):
> >
> > diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> > index 3f4cf930f809..71e5eebec31a 100644
> > --- a/net/sched/act_api.c
> > +++ b/net/sched/act_api.c
> > @@ -1511,7 +1511,7 @@ int tc_setup_cb_egdev_call(const struct net_device *dev,
> >         struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
> >
> >         if (!egdev)
> > -               return 0;
> > +               return err_stop ? -EOPNOTSUPP : 0;
> >         return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
> >  }
> >  EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);  
> 
> Jakub,
> 
> We will test it out and let you know

That's probably insufficient, because the second action should be
skipped completely.  But an improvement nonetheless :)

> > But the correct fix is to remove egdev crutch completely IMO.  
> 
> Not against it, sometimes designs should change and be replaced
> with better ones, happens

Cool, I'm not sure when we'll get to it, but perhaps we can go over 
the ideas I presented at netconf on switchdev call next week (if it's
still active)?

^ permalink raw reply

* Re: [PATCH net] net: sched: cls: Fix offloading when ingress dev is vxlan
From: Jakub Kicinski @ 2018-06-06 15:57 UTC (permalink / raw)
  To: Paul Blakey
  Cc: David Miller, jiri, xiyou.wangcong, jhs, netdev, kliteyn, roid,
	shahark, markb, ogerlitz
In-Reply-To: <fae451a1-a925-b4c9-ef26-29e6bf58c88a@mellanox.com>

On Wed, 6 Jun 2018 10:59:30 +0300, Paul Blakey wrote:
> Maybe we can apply my patch logic of still trying the egress dev if the 
> block has a single device, and not shared. Is that ok with you?

I don't remember that patch but sounds pretty bad.

> You're patch seems good as an add on, but the egress hw device (sw1np0) 
> would go over the tc actions and see if it can offload such rule (output 
> to software lo device) and would fail anyway.

Ack, I'm not trying to solve the skip_sw problem.  It's a hard one.

^ permalink raw reply

* Re: BUG: 4.14.11 unable to handle kernel NULL pointer dereference in xfrm_lookup
From: Tobias Hommel @ 2018-06-06 16:03 UTC (permalink / raw)
  To: Kristian Evensen; +Cc: Steffen Klassert, Markus Berner, Network Development
In-Reply-To: <CAKfDRXhHMg5HKX=T1BKs5mx+BL-L8VyzqApS+8E52mDipZzU6Q@mail.gmail.com>

Hi,

On Wed, Jun 06, 2018 at 12:41:53PM +0200, Kristian Evensen wrote:
> Hi,
> 
> I am experiencing the same issue on a PC Engines APU2 running kernel
> 4.14.34, both with and without hardware encryption. With hw.
> encryption, the crash occurs within 2-4 hours. Without hw. encryption,
> it takes 7-8 hours. My setup is nothing crazy, between 7 and 20
> tunnels with heavy RX/TX.
> 
> On Fri, Feb 2, 2018 at 9:09 AM, Steffen Klassert
> <steffen.klassert@secunet.com> wrote:
> > Thanks for offering help, but I fear we have to wait until
> > Tobias has bisected it.
> 
> Has any progress been made here? I would like to try a bisect myself,

Sorry no progress until now, I currently do not get time to have a deeper look
into that. We're back to 4.1.6 right now.

> but my board is running OpenWRT, which makes bisecting hard. The only
> observation I have so far, is that I did not see the issue with kernel
> 4.9.
> 
> BR,
> Kristian

Tobi

^ permalink raw reply

* [PATCH bpf] tools/bpf: fix selftest get_cgroup_id_user
From: Yonghong Song @ 2018-06-06 16:12 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

Commit f269099a7e7a ("tools/bpf: add a selftest for
bpf_get_current_cgroup_id() helper") added a test
for bpf_get_current_cgroup_id() helper. The bpf program
is attached to tracepoint syscalls/sys_enter_nanosleep
and will record the cgroup id if the tracepoint is hit.
The test program creates a cgroup and attachs itself to
this cgroup and expects that the test program process
cgroup id is the same as the cgroup_id retrieved
by the bpf program.

In a light system where no other processes called
nanosleep syscall, the test case can pass.
In a busy system where many different processes can hit
syscalls/sys_enter_nanosleep tracepoint, the cgroup id
recorded by bpf program may not match the test program
process cgroup_id.

This patch fixed an issue by communicating the test program
pid to bpf program. The bpf program only records
cgroup id if the current task pid is the same as
passed-in pid. This ensures that the recorded cgroup_id
is for the cgroup within which the test program resides.

Fixes: f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper")
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/get_cgroup_id_kern.c | 14 +++++++++++++-
 tools/testing/selftests/bpf/get_cgroup_id_user.c | 12 ++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
index 2cf8cb2..014dba1 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_kern.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
@@ -11,12 +11,24 @@ struct bpf_map_def SEC("maps") cg_ids = {
 	.max_entries = 1,
 };
 
+struct bpf_map_def SEC("maps") pidmap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
 SEC("tracepoint/syscalls/sys_enter_nanosleep")
 int trace(void *ctx)
 {
-	__u32 key = 0;
+	__u32 pid = bpf_get_current_pid_tgid();
+	__u32 key = 0, *expected_pid;
 	__u64 *val;
 
+	expected_pid = bpf_map_lookup_elem(&pidmap, &key);
+	if (!expected_pid || *expected_pid != pid)
+		return 0;
+
 	val = bpf_map_lookup_elem(&cg_ids, &key);
 	if (val)
 		*val = bpf_get_current_cgroup_id();
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
index ea19a42..e8da7b3 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -50,13 +50,13 @@ int main(int argc, char **argv)
 	const char *probe_name = "syscalls/sys_enter_nanosleep";
 	const char *file = "get_cgroup_id_kern.o";
 	int err, bytes, efd, prog_fd, pmu_fd;
+	int cgroup_fd, cgidmap_fd, pidmap_fd;
 	struct perf_event_attr attr = {};
-	int cgroup_fd, cgidmap_fd;
 	struct bpf_object *obj;
 	__u64 kcgid = 0, ucgid;
+	__u32 key = 0, pid;
 	int exit_code = 1;
 	char buf[256];
-	__u32 key = 0;
 
 	err = setup_cgroup_environment();
 	if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err,
@@ -81,6 +81,14 @@ int main(int argc, char **argv)
 		  cgidmap_fd, errno))
 		goto close_prog;
 
+	pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
+	if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  pidmap_fd, errno))
+		goto close_prog;
+
+	pid = getpid();
+	bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
+
 	snprintf(buf, sizeof(buf),
 		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
 	efd = open(buf, O_RDONLY, 0);
-- 
2.9.5

^ permalink raw reply related

* BUG: jumbo frames broken after commit xen-netfront: Fix race between device setup and open
From: Andrew Jeddeloh @ 2018-06-06 16:29 UTC (permalink / raw)
  To: netdev

Hi all,

The patch "xen-netfront: Fix race between device setup and open" seems
to have introduced a regression preventing setting MTU's larger than
1500. We experienced this downstream with Container Linux and
confirmed with Fedora 28 as well.

It's commit f599c64fdf7d9c108e8717fb04bc41c680120da4 in the linux-stable tree.
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=f599c64fdf7d9c108e8717fb04bc41c680120da4

Downstream bugs:
https://github.com/coreos/bugs/issues/2443
https://bugzilla.redhat.com/show_bug.cgi?id=1584216

We've confirmed that reverting that commit fixes the bug. It be
reliably can be reproduced on AWS with t2.micro instances (and
presumably other systems using the same driver). Both using
systemd-networkd to set the mtu and manual ip link commands cause the
link to repsond with "Invalid argument" when trying to set the MTU >
1500.

I'm not sure why that commit introduced the regression.

Please let me know if there's any more information that would be helpful.

- Andrew

^ permalink raw reply

* Re: [less-CONFIG_NET 1/7] net: reorder filter code
From: Willem de Bruijn @ 2018-06-06 16:33 UTC (permalink / raw)
  To: Norbert Manthey
  Cc: Alexei Starovoitov, Daniel Borkmann, David S. Miller,
	Network Development, LKML
In-Reply-To: <1528293206-24298-1-git-send-email-nmanthey@amazon.de>

On Wed, Jun 6, 2018 at 9:53 AM, Norbert Manthey <nmanthey@amazon.de> wrote:
> This commit reorders the definition of functions and struct in the
> file filter.c, such that in the next step we can easily cut the file
> into a commonly used part, as well as a part that is only required in
> case CONFIG_NET is actually set.
>
> This is part of the effort to split CONFIG_SECCOMP_FILTER and
> CONFIG_NET.
>
> Signed-off-by: Norbert Manthey <nmanthey@amazon.de>

Reordering patches like this and the flow-dissector patch in this
series make cherry-picking fixes back to stable branches and
and following code history with git blame harder.

^ permalink raw reply

* [PATCH net-next] strparser: Add __strp_unpause and use it in ktls.
From: Doron Roberts-Kedes @ 2018-06-06 16:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Dave Watson, Tom Herbert, Vakul Garg, netdev, Doron Roberts-Kedes

strp_unpause queues strp_work in order to parse any messages that
arrived while the strparser was paused. However, the process invoking
strp_unpause could eagerly parse a buffered message itself if it held
the sock lock.

__strp_unpause is an alternative to strp_pause that avoids the scheduling
overhead that results when a receiving thread unpauses the strparser
and waits for the next message to be delivered by the workqueue thread.

This patch more than doubled the IOPS achieved in a benchmark of NBD
traffic encrypted using ktls.

Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
---
 include/net/strparser.h   |  2 ++
 net/strparser/strparser.c | 13 +++++++++++++
 net/tls/tls_sw.c          |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/net/strparser.h b/include/net/strparser.h
index d96b59f..f177c87 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp)
 
 /* May be called without holding lock for attached socket */
 void strp_unpause(struct strparser *strp);
+/* Must be called with process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp);
 
 static inline void save_strp_stats(struct strparser *strp,
 				   struct strp_aggr_stats *agg_stats)
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 092bebc..1a96951 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -512,6 +512,19 @@ int strp_init(struct strparser *strp, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(strp_init);
 
+/* Sock process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp)
+{
+	strp->paused = 0;
+
+	if (strp->need_bytes) {
+		if (strp_peek_len(strp) < strp->need_bytes)
+			return;
+	}
+	strp_read_sock(strp);
+}
+EXPORT_SYMBOL_GPL(__strp_unpause);
+
 void strp_unpause(struct strparser *strp)
 {
 	strp->paused = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 839e1e1..8ca57d0 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -735,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 	/* Finished with message */
 	ctx->recv_pkt = NULL;
 	kfree_skb(skb);
-	strp_unpause(&ctx->strp);
+	__strp_unpause(&ctx->strp);
 
 	return true;
 }
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH bpf-next v5 2/2] samples/bpf: Add xdp_sample_pkts example
From: Jakub Kicinski @ 2018-06-06 16:54 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, Jesper Dangaard Brouer
In-Reply-To: <152828901941.9599.5391551130846011457.stgit@alrua-kau>

On Wed, 06 Jun 2018 14:43:39 +0200, Toke Høiland-Jørgensen wrote:
> Add an example program showing how to sample packets from XDP using the
> perf event buffer. The example userspace program just prints the ethernet
> header for every packet sampled.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

Looks like some of the includes in samples/bpf/xdp_sample_pkts_user.c
are unnecessary, but otherwise LGTM.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

^ permalink raw reply

* Re: [PATCH bpf-next v5 1/2] trace_helpers.c: Add helpers to poll multiple perf FDs for events
From: Jakub Kicinski @ 2018-06-06 16:55 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, Jesper Dangaard Brouer
In-Reply-To: <152828901936.9599.10143519677183418086.stgit@alrua-kau>

On Wed, 06 Jun 2018 14:43:39 +0200, Toke Høiland-Jørgensen wrote:
> Add two new helper functions to trace_helpers that supports polling
> multiple perf file descriptors for events. These are used to the XDP
> perf_event_output example, which needs to work with one perf fd per CPU.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

malloc() + memset(0) could have been replaced with calloc, but
otherwise looks good.

^ permalink raw reply

* Re: pull request Cavium liquidio vswitch firmware v1.7.2
From: Felix Manlunas @ 2018-06-06 16:57 UTC (permalink / raw)
  To: linux-firmware
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	manish.awasthi, manojkumar.panicker
In-Reply-To: <20180521173920.GA22607@felix-thinkpad.cavium.com>

On Mon, May 21, 2018 at 10:39:20AM -0700, Felix Manlunas wrote:
> The following changes since commit 2a9b2cf50fb32e36e4fc1586c2f6f1421913b553:
> 
>   Merge branch 'for-upstreaming-v1.7.2' of https://github.com/felix-cavium/linux-firmware (2018-05-18 08:35:22 -0400)
> 
> are available in the git repository at:
> 
>   https://github.com/felix-cavium/linux-firmware.git for-upstreaming-v1.7.2-vsw
> 
> for you to fetch changes up to 0e193ca65d8b064502d61163597bf14eef81710f:
> 
>   linux-firmware: liquidio: update vswitch firmware to v1.7.2 (2018-05-19 23:29:03 -0700)
> 
> Signed-off-by: Manish Awasthi <manish.awasthi@cavium.com>
> Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
> ----------------------------------------------------------------
> Felix Manlunas (1):
>       linux-firmware: liquidio: update vswitch firmware to v1.7.2
> 
>  WHENCE                    |   2 +-
>  liquidio/lio_23xx_vsw.bin | Bin 19922416 -> 20434408 bytes
>  2 files changed, 1 insertion(+), 1 deletion(-)

Hello Maintainers of linux-firmware.git,

Any feedback about this submission?  We sent it two weeks ago, but we
haven't heard anything.

Thanks,
Felix

^ permalink raw reply

* [RFC PATCH ghak90 (was ghak32) V3 00/10] audit: implement container identifier
From: Richard Guy Briggs @ 2018-06-06 16:58 UTC (permalink / raw)
  To: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev
  Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
	ebiederm, Richard Guy Briggs

Implement kernel audit container identifier.

This patchset is a third based on the proposal document (V3)
posted:
	https://www.redhat.com/archives/linux-audit/2018-January/msg00014.html

The first patch implements the proc fs write to set the audit container
identifier of a process, emitting an AUDIT_CONTAINER_ID record to announce the
registration of that audit container identifier on that process.  This patch
requires userspace support for record acceptance and proper type
display.

The second implements the auxiliary record AUDIT_CONTAINER if an
audit container identifier is identifiable with an event.  This patch
requires userspace support for proper type display.

The third adds signal and ptrace support.

The 4th creates a local audit context to be able to bind a standalone
record with a locally created auxiliary record.

The 5th patch adds audit container identifier records to the tty
standalone record.

The 6th adds audit container identifier filtering to the exit,
exclude and user lists.  This patch adds the AUDIT_CONTID field and
requires auditctl userspace support for the --contid option.

The 7th adds network namespace audit container identifier labelling
based on member tasks' audit container identifier labels.

The 8th adds audit container identifier support to standalone netfilter
records that don't have a task context and lists each container to which
that net namespace belongs.

The 9th implements reading the audit container identifier from the proc
filesystem for debugging.  This patch isn't planned for upstream
inclusion.

The 10th fixes an rfkill spelling mistake in a comment that is otherwise
the only match for "contid" in the kernel.


Example: Set an audit container identifier of 123456 to the "sleep" task:

  sleep 2&  
  child=$!
  echo 123456 > /proc/$child/audit_containerid; echo $?
  ausearch -ts recent -m container
  echo child:$child contid:$( cat /proc/$child/audit_containerid)

This should produce a record such as:

  type=CONTAINER_ID msg=audit(2018-06-06 12:39:29.636:26949) : op=set opid=2209 old-contid=18446744073709551615 contid=123456 pid=628 auid=root uid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=bash exe=/usr/bin/bash res=yes 


Example: Set a filter on an audit container identifier 123459 on /tmp/tmpcontainerid:

  contid=123459
  key=tmpcontainerid
  auditctl -a exit,always -F dir=/tmp -F perm=wa -F contid=$contid -F key=$key
  perl -e "sleep 1; open(my \$tmpfile, '>', \"/tmp/$key\"); close(\$tmpfile);" &
  child=$!
  echo $contid > /proc/$child/audit_containerid
  sleep 2
  ausearch -i -ts recent -k $key
  auditctl -d exit,always -F dir=/tmp -F perm=wa -F contid=$contid -F key=$key
  rm -f /tmp/$key

This should produce an event such as:

  type=CONTAINER msg=audit(2018-06-06 12:46:31.707:26953) : op=task contid=123459 
  type=PROCTITLE msg=audit(2018-06-06 12:46:31.707:26953) : proctitle=perl -e sleep 1; open(my $tmpfile, '>', "/tmp/tmpcontainerid"); close($tmpfile); 
  type=PATH msg=audit(2018-06-06 12:46:31.707:26953) : item=1 name=/tmp/tmpcontainerid inode=25656 dev=00:26 mode=file,644 ouid=root ogid=root rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=none cap_fi=none cap_fe=0 cap_fver=0 
  type=PATH msg=audit(2018-06-06 12:46:31.707:26953) : item=0 name=/tmp/ inode=8985 dev=00:26 mode=dir,sticky,777 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:tmp_t:s0 nametype=PARENT cap_fp=none cap_fi=none cap_fe=0 cap_fver=0 
  type=CWD msg=audit(2018-06-06 12:46:31.707:26953) : cwd=/root 
  type=SYSCALL msg=audit(2018-06-06 12:46:31.707:26953) : arch=x86_64 syscall=openat success=yes exit=3 a0=0xffffffffffffff9c a1=0x5621f2b81900 a2=O_WRONLY|O_CREAT|O_TRUNC a3=0x1b6 items=2 ppid=628 pid=2232 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=tmpcontainerid 


Requires: https://github.com/linux-audit/audit-kernel/issues/81
See: https://github.com/linux-audit/audit-kernel/issues/90
See: https://github.com/linux-audit/audit-userspace/issues/40
See: https://github.com/linux-audit/audit-testsuite/issues/64
See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID

Changelog:

v3
- switched from containerid in task_struct to audit_task_info (depends on ghak81)
- drop INVALID_CID in favour of only AUDIT_CID_UNSET
- check for !audit_task_info, throw -ENOPROTOOPT on set
- changed -EPERM to -EEXIST for parent check
- return AUDIT_CID_UNSET if !audit_enabled
- squash child/thread check patch into AUDIT_CONTAINER_ID patch
- changed -EPERM to -EBUSY for child check
- separate child and thread checks, use -EALREADY for latter
- move addition of op= from ptrace/signal patch to AUDIT_CONTAINER patch
- fix && to || bashism in ptrace/signal patch
- uninline and export function for audit_free_context()
- drop CONFIG_CHANGE, FEATURE_CHANGE, ANOM_ABEND, ANOM_SECCOMP patches
- move audit_enabled check (xt_AUDIT)
- switched from containerid list in struct net to net_generic's struct audit_net
- move containerid list iteration into audit (xt_AUDIT)
- create function to move namespace switch into audit
- switched /proc/PID/ entry from containerid to audit_containerid
- call kzalloc with GFP_ATOMIC on in_atomic() in audit_alloc_context()
- call kzalloc with GFP_ATOMIC on in_atomic() in audit_log_container_info()
- use xt_net(par) instead of sock_net(skb->sk) to get net
- switched record and field names: initial CONTAINER_ID, aux CONTAINER, field CONTID
- allow to set own contid
- open code audit_set_containerid
- add contid inherited flag
- ccontainerid and pcontainerid eliminated due to inherited flag
- change name of container list funcitons
- rename containerid to contid
- convert initial container record to syscall aux
- fix spelling mistake of contidion in net/rfkill/core.c to avoid contid name collision

v2
- add check for children and threads
- add network namespace container identifier list
- add NETFILTER_PKT audit container identifier logging
- patch description and documentation clean-up and example
- reap unused ppid

Richard Guy Briggs (10):
  audit: add container id
  audit: log container info of syscalls
  audit: add containerid support for ptrace and signals
  audit: add support for non-syscall auxiliary records
  audit: add containerid support for tty_audit
  audit: add containerid filtering
  audit: add support for containerid to network namespaces
  audit: NETFILTER_PKT: record each container ID associated with a netNS
  debug audit: read container ID of a process
  rfkill: fix spelling mistake contidion to condition

 drivers/tty/tty_audit.c    |   5 +-
 fs/proc/base.c             |  53 +++++++++++++++++++
 include/linux/audit.h      |  68 ++++++++++++++++++++++++
 include/uapi/linux/audit.h |   8 ++-
 kernel/audit.c             | 114 ++++++++++++++++++++++++++++++++++++++++
 kernel/audit.h             |   3 ++
 kernel/auditfilter.c       |  47 +++++++++++++++++
 kernel/auditsc.c           | 128 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/nsproxy.c           |   4 ++
 net/netfilter/xt_AUDIT.c   |  12 ++++-
 net/rfkill/core.c          |   2 +-
 11 files changed, 433 insertions(+), 11 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [RFC PATCH ghak90 (was ghak32) V3 01/10] audit: add container id
From: Richard Guy Briggs @ 2018-06-06 16:58 UTC (permalink / raw)
  To: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev
  Cc: ebiederm, luto, jlayton, carlos, dhowells, Richard Guy Briggs,
	viro, simo, eparis, serge
In-Reply-To: <cover.1528304203.git.rgb@redhat.com>

Implement the proc fs write to set the audit container identifier of a
process, emitting an AUDIT_CONTAINER_ID record to document the event.

This is a write from the container orchestrator task to a proc entry of
the form /proc/PID/audit_containerid where PID is the process ID of the
newly created task that is to become the first task in a container, or
an additional task added to a container.

The write expects up to a u64 value (unset: 18446744073709551615).

The writer must have capability CAP_AUDIT_CONTROL.

This will produce a record such as this:
  type=CONTAINER_ID msg=audit(2018-06-06 12:39:29.636:26949) : op=set opid=2209 old-contid=18446744073709551615 contid=123456 pid=628 auid=root uid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=bash exe=/usr/bin/bash res=yes

The "op" field indicates an initial set.  The "pid" to "ses" fields are
the orchestrator while the "opid" field is the object's PID, the process
being "contained".  Old and new audit container identifier values are
given in the "contid" fields, while res indicates its success.

It is not permitted to unset or re-set the audit container identifier.
A child inherits its parent's audit container identifier, but then can
be set only once after.

See: https://github.com/linux-audit/audit-kernel/issues/90
See: https://github.com/linux-audit/audit-userspace/issues/51
See: https://github.com/linux-audit/audit-testsuite/issues/64
See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 fs/proc/base.c             | 37 ++++++++++++++++++++++++
 include/linux/audit.h      | 25 ++++++++++++++++
 include/uapi/linux/audit.h |  2 ++
 kernel/auditsc.c           | 71 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index eafa39a..318dff4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1302,6 +1302,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 	.read		= proc_sessionid_read,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_contid_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	u64 contid;
+	int rv;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	if (*ppos != 0) {
+		/* No partial writes. */
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	rv = kstrtou64_from_user(buf, count, 10, &contid);
+	if (rv < 0) {
+		put_task_struct(task);
+		return rv;
+	}
+
+	rv = audit_set_contid(task, contid);
+	put_task_struct(task);
+	if (rv < 0)
+		return rv;
+	return count;
+}
+
+static const struct file_operations proc_contid_operations = {
+	.write		= proc_contid_write,
+	.llseek		= generic_file_llseek,
+};
+
 #endif
 
 #ifdef CONFIG_FAULT_INJECTION
@@ -2995,6 +3030,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+	REG("audit_containerid", S_IWUSR, proc_contid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
@@ -3386,6 +3422,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+	REG("audit_containerid", S_IWUSR, proc_contid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4f824c4..497cd81 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -219,6 +219,8 @@ static inline void audit_log_task_info(struct audit_buffer *ab,
 struct audit_task_info {
 	kuid_t			loginuid;
 	unsigned int		sessionid;
+	u64			contid;
+	bool			inherited; /* containerid inheritance */
 	struct audit_context	*ctx;
 };
 extern struct audit_task_info init_struct_audit;
@@ -331,6 +333,7 @@ static inline void audit_ptrace(struct task_struct *t)
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec64 *t, unsigned int *serial);
 extern int audit_set_loginuid(kuid_t loginuid);
+extern int audit_set_contid(struct task_struct *tsk, u64 contid);
 
 static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
 {
@@ -348,6 +351,14 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 		return AUDIT_SID_UNSET;
 }
 
+static inline u64 audit_get_contid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return AUDIT_CID_UNSET;
+	else
+		return tsk->audit->contid;
+}
+
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -542,6 +553,10 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return AUDIT_SID_UNSET;
 }
+static inline kuid_t audit_get_contid(struct task_struct *tsk)
+{
+	return AUDIT_CID_UNSET;
+}
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
@@ -606,6 +621,16 @@ static inline bool audit_loginuid_set(struct task_struct *tsk)
 	return uid_valid(audit_get_loginuid(tsk));
 }
 
+static inline bool cid_valid(u64 contid)
+{
+	return contid != AUDIT_CID_UNSET;
+}
+
+static inline bool audit_contid_set(struct task_struct *tsk)
+{
+	return cid_valid(audit_get_contid(tsk));
+}
+
 static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
 {
 	audit_log_n_string(ab, buf, strlen(buf));
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 04f9bd2..c3b1aca 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -71,6 +71,7 @@
 #define AUDIT_TTY_SET		1017	/* Set TTY auditing status */
 #define AUDIT_SET_FEATURE	1018	/* Turn an audit feature on or off */
 #define AUDIT_GET_FEATURE	1019	/* Get which features are enabled */
+#define AUDIT_CONTAINER_ID	1020	/* Define the container id and information */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
@@ -466,6 +467,7 @@ struct audit_tty_status {
 
 #define AUDIT_UID_UNSET (unsigned int)-1
 #define AUDIT_SID_UNSET ((unsigned int)-1)
+#define AUDIT_CID_UNSET ((u64)-1)
 
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 59ef7a81..611e926 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -956,6 +956,8 @@ int audit_alloc(struct task_struct *tsk)
 		return -ENOMEM;
 	info->loginuid = audit_get_loginuid(current);
 	info->sessionid = audit_get_sessionid(current);
+	info->contid = audit_get_contid(current);
+	info->inherited = true;
 	tsk->audit = info;
 
 	if (likely(!audit_ever_enabled))
@@ -985,6 +987,8 @@ int audit_alloc(struct task_struct *tsk)
 struct audit_task_info init_struct_audit = {
 	.loginuid = INVALID_UID,
 	.sessionid = AUDIT_SID_UNSET,
+	.contid = AUDIT_CID_UNSET,
+	.inherited = true,
 	.ctx = NULL,
 };
 
@@ -2112,6 +2116,73 @@ int audit_set_loginuid(kuid_t loginuid)
 }
 
 /**
+ * audit_set_contid - set current task's audit_context contid
+ * @contid: contid value
+ *
+ * Returns 0 on success, -EPERM on permission failure.
+ *
+ * Called (set) from fs/proc/base.c::proc_contid_write().
+ */
+int audit_set_contid(struct task_struct *task, u64 contid)
+{
+	u64 oldcontid;
+	int rc = 0;
+	struct audit_buffer *ab;
+	uid_t uid;
+	struct tty_struct *tty;
+	char comm[sizeof(current->comm)];
+
+	/* Can't set if audit disabled */
+	if (!task->audit)
+		return -ENOPROTOOPT;
+	oldcontid = audit_get_contid(task);
+	/* Don't allow the audit containerid to be unset */
+	if (!cid_valid(contid))
+		rc = -EINVAL;
+	/* if we don't have caps, reject */
+	else if (!capable(CAP_AUDIT_CONTROL))
+		rc = -EPERM;
+	/* if task has children or is not single-threaded, deny */
+	else if (!list_empty(&task->children))
+		rc = -EBUSY;
+	else if (!(thread_group_leader(task) && thread_group_empty(task)))
+		rc = -EALREADY;
+	/* it is already set, and not inherited from the parent, reject */
+	else if (cid_valid(oldcontid) && !task->audit->inherited)
+		rc = -EEXIST;
+	if (!rc) {
+		task_lock(task);
+		task->audit->contid = contid;
+		task->audit->inherited = false;
+		task_unlock(task);
+	}
+
+	if (!audit_enabled)
+		return rc;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_ID);
+	if (!ab)
+		return rc;
+
+	uid = from_kuid(&init_user_ns, task_uid(current));
+	tty = audit_get_tty(current);
+	audit_log_format(ab, "op=set opid=%d old-contid=%llu contid=%llu pid=%d uid=%u auid=%u tty=%s ses=%u",
+			 task_tgid_nr(task), oldcontid, contid,
+			 task_tgid_nr(current), uid
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(current));
+	audit_put_tty(tty);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, get_task_comm(comm, current));
+	audit_log_d_path_exe(ab, current->mm);
+	audit_log_format(ab, " res=%d", !rc);
+	audit_log_end(ab);
+	return rc;
+}
+
+/**
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
  * @mode: mode bits
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH ghak90 (was ghak32) V3 02/10] audit: log container info of syscalls
From: Richard Guy Briggs @ 2018-06-06 16:58 UTC (permalink / raw)
  To: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev
  Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
	ebiederm, Richard Guy Briggs
In-Reply-To: <cover.1528304203.git.rgb@redhat.com>

Create a new audit record AUDIT_CONTAINER to document the audit
container identifier of a process if it is present.

Called from audit_log_exit(), syscalls are covered.

A sample raw event:
type=SYSCALL msg=audit(1519924845.499:257): arch=c000003e syscall=257 success=yes exit=3 a0=ffffff9c a1=56374e1cef30 a2=241 a3=1b6 items=2 ppid=606 pid=635 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="bash" exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key="tmpcontainerid"
type=CWD msg=audit(1519924845.499:257): cwd="/root"
type=PATH msg=audit(1519924845.499:257): item=0 name="/tmp/" inode=13863 dev=00:27 mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmp_t:s0 nametype= PARENT cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=PATH msg=audit(1519924845.499:257): item=1 name="/tmp/tmpcontainerid" inode=17729 dev=00:27 mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=PROCTITLE msg=audit(1519924845.499:257): proctitle=62617368002D6300736C65657020313B206563686F2074657374203E202F746D702F746D70636F6E7461696E65726964
type=CONTAINER msg=audit(1519924845.499:257): op=task contid=123458

See: https://github.com/linux-audit/audit-kernel/issues/90
See: https://github.com/linux-audit/audit-userspace/issues/51
See: https://github.com/linux-audit/audit-testsuite/issues/64
See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h      |  7 +++++++
 include/uapi/linux/audit.h |  1 +
 kernel/audit.c             | 23 +++++++++++++++++++++++
 kernel/auditsc.c           |  3 +++
 4 files changed, 34 insertions(+)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 497cd81..4e1e34e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -152,6 +152,9 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab,
 				struct task_struct *tsk);
+extern int audit_log_contid(struct task_struct *tsk,
+				    struct audit_context *context,
+				    char *op);
 
 extern int		    audit_update_lsm_rules(void);
 
@@ -202,6 +205,10 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
 static inline void audit_log_task_info(struct audit_buffer *ab,
 				       struct task_struct *tsk)
 { }
+static inline int audit_log_contid(struct task_struct *tsk,
+					   struct audit_context *context,
+					   char *op)
+{ }
 #define audit_enabled 0
 #endif /* CONFIG_AUDIT */
 
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c3b1aca..469ab25 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -115,6 +115,7 @@
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
 #define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
+#define AUDIT_CONTAINER		1332	/* Container ID */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.c b/kernel/audit.c
index e7478cb..5e150c6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2048,6 +2048,29 @@ void audit_log_session_info(struct audit_buffer *ab)
 	audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
 }
 
+/*
+ * audit_log_contid - report container info
+ * @tsk: task to be recorded
+ * @context: task or local context for record
+ * @op: contid string description
+ */
+int audit_log_contid(struct task_struct *tsk,
+			     struct audit_context *context, char *op)
+{
+	struct audit_buffer *ab;
+
+	if (!audit_contid_set(tsk))
+		return 0;
+	/* Generate AUDIT_CONTAINER record with container ID */
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER);
+	if (!ab)
+		return -ENOMEM;
+	audit_log_format(ab, "op=%s contid=%llu",
+			 op, audit_get_contid(tsk));
+	audit_log_end(ab);
+	return 0;
+}
+
 void audit_log_key(struct audit_buffer *ab, char *key)
 {
 	audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 611e926..a3c946c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1490,10 +1490,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 	audit_log_proctitle(tsk, context);
 
+	audit_log_contid(tsk, context, "task");
+
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
 	if (ab)
 		audit_log_end(ab);
+
 	if (call_panic)
 		audit_panic("error converting sid to string");
 }
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH ghak90 (was ghak32) V3 03/10] audit: add containerid support for ptrace and signals
From: Richard Guy Briggs @ 2018-06-06 16:58 UTC (permalink / raw)
  To: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev
  Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
	ebiederm, Richard Guy Briggs
In-Reply-To: <cover.1528304203.git.rgb@redhat.com>

Add audit container identifier support to ptrace and signals.  In
particular, the "op" field provides a way to label the auxiliary record
to which it is associated.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h | 11 +++++------
 kernel/audit.c        | 13 +++++++------
 kernel/audit.h        |  2 ++
 kernel/auditsc.c      | 21 ++++++++++++++++-----
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4e1e34e..ab50985 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -34,6 +34,7 @@ struct audit_sig_info {
 	uid_t		uid;
 	pid_t		pid;
 	char		ctx[0];
+	u64		cid;
 };
 
 struct audit_buffer;
@@ -152,9 +153,8 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab,
 				struct task_struct *tsk);
-extern int audit_log_contid(struct task_struct *tsk,
-				    struct audit_context *context,
-				    char *op);
+extern int audit_log_contid(struct audit_context *context,
+				     char *op, u64 contid);
 
 extern int		    audit_update_lsm_rules(void);
 
@@ -205,9 +205,8 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
 static inline void audit_log_task_info(struct audit_buffer *ab,
 				       struct task_struct *tsk)
 { }
-static inline int audit_log_contid(struct task_struct *tsk,
-					   struct audit_context *context,
-					   char *op)
+static inline int audit_log_contid(struct audit_context *context,
+					    char *op, u64 contid)
 { }
 #define audit_enabled 0
 #endif /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 5e150c6..ba304a8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -142,6 +142,7 @@ struct audit_net {
 kuid_t		audit_sig_uid = INVALID_UID;
 pid_t		audit_sig_pid = -1;
 u32		audit_sig_sid = 0;
+u64		audit_sig_cid = AUDIT_CID_UNSET;
 
 /* Records can be lost in several ways:
    0) [suppressed in audit_alloc]
@@ -1437,6 +1438,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			memcpy(sig_data->ctx, ctx, len);
 			security_release_secctx(ctx, len);
 		}
+		sig_data->cid = audit_sig_cid;
 		audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
 				 sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
@@ -2050,23 +2052,22 @@ void audit_log_session_info(struct audit_buffer *ab)
 
 /*
  * audit_log_contid - report container info
- * @tsk: task to be recorded
  * @context: task or local context for record
  * @op: contid string description
+ * @contid: container ID to report
  */
-int audit_log_contid(struct task_struct *tsk,
-			     struct audit_context *context, char *op)
+int audit_log_contid(struct audit_context *context,
+			      char *op, u64 contid)
 {
 	struct audit_buffer *ab;
 
-	if (!audit_contid_set(tsk))
+	if (!cid_valid(contid))
 		return 0;
 	/* Generate AUDIT_CONTAINER record with container ID */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER);
 	if (!ab)
 		return -ENOMEM;
-	audit_log_format(ab, "op=%s contid=%llu",
-			 op, audit_get_contid(tsk));
+	audit_log_format(ab, "op=%s contid=%llu", op, contid);
 	audit_log_end(ab);
 	return 0;
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 214e149..1cf1c35 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -147,6 +147,7 @@ struct audit_context {
 	kuid_t		    target_uid;
 	unsigned int	    target_sessionid;
 	u32		    target_sid;
+	u64		    target_cid;
 	char		    target_comm[TASK_COMM_LEN];
 
 	struct audit_tree_refs *trees, *first_trees;
@@ -329,6 +330,7 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
 extern pid_t audit_sig_pid;
 extern kuid_t audit_sig_uid;
 extern u32 audit_sig_sid;
+extern u64 audit_sig_cid;
 
 extern int audit_filter(int msgtype, unsigned int listtype);
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a3c946c..cface9d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -113,6 +113,7 @@ struct audit_aux_data_pids {
 	kuid_t			target_uid[AUDIT_AUX_PIDS];
 	unsigned int		target_sessionid[AUDIT_AUX_PIDS];
 	u32			target_sid[AUDIT_AUX_PIDS];
+	u64			target_cid[AUDIT_AUX_PIDS];
 	char 			target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
 	int			pid_count;
 };
@@ -1456,21 +1457,27 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	for (aux = context->aux_pids; aux; aux = aux->next) {
 		struct audit_aux_data_pids *axs = (void *)aux;
 
-		for (i = 0; i < axs->pid_count; i++)
+		for (i = 0; i < axs->pid_count; i++) {
+			char axsn[sizeof("aux0xN ")];
+
+			sprintf(axsn, "aux0x%x", i);
 			if (audit_log_pid_context(context, axs->target_pid[i],
 						  axs->target_auid[i],
 						  axs->target_uid[i],
 						  axs->target_sessionid[i],
 						  axs->target_sid[i],
-						  axs->target_comm[i]))
+						  axs->target_comm[i])
+			    || audit_log_contid(context, axsn, axs->target_cid[i]))
 				call_panic = 1;
+		}
 	}
 
 	if (context->target_pid &&
-	    audit_log_pid_context(context, context->target_pid,
+	    (audit_log_pid_context(context, context->target_pid,
 				  context->target_auid, context->target_uid,
 				  context->target_sessionid,
-				  context->target_sid, context->target_comm))
+				  context->target_sid, context->target_comm)
+	    || audit_log_contid(context, "target", context->target_cid)))
 			call_panic = 1;
 
 	if (context->pwd.dentry && context->pwd.mnt) {
@@ -1490,7 +1497,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 	audit_log_proctitle(tsk, context);
 
-	audit_log_contid(tsk, context, "task");
+	audit_log_contid(context, "task", audit_get_contid(tsk));
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -2375,6 +2382,7 @@ void __audit_ptrace(struct task_struct *t)
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
+	context->target_cid = audit_get_contid(t);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
 }
 
@@ -2402,6 +2410,7 @@ int audit_signal_info(int sig, struct task_struct *t)
 		else
 			audit_sig_uid = uid;
 		security_task_getsecid(current, &audit_sig_sid);
+		audit_sig_cid = audit_get_contid(current);
 	}
 
 	if (!audit_signals || audit_dummy_context())
@@ -2415,6 +2424,7 @@ int audit_signal_info(int sig, struct task_struct *t)
 		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
+		ctx->target_cid = audit_get_contid(t);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
 		return 0;
 	}
@@ -2436,6 +2446,7 @@ int audit_signal_info(int sig, struct task_struct *t)
 	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+	axp->target_cid[axp->pid_count] = audit_get_contid(t);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
 	axp->pid_count++;
 
-- 
1.8.3.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox