Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 07/15] scm: allow AF_BUS sockets to send ancillary data
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

Similar to UNIX domain sockets AF_BUS sockets support passing file
descriptors and process credentials which requires supporting passing
control messages.

The core socket level control messages processing requires extending
to allow sockets other than PF_UNIX to send SCM_RIGHTS type messages.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 net/core/scm.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/scm.c b/net/core/scm.c
index 611c5ef..87e3152 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -158,7 +158,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 		switch (cmsg->cmsg_type)
 		{
 		case SCM_RIGHTS:
-			if (!sock->ops || sock->ops->family != PF_UNIX)
+			if (!sock->ops || (sock->ops->family != PF_UNIX &&
+					   sock->ops->family != PF_BUS))
 				goto error;
 			err=scm_fp_copy(cmsg, &p->fp);
 			if (err<0)
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 06/15] netfilter: Add NFPROTO_BUS hook constant for AF_BUS socket family
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

AF_BUS sockets add a netfilter NF_HOOK() on the packet sending path.
This allows packet to be mangled by registered netfilter hooks.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 include/linux/netfilter.h |    1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index c613cf0..0698924 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -67,6 +67,7 @@ enum {
 	NFPROTO_BRIDGE =  7,
 	NFPROTO_IPV6   = 10,
 	NFPROTO_DECNET = 12,
+	NFPROTO_BUS,
 	NFPROTO_NUMPROTO,
 };
 
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 05/15] security: selinux: Add AF_BUS socket SELinux hooks
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

Add Security-Enhanced Linux (SELinux) hook for AF_BUS socket address family.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 security/selinux/hooks.c |   35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4ee6f23..5bacbe2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -67,6 +67,7 @@
 #include <linux/quota.h>
 #include <linux/un.h>		/* for Unix socket types */
 #include <net/af_unix.h>	/* for Unix socket types */
+#include <net/af_bus.h>	/* for Bus socket types */
 #include <linux/parser.h>
 #include <linux/nfs_mount.h>
 #include <net/ipv6.h>
@@ -4101,6 +4102,39 @@ static int selinux_socket_unix_may_send(struct socket *sock,
 			    &ad);
 }
 
+static int selinux_socket_bus_connect(struct sock *sock, struct sock *other,
+				      struct sock *newsk)
+{
+	struct sk_security_struct *sksec_sock = sock->sk_security;
+	struct sk_security_struct *sksec_other = other->sk_security;
+	struct sk_security_struct *sksec_new = newsk->sk_security;
+	struct common_audit_data ad;
+	struct lsm_network_audit net = {0,};
+	int err;
+
+	ad.type = LSM_AUDIT_DATA_NET;
+	ad.u.net = &net;
+	ad.u.net->sk = other;
+
+	err = avc_has_perm(sksec_sock->sid, sksec_other->sid,
+			   sksec_other->sclass,
+			   UNIX_STREAM_SOCKET__CONNECTTO, &ad);
+	if (err)
+		return err;
+
+	/* server child socket */
+	sksec_new->peer_sid = sksec_sock->sid;
+	err = security_sid_mls_copy(sksec_other->sid, sksec_sock->sid,
+				    &sksec_new->sid);
+	if (err)
+		return err;
+
+	/* connecting socket */
+	sksec_sock->peer_sid = sksec_new->sid;
+
+	return 0;
+}
+
 static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
 				    u32 peer_sid,
 				    struct common_audit_data *ad)
@@ -5643,6 +5677,7 @@ static struct security_operations selinux_ops = {
 
 	.unix_stream_connect =		selinux_socket_unix_stream_connect,
 	.unix_may_send =		selinux_socket_unix_may_send,
+	.bus_connect =		        selinux_socket_bus_connect,
 
 	.socket_create =		selinux_socket_create,
 	.socket_post_create =		selinux_socket_post_create,
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 04/15] security: Add Linux Security Modules hook for AF_BUS sockets
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

AF_BUS implements a security hook bus_connect() to be used by LSM to
enforce connectivity security policies.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 include/linux/security.h |   11 +++++++++++
 security/capability.c    |    7 +++++++
 security/security.c      |    7 +++++++
 3 files changed, 25 insertions(+)

diff --git a/include/linux/security.h b/include/linux/security.h
index 4e5a73c..d30dc4a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1578,6 +1578,8 @@ struct security_operations {
 
 #ifdef CONFIG_SECURITY_NETWORK
 	int (*unix_stream_connect) (struct sock *sock, struct sock *other, struct sock *newsk);
+	int (*bus_connect) (struct sock *sock, struct sock *other,
+			    struct sock *newsk);
 	int (*unix_may_send) (struct socket *sock, struct socket *other);
 
 	int (*socket_create) (int family, int type, int protocol, int kern);
@@ -2519,6 +2521,8 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32
 #ifdef CONFIG_SECURITY_NETWORK
 
 int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk);
+int security_bus_connect(struct sock *sock, struct sock *other,
+			 struct sock *newsk);
 int security_unix_may_send(struct socket *sock,  struct socket *other);
 int security_socket_create(int family, int type, int protocol, int kern);
 int security_socket_post_create(struct socket *sock, int family,
@@ -2566,6 +2570,13 @@ static inline int security_unix_stream_connect(struct sock *sock,
 	return 0;
 }
 
+static inline int security_bus_connect(struct socket *sock,
+				       struct sock *other,
+				       struct sock *newsk)
+{
+	return 0;
+}
+
 static inline int security_unix_may_send(struct socket *sock,
 					 struct socket *other)
 {
diff --git a/security/capability.c b/security/capability.c
index 61095df..ea57f2b 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -563,6 +563,12 @@ static int cap_unix_may_send(struct socket *sock, struct socket *other)
 	return 0;
 }
 
+static int cap_bus_connect(struct sock *sock, struct sock *other,
+			   struct sock *newsk)
+{
+	return 0;
+}
+
 static int cap_socket_create(int family, int type, int protocol, int kern)
 {
 	return 0;
@@ -1016,6 +1022,7 @@ void __init security_fixup_ops(struct security_operations *ops)
 #ifdef CONFIG_SECURITY_NETWORK
 	set_to_cap_if_null(ops, unix_stream_connect);
 	set_to_cap_if_null(ops, unix_may_send);
+	set_to_cap_if_null(ops, bus_connect);
 	set_to_cap_if_null(ops, socket_create);
 	set_to_cap_if_null(ops, socket_post_create);
 	set_to_cap_if_null(ops, socket_bind);
diff --git a/security/security.c b/security/security.c
index 3efc9b1..00ab7df 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1059,6 +1059,13 @@ int security_unix_may_send(struct socket *sock,  struct socket *other)
 }
 EXPORT_SYMBOL(security_unix_may_send);
 
+int security_bus_connect(struct sock *sock, struct sock *other,
+				struct sock *newsk)
+{
+	return security_ops->bus_connect(sock, other, newsk);
+}
+EXPORT_SYMBOL(security_bus_connect);
+
 int security_socket_create(int family, int type, int protocol, int kern)
 {
 	return security_ops->socket_create(family, type, protocol, kern);
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 03/15] net: bus: Add AF_BUS socket and address definitions
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

An AF_BUS socket address is made up of a path component and a numeric
component. The path component is either a pathname or an abstract
socket similar to a unix socket. The numeric component is used to
uniquely identify each connection to the bus. Thus the path identifies
a specific bus and the numeric component the attachment to that bus.

The numeric component of the address is a 64-bit unsigned integer,
interpreted by splitting the into two parts: the most significant 16
bits are a prefix identifying the type of address, and the remaining
48 bits are the actual client address within that prefix, as shown in
this figure:

Bit:  0             15 16                                            63
     +----------------+------------------------------------------------+
     |  Type prefix   |                Client address                  |
     +----------------+------------------------------------------------+

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 include/linux/bus.h  |   34 +++++++
 include/net/af_bus.h |  272 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 include/linux/bus.h
 create mode 100644 include/net/af_bus.h

diff --git a/include/linux/bus.h b/include/linux/bus.h
new file mode 100644
index 0000000..19cac36
--- /dev/null
+++ b/include/linux/bus.h
@@ -0,0 +1,34 @@
+#ifndef _LINUX_BUS_H
+#define _LINUX_BUS_H
+
+#include <linux/socket.h>
+
+/* 'protocol' to use in socket(AF_BUS, SOCK_SEQPACKET, protocol) */
+#define BUS_PROTO_NONE	0
+#define BUS_PROTO_DBUS	1
+#define BUS_PROTO_MAX	1
+
+#define BUS_PATH_MAX	108
+
+/**
+ * struct bus_addr - af_bus address
+ * @s_addr: an af_bus address (16-bit prefix + 48-bit client address)
+ */
+struct bus_addr {
+	u64 s_addr;
+};
+
+
+/**
+ * struct sockaddr_bus - af_bus socket address
+ * @sbus_family: the socket address family
+ * @sbus_addr: an af_bus address
+ * @sbus_path: a path name
+ */
+struct sockaddr_bus {
+	__kernel_sa_family_t sbus_family;
+	struct bus_addr      sbus_addr;
+	char sbus_path[BUS_PATH_MAX];
+};
+
+#endif /* _LINUX_BUS_H */
diff --git a/include/net/af_bus.h b/include/net/af_bus.h
new file mode 100644
index 0000000..19bd7ac
--- /dev/null
+++ b/include/net/af_bus.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012, GENIVI Alliance
+ *
+ * Authors:	Javier Martinez Canillas, <javier.martinez@collabora.co.uk>
+ *              Alban Crequy, <alban.crequy@collabora.co.uk>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Based on BSD Unix domain sockets (net/unix).
+ */
+
+#ifndef __LINUX_NET_AFBUS_H
+#define __LINUX_NET_AFBUS_H
+
+#include <linux/socket.h>
+#include <linux/bus.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+extern void bus_inflight(struct file *fp);
+extern void bus_notinflight(struct file *fp);
+extern void bus_gc(void);
+extern void wait_for_bus_gc(void);
+extern struct sock *bus_get_socket(struct file *filp);
+extern struct sock *bus_peer_get(struct sock *);
+
+#define BUS_HASH_SIZE	256
+
+extern spinlock_t bus_address_lock;
+extern struct hlist_head bus_address_table[BUS_HASH_SIZE];
+
+#define BUS_MAX_QLEN    10
+#define BUS_MASTER_ADDR 0x0
+#define BUS_PREFIX_BITS 16
+#define BUS_CLIENT_BITS 48
+#define BUS_PREFIX_MASK 0xffff000000000000
+#define BUS_CLIENT_MASK 0x0000ffffffffffff
+
+/* AF_BUS socket options */
+#define BUS_ADD_ADDR 1
+#define BUS_JOIN_BUS 2
+#define BUS_DEL_ADDR 3
+#define BUS_SET_EAVESDROP 4
+#define BUS_UNSET_EAVESDROP 5
+#define BUS_SET_SENDBUF 6
+#define BUS_SET_MAXQLEN 7
+
+/* Connection and socket states */
+enum {
+	BUS_ESTABLISHED = TCP_ESTABLISHED,
+	BUS_CLOSE = TCP_CLOSE,
+	BUS_LISTEN = TCP_LISTEN,
+	BUS_MAX_STATES
+};
+
+#define NF_BUS_SENDING 1
+
+extern unsigned int bus_tot_inflight;
+extern spinlock_t bus_table_lock;
+extern struct hlist_head bus_socket_table[BUS_HASH_SIZE + 1];
+
+/**
+ * struct bus_address - an af_bus address associated with an af_bus sock
+ * @refcnt: address reference counter
+ * @len: address length
+ * @hash: address hash value
+ * @addr_node: member of struct bus_sock.addr_list
+ * @table_node: member of struct hlist_head bus_address_table[hash]
+ * @sock: the af_bus sock that owns this address
+ * @name: the socket address for this address
+ */
+struct bus_address {
+	atomic_t	refcnt;
+	int		len;
+	unsigned	hash;
+	struct hlist_node addr_node;
+	struct hlist_node table_node;
+	struct sock  *sock;
+	struct sockaddr_bus name[0];
+};
+
+/**
+ * struct bus_send_context - sending context for an socket buffer
+ * @sender_socket: the sender socket associated with this sk_buff
+ * @siocb: used to send ancillary data
+ * @timeo: sending timeout
+ * @max_level: file descriptor passing maximum recursion level
+ * @namelen: length of socket address name
+ * @hash: socket name hash value
+ * @other: destination sock
+ * @sender: sender socket address name
+ * @recipient: recipient socket address name
+ * @authenticated: flag whether the sock already joined the bus
+ * @bus_master_side: flag whether the sock is an accepted socket
+ * @to_master: flag whether the destination is the bus master
+ * @multicast: flag whether the destination is a multicast address
+ * @deliver: flag whether the skb has to be delivered
+ * @eavesdropper: flag whether the sock is allowed to eavesdrop
+ * @main_recipient: flag whether the sock is the main recipient
+ */
+struct bus_send_context {
+	struct socket *sender_socket;
+	struct sock_iocb *siocb;
+	long timeo;
+	int max_level;
+	int namelen;
+	unsigned hash;
+	struct sock *other;
+	struct sockaddr_bus	*sender;
+	struct sockaddr_bus	*recipient;
+	unsigned int		authenticated:1;
+	unsigned int		bus_master_side:1;
+	unsigned int		to_master:1;
+	unsigned int		multicast:1;
+	unsigned int            deliver:1;
+	unsigned int            eavesdropper:1;
+	unsigned int            main_recipient:1;
+};
+
+/**
+ * struct bus_skb_parms - socket buffer parameters
+ * @pid: process id
+ * @cred: skb credentials
+ * @fp: passed file descriptors
+ * @secid: security id
+ * @sendctx: skb sending context
+ */
+struct bus_skb_parms {
+	struct pid		*pid;
+	const struct cred	*cred;
+	struct scm_fp_list	*fp;
+#ifdef CONFIG_SECURITY_NETWORK
+	u32			secid;
+#endif
+	struct bus_send_context	*sendctx;
+};
+
+#define BUSCB(skb)      (*(struct bus_skb_parms *)&((skb)->cb))
+#define BUSSID(skb)     (&BUSCB((skb)).secid)
+
+#define bus_state_lock(s)	spin_lock(&bus_sk(s)->lock)
+#define bus_state_unlock(s)	spin_unlock(&bus_sk(s)->lock)
+#define bus_state_lock_nested(s) \
+				spin_lock_nested(&bus_sk(s)->lock, \
+				SINGLE_DEPTH_NESTING)
+
+/**
+ * struct bus - a communication bus
+ * @master: the bus master sock
+ * @peers: list of struct bus_sock.bus_node allowed to join the bus
+ * @lock: protect peers concurrent access
+ * @send_lock: enforce atomic multicast delivery
+ * @kref: bus reference counter
+ * @addr_cnt: address number counter to assign prefix 0x0000 addresses
+ * @eavesdropper_cnt: eavesdroppers counter
+ */
+struct bus {
+	struct sock		*master;
+	struct hlist_head       peers;
+	spinlock_t		lock;
+	spinlock_t		send_lock;
+	struct kref             kref;
+	atomic64_t              addr_cnt;
+	atomic64_t              eavesdropper_cnt;
+};
+
+/**
+ * struct bus_sock - an af_bus socket
+ * @sk: associated sock
+ * @addr: sock principal address
+ * @addr_list: list of struct bus_address.addr_node
+ * @path: sock path name
+ * @readlock: protect from concurrent reading
+ * @peer: peer sock
+ * @other: the listening sock
+ * @link: list of candidates for garbage collection
+ * @inflight: number of times the file descriptor is in flight
+ * @lock: protect the sock from concurrent access
+ * @gc_candidate: flag whether the is a candidate for gc
+ * @gc_maybe_cycle: flag whether could be a cyclic reference
+ * @recursion_level: file passing current recursion level
+ * @peer_wq: peer sock wait queue
+ * @bus: bus that this sock belongs to
+ * @bus_master: flag whether the sock is the bus master
+ * @bus_master_side: flag whether is an accepted socket
+ * @authenticated: flag whether the sock joined the bus
+ * @eavesdropper: flag whether the sock is allowed to eavesdrop
+ * @bus_node: member of struct bus.peers list of joined socks
+ */
+struct bus_sock {
+	/* WARNING: sk has to be the first member */
+	struct sock		sk;
+	struct bus_address     *addr;
+	struct hlist_head       addr_list;
+	struct path		path;
+	struct mutex		readlock;
+	struct sock		*peer;
+	struct sock		*other;
+	struct list_head	link;
+	atomic_long_t		inflight;
+	spinlock_t		lock;
+	unsigned int		gc_candidate:1;
+	unsigned int		gc_maybe_cycle:1;
+	unsigned char		recursion_level;
+	struct socket_wq	peer_wq;
+	struct bus              *bus;
+	bool                    bus_master;
+	bool                    bus_master_side;
+	bool                    authenticated;
+	bool                    eavesdropper;
+	struct hlist_node	bus_node;
+};
+#define bus_sk(__sk) ((struct bus_sock *)__sk)
+
+#define peer_wait peer_wq.wait
+
+/**
+ * bus_same_bus - Test if two socket address belongs to the same bus
+ * @sbusaddr1: socket address name
+ * @sbusaddr2: socket address name
+ */
+static inline bool bus_same_bus(struct sockaddr_bus *sbusaddr1,
+				struct sockaddr_bus *sbusaddr2)
+{
+	int offset;
+
+	if (sbusaddr1->sbus_path[0] != sbusaddr2->sbus_path[0])
+		return false;
+
+	/*
+	 * abstract path names start with a null byte character,
+	 * so they have to be compared starting at the second char.
+	 */
+	offset = (sbusaddr1->sbus_path[0] == '\0');
+
+	return !strncmp(sbusaddr1->sbus_path + offset,
+		       sbusaddr2->sbus_path + offset,
+		       BUS_PATH_MAX);
+}
+
+static inline unsigned int bus_hash_fold(__wsum n)
+{
+	unsigned int hash = (__force unsigned int)n;
+	hash ^= hash>>16;
+	hash ^= hash>>8;
+	return hash&(BUS_HASH_SIZE-1);
+}
+
+static inline unsigned int bus_compute_hash(struct bus_addr addr)
+{
+	return bus_hash_fold(csum_partial((void *)&addr, sizeof(addr), 0));
+}
+
+long bus_inq_len(struct sock *sk);
+long bus_outq_len(struct sock *sk);
+
+#ifdef CONFIG_SYSCTL
+extern int bus_sysctl_register(struct net *net);
+extern void bus_sysctl_unregister(struct net *net);
+#else
+static inline int bus_sysctl_register(struct net *net) { return 0; }
+static inline void bus_sysctl_unregister(struct net *net) {}
+#endif
+
+bool bus_can_write(struct net *net, struct sockaddr_bus *addr, int len,
+		   int protocol);
+
+#endif /* __LINUX_NET_AFBUS_H */
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 02/15] net: bus: Add documentation for AF_BUS
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

Docuemnt the AF_BUS design, API and usage semantics.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 Documentation/networking/af_bus.txt |  558 +++++++++++++++++++++++++++++++++++
 1 file changed, 558 insertions(+)
 create mode 100644 Documentation/networking/af_bus.txt

diff --git a/Documentation/networking/af_bus.txt b/Documentation/networking/af_bus.txt
new file mode 100644
index 0000000..a0b078f
--- /dev/null
+++ b/Documentation/networking/af_bus.txt
@@ -0,0 +1,558 @@
+			The AF_BUS socket address family
+			================================
+
+Introduction
+------------
+
+AF_BUS is a message oriented inter process communication system.
+
+The principle features are:
+
+ - Reliable datagram based communication (all sockets are of type
+   SOCK_SEQPACKET)
+
+ - Multicast message delivery (one to many, unicast as a subset)
+
+ - Strict ordering (messages are delivered to every client in the same order)
+
+ - Ability to pass file descriptors
+
+ - Ability to pass credentials
+
+The basic concept is to provide a virtual bus on which multiple
+processes can communicate and policy is imposed by a "bus master".
+
+A process can create buses to which other processes can connect and
+communicate with each other by sending messages. Processes' addresses
+are automatically assigned by the bus on connect and are
+unique. Messages can be sent either to a process' unique address or to
+a bus multicast addresses.
+
+Netfilter rules or Berkeley Packet Filter can be used to restrict the
+messages that each peer is allowed to receive. This is especially
+important when sending to multicast addresses.
+
+Besides messages, process can send and receive ancillary data (i.e.,
+SCM_RIGHTS for passing file descriptors or SCM_CREDENTIALS for passing
+Unix credentials). In the case of a multicast message all recipients
+of a message may obtain a copy a file descriptor or credentials.
+
+A bus is created by processes connecting on an AF_BUS socket. The
+"bus master" binds itself instead of connecting to the NULL address.
+
+The socket address is made up of a path component and a numeric
+component. The path component is either a pathname or an abstract
+socket similar to a unix socket. The numeric component is used to
+uniquely identify each connection to the bus. Thus the path identifies
+a specific bus and the numeric component the attachment to that bus.
+
+The process that calls bind(2) on the socket is the owner of the bus
+and is called the bus master. The master is a special client of the
+bus and has some responsibility for the bus' operation. The master is
+assigned a fixed address with all the bits zero (0x0000000000000000).
+
+Each process connected to an AF_BUS socket has one or more addresses
+within that bus. These addresses are 64-bit unsigned integers,
+interpreted by splitting the address into two parts: the most
+significant 16 bits are a prefix identifying the type of address, and
+the remaining 48 bits are the actual client address within that
+prefix, as shown in this figure:
+
+Bit:  0             15 16                                            63
+     +----------------+------------------------------------------------+
+     |  Type prefix   |                Client address                  |
+     +----------------+------------------------------------------------+
+
+The prefix with all bits zero is reserved for use by the kernel, which
+automatically assigns one address from this prefix to each client on
+connection.  The address in this prefix with all bits zero is always
+assigned to the bus master. Addresses on the prefix 0x0000 are unique
+and will never repeat for the lifetime of the bus master.
+
+A client may have multiple addresses. When data is sent to other
+clients, those clients will always see the sender address that is in
+the prefix 0x0000 address space when calling recvmsg(2) or
+recvfrom(2). Similarly, the prefix 0x0000 address is returned by calls
+to getsockname(2) and getpeername(2).
+
+For each prefix, the address where the least significant 48 bits are
+all 1 (i.e., 0xffffffffffff) is also reserved, and can be used to send
+multicast messages to all the peers on a prefix.
+
+The non-reserved addresses in each of the remaining prefixes are
+managed by the bus master, which may assign additional addresses to
+any other connected socket.
+
+Having different name-spaces has two advantages:
+
+  - Clients can have addresses on different mutually-exclusive
+    scopes. This permits sending multicast packets to only clients
+    that have addresses on a given prefix.
+
+  - The addressing scheme can be more flexible. The kernel will only
+    assign unique addresses on the all-bits-zero prefix (0x0000) and
+    allows the bus master process to assign additional addresses to
+    clients on other prefixes.  By having different prefixes, the
+    kernel and bus master assignments will not collide.
+
+AF_BUS transport can support two network topologies. When a process
+first connects to the bus master, it can only communicate with the bus
+master. The process can't send and receive packets from other peers on
+the bus. So, from the client process point of view the network
+topology is point-to-point.
+
+The bus master can allow the connected peer to be part of the bus and
+start to communicate with other peers by setting a socket option with
+the setsockopt(2) system call using the accepted socket descriptor. At
+this point, the topology becomes a bus to the client process.
+
+Packets whose destination address is not assigned to any client are
+routed by default to the bus master (the client accepted socket
+descriptor).
+
+
+Semantics
+---------
+
+Bus features:
+
+ - Unicast and multicast addressing scheme.
+ - Ability to assign addresses from user-space with different prefixes.
+ - Automatic address assignment.
+ - Ordered packets delivery (FIFO, total ordering).
+ - File descriptor and credentials passing.
+ - Support for both point-to-point and bus network topologies.
+ - Bus control access managed from user-space.
+ - Netfilter hooks for packet sending, routing and receiving.
+
+A process (the "bus master") can create an AF_BUS bus with socket(2)
+and use bind(2) to assign an address to the bus. Then it can listen(2)
+on the created socket to start accepting incoming connections with
+accept(2).
+
+Processes can connect to the bus by creating a socket with socket(2)
+and using connect(2). The kernel will assign a unique address to each
+connection and messages can be sent and received by using BSD socket
+primitives.
+
+This uses the connect(2) semantic in a non-traditional way, with
+AF_BUS sockets, it's not possible to connect "my" socket to a specific
+peer socket whereas the traditional BSD sockets API usage, connect(2)
+either connects to stream sockets, or assigns a peer address to a
+datagram socket (so that send(2) can be used instead of sendto()).
+
+An AF_BUS socket address is represented as a combination of a bus
+address and a bus path name. Address are unique within a path. The
+unique bus address is further subdivided into a prefix and a client
+address. Thus the path identifies a specific bus and the numeric
+component the attachment to that bus.
+
+#define BUS_PATH_MAX    108
+
+/* Bus address */
+struct bus_addr {
+	uint64_t    s_addr; 	/* 16-bit prefix + 48-bit client address */
+};
+
+/* Structure describing an AF_BUS socket address. */
+struct sockaddr_bus {
+	sa_family_t     sbus_family; 	   	  /* AF_BUS */
+	struct bus_addr sbus_addr;                /* bus address */
+	char 		sbus_path[BUS_PATH_MAX];  /* pathname */
+};
+
+A process becomes a bus master for a given struct sockaddr_bus by
+calling bind(2) on an AF_BUS addresses. The argument must be { AF_BUS,
+0, path }. 
+
+AF_BUS supports both abstract and non-abstract path names. Abstract
+names are distinguished by the fact that sbus_path[0] == '\0' and they
+don't represent file system paths while non-abstract paths are bound
+to a file system path name. (See the unix(7) man page for a discussion
+of abstract socket addresses in the AF_UNIX address family.)
+
+Then the process calls listen(2) to accept incoming connections. If
+that process calls getsockname(2), the returned address will be {
+AF_BUS, 0, path }.
+
+The conventional string form of the full address is path + ":" +
+prefix + "/" + client address. Prefix and client address are
+represented in hex.
+
+For example the address:
+
+struct sockaddr_bus addr;
+addr.sbus_family = AF_BUS;
+strcpy(addr.sbus_path, "/tmp/test");
+addr.sbus_addr.s_addr   = 0x0002f00ddeadbeef;
+
+would be represented using the string /tmp/test:0002/f00ddeadbeef.
+
+If the bus_addr is 0, then both the prefix and client address may be
+omitted from the string form.  To connect to a bus as a client it is
+sufficient to specify the path, since the listening address always has
+bus_addr == 0. it is not meanigful to specify 'bus_addr' as other than
+0 on connect()
+
+The AF_BUS implementation will automatically assign a unique address
+to each client but the bus master can assign additional addresses on a
+different prefix by means of the setsockopt(2) system call. For
+example:
+
+struct bus_addr addr;
+addr.s_addr = 0x0001deadfee1dead;
+ret = setsockopt(afd, SOL_BUS, BUS_ADD_ADDR, &addr, sizeof(addr));
+
+where afd is the accepted socket descriptor in the daemon. To show graphically:
+
+	  L          The AF_BUS listening socket  }
+       /  |  \                                    }-- listener process
+     A1  A2  A3      The AF_BUS accepted sockets  }
+      |   |   |
+     C1  C2  C3      The AF_BUS connected sockets }-- client processes
+
+So if setsockopt(A1, SOL_BUS, BUS_ADD_ADDR, &addr, sizeof(addr)) is
+called, C1 will get the new address.
+
+The inverse operation is BUS_DEL_ADDR, which the bus master can use to
+remove a client socket AF_BUS address:
+
+ret = setsockopt(afd, SOL_BUS, BUS_DEL_ADDR, &addr, sizeof(addr));
+
+Besides assigning additional addresses, the bus master has to allow a
+client process to communicate with other peers on the bus using a
+setsockopt(2):
+
+ret = setsockopt(afd, SOL_BUS, BUS_JOIN_BUS, NULL, 0);
+
+Clients are not meant to send messages to each other until the master
+tells them (in a protocol-specific way) that the BUS_JOIN_BUS
+setsockopt(2) call was made.
+
+If a client sends a message to a destination other than the bus
+master's all-zero address before joining the bus, a EHOSTUNREACH (No
+route to host) error is returned since the only host that exists in
+the point-to-point network before the client joins the bus are the
+client and the bus master.  
+
+A EHOSTUNREACH is returned if a client that joined a bus tries to send
+a packet to a client from another bus. Cross-bus communication is not
+permited.
+
+When a process wants to send a unicast message to a peer, it fills a
+sockaddr structure and performs a socket operation (i.e., sendto(2))
+
+struct sockaddr_bus addr;
+char *msg = "Hello world";
+
+addr.sbus_family 	   = AF_BUS;
+strcpy(addr.sbus_path, "/tmp/test");
+addr.sbus_addr.s_addr   = 0x0001f00ddeadbeef;
+
+ret = sendto(sockfd, "Hello world", strlen("Hello world"), 0,
+	    (struct sockaddr*)&addr, sizeof(addr));
+
+The current implementation requires that the addr.sbus_path component
+match the one used to conenct() to the bus but in future this
+requirement will be removed.
+
+The kernel will first check that the socket is connected and that the
+bus path of the socket correspond with the destination, then it will
+extract the prefix and client address from the bus address using a
+fixed 16 -bit bitmask.
+
+prefix 		= bus address >> 48 & 0xffff
+client address 	= bus address & 0xffff
+
+If the client address is not all bits one, then the message is unicast
+and is delivered to the socket with that assigned address
+(0x0001f00ddeadbeef).  Otherwise the message is multicast and is
+delivered to all the peers with this address prefix (0x0001 in this
+case).
+
+So, when a process wants to send a multicast message, it just has to
+fill the address structure with the address prefix + 0xffffffffffff:
+
+struct sockaddr_bus addr;
+char *msg = "Hello world";
+
+addr.bus_family = AF_BUS;
+strcpy(addr.sbus_path, "/tmp/test");
+addr.bus_addr   = 0x0001ffffffffffff;
+
+ret = sendto(sockfd, "Hello world", strlen("Hello world"), 0,
+	    (struct sockaddr*)&addr, sizeof(addr));
+
+The kernel, will apply the binary and operation, learn that the
+address is 0xffffffffffff and send the message to all the peers on
+this prefix (0x0001).
+
+Socket transmit queued bytes are limited by a maximum send buffer size
+(sysctl_wmem_max) defined in the kernel and can be modified at runtime
+using the sysctl interface on /proc/sys/net/core/wmem_default. This
+parameter is global for all the sockets families in a Linux system.
+
+AF_BUS permits the definition of a per-bus maximum send buffer size
+using the BUS_SET_SENDBUF socket option. The bus master can call the
+setsockopt(2) system call using as a parameter the listening socket.
+The command sets a maximum write buffer that will be imposed on each
+new socket that connects to the bus:
+
+ret = setsockopt(serverfd, SOL_BUS, BUS_SET_SENDBUF, &sndbuf,
+sizeof(int));
+
+In the transmission path both Berkeley Packet Filters and Netfilter
+hooks are available, so they can be used to filter sending packets.
+
+
+Using this addressing scheme with D-Bus
+---------------------------------------
+
+As an example of a use case for AF_BUS, let's analyze how the D-Bus
+IPC system can be implemented on top of it.
+
+We define a new D-Bus address type "afbus".
+
+A D-Bus client may connect to an address of the form "afbus:path=X"
+where X is a string. This means that it connect()s to { AF_BUS, 0, X }.
+
+For example: afbus:path=/tmp/test connects to { AF_BUS, 0, /tmp/test }.
+
+A D-Bus daemon may listen on the address "afbus:", which means that it
+binds to { AF_BUS, 0, /tmp/test }. It will advertise an address of the
+form "afbus:path=/tmp/test" to clients, for instance via the
+--print-address option, or via dbus-launch setting the
+DBUS_SESSION_BUS_ADDRESS environment variable.  For instance, "afbus:"
+is an appropriate default listening address for the session bus,
+resulting in dbus-launch setting the DBUS_SESSION_BUS_ADDRESS
+environment variable to something like
+"afbus:path=/tmp/test,guid=...".
+
+A D-Bus daemon may listen on the address "afbus:file=/some/file",
+which means that it will do as above, then write its path into the
+given well-known file.  For instance,
+"afbus:file=/run/dbus/system_bus.afbus" is an appropriate listening
+address for the system bus. Only processes with suitable privileges to
+write to that file can impersonate the system bus.
+
+D-Bus clients wishing to connect to the well-known system bus should
+attempt to connect to afbus:file=/run/dbus/system_bus.afbus, falling
+back to unix:path=/var/run/dbus/system_bus_socket if that fails. On
+Linux systems, the well-known system bus daemon should attempt to
+listen on both of those addresses.
+
+The D-Bus daemon will serve as bus master as well since it will be the
+process that creates and listens on the AF_BUS socket.
+
+D-Bus clients will use the fixed bus master address (all zero bits) to
+send messages to the D-Bus daemon and the client's unique address to
+send messages to other D-Bus clients using the bus.
+
+When initially connected, D-Bus clients will only be able to
+communicate with the D-Bus daemon and will send authentication
+information (AUTH message and SCM_CREDENTIALS ancillary
+messages). Since the D-Bus daemon is also the bus master, it can allow
+D-Bus clients to join the bus and be able to send and receive D-Bus
+messages from other peers.
+
+On connection, the kernel will assign to each client an address in the
+prefix 0x0000. If a client attempts to send messages to clients other
+than the bus master, this is considered to be an error, and is
+prevented by the kernel.
+
+When the D-Bus daemon has authenticated a client and determined that
+it is authorized to be on this bus, it uses a setsockopt(2) call to
+tell the kernel that this client has permission to send messages. The
+D-Bus daemon then tells the client by sending the Hello() reply that
+it has made the setsockopt(2) call and that now is able to send
+messages to other peers on the bus.
+
+Well-known names are represented by addresses in the 0x0001, ... prefixes.
+
+Addresses in prefix 0x0000 must be mapped to D-Bus unique names in a
+way that can't collide with unique names allocated by the dbus-daemon
+for legacy clients.
+
+In order to be consistent with current D-Bus unique naming, the AF_BUS
+addresses can be mapped directly to D-Bus unique names, for example
+(0000/0000deadbeef to ":0.deadbeef"). Leading zeroes can be suppressed
+since the common case should be relatively small numbers (the kernel
+allocates client addresses sequentially, and machines could be
+rebooted occasionally).
+
+By having both AF_BUS and legacy D-Bus clients use the same address
+space, the D-Bus daemon can act as a proxy between clients and can be
+sure that D-Bus unique names will be unique for both AF_BUS and legacy
+clients.
+
+To act as a proxy between AF_BUS and legacy clients, each time the
+D-Bus daemon accepts a legacy connection (i.e., AF_UNIX), it will
+create an AF_BUS socket and establish a connection with itself. It
+will then associate this newly created connection with the legacy one.
+
+To explain it graphically:
+
+	  L          The AF_BUS listening socket  }
+       /  |  \                                    }-- listener process
+     A1  A2  A3      The AF_BUS accepted sockets  }
+      |   |   |
+     C1  C2  C3      The AF_BUS connected sockets, where:
+      |                    * C1 belongs to the listener process
+      |                    * C2 and C3 belongs to the client processes
+      |
+ L2--A4       The AF_UNIX listening and accepted sockets \
+      |                            in the listener process
+     C4       The AF_UNIX connected socket in the legacy client process
+
+
+where C2 and C3 are normal AF_BUS clients and C4 is a legacy
+client. The D-Bus daemon after accepting the connection using the
+legacy transport (A4), will create an AF_BUS socket pair (C1, A1)
+associated with the legacy client.
+
+Legacy clients will send messages to the D-Bus daemon using their
+legacy socket and the D-Bus daemon will extract the destination
+address, resolve to the corresponding AF_BUS address and use this to
+send the message to the right peer.  
+
+Conversely, when an AF_BUS client sends a D-Bus message to a legacy
+client, it will use the AF_BUS address of the connection associated
+with that client. The D-Bus daemon will receive the message, modify
+the message's content to set SENDER headers based on the AF_BUS source
+address and use the legacy transport to send the D-Bus message to the
+legacy client.
+
+As a special case, the bus daemon's all-zeroes address maps to
+"org.freedesktop.DBus" and vice versa.
+
+When a D-Bus client receives an AF_BUS message from the bus master
+(0/0), it must use the SENDER header field in the D-Bus message, as
+for any other D-Bus transport, to determine whether the message is
+actually from the D-Bus daemon (the SENDER is "org.freedesktop.DBus"
+or missing), or from another client (the SENDER starts with ":"). It
+is valid for messages from another AF_BUS client to be received via
+the D-Bus daemon; if they are, the SENDER header field will always be
+set.
+
+Besides its unique name, D-Bus services can have well-known names such
+as org.gnome.Keyring or org.freedesktop.Telepathy. These well-known
+names can also be used as a D-Bus message destination
+address. Well-known names are not numeric and AF_BUS is not able to
+parse D-Bus messages.
+
+To solve this, the D-Bus daemon will assign an additional AF_BUS
+address to each D-Bus client that owns a well-known name. The mapping
+between well-known names and AF_BUS address is maintained by the D-Bus
+daemon on a persistent data structure.
+
+D-Bus client libraries will maintain a cache of these mappings so they
+can send messages to services with well-known names using their mapped
+AF_BUS address.
+
+If a client intending to send a D-Bus message to a given well-known
+name does not have that well-known name in its cache, it must send the
+AF_BUS message to the listener (0000/000000000000) instead. 
+
+The listener must forward the D-Bus message to the owner of that
+well-known name, setting the SENDER header field if necessary. It may
+also send this AF_BUS-specific D-Bus signal to the sender, so that the
+sender can update its cache:
+
+     org.freedesktop.DBus.AF_BUS.Forwarded (STRING well_known_name,
+	 UINT64 af_bus_client)
+
+	 Emitted by the D-Bus daemon with sender "org.freedesktop.DBus"
+	 and object path "/org/freedesktop/DBus" to indicate that
+	 the well-known name well_known_name is represented by the
+	 AF_BUS address { AF_BUS, af_bus_client, path } where
+	 path is the path name used by this bus.
+
+	 For instance, if the well-known name "org.gnome.Keyring"
+	 is represented by AF_BUS address 0001/0000deadbeef,
+	 the signal would have arguments ("org.gnome.Keyring",
+	 0x00010000deadbeef), corresponding to the AF_BUS
+	 address { AF_BUS, 0x00010000deadbeef, /tmp/test }.
+
+If the D-Bus service for that well-known name is not active, then the
+D-Bus daemon will first do the service activation, assign an
+additional address to the recently activated service, store the
+well-known service to numeric address mapping on its persistent cache,
+and then send the AF_BUS.Forwarded signal back to the client.
+
+Once the mapping has been made, the AF_BUS address associated with a
+well-known name cannot be reused for the lifetime of the D-Bus daemon
+(which is the same as the lifetime of the socket). 
+
+Nevertheless the AF_BUS address associated with a well-known name can
+change, for example if a service goes away and a new instance gets
+activated. This new instance can have a different AF_BUS address.  The
+D-Bus daemon will maintain a list of the mappings that are currently
+valid so it can send the AF_BUS.
+
+Forwarded signal with the mapping information to the clients. Client
+libraries will maintain a fixed-size Last Recently Used (LRU) cache
+with previous mappings sent by the D-Bus daemon.
+
+If the clients overwrite a mapping due to the LRU replace policy and
+later want to send a D-Bus message to the overwritten well-known name,
+they will send the D-Bus message back to the D-Bus daemon and this
+will send the signal with the mapping information. 
+
+If a service goes away or if the service AF_BUS address changed and
+the client still has the old AF_BUS address in its cache, it will send
+the D-Bus message to the old destination. 
+
+Since packets whose destination AF_BUS addresses are not assigned to
+any process are routed by default to the bus master, the D-Bus daemon
+will receive these D-bus messages and send an AF_BUS.
+
+Forwarded signal back to the client with the new AF_BUS address so it
+can update its cache with the new mapping.
+
+For well-known names, the D-Bus daemon will use a different address
+prefix (0x0001) so it doesn't conflict with the D-Bus unique names
+address prefix (0x0000).
+
+Besides D-Bus method call messages which are unicast, D-Bus allows
+clients to send multicast messages (D-Bus signals). Clients can send
+signals messages using the bus unique name prefix multicast address
+(0x0001ffffffffffff).
+
+A netfilter hook is used to filter these multicast messages and only
+deliver to the correct peers based on match rules.
+
+
+D-Bus aware netfilter module
+----------------------------
+
+AF_BUS is designed to be a generic bus transport supporting both
+unicast and multicast communications.
+
+In order for D-Bus to operate efficiently, the transport method has to
+know the D-Bus message wire-protocol and D-Bus message structure. But
+adding this D-Bus specific knowledge to AF_BUS will break one of the
+fundamental design principles of any network protocol stack, namely
+layer-independence: layer n must not make any assumptions about the
+payload in layer n + 1.
+
+So, in order to have a clean protocol design but be able to allow the
+transport to analyze the D-Bus messages, netfilter hooks are used to
+do the filtering based on match rules.
+
+The kernel module has to maintain the match rules and the D-Bus daemon
+is responsible for managing this information. Every time an add match
+rule message is processed by the D-Bus daemon, this will update the
+netfilter module match rules set so the netfilter hook function can
+use that information to do the match rules based filtering.
+
+The D-Bus daemon and the netfilter module will use the generic netlink
+subsystem to do the kernel-to-user-space communication. Netlink is
+already used by most of the networking subsystem in Linux
+(iptables/netfilter, ip/routing, etc).
+
+We enforce a security scheme so only the bus master's user ID can
+update the netfilter module match rules set.
+
+The advantage of using the netfilter subsystem is that we decouple the
+mechanism from the policy. AF_BUS will only add a set of hook points
+and external modules will be used to enforce a given policy.
-- 
1.7.10

^ permalink raw reply related

* [PATCH net-next 01/15] net: bus: Add AF_BUS socket address family
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller
  Cc: Javier Martinez Canillas, Vincent Sanders
In-Reply-To: <1340988354-26981-1-git-send-email-vincent.sanders@collabora.co.uk>

From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

This adds AF_BUS to the socket headers and net core.

AF_BUS is a message oriented inter process communication system
implemented as asocket address family. The principle features are:

 - Reliable datagram based communication (all sockets are of type
   SOCK_SEQPACKET)
 - Multicast message delivery (one to many, unicast as a subset)
 - Strict ordering (messages are delivered to every client in the same order)
 - Ability to pass file descriptors
 - Ability to pass credentials

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 include/linux/socket.h |    5 ++++-
 net/core/sock.c        |    6 +++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322..d244e69 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -195,7 +195,8 @@ struct ucred {
 #define AF_CAIF		37	/* CAIF sockets			*/
 #define AF_ALG		38	/* Algorithm sockets		*/
 #define AF_NFC		39	/* NFC sockets			*/
-#define AF_MAX		40	/* For now.. */
+#define AF_BUS		40	/* BUS sockets			*/
+#define AF_MAX		41	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -238,6 +239,7 @@ struct ucred {
 #define PF_CAIF		AF_CAIF
 #define PF_ALG		AF_ALG
 #define PF_NFC		AF_NFC
+#define PF_BUS		AF_BUS
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -312,6 +314,7 @@ struct ucred {
 #define SOL_IUCV	277
 #define SOL_CAIF	278
 #define SOL_ALG		279
+#define SOL_BUS		280
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/net/core/sock.c b/net/core/sock.c
index 929bdcc..b9c5fc8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -208,7 +208,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
-  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
+  "sk_lock-AF_NFC"   , "sk_lock-AF_BUS"     , "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -224,7 +224,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
-  "slock-AF_NFC"   , "slock-AF_MAX"
+  "slock-AF_NFC"   , "slock-AF_BUS"     , "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -240,7 +240,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
-  "clock-AF_NFC"   , "clock-AF_MAX"
+  "clock-AF_NFC"   , "clock-AF_BUS"     , "clock-AF_MAX"
 };
 
 /*
-- 
1.7.10

^ permalink raw reply related

* AF_BUS socket address family
From: Vincent Sanders @ 2012-06-29 16:45 UTC (permalink / raw)
  To: netdev, linux-kernel, David S. Miller


This series adds the bus address family (AF_BUS) it is against
net-next as of yesterday.
 
AF_BUS is a message oriented inter process communication system. 

The principle features are:

 - Reliable datagram based communication (all sockets are of type
   SOCK_SEQPACKET)

 - Multicast message delivery (one to many, unicast as a subset)

 - Strict ordering (messages are delivered to every client in the same order)

 - Ability to pass file descriptors

 - Ability to pass credentials

The basic concept is to provide a virtual bus on which multiple
processes can communicate and policy is imposed by a "bus master".

Introduction
------------

AF_BUS is based upon AF_UNIX but extended for multicast operation and
removes stream operation, responding to extensive feedback on previous
approaches we have made the implementation as isolated as
possible. There are opportunities in the future to integrate the
socket garbage collector with that of the unix socket implementation.

The impetus for creating this IPC mechanism is to replace the
underlying transport for D-Bus. The D-Bus system currently emulates this
IPC mechanism using AF_UNIX sockets in userspace and has numerous
undesirable behaviours. D-Bus is now widely deployed in many areas and
has become a de-facto IPC standard. Using this IPC mechanism as a
transport gives a significant (100% or more) improvement to throughput
with comparable improvement to latency.

This work was undertaken by Collabora for the GENIVI Alliance and we
are committed to responding to feedback promptly and intend to continue
to support this feature into the future.

Operation
---------

A bus is created by processes connecting on an AF_BUS socket. The
"bus master" binds itself instead of connecting to the NULL address.

The socket address is made up of a path component and a numeric
component. The path component is either a pathname or an abstract
socket similar to a unix socket. The numeric component is used to
uniquely identify each connection to the bus. Thus the path identifies
a specific bus and the numeric component the attachment to that bus.

The numeric component of the address is divided into two fixed parts a
prefix to identify multicast groups and a suffix which identifies the
attachment. The kernel allocates a single address in prefix 0 to each
socket upon connection.

Connections are initially limited to communicating with address the
bus master (address 0) . The bus master is responsible for making all
policy decisions around manipulating other attachments including
building multicast groups. 

It is expected that connecting clients use protocol specific messages
to communicate with the bus master to negotiate differing
configurations although a bus master might implement a fixed
behaviour.

AF_BUS itself is protocol agnostic and implements the configured
policy between attachments which allows for a bus master to leave a
bus and communication between clients to continue.

Some test code has been written [1] which demonstrates the usage of
AF_BUS.

Use with BUS_PROTO_DBUS
-----------------------

The initial aim of AF_BUS is to provide a IPC mechanism suitable for
use to provide the underlying transport for D-Bus. 

A socket created using BUS_PROTO_DBUS indicates that the messages
passed will be in the D-Bus format. The userspace libraries have been
updated to use this transport with an updated D-Bus daemon [2] as a bus
master.

The D-Bus protocol allows for multicast groups to be filtered depending
on message contents. These filters are configured by the bus master
but need to be enforced on message delivery. 

We have simply used the standard kernel netfilter mechanism to achieve
this. This is used to filter delivery to clients that may be part of a
multicast group where they are not receiving all messages according to
policy. If a client wishes to further filter its input provision has
been made to allow them to use BPF.

The kernel based IPC has several benefits for D-Bus over the userspace
emulation:

 - Context switching between userspace processes is reduced.
 - Message data copying is reduced.
 - System call overheads are reduced.
 - The userspace D-Bus daemon was subject to resource starvation,
   client contention and priority inversion.
 - Latency is reduced
 - Throughput is increased.

The tools for testing these assertions are available [3] and
consistently show a doubling in throughput and better than halving of
latency.

[1] http://cgit.collabora.com/git/user/javier/check-unix-multicast.git/log/?h=af-bus
[2] http://cgit.collabora.com/git/user/rodrigo/dbus.git/

[3] git://github.com/kanchev/dbus-ping.git
    https://github.com/kanchev/dbus-ping/blob/master/dbus-genivi-benchmarking.sh

^ permalink raw reply

* [PATCH v3] sctp: be more restrictive in transport selection on bundled sacks
From: Neil Horman @ 2012-06-29 16:34 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Vlad Yaseivch, David S. Miller
In-Reply-To: <1340742704-2192-1-git-send-email-nhorman@tuxdriver.com>

It was noticed recently that when we send data on a transport, its possible that
we might bundle a sack that arrived on a different transport.  While this isn't
a major problem, it does go against the SHOULD requirement in section 6.4 of RFC
2960:

 An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK,
   etc.) to the same destination transport address from which it
   received the DATA or control chunk to which it is replying.  This
   rule should also be followed if the endpoint is bundling DATA chunks
   together with the reply chunk.

This patch seeks to correct that.  It restricts the bundling of sack operations
to only those transports which have moved the ctsn of the association forward
since the last sack.  By doing this we guarantee that we only bundle outbound
saks on a transport that has received a chunk since the last sack.  This brings
us into stricter compliance with the RFC.

Vlad had initially suggested that we strictly allow only sack bundling on the
transport that last moved the ctsn forward.  While this makes sense, I was
concerned that doing so prevented us from bundling in the case where we had
received chunks that moved the ctsn on multiple transports.  In those cases, the
RFC allows us to select any of the transports having received chunks to bundle
the sack on.  so I've modified the approach to allow for that, by adding a state
variable to each transport that tracks weather it has moved the ctsn since the
last sack.  This I think keeps our behavior (and performance), close enough to
our current profile that I think we can do this without a sysctl knob to
enable/disable it.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yaseivch <vyasevich@gmail.com>
CC: David S. Miller <davem@davemloft.net>
Reported-by: Michele Baldessari <michele@redhat.com>
Reported-by: sorin serban <sserban@redhat.com>

---
Change Notes:
V2)
	* Removed unused variable as per Dave M. Request
	* Delayed rwnd adjustment until we are sure we will sack (Vlad Y.)
V3)
	* Switched test to use pkt->transport rather than chunk->transport
	* Modified detection of sacka-able transport.  Instead of just setting
	  and clearning a flag, we now mark each transport and association with
	  a sack generation tag.  We increment the associations generation on
	  every sack, and assign that generation tag to every transport that
	  updates the ctsn.  This prevents us from having to iterate over a for
	  loop on every sack, which is much more scalable.
---
 include/net/sctp/structs.h |    4 ++++
 include/net/sctp/tsnmap.h  |    3 ++-
 net/sctp/associola.c       |    1 +
 net/sctp/output.c          |    9 +++++++--
 net/sctp/sm_make_chunk.c   |   10 ++++++++++
 net/sctp/sm_sideeffect.c   |    2 +-
 net/sctp/transport.c       |    2 ++
 net/sctp/tsnmap.c          |    6 +++++-
 net/sctp/ulpevent.c        |    3 ++-
 net/sctp/ulpqueue.c        |    2 +-
 10 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..fecdf31 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -912,6 +912,9 @@ struct sctp_transport {
 		/* Is this structure kfree()able? */
 		malloced:1;
 
+	/* Has this transport moved the ctsn since we last sacked */
+	__u32 sack_generation;
+
 	struct flowi fl;
 
 	/* This is the peer's IP address and port. */
@@ -1584,6 +1587,7 @@ struct sctp_association {
 		 */
 		__u8    sack_needed;     /* Do we need to sack the peer? */
 		__u32	sack_cnt;
+		__u32	sack_generation;
 
 		/* These are capabilities which our peer advertised.  */
 		__u8	ecn_capable:1,	    /* Can peer do ECN? */
diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h
index e7728bc..2c5d2b4 100644
--- a/include/net/sctp/tsnmap.h
+++ b/include/net/sctp/tsnmap.h
@@ -117,7 +117,8 @@ void sctp_tsnmap_free(struct sctp_tsnmap *map);
 int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn);
 
 /* Mark this TSN as seen.  */
-int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn);
+int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn,
+		     struct sctp_transport *trans);
 
 /* Mark this TSN and all lower as seen. */
 void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..6c66adb 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -271,6 +271,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 */
 	asoc->peer.sack_needed = 1;
 	asoc->peer.sack_cnt = 0;
+	asoc->peer.sack_generation=0;
 
 	/* Assume that the peer will tell us if he recognizes ASCONF
 	 * as part of INIT exchange.
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f1b7d4b..0de6cd5 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -240,14 +240,19 @@ static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt,
 	 */
 	if (sctp_chunk_is_data(chunk) && !pkt->has_sack &&
 	    !pkt->has_cookie_echo) {
-		struct sctp_association *asoc;
 		struct timer_list *timer;
-		asoc = pkt->transport->asoc;
+		struct sctp_association *asoc = pkt->transport->asoc;
+
 		timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
 
 		/* If the SACK timer is running, we have a pending SACK */
 		if (timer_pending(timer)) {
 			struct sctp_chunk *sack;
+
+			if (pkt->transport->sack_generation !=
+			    pkt->transport->asoc->peer.sack_generation)
+				return retval;
+
 			asoc->a_rwnd = asoc->rwnd;
 			sack = sctp_make_sack(asoc);
 			if (sack) {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index a85eeeb..ffa2a8e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -736,6 +736,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 	__u16 num_gabs, num_dup_tsns;
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
 	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
+	struct sctp_transport *trans;
 
 	memset(gabs, 0, sizeof(gabs));
 	ctsn = sctp_tsnmap_get_ctsn(map);
@@ -805,6 +806,15 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 		sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
 				 sctp_tsnmap_get_dups(map));
 
+	/*
+	 * Once we have a sack generated, clear the moved_tsn information
+	 * from all the transports
+	 */
+	if (!asoc->peer.sack_generation)
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports)
+			trans->sack_generation = UINT_MAX;
+	((struct sctp_association *)asoc)->peer.sack_generation++;
 nodata:
 	return retval;
 }
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..8716da1 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1268,7 +1268,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_REPORT_TSN:
 			/* Record the arrival of a TSN.  */
 			error = sctp_tsnmap_mark(&asoc->peer.tsn_map,
-						 cmd->obj.u32);
+						 cmd->obj.u32, NULL);
 			break;
 
 		case SCTP_CMD_REPORT_FWDTSN:
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..1dcceb6 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -68,6 +68,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 	peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
 	memset(&peer->saddr, 0, sizeof(union sctp_addr));
 
+	peer->sack_generation = 0;
+
 	/* From 6.3.1 RTO Calculation:
 	 *
 	 * C1) Until an RTT measurement has been made for a packet sent to the
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index f1e40ceb..b5fb7c4 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -114,7 +114,8 @@ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn)
 
 
 /* Mark this TSN as seen.  */
-int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn)
+int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn,
+		     struct sctp_transport *trans)
 {
 	u16 gap;
 
@@ -133,6 +134,9 @@ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn)
 		 */
 		map->max_tsn_seen++;
 		map->cumulative_tsn_ack_point++;
+		if (trans)
+			trans->sack_generation =
+				trans->asoc->peer.sack_generation;
 		map->base_tsn++;
 	} else {
 		/* Either we already have a gap, or about to record a gap, so
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 8a84017..33d8947 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -715,7 +715,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	 * can mark it as received so the tsn_map is updated correctly.
 	 */
 	if (sctp_tsnmap_mark(&asoc->peer.tsn_map,
-			     ntohl(chunk->subh.data_hdr->tsn)))
+			     ntohl(chunk->subh.data_hdr->tsn),
+			     chunk->transport))
 		goto fail_mark;
 
 	/* First calculate the padding, so we don't inadvertently
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index f2d1de7..f5a6a4f 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -1051,7 +1051,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 	if (chunk && (freed >= needed)) {
 		__u32 tsn;
 		tsn = ntohl(chunk->subh.data_hdr->tsn);
-		sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn);
+		sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn, chunk->transport);
 		sctp_ulpq_tail_data(ulpq, chunk, gfp);
 
 		sctp_ulpq_partial_delivery(ulpq, chunk, gfp);
-- 
1.7.7.6

^ permalink raw reply related

* Re: BUG: NULL pointer in ctnetlink_conntrack_event
From: Pablo Neira Ayuso @ 2012-06-29 16:29 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: netdev, netfilter-devel
In-Reply-To: <mdaatbj.9730f14767bb2419ed11e14ac50382b8@obelix.schillstrom.com>

[-- Attachment #1: Type: text/plain, Size: 6403 bytes --]

On Fri, Jun 29, 2012 at 02:29:37PM +0200, Hans Schillstrom wrote:
> Hello,
> 
> There is a "hard to find" problem in ctnetlink_conntrack_event() when calling
> netlink_has_listeners() net->nfnl is NULL.
> 
> The rcu stuff seems to be right at a first look but who knows...
> 
> The line below fix the problem, but that is not the root cause.
> 
>  int nfnetlink_has_listeners(struct net *net, unsigned int group)
>  {
> -       return netlink_has_listeners(net->nfnl, group);
> +       return net->nfnl ? netlink_has_listeners(net->nfnl, group) : 0 ;
>  }
> 
> Yes it is a 3.0.26 kernel but this patch is applied
> netfilter: nf_conntrack: make event callback registration per-netns

I think this patch above is missing some rcu_access_pointer usage.

Please, see patch attached.

> It happens when adding a number of containers with does a "nfct_query(h, NFCT_Q_CREATE, ct);"
> and most likely one namespace shuts down.
> 
> Any idea why the timer is running at this point ?
> 
> 
> BUG: unable to handle kernel NULL pointer dereference at 000000000000027c
> IP: [<ffffffff813615db>] netlink_has_listeners+0xb/0x60
> PGD 0
> Oops: 0000 [#3] PREEMPT SMP
> CPU 0
> Modules linked in: ip6table_raw(N) xt_NOTRACK(N) iptable_raw(N) ipt_REJECT(N) xt_sctp(N) xt_multiport(N) xt_connmark(N) xt_mark(N) xt_conntrack(N) ip6table_mangle(N) ip_vs(N) nf_conntrack_netlink(N) nfnetlink(N) ip6_tunnel(N) tunnel6(N) macvlan(N) xt_HMARK(N) ipv6_find_hdr(N) iptable_mangle(N) nf_conntrack_ipv6(N) nf_defrag_ipv6(N) ip6t_LOG(N) ip6table_filter(N) ip6_tables(N) nf_conntrack_ipv4(N) nf_defrag_ipv4(N) xt_state(N) xt_tcpudp(N) xt_u32(N) xt_comment(N) xt_length(N) xt_hashlimit(N) ipt_LOG(N) xt_limit(N) iptable_filter(N) ip_tables(N) x_tables(N) nf_conntrack_ftp(N) nf_conntrack_tftp(N) nf_conntrack(N) mptsas(N) mptscsih(N) mptbase(N) sg(N) scsi_transport_sas(N) i2c_i801(N) i2c_core(N) button(N) pcspkr(N) ahci(N) libahci(N) processor(N) serio_raw(N) thermal_sys(N) hwmon(N) iTCO_wdt(N) iTCO_vendor_support(N) libata(N) ioatdma(N) ixgbe(N) mdio(N) nfs(N) lockd(N) fscache(N) auth_rpcgss(N) nfs_acl(N) sunrpc(N) af_packet(N) ipv6(N) ipv6_lib(N) bonding(N) e1000e(N) igb(N) dca(N) mii(N) 8021q(N) garp(N) st
> p(N) llc(N) softdog(N) xfs(N) exportfs(N) sd_mod(N) crc_t10dif(N) usb_storage(N) scsi_mod(N) ehci_hcd(N) uhci_hcd(N) usbcore(N) usb_common(N)
> Supported: Yes
> 
> Pid: 0, comm: swapper Tainted: G      D    N  3.0.26-0.2-default
> RIP: 0010:[<ffffffff813615db>]  [<ffffffff813615db>] netlink_has_listeners+0xb/0x60
> RSP: 0018:ffff88063f203da0  EFLAGS: 00010286
> RAX: ffff88063f203e30 RBX: 0000000000000000 RCX: ffffffffa04c60f0
> RDX: 0000000000000004 RSI: 0000000000000003 RDI: 0000000000000000
> RBP: 0000000000000003 R08: 0000000000000000 R09: ffff88063f2114a0
> R10: 0000000000000000 R11: ffffffff8101e760 R12: ffff8805e2a45788
> R13: 0000000000000000 R14: 0000000000000002 R15: 0000000000000004
> FS:  0000000000000000(0000) GS:ffff88063f200000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: 000000000000027c CR3: 0000000001a03000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a0b020)
> Stack:
>  0000000000000000 0000000000000000 ffff8805e2a45800 ffffffffa04c453e
>  ffff88063f203e30 0000000400000001 ffff8805e24e6c80 0000000300000000
>  0000000000000000 ffff880610044000 ffff880610044800 ffff8805e2a45788
> Call Trace:
>  [<ffffffffa04c453e>] ctnetlink_conntrack_event+0x51e/0x570 [nf_conntrack_netlink]
>  [<ffffffffa042a27b>] death_by_timeout+0x12b/0x190 [nf_conntrack]
>  [<ffffffff810608ec>] run_timer_softirq+0x14c/0x270
>  [<ffffffff81059d25>] __do_softirq+0xa5/0x180
>  [<ffffffff813ff43c>] call_softirq+0x1c/0x30
>  [<ffffffff810043f5>] do_softirq+0x65/0xa0
>  [<ffffffff81059b15>] irq_exit+0xc5/0x100
>  [<ffffffff8101f5a9>] smp_apic_timer_interrupt+0x69/0xa0
>  [<ffffffff813febf3>] apic_timer_interrupt+0x13/0x20
>  [<ffffffffa0230806>] acpi_idle_enter_bm+0x255/0x28f [processor]
>  [<ffffffff813179e2>] cpuidle_idle_call+0xd2/0x120
>  [<ffffffff810019f3>] cpu_idle+0x63/0xd0
>  [<ffffffff81bf0f65>] start_kernel+0x3e4/0x4bf
>  [<ffffffff81bf03c3>] x86_64_start_kernel+0x114/0x12f
> Code: ff 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 e9 cb c5 fc ff 66 66 2e 0f 1f 84 00 00 00 00 00 55 89 f5 53 48 89 fb 48 83 ec 08 <f6> 87 7c 02 00 00 01 74 41 e8 47 50 d5 ff 0f b6 83 21 01 00 00
> RIP  [<ffffffff813615db>] netlink_has_listeners+0xb/0x60
>  RSP <ffff88063f203da0>
> CR2: 000000000000027c
> ---[ end trace a057af0b3004c67a ]---
> Kernel panic - not syncing: Fatal exception in interrupt
> Pid: 0, comm: swapper Tainted: G      D    N  3.0.26-0.2-default #1
> Call Trace:
>  [<ffffffff81004672>] dump_trace+0x82/0x380
>  [<ffffffff813f4fa2>] dump_stack+0x69/0x6f
>  [<ffffffff813f5050>] panic+0xa8/0x20c
>  [<ffffffff813f9b21>] oops_end+0xe1/0xf0
>  [<ffffffff81030e50>] no_context+0x100/0x270
>  [<ffffffff81031135>] __bad_area_nosemaphore+0x175/0x220
>  [<ffffffff813fbb36>] do_page_fault+0x3a6/0x590
>  [<ffffffff813f8d15>] page_fault+0x25/0x30
>  [<ffffffff813615db>] netlink_has_listeners+0xb/0x60
>  [<ffffffffa04c453e>] ctnetlink_conntrack_event+0x51e/0x570 [nf_conntrack_netlink]
>  [<ffffffffa042a27b>] death_by_timeout+0x12b/0x190 [nf_conntrack]
>  [<ffffffff810608ec>] run_timer_softirq+0x14c/0x270
>  [<ffffffff81059d25>] __do_softirq+0xa5/0x180
>  [<ffffffff813ff43c>] call_softirq+0x1c/0x30
>  [<ffffffff810043f5>] do_softirq+0x65/0xa0
>  [<ffffffff81059b15>] irq_exit+0xc5/0x100
>  [<ffffffff8101f5a9>] smp_apic_timer_interrupt+0x69/0xa0
>  [<ffffffff813febf3>] apic_timer_interrupt+0x13/0x20
>  [<ffffffffa0230806>] acpi_idle_enter_bm+0x255/0x28f [processor]
>  [<ffffffff813179e2>] cpuidle_idle_call+0xd2/0x120
>  [<ffffffff810019f3>] cpu_idle+0x63/0xd0
>  [<ffffffff81bf0f65>] start_kernel+0x3e4/0x4bf
>  [<ffffffff81bf03c3>] x86_64_start_kernel+0x114/0x12f
> Rebooting in 1 seconds..
> --
> Regards 
> Hans Schillstrom
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: missing-rcu_access_pointer.patch --]
[-- Type: text/x-diff, Size: 543 bytes --]

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index a88fb69..e1ce104 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -78,7 +78,7 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *e;
 
-	if (net->ct.nf_conntrack_event_cb == NULL)
+	if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
 		return;
 
 	e = nf_ct_ecache_find(ct);

^ permalink raw reply related

* [PATCH 2/2] netlink: add nlk->netlink_bind hook for module auto-loading
From: pablo @ 2012-06-29 16:15 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340986522-3442-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds a hook in the binding path of netlink.

This is used by ctnetlink to allow module autoloading for the case
in which one user executes:

 conntrack -E

So far, this resulted in nfnetlink loaded, but not
nf_conntrack_netlink.

I have received in the past many complains on this behaviour.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netlink.h   |    1 +
 net/netfilter/nfnetlink.c |   29 +++++++++++++++++++++++++++++
 net/netlink/af_netlink.c  |   19 +++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6085e49..f74dd13 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -179,6 +179,7 @@ struct netlink_kernel_cfg {
 	unsigned int	groups;
 	void		(*input)(struct sk_buff *skb);
 	struct mutex	*cb_mutex;
+	void		(*bind)(int group);
 };
 
 extern struct sock *netlink_kernel_create(struct net *net, int unit,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 700e461..5a2132b 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -39,6 +39,15 @@ static char __initdata nfversion[] = "0.30";
 static const struct nfnetlink_subsystem __rcu *subsys_table[NFNL_SUBSYS_COUNT];
 static DEFINE_MUTEX(nfnl_mutex);
 
+static const int nfnl_group2type[NFNLGRP_MAX+1] = {
+	[NFNLGRP_CONNTRACK_NEW]		= NFNL_SUBSYS_CTNETLINK,
+	[NFNLGRP_CONNTRACK_UPDATE]	= NFNL_SUBSYS_CTNETLINK,
+	[NFNLGRP_CONNTRACK_DESTROY]	= NFNL_SUBSYS_CTNETLINK,
+	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+};
+
 void nfnl_lock(void)
 {
 	mutex_lock(&nfnl_mutex);
@@ -200,12 +209,32 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 	netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
 }
 
+#ifdef CONFIG_MODULES
+static void nfnetlink_bind(int group)
+{
+	const struct nfnetlink_subsystem *ss;
+	int type = nfnl_group2type[group];
+
+	rcu_read_lock();
+	ss = nfnetlink_get_subsys(type);
+	if (!ss) {
+		rcu_read_unlock();
+		request_module("nfnetlink-subsys-%d", type);
+		return;
+	}
+	rcu_read_unlock();
+}
+#endif
+
 static int __net_init nfnetlink_net_init(struct net *net)
 {
 	struct sock *nfnl;
 	struct netlink_kernel_cfg cfg = {
 		.groups	= NFNLGRP_MAX,
 		.input	= nfnetlink_rcv,
+#ifdef CONFIG_MODULES
+		.bind	= nfnetlink_bind,
+#endif
 	};
 
 	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 43a124f..5463969 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -80,6 +80,7 @@ struct netlink_sock {
 	struct mutex		*cb_mutex;
 	struct mutex		cb_def_mutex;
 	void			(*netlink_rcv)(struct sk_buff *skb);
+	void			(*netlink_bind)(int group);
 	struct module		*module;
 };
 
@@ -124,6 +125,7 @@ struct netlink_table {
 	unsigned int		groups;
 	struct mutex		*cb_mutex;
 	struct module		*module;
+	void			(*bind)(int group);
 	int			registered;
 };
 
@@ -444,6 +446,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	struct module *module = NULL;
 	struct mutex *cb_mutex;
 	struct netlink_sock *nlk;
+	void (*bind)(int group);
 	int err = 0;
 
 	sock->state = SS_UNCONNECTED;
@@ -468,6 +471,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	else
 		err = -EPROTONOSUPPORT;
 	cb_mutex = nl_table[protocol].cb_mutex;
+	bind = nl_table[protocol].bind;
 	netlink_unlock_table();
 
 	if (err < 0)
@@ -483,6 +487,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 
 	nlk = nlk_sk(sock->sk);
 	nlk->module = module;
+	nlk->netlink_bind = bind;
 out:
 	return err;
 
@@ -683,6 +688,15 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 	netlink_update_listeners(sk);
 	netlink_table_ungrab();
 
+	if (nlk->netlink_bind && nlk->groups[0]) {
+		int i;
+
+		for (i=0; i<nlk->ngroups; i++) {
+			if (test_bit(i, nlk->groups))
+				nlk->netlink_bind(i);
+		}
+	}
+
 	return 0;
 }
 
@@ -1239,6 +1253,10 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		netlink_update_socket_mc(nlk, val,
 					 optname == NETLINK_ADD_MEMBERSHIP);
 		netlink_table_ungrab();
+
+		if (nlk->netlink_bind)
+			nlk->netlink_bind(val);
+
 		err = 0;
 		break;
 	}
@@ -1559,6 +1577,7 @@ netlink_kernel_create(struct net *net, int unit,
 		rcu_assign_pointer(nl_table[unit].listeners, listeners);
 		nl_table[unit].cb_mutex = cb_mutex;
 		nl_table[unit].module = module;
+		nl_table[unit].bind = cfg ? cfg->bind : NULL;
 		nl_table[unit].registered = 1;
 	} else {
 		kfree(listeners);
-- 
1.7.10

^ permalink raw reply related

* [PATCH 1/2] netlink: add netlink_kernel_cfg parameter to netlink_kernel_create
From: pablo @ 2012-06-29 16:15 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340986522-3442-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds the following structure:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

That can be passed to netlink_kernel_create to set optional configurations
for netlink kernel sockets.

I've populated this structure by looking for NULL and zero parameters at the
existing code. The remaining parameters that always need to be set are still
left in the original interface.

That includes optional parameters for the netlink socket creation. This allows
easy extensibility of this interface in the future.

This patch also adapts all callers to use this new interface.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 crypto/crypto_user.c                |    7 +++++--
 drivers/connector/connector.c       |   13 +++++++++----
 drivers/infiniband/core/netlink.c   |    7 +++++--
 drivers/scsi/scsi_netlink.c         |    7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |    9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |    6 ++++--
 include/linux/netlink.h             |   15 ++++++++++-----
 kernel/audit.c                      |    7 +++++--
 lib/kobject_uevent.c                |    5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |    6 ++++--
 net/core/rtnetlink.c                |    9 +++++++--
 net/core/sock_diag.c                |    8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |    8 +++++---
 net/ipv4/fib_frontend.c             |    7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |    8 +++++---
 net/netfilter/nfnetlink.c           |    7 +++++--
 net/netlink/af_netlink.c            |   16 ++++++++++------
 net/netlink/genetlink.c             |   10 +++++++---
 net/xfrm/xfrm_user.c                |    7 +++++--
 security/selinux/netlink.c          |    6 +++++-
 20 files changed, 117 insertions(+), 51 deletions(-)

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 5a37ead..ba2c611 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb)
 
 static int __init crypto_user_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input	= crypto_netlink_rcv,
+	};
+
 	crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
-					    0, crypto_netlink_rcv,
-					    NULL, THIS_MODULE);
+					    THIS_MODULE, &cfg);
 	if (!crypto_nlsk)
 		return -ENOMEM;
 
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 34e0e9e..116cf8d 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = {
 	.release = single_release
 };
 
+static struct cn_dev cdev = {
+	.input   = cn_rx_skb,
+};
+
 static int __devinit cn_init(void)
 {
 	struct cn_dev *dev = &cdev;
-
-	dev->input = cn_rx_skb;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= CN_NETLINK_USERS + 0xf,
+		.input	= dev->input,
+	};
 
 	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR,
-					 CN_NETLINK_USERS + 0xf,
-					 dev->input, NULL, THIS_MODULE);
+					 THIS_MODULE, &cfg);
 	if (!dev->nls)
 		return -EIO;
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 1e691dc..3ae2bfd 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb)
 
 int __init ibnl_init(void)
 {
-	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv,
-				    NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg);
 	if (!nls) {
 		pr_warn("Failed to create netlink socket\n");
 		return -ENOMEM;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index c77628a..8818dd6 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -486,6 +486,10 @@ void
 scsi_netlink_init(void)
 {
 	int error;
+	struct netlink_kernel_cfg cfg = {
+		.input	= scsi_nl_rcv_msg,
+		.groups	= SCSI_NL_GRP_CNT,
+	};
 
 	INIT_LIST_HEAD(&scsi_nl_drivers);
 
@@ -497,8 +501,7 @@ scsi_netlink_init(void)
 	}
 
 	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
-				SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL,
-				THIS_MODULE);
+					     THIS_MODULE, &cfg);
 	if (!scsi_nl_sock) {
 		printk(KERN_ERR "%s: register of receive handler failed\n",
 				__func__);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 1cf640e..6042954 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 static __init int iscsi_transport_init(void)
 {
 	int err;
-
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+		.input	= iscsi_if_rx,
+	};
 	printk(KERN_INFO "Loading iSCSI transport class v%s.\n",
 		ISCSI_TRANSPORT_VERSION);
 
@@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void)
 	if (err)
 		goto unregister_conn_class;
 
-	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx,
-				    NULL, THIS_MODULE);
+	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI,
+				    THIS_MODULE, &cfg);
 	if (!nls) {
 		err = -ENOBUFS;
 		goto unregister_session_class;
diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c
index d0cb48a..d1eed1e 100644
--- a/drivers/staging/gdm72xx/netlink_k.c
+++ b/drivers/staging/gdm72xx/netlink_k.c
@@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type,
 						void *msg, int len))
 {
 	struct sock *sock;
+	struct netlink_kernel_cfg cfg = {
+		.input  = netlink_rcv,
+	};
 
 #if !defined(DEFINE_MUTEX)
 	init_MUTEX(&netlink_mutex);
 #endif
 
-	sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL,
-					THIS_MODULE);
+	sock = netlink_kernel_create(&init_net, unit, 0, THIS_MODULE, &cfg);
 
 	if (sock)
 		rcv_cb = cb;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index ed33f09..6085e49 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -174,11 +174,16 @@ struct netlink_skb_parms {
 extern void netlink_table_grab(void);
 extern void netlink_table_ungrab(void);
 
-extern struct sock *netlink_kernel_create(struct net *net,
-					  int unit,unsigned int groups,
-					  void (*input)(struct sk_buff *skb),
-					  struct mutex *cb_mutex,
-					  struct module *module);
+/* optional Netlink kernel configuration parameters */
+struct netlink_kernel_cfg {
+	unsigned int	groups;
+	void		(*input)(struct sk_buff *skb);
+	struct mutex	*cb_mutex;
+};
+
+extern struct sock *netlink_kernel_create(struct net *net, int unit,
+					  struct module *module,
+					  struct netlink_kernel_cfg *cfg);
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
diff --git a/kernel/audit.c b/kernel/audit.c
index 30b252a..4a3f28d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.input	= audit_receive,
+	};
 
 	if (audit_initialized == AUDIT_DISABLED)
 		return 0;
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
-					   audit_receive, NULL, THIS_MODULE);
+	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+					   THIS_MODULE, &cfg);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 	else
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1a91efa..0401d29 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var);
 static int uevent_net_init(struct net *net)
 {
 	struct uevent_sock *ue_sk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+	};
 
 	ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
 	if (!ue_sk)
 		return -ENOMEM;
 
 	ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT,
-					  1, NULL, NULL, THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ue_sk->sk) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 1bd1732..374bdcd 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void)
 {
 	int ret;
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= EBT_ULOG_MAXNLGROUPS,
+	};
 
 	if (nlbufsiz >= 128*1024) {
 		pr_warning("Netlink buffer has to be <= 128kB,"
@@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void)
 	}
 
 	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
-					  THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 21318d1..2db8557 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2366,8 +2366,13 @@ static struct notifier_block rtnetlink_dev_notifier = {
 static int __net_init rtnetlink_net_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
-				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.groups		= RTNLGRP_MAX,
+		.input		= rtnetlink_rcv,
+		.cb_mutex	= &rtnl_mutex,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg);
 	if (!sk)
 		return -ENOMEM;
 	net->rtnl = sk;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 0d934ce..0929821 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -177,8 +177,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk);
 
 static int __init sock_diag_init(void)
 {
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
-					sock_diag_rcv, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= sock_diag_rcv,
+	};
+
+	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+					       THIS_MODULE, &cfg);
 	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
 }
 
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index b8f7f5b..11db0ec 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = {
 static int __init dn_rtmsg_init(void)
 {
 	int rv = 0;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= DNRNG_NLGRP_MAX,
+		.input	= dnrmg_receive_user_skb,
+	};
 
 	dnrmg = netlink_kernel_create(&init_net,
-				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
-				      dnrmg_receive_user_skb,
-				      NULL, THIS_MODULE);
+				      NETLINK_DNRTMSG, THIS_MODULE, &cfg);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411..0cd820e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -935,8 +935,11 @@ static void nl_fib_input(struct sk_buff *skb)
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
-				   nl_fib_input, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= nl_fib_input,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
 	if (sk == NULL)
 		return -EAFNOSUPPORT;
 	net->ipv4.fibnl = sk;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 99b3f53..1109f7f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
 	int ret, i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= ULOG_MAXNLGROUPS,
+	};
 
 	pr_debug("init module\n");
 
@@ -393,9 +396,8 @@ static int __init ulog_tg_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net,
-					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
-					NULL, THIS_MODULE);
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					THIS_MODULE, &cfg);
 	if (!nflognl)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1..700e461 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int __net_init nfnetlink_net_init(struct net *net)
 {
 	struct sock *nfnl;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= NFNLGRP_MAX,
+		.input	= nfnetlink_rcv,
+	};
 
-	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
-				     nfnetlink_rcv, NULL, THIS_MODULE);
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg);
 	if (!nfnl)
 		return -ENOMEM;
 	net->nfnl_stash = nfnl;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b3025a6..43a124f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(struct net *net, int unit, unsigned int groups,
-		      void (*input)(struct sk_buff *skb),
-		      struct mutex *cb_mutex, struct module *module)
+netlink_kernel_create(struct net *net, int unit,
+		      struct module *module,
+		      struct netlink_kernel_cfg *cfg)
 {
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
 	struct listeners *listeners = NULL;
+	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
+	unsigned int groups;
 
 	BUG_ON(!nl_table);
 
@@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	if (groups < 32)
+	if (!cfg || cfg->groups < 32)
 		groups = 32;
+	else
+		groups = cfg->groups;
 
 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
 	if (!listeners)
 		goto out_sock_release;
 
 	sk->sk_data_ready = netlink_data_ready;
-	if (input)
-		nlk_sk(sk)->netlink_rcv = input;
+	if (cfg && cfg->input)
+		nlk_sk(sk)->netlink_rcv = cfg->input;
 
 	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2cc7c1e..32761b5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = {
 
 static int __net_init genl_pernet_init(struct net *net)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input		= genl_rcv,
+		.cb_mutex	= &genl_mutex,
+	};
+
 	/* we'll bump the group number right afterwards */
-	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
-					       genl_rcv, &genl_mutex,
-					       THIS_MODULE);
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC,
+					       THIS_MODULE, &cfg);
 
 	if (!net->genl_sock && net_eq(net, &init_net))
 		panic("GENL: Cannot initialize generic netlink\n");
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 44293b3..622d049 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2945,9 +2945,12 @@ static struct xfrm_mgr netlink_mgr = {
 static int __net_init xfrm_user_net_init(struct net *net)
 {
 	struct sock *nlsk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= XFRMNLGRP_MAX,
+		.input	= xfrm_netlink_rcv,
+	};
 
-	nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX,
-				     xfrm_netlink_rcv, NULL, THIS_MODULE);
+	nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg);
 	if (nlsk == NULL)
 		return -ENOMEM;
 	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 8a23a35..8a77725 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.groups	= SELNLGRP_MAX,
+	};
+
 	selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX,
-				      SELNLGRP_MAX, NULL, NULL, THIS_MODULE);
+				      THIS_MODULE, &cfg);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);
-- 
1.7.10


^ permalink raw reply related

* [PATCH 0/2] [net-next] Netlink updates
From: pablo @ 2012-06-29 16:15 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following two patches provides a couple of Netlink updates:

* One netlink update to introduce struct netlink_kernel_cfg:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

This structure contains optional paramters to configure one netlink
kernel socket.

eg.

       struct netlink_kernel_cfg cfg = {
               .input  = crypto_netlink_rcv,
       };

       crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
                                           THIS_MODULE, &cfg);

This should make easier in the future to add new optional
configuration parameters without touching the netlink_kernel_create
interface.

I converted all netlink_kernel_create invocations in your tree to
use the new one.

* One patch that adds the bind hook. This hook is used by nfnetlink
to auto-load the appropriate subsystem

The bind hook is called in the netlink_setsockopt and netlink_bind
paths. These are called when registering a user-space netlink event
listener.

Let me provide one example, to further clarify this. If you run:

 conntrack -E

And nf_conntrack_netlink is not loaded, the existing netlink autoload
code add nfnetlink, but we have no way to autoload nf_conntrack_netlink.

With this new chunk of code, we can run some code in nfnetlink to check
for the group that the user-space listener wants to subscribe to. Then,
it can check what module needs to be auto-loaded, if required.

We got users complaining on this behaviour in the past.

If you like them, please manually apply. I wanted to know if you are
happy with these before pushing them into my tree, as they include
netlink changes.

Thanks!

Pablo Neira Ayuso (2):
  netlink: add netlink_kernel_cfg parameter to netlink_kernel_create
  netlink: add nlk->netlink_bind hook for module auto-loading

 crypto/crypto_user.c                |    7 +++++--
 drivers/connector/connector.c       |   13 +++++++++----
 drivers/infiniband/core/netlink.c   |    7 +++++--
 drivers/scsi/scsi_netlink.c         |    7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |    9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |    6 ++++--
 include/linux/netlink.h             |   16 +++++++++++-----
 kernel/audit.c                      |    7 +++++--
 lib/kobject_uevent.c                |    5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |    6 ++++--
 net/core/rtnetlink.c                |    9 +++++++--
 net/core/sock_diag.c                |    8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |    8 +++++---
 net/ipv4/fib_frontend.c             |    7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |    8 +++++---
 net/netfilter/nfnetlink.c           |   36 +++++++++++++++++++++++++++++++++--
 net/netlink/af_netlink.c            |   35 ++++++++++++++++++++++++++++------
 net/netlink/genetlink.c             |   10 +++++++---
 net/xfrm/xfrm_user.c                |    7 +++++--
 security/selinux/netlink.c          |    6 +++++-
 20 files changed, 166 insertions(+), 51 deletions(-)

-- 
1.7.10


^ permalink raw reply

* Re: [PATCH net-next] em_canid: Ematch rule to match CAN frames according to their identifiers
From: Oliver Hartkopp @ 2012-06-29 15:44 UTC (permalink / raw)
  To: Rostislav Lisovy; +Cc: netdev, linux-can, lartc, pisa, sojkam1
In-Reply-To: <1340903231-9561-1-git-send-email-lisovy@gmail.com>

Hello Rostislav,

looks really good now.

1. Your Signed-off-by: is missing.

2. One remark to a removed length check:

(..)

> +static int em_canid_change(struct tcf_proto *tp, void *data, int len,
> +			  struct tcf_ematch *m)
> +{
> +	struct can_filter *conf = data; /* Array with rules,
> +					 * fixed size EM_CAN_RULES_SIZE
> +					 */
> +	struct canid_match *cm;
> +	struct canid_match *cm_old = (struct canid_match *) m->data;
> +	int i;
> +	int rulescnt;
> +


What about a zero length check here?

	if (!len)
		return -EINVAL;

???

> +	if (len % sizeof(struct can_filter))
> +		return -EINVAL;
> +
> +	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
> +		return -EINVAL;
> +
> +	rulescnt = len / sizeof(struct can_filter);
> +
> +	cm = kzalloc(sizeof(struct canid_match) + sizeof(struct can_filter) *
> +		rulescnt, GFP_KERNEL);
> +	if (!cm)
> +		return -ENOMEM;


The length could alternatively be checked here too

http://lxr.linux.no/#linux+v3.4.4/net/sched/ematch.c#L235

if em->ops->datalen is set.

But here's no

	.datalen = sizeof(struct can_filter),

defined, right?

> +static struct tcf_ematch_ops em_canid_ops = {
> +	.kind	  = TCF_EM_CANID,
> +	.change	  = em_canid_change,
> +	.match	  = em_canid_match,
> +	.destroy  = em_canid_destroy,
> +	.dump	  = em_canid_dump,
> +	.owner	  = THIS_MODULE,
> +	.link	  = LIST_HEAD_INIT(em_canid_ops.link)
> +};


Regards,
Oliver

^ permalink raw reply

* RE: [PATCH 1/5] netfilter: ipset: fix interface comparision in hash-netiface sets
From: David Laight @ 2012-06-29 15:41 UTC (permalink / raw)
  To: pablo, netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-2-git-send-email-pablo@netfilter.org>

> From: Florian Westphal <fw@strlen.de>
> 
> ifname_compare() assumes that skb->dev is zero-padded,
> e.g 'eth1\0\0\0\0\0...'. This isn't always the case. e1000 driver does
> 
> strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
> 
> in e1000_probe(), so once device is registered dev->name memory
contains
> 'eth1\0:0:3\0\0\0' (or something like that), which makes eth1 compare
fail.

strncpy() would normally zero-fill the destination buffer
(at least the libc version does).

So something else must be wrong.

	David



^ permalink raw reply

* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Andreas Gruenbacher @ 2012-06-29 15:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340982666.21162.3.camel@edumazet-glaptop>

On Fri, 2012-06-29 at 17:11 +0200, Eric Dumazet wrote:
> On Fri, 2012-06-29 at 16:54 +0200, Andreas Gruenbacher wrote:
> > The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
> > page should be put into a new packet even when there is still room left in the
> > previous packet.
> > 
> > In the tcp protocol, messages which are not sent immediately are queued.  When
> > more data is sent, it will be added to the last segment in that queue until
> > that segment is "full" whenever possible; only then is a new segment added.
> > Right now, there is no way to indicate when tcp should start a new segment.
> > The new flag allows to control that.
> > 
> > Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
> > ---
> 
> I don't understand how maintaining any message boundaries at sender can
> prevent any middlebox or the receiver to coalesce frames to any
> boundaries it prefers ?

The primary use case is fast Gigabit (10 or more) Ethernet connections
with jumbo frames and switches that support them.  There, frames will go
through unchanged and you can zero-copy receive all the time.

Not sure how well the approach scales to other kinds of connections; it
may work often enough to be worth it.  When things get distorted between
the sender and the receiver and tcp_recvbio() fails, the data can still
be copied out of the socket as before.

Andreas

^ permalink raw reply

* [PATCH 5/5] netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>

Bug added in commit 6b75e3e8d664a9a (netfilter: nfnetlink: add RCU in
nfnetlink_rcv_msg())

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1..791d56b 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -169,8 +169,10 @@ replay:
 
 		err = nla_parse(cda, ss->cb[cb_id].attr_count,
 				attr, attrlen, ss->cb[cb_id].policy);
-		if (err < 0)
+		if (err < 0) {
+			rcu_read_unlock();
 			return err;
+		}
 
 		if (nc->call_rcu) {
 			err = nc->call_rcu(net->nfnl, skb, nlh,
-- 
1.7.10

^ permalink raw reply related

* [PATCH 1/5] netfilter: ipset: fix interface comparision in hash-netiface sets
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

ifname_compare() assumes that skb->dev is zero-padded,
e.g 'eth1\0\0\0\0\0...'. This isn't always the case. e1000 driver does

strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);

in e1000_probe(), so once device is registered dev->name memory contains
'eth1\0:0:3\0\0\0' (or something like that), which makes eth1 compare
fail.

Use plain strcmp() instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_netiface.c |   32 ++++------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index ee86394..d5d3607 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -38,30 +38,6 @@ struct iface_node {
 
 #define iface_data(n)	(rb_entry(n, struct iface_node, node)->iface)
 
-static inline long
-ifname_compare(const char *_a, const char *_b)
-{
-	const long *a = (const long *)_a;
-	const long *b = (const long *)_b;
-
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
-	if (a[0] != b[0])
-		return a[0] - b[0];
-	if (IFNAMSIZ > sizeof(long)) {
-		if (a[1] != b[1])
-			return a[1] - b[1];
-	}
-	if (IFNAMSIZ > 2 * sizeof(long)) {
-		if (a[2] != b[2])
-			return a[2] - b[2];
-	}
-	if (IFNAMSIZ > 3 * sizeof(long)) {
-		if (a[3] != b[3])
-			return a[3] - b[3];
-	}
-	return 0;
-}
-
 static void
 rbtree_destroy(struct rb_root *root)
 {
@@ -99,7 +75,7 @@ iface_test(struct rb_root *root, const char **iface)
 
 	while (n) {
 		const char *d = iface_data(n);
-		long res = ifname_compare(*iface, d);
+		int res = strcmp(*iface, d);
 
 		if (res < 0)
 			n = n->rb_left;
@@ -121,7 +97,7 @@ iface_add(struct rb_root *root, const char **iface)
 
 	while (*n) {
 		char *ifname = iface_data(*n);
-		long res = ifname_compare(*iface, ifname);
+		int res = strcmp(*iface, ifname);
 
 		p = *n;
 		if (res < 0)
@@ -366,7 +342,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	struct hash_netiface4_elem data = { .cidr = HOST_MASK };
 	u32 ip = 0, ip_to, last;
 	u32 timeout = h->timeout;
-	char iface[IFNAMSIZ] = {};
+	char iface[IFNAMSIZ];
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
@@ -663,7 +639,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface6_elem data = { .cidr = HOST_MASK };
 	u32 timeout = h->timeout;
-	char iface[IFNAMSIZ] = {};
+	char iface[IFNAMSIZ];
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
-- 
1.7.10

^ permalink raw reply related

* [PATCH 3/5] netfilter: update location of my trees
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 MAINTAINERS |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f6e62de..302aa00 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4654,8 +4654,8 @@ L:	netfilter@vger.kernel.org
 L:	coreteam@netfilter.org
 W:	http://www.netfilter.org/
 W:	http://www.iptables.org/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-2.6.git
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next-2.6.git
+T:	git git://1984.lsi.us.es/nf
+T:	git git://1984.lsi.us.es/nf-next
 S:	Supported
 F:	include/linux/netfilter*
 F:	include/linux/netfilter/
-- 
1.7.10

^ permalink raw reply related

* [PATCH 4/5] netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>

This patch fixes a crash if that ipset command is sent over nfnetlink.

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c |   12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 819c342..9730882 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -640,6 +640,14 @@ find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
 }
 
 static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
 ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
@@ -1539,6 +1547,10 @@ nlmsg_failure:
 }
 
 static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+	[IPSET_CMD_NONE]	= {
+		.call		= ip_set_none,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+	},
 	[IPSET_CMD_CREATE]	= {
 		.call		= ip_set_create,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
-- 
1.7.10


^ permalink raw reply related

* [PATCH 2/5] netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Eric Dumazet <edumazet@google.com>

After call to ip6_route_output() we must release dst or we leak it.

Also should test dst->error, as ip6_route_output() never returns NULL.

Use boolean while we are at it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd811b8..d43e3c1 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -76,19 +76,19 @@ static void __ip_vs_del_service(struct ip_vs_service *svc);
 
 #ifdef CONFIG_IP_VS_IPV6
 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(struct net *net,
-				    const struct in6_addr *addr)
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+				     const struct in6_addr *addr)
 {
-	struct rt6_info *rt;
 	struct flowi6 fl6 = {
 		.daddr = *addr,
 	};
+	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+	bool is_local;
 
-	rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
-	if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
-		return 1;
+	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
 
-	return 0;
+	dst_release(dst);
+	return is_local;
 }
 #endif
 
-- 
1.7.10


^ permalink raw reply related

* [PATCH 0/5] netfilter fixes for 3.5-rc4
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following are 4 fixes and the update of the MAINTAINERS file
to point to my Netfilter trees.

They are:

* One refcount leak fix in IPVS IPv6 support from Eric Dumazet.

* One fix for interface comparison in ipset hash-netiface sets
  from Florian Westphal.

* One fix for a missing rcu_read_unlock in nfnetlink from
  Tomasz Bursztyka.

* One fix for a kernel crash if IPSET_CMD_NONE is set to ipset via
  nfnetlink, again from Tomasz Bursztyka.

You can pull these changes from:

git://1984.lsi.us.es/nf master

Thanks!

Eric Dumazet (1):
  netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6

Florian Westphal (1):
  netfilter: ipset: fix interface comparision in hash-netiface sets

Pablo Neira Ayuso (1):
  netfilter: update location of my trees

Tomasz Bursztyka (2):
  netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
  netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg

 MAINTAINERS                                |    4 ++--
 net/netfilter/ipset/ip_set_core.c          |   12 +++++++++++
 net/netfilter/ipset/ip_set_hash_netiface.c |   32 ++++------------------------
 net/netfilter/ipvs/ip_vs_ctl.c             |   14 ++++++------
 net/netfilter/nfnetlink.c                  |    4 +++-
 5 files changed, 28 insertions(+), 38 deletions(-)

-- 
1.7.10


^ permalink raw reply

* Re: "ADDRCONF(NETDEV_UP): eth0: link is not ready" with IPv6
From: Ben Hutchings @ 2012-06-29 15:24 UTC (permalink / raw)
  To: Arvid Brodin; +Cc: netdev@vger.kernel.org, Alexey Kuznetsov, Stephen Hemminger
In-Reply-To: <4FED14C2.9020200@xdin.com>

On Fri, 2012-06-29 at 02:36 +0000, Arvid Brodin wrote:
> Hi,
> 
> After 'ip link set eth0 up' on an avr32 board (network driver macb), the device ends up in
> operational mode "UNKNOWN":
> 
> # ip link
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN qlen 1000
>     link/ether 00:24:74:00:17:9d brd ff:ff:ff:ff:ff:ff
> 
> Unplugging and plugging in the network cable gets the device to mode "UP".
> 
> This is a problem for me because I'm trying to use this device as a "slave" device (for a
> virtual HSR device*) and I need to be able to decide if the slave device is operational or
> not.
> 
> Following Stephen's advice here:
> http://kerneltrap.org/mailarchive/linux-netdev/2008/9/24/3398834 I checked the macb.c code
> and noticed they do not call netif_carrier_off() neither before register_netdev() nor in
> dev_open().

It should be called after register_netdev() and before the driver's
ndo_open implementation returns.

> I added the call before register_netdev(), which fixed the problem. However, if I then
> enable IPv6:
>
> # ip link set eth0 up
> ADDRCONF(NETDEV_UP): eth0: link is not ready
> eth0: link up (100/Full)
> ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready

This looks normal.

> Any idea what is happening / what I'm doing wrong? (This is not just cosmetic; is some
> situations this seems to kill the interface - e.g. ping does not work, down/up does not
> help...) Things work fine without IPv6 configured.

Perhaps some packets sent automatically by IPv6 are triggering a driver
bug?  Or there is a bug in multicast support, which IPv6 always uses.

Ben.

> *N.B. I'm writing a driver for a network protocol called "High-availability Seamless
> Redundancy".

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Eric Dumazet @ 2012-06-29 15:11 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340981690.25226.3.camel@gurkel.linbit>

On Fri, 2012-06-29 at 16:54 +0200, Andreas Gruenbacher wrote:
> The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
> page should be put into a new packet even when there is still room left in the
> previous packet.
> 
> In the tcp protocol, messages which are not sent immediately are queued.  When
> more data is sent, it will be added to the last segment in that queue until
> that segment is "full" whenever possible; only then is a new segment added.
> Right now, there is no way to indicate when tcp should start a new segment.
> The new flag allows to control that.
> 
> Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
> ---

I don't understand how maintaining any message boundaries at sender can
prevent any middlebox or the receiver to coalesce frames to any
boundaries it prefers ?

^ permalink raw reply

* [patch net-next v2 4/4] dummy: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev
  Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
	shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/dummy.c |   15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index bab0158..9d6a067 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -40,18 +40,6 @@
 
 static int numdummies = 1;
 
-static int dummy_set_address(struct net_device *dev, void *p)
-{
-	struct sockaddr *sa = p;
-
-	if (!is_valid_ether_addr(sa->sa_data))
-		return -EADDRNOTAVAIL;
-
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
-	memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
-	return 0;
-}
-
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
 {
@@ -118,7 +106,7 @@ static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_start_xmit		= dummy_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= set_multicast_list,
-	.ndo_set_mac_address	= dummy_set_address,
+	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_get_stats64	= dummy_get_stats64,
 };
 
@@ -134,6 +122,7 @@ static void dummy_setup(struct net_device *dev)
 	dev->tx_queue_len = 0;
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
 	eth_hw_addr_random(dev);
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox