Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/2] netlink: add netlink_kernel_cfg parameter to netlink_kernel_create
From: pablo @ 2012-06-29 16:15 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340986522-3442-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds the following structure:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

That can be passed to netlink_kernel_create to set optional configurations
for netlink kernel sockets.

I've populated this structure by looking for NULL and zero parameters at the
existing code. The remaining parameters that always need to be set are still
left in the original interface.

That includes optional parameters for the netlink socket creation. This allows
easy extensibility of this interface in the future.

This patch also adapts all callers to use this new interface.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 crypto/crypto_user.c                |    7 +++++--
 drivers/connector/connector.c       |   13 +++++++++----
 drivers/infiniband/core/netlink.c   |    7 +++++--
 drivers/scsi/scsi_netlink.c         |    7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |    9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |    6 ++++--
 include/linux/netlink.h             |   15 ++++++++++-----
 kernel/audit.c                      |    7 +++++--
 lib/kobject_uevent.c                |    5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |    6 ++++--
 net/core/rtnetlink.c                |    9 +++++++--
 net/core/sock_diag.c                |    8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |    8 +++++---
 net/ipv4/fib_frontend.c             |    7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |    8 +++++---
 net/netfilter/nfnetlink.c           |    7 +++++--
 net/netlink/af_netlink.c            |   16 ++++++++++------
 net/netlink/genetlink.c             |   10 +++++++---
 net/xfrm/xfrm_user.c                |    7 +++++--
 security/selinux/netlink.c          |    6 +++++-
 20 files changed, 117 insertions(+), 51 deletions(-)

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 5a37ead..ba2c611 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb)
 
 static int __init crypto_user_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input	= crypto_netlink_rcv,
+	};
+
 	crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
-					    0, crypto_netlink_rcv,
-					    NULL, THIS_MODULE);
+					    THIS_MODULE, &cfg);
 	if (!crypto_nlsk)
 		return -ENOMEM;
 
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 34e0e9e..116cf8d 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = {
 	.release = single_release
 };
 
+static struct cn_dev cdev = {
+	.input   = cn_rx_skb,
+};
+
 static int __devinit cn_init(void)
 {
 	struct cn_dev *dev = &cdev;
-
-	dev->input = cn_rx_skb;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= CN_NETLINK_USERS + 0xf,
+		.input	= dev->input,
+	};
 
 	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR,
-					 CN_NETLINK_USERS + 0xf,
-					 dev->input, NULL, THIS_MODULE);
+					 THIS_MODULE, &cfg);
 	if (!dev->nls)
 		return -EIO;
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 1e691dc..3ae2bfd 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb)
 
 int __init ibnl_init(void)
 {
-	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv,
-				    NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg);
 	if (!nls) {
 		pr_warn("Failed to create netlink socket\n");
 		return -ENOMEM;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index c77628a..8818dd6 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -486,6 +486,10 @@ void
 scsi_netlink_init(void)
 {
 	int error;
+	struct netlink_kernel_cfg cfg = {
+		.input	= scsi_nl_rcv_msg,
+		.groups	= SCSI_NL_GRP_CNT,
+	};
 
 	INIT_LIST_HEAD(&scsi_nl_drivers);
 
@@ -497,8 +501,7 @@ scsi_netlink_init(void)
 	}
 
 	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
-				SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL,
-				THIS_MODULE);
+					     THIS_MODULE, &cfg);
 	if (!scsi_nl_sock) {
 		printk(KERN_ERR "%s: register of receive handler failed\n",
 				__func__);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 1cf640e..6042954 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 static __init int iscsi_transport_init(void)
 {
 	int err;
-
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+		.input	= iscsi_if_rx,
+	};
 	printk(KERN_INFO "Loading iSCSI transport class v%s.\n",
 		ISCSI_TRANSPORT_VERSION);
 
@@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void)
 	if (err)
 		goto unregister_conn_class;
 
-	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx,
-				    NULL, THIS_MODULE);
+	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI,
+				    THIS_MODULE, &cfg);
 	if (!nls) {
 		err = -ENOBUFS;
 		goto unregister_session_class;
diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c
index d0cb48a..d1eed1e 100644
--- a/drivers/staging/gdm72xx/netlink_k.c
+++ b/drivers/staging/gdm72xx/netlink_k.c
@@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type,
 						void *msg, int len))
 {
 	struct sock *sock;
+	struct netlink_kernel_cfg cfg = {
+		.input  = netlink_rcv,
+	};
 
 #if !defined(DEFINE_MUTEX)
 	init_MUTEX(&netlink_mutex);
 #endif
 
-	sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL,
-					THIS_MODULE);
+	sock = netlink_kernel_create(&init_net, unit, 0, THIS_MODULE, &cfg);
 
 	if (sock)
 		rcv_cb = cb;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index ed33f09..6085e49 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -174,11 +174,16 @@ struct netlink_skb_parms {
 extern void netlink_table_grab(void);
 extern void netlink_table_ungrab(void);
 
-extern struct sock *netlink_kernel_create(struct net *net,
-					  int unit,unsigned int groups,
-					  void (*input)(struct sk_buff *skb),
-					  struct mutex *cb_mutex,
-					  struct module *module);
+/* optional Netlink kernel configuration parameters */
+struct netlink_kernel_cfg {
+	unsigned int	groups;
+	void		(*input)(struct sk_buff *skb);
+	struct mutex	*cb_mutex;
+};
+
+extern struct sock *netlink_kernel_create(struct net *net, int unit,
+					  struct module *module,
+					  struct netlink_kernel_cfg *cfg);
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
diff --git a/kernel/audit.c b/kernel/audit.c
index 30b252a..4a3f28d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.input	= audit_receive,
+	};
 
 	if (audit_initialized == AUDIT_DISABLED)
 		return 0;
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
-					   audit_receive, NULL, THIS_MODULE);
+	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+					   THIS_MODULE, &cfg);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 	else
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1a91efa..0401d29 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var);
 static int uevent_net_init(struct net *net)
 {
 	struct uevent_sock *ue_sk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+	};
 
 	ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
 	if (!ue_sk)
 		return -ENOMEM;
 
 	ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT,
-					  1, NULL, NULL, THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ue_sk->sk) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 1bd1732..374bdcd 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void)
 {
 	int ret;
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= EBT_ULOG_MAXNLGROUPS,
+	};
 
 	if (nlbufsiz >= 128*1024) {
 		pr_warning("Netlink buffer has to be <= 128kB,"
@@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void)
 	}
 
 	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
-					  THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 21318d1..2db8557 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2366,8 +2366,13 @@ static struct notifier_block rtnetlink_dev_notifier = {
 static int __net_init rtnetlink_net_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
-				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.groups		= RTNLGRP_MAX,
+		.input		= rtnetlink_rcv,
+		.cb_mutex	= &rtnl_mutex,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg);
 	if (!sk)
 		return -ENOMEM;
 	net->rtnl = sk;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 0d934ce..0929821 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -177,8 +177,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk);
 
 static int __init sock_diag_init(void)
 {
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
-					sock_diag_rcv, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= sock_diag_rcv,
+	};
+
+	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+					       THIS_MODULE, &cfg);
 	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
 }
 
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index b8f7f5b..11db0ec 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = {
 static int __init dn_rtmsg_init(void)
 {
 	int rv = 0;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= DNRNG_NLGRP_MAX,
+		.input	= dnrmg_receive_user_skb,
+	};
 
 	dnrmg = netlink_kernel_create(&init_net,
-				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
-				      dnrmg_receive_user_skb,
-				      NULL, THIS_MODULE);
+				      NETLINK_DNRTMSG, THIS_MODULE, &cfg);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411..0cd820e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -935,8 +935,11 @@ static void nl_fib_input(struct sk_buff *skb)
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
-				   nl_fib_input, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= nl_fib_input,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
 	if (sk == NULL)
 		return -EAFNOSUPPORT;
 	net->ipv4.fibnl = sk;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 99b3f53..1109f7f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
 	int ret, i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= ULOG_MAXNLGROUPS,
+	};
 
 	pr_debug("init module\n");
 
@@ -393,9 +396,8 @@ static int __init ulog_tg_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net,
-					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
-					NULL, THIS_MODULE);
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					THIS_MODULE, &cfg);
 	if (!nflognl)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1..700e461 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int __net_init nfnetlink_net_init(struct net *net)
 {
 	struct sock *nfnl;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= NFNLGRP_MAX,
+		.input	= nfnetlink_rcv,
+	};
 
-	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
-				     nfnetlink_rcv, NULL, THIS_MODULE);
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg);
 	if (!nfnl)
 		return -ENOMEM;
 	net->nfnl_stash = nfnl;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b3025a6..43a124f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(struct net *net, int unit, unsigned int groups,
-		      void (*input)(struct sk_buff *skb),
-		      struct mutex *cb_mutex, struct module *module)
+netlink_kernel_create(struct net *net, int unit,
+		      struct module *module,
+		      struct netlink_kernel_cfg *cfg)
 {
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
 	struct listeners *listeners = NULL;
+	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
+	unsigned int groups;
 
 	BUG_ON(!nl_table);
 
@@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	if (groups < 32)
+	if (!cfg || cfg->groups < 32)
 		groups = 32;
+	else
+		groups = cfg->groups;
 
 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
 	if (!listeners)
 		goto out_sock_release;
 
 	sk->sk_data_ready = netlink_data_ready;
-	if (input)
-		nlk_sk(sk)->netlink_rcv = input;
+	if (cfg && cfg->input)
+		nlk_sk(sk)->netlink_rcv = cfg->input;
 
 	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2cc7c1e..32761b5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = {
 
 static int __net_init genl_pernet_init(struct net *net)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input		= genl_rcv,
+		.cb_mutex	= &genl_mutex,
+	};
+
 	/* we'll bump the group number right afterwards */
-	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
-					       genl_rcv, &genl_mutex,
-					       THIS_MODULE);
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC,
+					       THIS_MODULE, &cfg);
 
 	if (!net->genl_sock && net_eq(net, &init_net))
 		panic("GENL: Cannot initialize generic netlink\n");
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 44293b3..622d049 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2945,9 +2945,12 @@ static struct xfrm_mgr netlink_mgr = {
 static int __net_init xfrm_user_net_init(struct net *net)
 {
 	struct sock *nlsk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= XFRMNLGRP_MAX,
+		.input	= xfrm_netlink_rcv,
+	};
 
-	nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX,
-				     xfrm_netlink_rcv, NULL, THIS_MODULE);
+	nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg);
 	if (nlsk == NULL)
 		return -ENOMEM;
 	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 8a23a35..8a77725 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.groups	= SELNLGRP_MAX,
+	};
+
 	selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX,
-				      SELNLGRP_MAX, NULL, NULL, THIS_MODULE);
+				      THIS_MODULE, &cfg);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);
-- 
1.7.10


^ permalink raw reply related

* [PATCH 0/2] [net-next] Netlink updates
From: pablo @ 2012-06-29 16:15 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following two patches provides a couple of Netlink updates:

* One netlink update to introduce struct netlink_kernel_cfg:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

This structure contains optional paramters to configure one netlink
kernel socket.

eg.

       struct netlink_kernel_cfg cfg = {
               .input  = crypto_netlink_rcv,
       };

       crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
                                           THIS_MODULE, &cfg);

This should make easier in the future to add new optional
configuration parameters without touching the netlink_kernel_create
interface.

I converted all netlink_kernel_create invocations in your tree to
use the new one.

* One patch that adds the bind hook. This hook is used by nfnetlink
to auto-load the appropriate subsystem

The bind hook is called in the netlink_setsockopt and netlink_bind
paths. These are called when registering a user-space netlink event
listener.

Let me provide one example, to further clarify this. If you run:

 conntrack -E

And nf_conntrack_netlink is not loaded, the existing netlink autoload
code add nfnetlink, but we have no way to autoload nf_conntrack_netlink.

With this new chunk of code, we can run some code in nfnetlink to check
for the group that the user-space listener wants to subscribe to. Then,
it can check what module needs to be auto-loaded, if required.

We got users complaining on this behaviour in the past.

If you like them, please manually apply. I wanted to know if you are
happy with these before pushing them into my tree, as they include
netlink changes.

Thanks!

Pablo Neira Ayuso (2):
  netlink: add netlink_kernel_cfg parameter to netlink_kernel_create
  netlink: add nlk->netlink_bind hook for module auto-loading

 crypto/crypto_user.c                |    7 +++++--
 drivers/connector/connector.c       |   13 +++++++++----
 drivers/infiniband/core/netlink.c   |    7 +++++--
 drivers/scsi/scsi_netlink.c         |    7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |    9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |    6 ++++--
 include/linux/netlink.h             |   16 +++++++++++-----
 kernel/audit.c                      |    7 +++++--
 lib/kobject_uevent.c                |    5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |    6 ++++--
 net/core/rtnetlink.c                |    9 +++++++--
 net/core/sock_diag.c                |    8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |    8 +++++---
 net/ipv4/fib_frontend.c             |    7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |    8 +++++---
 net/netfilter/nfnetlink.c           |   36 +++++++++++++++++++++++++++++++++--
 net/netlink/af_netlink.c            |   35 ++++++++++++++++++++++++++++------
 net/netlink/genetlink.c             |   10 +++++++---
 net/xfrm/xfrm_user.c                |    7 +++++--
 security/selinux/netlink.c          |    6 +++++-
 20 files changed, 166 insertions(+), 51 deletions(-)

-- 
1.7.10

^ permalink raw reply

* Re: [PATCH net-next] em_canid: Ematch rule to match CAN frames according to their identifiers
From: Oliver Hartkopp @ 2012-06-29 15:44 UTC (permalink / raw)
  To: Rostislav Lisovy; +Cc: netdev, linux-can, lartc, pisa, sojkam1
In-Reply-To: <1340903231-9561-1-git-send-email-lisovy@gmail.com>

Hello Rostislav,

looks really good now.

1. Your Signed-off-by: is missing.

2. One remark to a removed length check:

(..)

> +static int em_canid_change(struct tcf_proto *tp, void *data, int len,
> +			  struct tcf_ematch *m)
> +{
> +	struct can_filter *conf = data; /* Array with rules,
> +					 * fixed size EM_CAN_RULES_SIZE
> +					 */
> +	struct canid_match *cm;
> +	struct canid_match *cm_old = (struct canid_match *) m->data;
> +	int i;
> +	int rulescnt;
> +


What about a zero length check here?

	if (!len)
		return -EINVAL;

???

> +	if (len % sizeof(struct can_filter))
> +		return -EINVAL;
> +
> +	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
> +		return -EINVAL;
> +
> +	rulescnt = len / sizeof(struct can_filter);
> +
> +	cm = kzalloc(sizeof(struct canid_match) + sizeof(struct can_filter) *
> +		rulescnt, GFP_KERNEL);
> +	if (!cm)
> +		return -ENOMEM;


The length could alternatively be checked here too

http://lxr.linux.no/#linux+v3.4.4/net/sched/ematch.c#L235

if em->ops->datalen is set.

But here's no

	.datalen = sizeof(struct can_filter),

defined, right?

> +static struct tcf_ematch_ops em_canid_ops = {
> +	.kind	  = TCF_EM_CANID,
> +	.change	  = em_canid_change,
> +	.match	  = em_canid_match,
> +	.destroy  = em_canid_destroy,
> +	.dump	  = em_canid_dump,
> +	.owner	  = THIS_MODULE,
> +	.link	  = LIST_HEAD_INIT(em_canid_ops.link)
> +};


Regards,
Oliver

^ permalink raw reply

* RE: [PATCH 1/5] netfilter: ipset: fix interface comparision in hash-netiface sets
From: David Laight @ 2012-06-29 15:41 UTC (permalink / raw)
  To: pablo, netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-2-git-send-email-pablo@netfilter.org>

> From: Florian Westphal <fw@strlen.de>
> 
> ifname_compare() assumes that skb->dev is zero-padded,
> e.g 'eth1\0\0\0\0\0...'. This isn't always the case. e1000 driver does
> 
> strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
> 
> in e1000_probe(), so once device is registered dev->name memory
contains
> 'eth1\0:0:3\0\0\0' (or something like that), which makes eth1 compare
fail.

strncpy() would normally zero-fill the destination buffer
(at least the libc version does).

So something else must be wrong.

	David



^ permalink raw reply

* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Andreas Gruenbacher @ 2012-06-29 15:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340982666.21162.3.camel@edumazet-glaptop>

On Fri, 2012-06-29 at 17:11 +0200, Eric Dumazet wrote:
> On Fri, 2012-06-29 at 16:54 +0200, Andreas Gruenbacher wrote:
> > The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
> > page should be put into a new packet even when there is still room left in the
> > previous packet.
> > 
> > In the tcp protocol, messages which are not sent immediately are queued.  When
> > more data is sent, it will be added to the last segment in that queue until
> > that segment is "full" whenever possible; only then is a new segment added.
> > Right now, there is no way to indicate when tcp should start a new segment.
> > The new flag allows to control that.
> > 
> > Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
> > ---
> 
> I don't understand how maintaining any message boundaries at sender can
> prevent any middlebox or the receiver to coalesce frames to any
> boundaries it prefers ?

The primary use case is fast Gigabit (10 or more) Ethernet connections
with jumbo frames and switches that support them.  There, frames will go
through unchanged and you can zero-copy receive all the time.

Not sure how well the approach scales to other kinds of connections; it
may work often enough to be worth it.  When things get distorted between
the sender and the receiver and tcp_recvbio() fails, the data can still
be copied out of the socket as before.

Andreas

^ permalink raw reply

* [PATCH 5/5] netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>

Bug added in commit 6b75e3e8d664a9a (netfilter: nfnetlink: add RCU in
nfnetlink_rcv_msg())

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1..791d56b 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -169,8 +169,10 @@ replay:
 
 		err = nla_parse(cda, ss->cb[cb_id].attr_count,
 				attr, attrlen, ss->cb[cb_id].policy);
-		if (err < 0)
+		if (err < 0) {
+			rcu_read_unlock();
 			return err;
+		}
 
 		if (nc->call_rcu) {
 			err = nc->call_rcu(net->nfnl, skb, nlh,
-- 
1.7.10

^ permalink raw reply related

* [PATCH 1/5] netfilter: ipset: fix interface comparision in hash-netiface sets
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

ifname_compare() assumes that skb->dev is zero-padded,
e.g 'eth1\0\0\0\0\0...'. This isn't always the case. e1000 driver does

strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);

in e1000_probe(), so once device is registered dev->name memory contains
'eth1\0:0:3\0\0\0' (or something like that), which makes eth1 compare
fail.

Use plain strcmp() instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_netiface.c |   32 ++++------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index ee86394..d5d3607 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -38,30 +38,6 @@ struct iface_node {
 
 #define iface_data(n)	(rb_entry(n, struct iface_node, node)->iface)
 
-static inline long
-ifname_compare(const char *_a, const char *_b)
-{
-	const long *a = (const long *)_a;
-	const long *b = (const long *)_b;
-
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
-	if (a[0] != b[0])
-		return a[0] - b[0];
-	if (IFNAMSIZ > sizeof(long)) {
-		if (a[1] != b[1])
-			return a[1] - b[1];
-	}
-	if (IFNAMSIZ > 2 * sizeof(long)) {
-		if (a[2] != b[2])
-			return a[2] - b[2];
-	}
-	if (IFNAMSIZ > 3 * sizeof(long)) {
-		if (a[3] != b[3])
-			return a[3] - b[3];
-	}
-	return 0;
-}
-
 static void
 rbtree_destroy(struct rb_root *root)
 {
@@ -99,7 +75,7 @@ iface_test(struct rb_root *root, const char **iface)
 
 	while (n) {
 		const char *d = iface_data(n);
-		long res = ifname_compare(*iface, d);
+		int res = strcmp(*iface, d);
 
 		if (res < 0)
 			n = n->rb_left;
@@ -121,7 +97,7 @@ iface_add(struct rb_root *root, const char **iface)
 
 	while (*n) {
 		char *ifname = iface_data(*n);
-		long res = ifname_compare(*iface, ifname);
+		int res = strcmp(*iface, ifname);
 
 		p = *n;
 		if (res < 0)
@@ -366,7 +342,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	struct hash_netiface4_elem data = { .cidr = HOST_MASK };
 	u32 ip = 0, ip_to, last;
 	u32 timeout = h->timeout;
-	char iface[IFNAMSIZ] = {};
+	char iface[IFNAMSIZ];
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
@@ -663,7 +639,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface6_elem data = { .cidr = HOST_MASK };
 	u32 timeout = h->timeout;
-	char iface[IFNAMSIZ] = {};
+	char iface[IFNAMSIZ];
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
-- 
1.7.10

^ permalink raw reply related

* [PATCH 3/5] netfilter: update location of my trees
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 MAINTAINERS |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f6e62de..302aa00 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4654,8 +4654,8 @@ L:	netfilter@vger.kernel.org
 L:	coreteam@netfilter.org
 W:	http://www.netfilter.org/
 W:	http://www.iptables.org/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-2.6.git
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next-2.6.git
+T:	git git://1984.lsi.us.es/nf
+T:	git git://1984.lsi.us.es/nf-next
 S:	Supported
 F:	include/linux/netfilter*
 F:	include/linux/netfilter/
-- 
1.7.10

^ permalink raw reply related

* [PATCH 4/5] netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>

This patch fixes a crash if that ipset command is sent over nfnetlink.

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c |   12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 819c342..9730882 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -640,6 +640,14 @@ find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
 }
 
 static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
 ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
@@ -1539,6 +1547,10 @@ nlmsg_failure:
 }
 
 static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+	[IPSET_CMD_NONE]	= {
+		.call		= ip_set_none,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+	},
 	[IPSET_CMD_CREATE]	= {
 		.call		= ip_set_create,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
-- 
1.7.10


^ permalink raw reply related

* [PATCH 2/5] netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>

From: Eric Dumazet <edumazet@google.com>

After call to ip6_route_output() we must release dst or we leak it.

Also should test dst->error, as ip6_route_output() never returns NULL.

Use boolean while we are at it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd811b8..d43e3c1 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -76,19 +76,19 @@ static void __ip_vs_del_service(struct ip_vs_service *svc);
 
 #ifdef CONFIG_IP_VS_IPV6
 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(struct net *net,
-				    const struct in6_addr *addr)
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+				     const struct in6_addr *addr)
 {
-	struct rt6_info *rt;
 	struct flowi6 fl6 = {
 		.daddr = *addr,
 	};
+	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+	bool is_local;
 
-	rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
-	if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
-		return 1;
+	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
 
-	return 0;
+	dst_release(dst);
+	return is_local;
 }
 #endif
 
-- 
1.7.10


^ permalink raw reply related

* [PATCH 0/5] netfilter fixes for 3.5-rc4
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following are 4 fixes and the update of the MAINTAINERS file
to point to my Netfilter trees.

They are:

* One refcount leak fix in IPVS IPv6 support from Eric Dumazet.

* One fix for interface comparison in ipset hash-netiface sets
  from Florian Westphal.

* One fix for a missing rcu_read_unlock in nfnetlink from
  Tomasz Bursztyka.

* One fix for a kernel crash if IPSET_CMD_NONE is set to ipset via
  nfnetlink, again from Tomasz Bursztyka.

You can pull these changes from:

git://1984.lsi.us.es/nf master

Thanks!

Eric Dumazet (1):
  netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6

Florian Westphal (1):
  netfilter: ipset: fix interface comparision in hash-netiface sets

Pablo Neira Ayuso (1):
  netfilter: update location of my trees

Tomasz Bursztyka (2):
  netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
  netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg

 MAINTAINERS                                |    4 ++--
 net/netfilter/ipset/ip_set_core.c          |   12 +++++++++++
 net/netfilter/ipset/ip_set_hash_netiface.c |   32 ++++------------------------
 net/netfilter/ipvs/ip_vs_ctl.c             |   14 ++++++------
 net/netfilter/nfnetlink.c                  |    4 +++-
 5 files changed, 28 insertions(+), 38 deletions(-)

-- 
1.7.10

^ permalink raw reply

* Re: "ADDRCONF(NETDEV_UP): eth0: link is not ready" with IPv6
From: Ben Hutchings @ 2012-06-29 15:24 UTC (permalink / raw)
  To: Arvid Brodin; +Cc: netdev@vger.kernel.org, Alexey Kuznetsov, Stephen Hemminger
In-Reply-To: <4FED14C2.9020200@xdin.com>

On Fri, 2012-06-29 at 02:36 +0000, Arvid Brodin wrote:
> Hi,
> 
> After 'ip link set eth0 up' on an avr32 board (network driver macb), the device ends up in
> operational mode "UNKNOWN":
> 
> # ip link
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN qlen 1000
>     link/ether 00:24:74:00:17:9d brd ff:ff:ff:ff:ff:ff
> 
> Unplugging and plugging in the network cable gets the device to mode "UP".
> 
> This is a problem for me because I'm trying to use this device as a "slave" device (for a
> virtual HSR device*) and I need to be able to decide if the slave device is operational or
> not.
> 
> Following Stephen's advice here:
> http://kerneltrap.org/mailarchive/linux-netdev/2008/9/24/3398834 I checked the macb.c code
> and noticed they do not call netif_carrier_off() neither before register_netdev() nor in
> dev_open().

It should be called after register_netdev() and before the driver's
ndo_open implementation returns.

> I added the call before register_netdev(), which fixed the problem. However, if I then
> enable IPv6:
>
> # ip link set eth0 up
> ADDRCONF(NETDEV_UP): eth0: link is not ready
> eth0: link up (100/Full)
> ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready

This looks normal.

> Any idea what is happening / what I'm doing wrong? (This is not just cosmetic; is some
> situations this seems to kill the interface - e.g. ping does not work, down/up does not
> help...) Things work fine without IPv6 configured.

Perhaps some packets sent automatically by IPv6 are triggering a driver
bug?  Or there is a bug in multicast support, which IPv6 always uses.

Ben.

> *N.B. I'm writing a driver for a network protocol called "High-availability Seamless
> Redundancy".

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Eric Dumazet @ 2012-06-29 15:11 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340981690.25226.3.camel@gurkel.linbit>

On Fri, 2012-06-29 at 16:54 +0200, Andreas Gruenbacher wrote:
> The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
> page should be put into a new packet even when there is still room left in the
> previous packet.
> 
> In the tcp protocol, messages which are not sent immediately are queued.  When
> more data is sent, it will be added to the last segment in that queue until
> that segment is "full" whenever possible; only then is a new segment added.
> Right now, there is no way to indicate when tcp should start a new segment.
> The new flag allows to control that.
> 
> Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
> ---

I don't understand how maintaining any message boundaries at sender can
prevent any middlebox or the receiver to coalesce frames to any
boundaries it prefers ?

^ permalink raw reply

* [patch net-next v2 4/4] dummy: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev
  Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
	shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/dummy.c |   15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index bab0158..9d6a067 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -40,18 +40,6 @@
 
 static int numdummies = 1;
 
-static int dummy_set_address(struct net_device *dev, void *p)
-{
-	struct sockaddr *sa = p;
-
-	if (!is_valid_ether_addr(sa->sa_data))
-		return -EADDRNOTAVAIL;
-
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
-	memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
-	return 0;
-}
-
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
 {
@@ -118,7 +106,7 @@ static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_start_xmit		= dummy_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= set_multicast_list,
-	.ndo_set_mac_address	= dummy_set_address,
+	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_get_stats64	= dummy_get_stats64,
 };
 
@@ -134,6 +122,7 @@ static void dummy_setup(struct net_device *dev)
 	dev->tx_queue_len = 0;
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
 	eth_hw_addr_random(dev);
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 3/4] team: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev
  Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
	shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/team/team.c |    9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 89853c3..9b94f53 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1188,10 +1188,11 @@ static int team_set_mac_address(struct net_device *dev, void *p)
 {
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
-	struct sockaddr *addr = p;
+	int err;
 
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
-	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	err = eth_mac_addr(dev, p);
+	if (err)
+		return err;
 	rcu_read_lock();
 	list_for_each_entry_rcu(port, &team->port_list, list)
 		if (team->ops.port_change_mac)
@@ -1393,7 +1394,7 @@ static void team_setup(struct net_device *dev)
 	 * bring us to promisc mode in case a unicast addr is added.
 	 * Let this up to underlay drivers.
 	 */
-	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
 
 	dev->features |= NETIF_F_LLTX;
 	dev->features |= NETIF_F_GRO;
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 2/4] virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev
  Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
	shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/virtio_net.c |   11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 36a16d5..1db445b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -679,12 +679,11 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct virtio_device *vdev = vi->vdev;
-	struct sockaddr *addr = p;
+	int ret;
 
-	if (!is_valid_ether_addr(addr->sa_data))
-		return -EADDRNOTAVAIL;
-	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
-	dev->addr_assign_type &= ~NET_ADDR_RANDOM;
+	ret = eth_mac_addr(dev, p);
+	if (ret)
+		return ret;
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
 		vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
@@ -1063,7 +1062,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 		return -ENOMEM;
 
 	/* Set up network device as normal. */
-	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
 	dev->netdev_ops = &virtnet_netdev;
 	dev->features = NETIF_F_HIGHDMA;
 
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 1/4] net: introduce new priv_flag indicating iface capable of change mac when running
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>

Introduce IFF_LIVE_ADDR_CHANGE priv_flag and use it to disable
netif_running() check in eth_mac_addr()

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/if.h |    2 ++
 net/ethernet/eth.c |    2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/if.h b/include/linux/if.h
index f995c66..1ec407b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -81,6 +81,8 @@
 #define IFF_UNICAST_FLT	0x20000		/* Supports unicast filtering	*/
 #define IFF_TEAM_PORT	0x40000		/* device used as team port */
 #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
+#define IFF_LIVE_ADDR_CHANGE 0x100000	/* device supports hardware address
+					 * change when it's running */
 
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 36e5880..db6a6c1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -283,7 +283,7 @@ int eth_mac_addr(struct net_device *dev, void *p)
 {
 	struct sockaddr *addr = p;
 
-	if (netif_running(dev))
+	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
 		return -EBUSY;
 	if (!is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next v2 0/4] net: introduce and use IFF_LIFE_ADDR_CHANGE
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
  To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
	davem

three drivers updated, but this can be used in many others.

v1->v2:
%s/LIFE/LIVE

Jiri Pirko (4):
  net: introduce new priv_flag indicating iface capable of change mac
    when running
  virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
  team: use IFF_LIVE_ADDR_CHANGE priv_flag
  dummy: use IFF_LIVE_ADDR_CHANGE priv_flag

 drivers/net/dummy.c      |   15 ++-------------
 drivers/net/team/team.c  |    9 +++++----
 drivers/net/virtio_net.c |   11 +++++------
 include/linux/if.h       |    2 ++
 net/ethernet/eth.c       |    2 +-
 5 files changed, 15 insertions(+), 24 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Eric Dumazet @ 2012-06-29 15:08 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340981632.25226.2.camel@gurkel.linbit>

On Fri, 2012-06-29 at 16:53 +0200, Andreas Gruenbacher wrote:
> Hello,
> 
> I'm (still) trying to pass data from the network to the block layer without
> copying. The block layer needs blocks to be contiguous in memory, and may have
> some alignment restrictions as well.  A lot of modern network hardware will
> receive large packets into separate buffers, so individual large packets will
> end up in contiguous, aligned buffers.  I would like to make use of that, but
> tcp currently doesn't allow me to control what ends up in which packets.
> 
> This patch series introduces a new flag for indicating to tcp when it should
> start a new segment. Using that on the sender side, I can get data over the
> network with no cpu copying at all.
> 
> [My last posting on this topic from May 8 is archived here:
>  http://www.spinics.net/lists/netdev/msg197788.html ]
> 
> Thanks,
> Andreas
> 
> Andreas Gruenbacher (3):
>   tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
>   tcp: Zero-copy receive from a socket into a bio
>   fs: Export bio_release_pages()

This looks like yet another zero copy, needing another couple of hundred
of lines.

Why splice infrastructure doesnt fit your needs ?

^ permalink raw reply

* [RFC] [TCP 3/3] fs: Export bio_release_pages()
From: Andreas Gruenbacher @ 2012-06-29 14:56 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller

Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
 fs/bio.c            |    3 ++-
 include/linux/bio.h |    1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bio.c b/fs/bio.c
index 73922ab..90501a5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1335,7 +1335,7 @@ void bio_set_pages_dirty(struct bio *bio)
 	}
 }
 
-static void bio_release_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio)
 {
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int i;
@@ -1347,6 +1347,7 @@ static void bio_release_pages(struct bio *bio)
 			put_page(page);
 	}
 }
+EXPORT_SYMBOL(bio_release_pages);
 
 /*
  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2643589..268ec49 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -246,6 +246,7 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 				 gfp_t, int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
+extern void bio_release_pages(struct bio *bio);
 
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-- 
1.7.10.2

^ permalink raw reply related

* [RFC] [TCP 2/3] tcp: Zero-copy receive from a socket into a bio
From: Andreas Gruenbacher @ 2012-06-29 14:55 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller

"Receive" data from a tcp socket by directly mapping sectors in the socket receive
buffers into a bio without copying.  This requires that the receive buffer
contains contiguous sectors which are well-enough aligned for the block device
associated with the bio.

Any data that cannot be mapped into the bio is left in the socket receive
buffers and can be received conventionally, by copying it out of the buffers.

Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
 include/net/tcp.h      |    3 +
 net/ipv4/Makefile      |    3 +-
 net/ipv4/tcp_recvbio.c |  168 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_recvbio.c

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e79aa48..c4d924b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -538,6 +538,9 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			 sk_read_actor_t recv_actor);
 
+/* tcp_recvbio.c */
+extern int tcp_recvbio(struct sock *sk, struct bio *bio, size_t size);
+
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
 extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3b..7ee9f92 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,7 +11,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
-	     inet_fragment.o ping.o
+	     inet_fragment.o ping.o \
+	     tcp_recvbio.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/tcp_recvbio.c b/net/ipv4/tcp_recvbio.c
new file mode 100644
index 0000000..4d6f833
--- /dev/null
+++ b/net/ipv4/tcp_recvbio.c
@@ -0,0 +1,168 @@
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+static int tcp_recvbio_add(struct bio *bio, struct sk_buff *skb,
+			   struct bio_vec *last)
+{
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	unsigned short vcnt = bio->bi_vcnt;
+	int ret;
+
+	if (vcnt == queue_max_segments(q))
+		return 0;
+	if (!blk_rq_aligned(q, last->bv_offset, last->bv_len))
+		return -EOPNOTSUPP;
+	ret = bio_add_page(bio, last->bv_page, last->bv_len, last->bv_offset);
+	if (vcnt != bio->bi_vcnt)
+		get_page(last->bv_page);
+	return ret;
+}
+
+static int tcp_recvbio_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+			    unsigned int offset, size_t len)
+{
+	struct bio *bio = rd_desc->arg.data;
+	int start = skb_headlen(skb), consumed = 0, frag_len, i;
+	struct sk_buff *frag_iter;
+	struct bio_vec last = { };
+	int ret = 0;
+
+	if (offset > (int)skb->len - len)
+		return -EFAULT;
+
+	/* Do not consume more data than we need.  */
+	if (len > rd_desc->count)
+		len = rd_desc->count;
+
+	/* Head of the skb */
+	frag_len = start - offset;
+	if (frag_len > 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		frag_len = end - offset;
+		if (frag_len > 0) {
+			if (frag_len > len)
+				frag_len = len;
+
+			last.bv_page = skb_frag_page(frag);
+			last.bv_offset = frag->page_offset + offset - start;
+			last.bv_len = frag_len;
+			ret = tcp_recvbio_add(bio, skb, &last);
+			if (ret <= 0)
+				goto out;
+			consumed += frag_len;
+			len -= frag_len;
+			if (!len)
+				break;
+			offset += frag_len;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		frag_len = end - offset;
+		if (frag_len > 0) {
+			if (frag_len > len)
+				frag_len = len;
+
+			ret = tcp_recvbio_data(rd_desc, frag_iter, offset -
+					       start, frag_len);
+			if (ret <= 0)
+				goto out;
+			consumed += frag_len;
+			len -= frag_len;
+			if (!len)
+				break;
+			offset += frag_len;
+		}
+		start = end;
+	}
+
+out:
+	rd_desc->written += consumed;
+	rd_desc->count -= consumed;
+	return consumed ? consumed : ret;
+}
+
+/**
+ * tcp_recvbio  -  zero-copy receive from a socket into a bio
+ * @sk: socket to receive from
+ * @bio: empty bio to receive into
+ * @size: number of bytes to receive
+ *
+ * Directly add page fragments from @sk's receive buffer to @bio.  The page
+ * fragments are held referenced with get_page().  Release those references
+ * with bio_release_pages() when done.
+ *
+ * Returns the number of bytes received into @bio.
+ */
+int tcp_recvbio(struct sock *sk, struct bio *bio, size_t size)
+{
+	long timeo = sock_rcvtimeo(sk, 0);
+	read_descriptor_t rd_desc = {
+		.count = size,
+		.arg = { .data = bio },
+	};
+	int ret = 0;
+
+	BUG_ON(bio->bi_idx != 0);
+
+	lock_sock(sk);
+	while (rd_desc.count) {
+		read_lock(&sk->sk_callback_lock);
+		ret = tcp_read_sock(sk, &rd_desc, tcp_recvbio_data);
+		read_unlock(&sk->sk_callback_lock);
+		if (ret < 0)
+			break;
+		else if (ret > 0)
+			timeo = sock_rcvtimeo(sk, 0);
+		else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				/*
+				 * This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				if (!sock_flag(sk, SOCK_DONE))
+					ret = -ENOTCONN;
+				break;
+                        }
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			sk_wait_data(sk, &timeo);
+			if (signal_pending(current)) {
+				ret = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				break;
+			}
+			timeo = 0;
+		}
+	}
+	release_sock(sk);
+	return rd_desc.written ? rd_desc.written : ret;
+}
+EXPORT_SYMBOL(tcp_recvbio);
-- 
1.7.10.2

^ permalink raw reply related

* [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Andreas Gruenbacher @ 2012-06-29 14:54 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller

The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
page should be put into a new packet even when there is still room left in the
previous packet.

In the tcp protocol, messages which are not sent immediately are queued.  When
more data is sent, it will be added to the last segment in that queue until
that segment is "full" whenever possible; only then is a new segment added.
Right now, there is no way to indicate when tcp should start a new segment.
The new flag allows to control that.

Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
 include/linux/socket.h |    1 +
 net/ipv4/tcp.c         |    5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322..be166de 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -266,6 +266,7 @@ struct ucred {
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
+#define MSG_NEW_PACKET	0x40000	/* tcp: try to put message into a new packet */
 #define MSG_EOF         MSG_FIN

 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exit for file
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f..148aebe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -854,7 +854,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 		bool can_coalesce;

-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		    (flags & MSG_NEW_PACKET)) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -1044,7 +1045,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				copy = max - skb->len;
 			}

-			if (copy <= 0) {
+			if (copy <= 0 || (flags & MSG_NEW_PACKET)) {
 new_segment:
 				/* Allocate new segment. If the interface is SG,
 				 * allocate skb fitting to single page.
-- 
1.7.10.2

^ permalink raw reply related

* [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Andreas Gruenbacher @ 2012-06-29 14:53 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller

Hello,

I'm (still) trying to pass data from the network to the block layer without
copying. The block layer needs blocks to be contiguous in memory, and may have
some alignment restrictions as well.  A lot of modern network hardware will
receive large packets into separate buffers, so individual large packets will
end up in contiguous, aligned buffers.  I would like to make use of that, but
tcp currently doesn't allow me to control what ends up in which packets.

This patch series introduces a new flag for indicating to tcp when it should
start a new segment. Using that on the sender side, I can get data over the
network with no cpu copying at all.

[My last posting on this topic from May 8 is archived here:
 http://www.spinics.net/lists/netdev/msg197788.html ]

Thanks,
Andreas

Andreas Gruenbacher (3):
  tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
  tcp: Zero-copy receive from a socket into a bio
  fs: Export bio_release_pages()

 fs/bio.c               |    3 +-
 include/linux/bio.h    |    1 +
 include/linux/socket.h |    1 +
 include/net/tcp.h      |    3 +
 net/ipv4/Makefile      |    3 +-
 net/ipv4/tcp.c         |    5 +-
 net/ipv4/tcp_recvbio.c |  168 ++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 180 insertions(+), 4 deletions(-)
 create mode 100644 net/ipv4/tcp_recvbio.c

-- 
1.7.10.2

^ permalink raw reply

* Re: [PATCH net-next 2/2] r8169: support RTL8168G
From: Francois Romieu @ 2012-06-29 13:51 UTC (permalink / raw)
  To: Hayes Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1340966060-2749-2-git-send-email-hayeswang@realtek.com>

Hayes Wang <hayeswang@realtek.com> :
[...]
> @@ -264,6 +267,11 @@ static const struct {
>  	[RTL_GIGA_MAC_VER_39] =
>  		_R("RTL8106e",		RTL_TD_1, FIRMWARE_8106E_1,
>  							JUMBO_1K, true),
> +	[RTL_GIGA_MAC_VER_40] =
> +		_R("RTL8168g/8111g",	RTL_TD_1, FIRMWARE_8168G_1,
> +							JUMBO_9K, false),
> +	[RTL_GIGA_MAC_VER_41] =
> +		_R("RTL8168g/8111g",	RTL_TD_1, NULL, JUMBO_9K, false),

You may explicitely state that jumbo operation requires no special action
by completing rtl_init_jumbo_ops.

(no checksuming with jumbo, sigh)

[...]
>  static void rtl_lock_work(struct rtl8169_private *tp)
>  {
> @@ -919,6 +936,99 @@ static int r8168dp_check_dash(struct rtl8169_private *tp)
>  	return (ocp_read(tp, 0x0f, reg) & 0x00008000) ? 1 : 0;
>  }
>  
> +static void r8168_phy_ocp_write(void __iomem *ioaddr, u32 reg, u32 data)
> +{
> +	int i;
> +
> +	if (reg & 0xffff0001)
> +		BUG();

The patch adds a lot of BUG(). BUG is terrible from a system or end user
viewpoint.

Were they only a devel helper or are they still supposed to be of use
in the future ? If the latter applies, why ?

[...]
> +static u16 r8168_phy_ocp_read(void __iomem *ioaddr, u32 reg)
> +{
> +	int i;
> +	u32 data;
> +
> +	if (reg & 0xffff0001)
> +		BUG();
> +
> +	RTL_W32(GPHY_OCP, (reg << 15));

You can save on parenthesis here.

[...]
> +static void r8168g_mdio_write(void __iomem *ioaddr, int reg_addr, int value)
> +{
> +	if (reg_addr == 0x1f)
> +		return;
> +
> +	r8168_phy_ocp_write(ioaddr, 0xa400 + reg_addr * 2, value);
> +}
> +
> +static int r8168g_mdio_read(void __iomem *ioaddr, int reg_addr)
> +{
> +	return r8168_phy_ocp_read(ioaddr, 0xa400 + reg_addr * 2);
> +}

#define XYZ_{BASE/OFFSET}	0xa400 ?

[...]
> @@ -2241,6 +2355,92 @@ static void rtl_phy_write_fw(struct rtl8169_private *tp, struct rtl_fw *rtl_fw)
>  	}
>  }
>  
> +static void rtl_ocp_write_fw(struct rtl8169_private *tp, struct rtl_fw *rtl_fw)
> +{
> +	struct rtl_fw_phy_action *pa = &rtl_fw->phy_action;
> +	void __iomem *ioaddr = tp->mmio_addr;
> +	u32 predata, count;
> +	u32 base_addr;
> +	size_t index;
> +
> +	predata = count = 0;
> +	base_addr = 0xa400;
> +
> +	for (index = 0; index < pa->size; ) {
> +		u32 action = le32_to_cpu(pa->code[index]);
> +		u32 data = action & 0x0000ffff;
> +		u32 regno = (action & 0x0fff0000) >> 16;
> +
> +		if (!action)
> +			break;
> +
> +		switch(action & 0xf0000000) {
> +		case PHY_READ:
> +			predata = r8168_phy_ocp_read(ioaddr,
> +					base_addr + (regno -16) * 2);
> +			count++;
> +			index++;
> +			break;
[duplicated code removed]
> +		case PHY_WRITE:
> +			if (regno == 0x1f)
> +				base_addr = data << 4;
> +			else
> +				r8168_phy_ocp_write(ioaddr,
> +						base_addr + (regno - 0x10) * 2,
> +						data);
> +			index++;
> +			break;
[duplicated code removed]
> +		case PHY_WRITE_PREVIOUS:
> +			r8168_phy_ocp_write(ioaddr, base_addr + (regno -16) * 2,
> +					    predata);
> +			index++;
> +			break;

I can't believe that the hardware people have designed something which
needs a different firmware write method, especially as it copies at lot
of code.

How did you come to the conclusion that it was not possible to hide this
stuff behind r8168g_mdio_{read / write} ?

I would not mind replacing the PHY_{READ/WRITE/WRITE_PREVIOUS} case with
chipset specific {READ/WRITE/WRITE_PREVIOUS} methods as long as the
semantic looks the same but going through a different (*write_fw) does not
trivially seem to be the best abstraction.

[...]
> @@ -3221,6 +3421,56 @@ static void rtl8411_hw_phy_config(struct rtl8169_private *tp)
>  	rtl_writephy(tp, 0x1f, 0x0000);
>  }
>  
> +static void rtl8168g_1_hw_phy_config(struct rtl8169_private *tp)
> +{
> +	void __iomem *ioaddr = tp->mmio_addr;
> +	u32 mac_ocp_addr, i;
> +	static const u16 mac_ocp_patch[] = {
> +		0xE008, 0xE01B, 0xE01D, 0xE01F,
> +		0xE021, 0xE023, 0xE025, 0xE027,
> +		0x49D2 ,0xF10D, 0x766C, 0x49E2,
> +		0xF00A, 0x1EC0, 0x8EE1, 0xC60A,
> +		0x77C0, 0x4870, 0x9FC0, 0x1EA0,
> +		0xC707, 0x8EE1, 0x9D6C, 0xC603,
> +		0xBE00, 0xB416, 0x0076, 0xE86C,
> +		0xC602, 0xBE00, 0x0000, 0xC602,
> +		0xBE00, 0x0000, 0xC602, 0xBE00,
> +		0x0000, 0xC602, 0xBE00, 0x0000,
> +		0xC602, 0xBE00, 0x0000, 0xC602,
> +		0xBE00, 0x0000, 0xC602, 0xBE00,
> +		0x0000, 0x0000, 0x0000, 0x0000

Please s/\(.*\)/\L\1/

> +	};
> +
> +	/* patch code for GPHY reset */
> +	mac_ocp_addr = 0xf800;
> +	for (i = 0; mac_ocp_addr < 0xf868; i++) {
> +		r8168_mac_ocp_write(ioaddr, mac_ocp_addr, mac_ocp_patch[i]);
> +		mac_ocp_addr += 2;
> +	}

	for (i = 0; i < ARRAY_SIZE(mac_ocp_patch); i++)
		r8168_mac_ocp_write(ioaddr, 0xf800 + 2*i, mac_ocp_patch[i]);

The array must be correctly sized anyway. :o)

You may save a bit on the 'mac_ocp_patch' identifier and replace 0xf800 with
a #define.

> +	r8168_mac_ocp_write(ioaddr, 0xfc26, 0x8000);
> +	r8168_mac_ocp_write(ioaddr, 0xfc28, 0x0075);
> +
> +	rtl_apply_firmware(tp);
> +
> +	if (r8168_phy_ocp_read(ioaddr, 0xa460) & 0x0100)
> +		rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x0000, 0x8000);
> +	else
> +		rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x8000, 0x0000);
> +
> +	if (r8168_phy_ocp_read(ioaddr, 0xa466) & 0x0100)
> +		rtl_w1w0_phy_ocp(ioaddr, 0xc41a, 0x0002, 0x0000);
> +	else
> +		rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x0000, 0x0002);
> +
> +	rtl_w1w0_phy_ocp(ioaddr, 0xa442, 0x000c, 0x0000);
> +	rtl_w1w0_phy_ocp(ioaddr, 0xa4b2, 0x0004, 0x0000);
> +
> +	r8168_phy_ocp_write(ioaddr, 0xa436, 0x8012);
> +	rtl_w1w0_phy_ocp(ioaddr, 0xa438, 0x8000, 0x0000);
> +
> +	rtl_w1w0_phy_ocp(ioaddr, 0xc422, 0x4000, 0x2000);
> +}

Is there any chance for this part to be a bit more literate ?

[...]
> @@ -4921,6 +5193,28 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp)
>  		     ERIAR_EXGMAC);
>  }
>  
> +static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
> +{
> +	void __iomem *ioaddr = tp->mmio_addr;
> +	struct pci_dev *pdev = tp->pci_dev;
> +
> +	rtl_eri_write(ioaddr, 0xc8, ERIAR_MASK_0101, 0x080002, ERIAR_EXGMAC);
> +	rtl_eri_write(ioaddr, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC);
> +	rtl_eri_write(ioaddr, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC);
> +	rtl_eri_write(ioaddr, 0xe8, ERIAR_MASK_1111, 0x00100006, ERIAR_EXGMAC);

> +	rtl_csi_access_enable_1(tp);

> +	rtl_tx_performance_tweak(pdev, 0x5 << MAX_READ_REQUEST_SHIFT);

> +	rtl_w1w0_eri(ioaddr, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
> +	rtl_w1w0_eri(ioaddr, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);

> +	RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
> +	RTL_W32(MISC, RTL_R32(MISC) & ~RXDV_GATED_EN);
> +	RTL_W8(MaxTxPacketSize, EarlySize);

> +	rtl_eri_write(ioaddr, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
> +	rtl_eri_write(ioaddr, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);

> +	RTL_W8(EEE_LED, RTL_R8(EEE_LED) & ~0x07);

> +	rtl_w1w0_eri(ioaddr, 0x2fc, ERIAR_MASK_0001, 0x01, 0x02, ERIAR_EXGMAC);
> +}

(ok, now it can be compared with similar functions)

[...]
> @@ -6491,6 +6790,47 @@ static unsigned rtl_try_msi(struct rtl8169_private *tp,
>  	return msi;
>  }
>  
> +static void __devinit rtl_hw_init_8168g(struct rtl8169_private *tp)
> +{
> +	void __iomem *ioaddr = tp->mmio_addr;
> +	u32 tmp_data;
> +
> +	RTL_W32(MISC, RTL_R32(MISC) | RXDV_GATED_EN);
> +	while (!(RTL_R32(TxConfig) & TXCFG_EMPTY))
> +		udelay(100);
> +
> +	while ((RTL_R8(MCU) & (TX_EMPTY | RX_EMPTY)) != (TX_EMPTY | RX_EMPTY))
> +		udelay(100);

#define RXTX_EMPTY	(TX_EMPTY | RX_EMPTY) ?

> +
> +	RTL_W8(ChipCmd, RTL_R8(ChipCmd) & ~(CmdTxEnb | CmdRxEnb));
> +	msleep(1);
> +	RTL_W8(MCU, RTL_R8(MCU) & ~NOW_IS_OOB);
> +
> +	tmp_data = r8168_mac_ocp_read(ioaddr, 0xe8de);
> +	tmp_data &= ~(1 << 14);
> +	r8168_mac_ocp_write(ioaddr, 0xe8de, tmp_data);
> +	while (!(RTL_R8(MCU) & LINK_LIST_RDY))
> +		udelay(100);
> +
> +	tmp_data = r8168_mac_ocp_read(ioaddr, 0xe8de);

Same 0xe8de offset used twice. #define ?

> +	tmp_data |= (1 << 15);
> +	r8168_mac_ocp_write(ioaddr, 0xe8de, tmp_data);
> +	while (!(RTL_R8(MCU) & LINK_LIST_RDY))
> +		udelay(100);
> +}
> +
> +static void __devinit rtl_hw_initialize(struct rtl8169_private *tp)
> +{
> +	switch (tp->mac_version) {
> +	case RTL_GIGA_MAC_VER_40:
> +	case RTL_GIGA_MAC_VER_41:
> +		rtl_hw_init_8168g(tp);
> +		break;
> +	default:
> +		break;
> +	}
> +}

Why doesn't it belong to hw_start ?

Is it completely unneeded if the device requires a rtl8169_hw_reset,
resumes or such ?

Thanks.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH net-next 1/2] r8169: support RTL8106E
From: Francois Romieu @ 2012-06-29 13:50 UTC (permalink / raw)
  To: Hayes Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1340966060-2749-1-git-send-email-hayeswang@realtek.com>

Hayes Wang <hayeswang@realtek.com> :
[...]
> Support the new chip RTL8106E.

I'll give it a try this week end.

Thanks.

-- 
Ueimor

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox