* [PATCH net-next 1/3] net: if_arp: add ARPHRD_NETLINK type
2013-06-21 17:38 [PATCH net-next 0/3] virtual netlink device for packet sockets Daniel Borkmann
@ 2013-06-21 17:38 ` Daniel Borkmann
2013-06-21 17:38 ` [PATCH net-next 2/3] net: netlink: virtual tap device management Daniel Borkmann
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Daniel Borkmann @ 2013-06-21 17:38 UTC (permalink / raw)
To: davem; +Cc: netdev
This small patch adds the definition of ARPHRD_NETLINK which can for
example be used by netlink monitoring devices as device type. So that
sockaddr_ll can pick it up and based on that choose the correct packet
dissector.
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
include/uapi/linux/if_arp.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index 82c7d1b..d7fea34 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -93,6 +93,7 @@
#define ARPHRD_PHONET_PIPE 821 /* PhoNet pipe header */
#define ARPHRD_CAIF 822 /* CAIF media type */
#define ARPHRD_IP6GRE 823 /* GRE over IPv6 */
+#define ARPHRD_NETLINK 824 /* Netlink header */
#define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */
#define ARPHRD_NONE 0xFFFE /* zero header length */
--
1.7.11.7
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH net-next 2/3] net: netlink: virtual tap device management
2013-06-21 17:38 [PATCH net-next 0/3] virtual netlink device for packet sockets Daniel Borkmann
2013-06-21 17:38 ` [PATCH net-next 1/3] net: if_arp: add ARPHRD_NETLINK type Daniel Borkmann
@ 2013-06-21 17:38 ` Daniel Borkmann
2013-06-21 17:38 ` [PATCH net-next 3/3] packet: nlmon: virtual netlink monitoring device for packet sockets Daniel Borkmann
2013-06-24 23:39 ` [PATCH net-next 0/3] virtual netlink " David Miller
3 siblings, 0 replies; 5+ messages in thread
From: Daniel Borkmann @ 2013-06-21 17:38 UTC (permalink / raw)
To: davem; +Cc: netdev
Similarly to the networking receive path with ptype_all taps, we add
the possibility to register netdevices that are for ARPHRD_NETLINK to
the netlink subsystem, so that those can be used for netlink analyzers
resp. debuggers. We do not offer a direct callback function as out-of-tree
modules could do crap with it. Instead, a netdevice must be registered
properly and only receives a clone, managed by the netlink layer. Symbols
are exported as GPL-only.
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
include/linux/netlink.h | 10 +++++
net/netlink/af_netlink.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 117 insertions(+)
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f78b430..86fde81a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -145,4 +145,14 @@ static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
return __netlink_dump_start(ssk, skb, nlh, control);
}
+struct netlink_tap {
+ struct net_device *dev;
+ struct module *module;
+ struct list_head list;
+};
+
+extern int netlink_add_tap(struct netlink_tap *nt);
+extern int __netlink_remove_tap(struct netlink_tap *nt);
+extern int netlink_remove_tap(struct netlink_tap *nt);
+
#endif /* __LINUX_NETLINK_H */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 8978755..796f1a5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -57,6 +57,7 @@
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
+#include <linux/if_arp.h>
#include <asm/cacheflush.h>
#include <net/net_namespace.h>
@@ -101,6 +102,9 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
+static DEFINE_SPINLOCK(netlink_tap_lock);
+static struct list_head netlink_tap_all __read_mostly;
+
static inline u32 netlink_group_mask(u32 group)
{
return group ? 1 << (group - 1) : 0;
@@ -111,6 +115,100 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
}
+int netlink_add_tap(struct netlink_tap *nt)
+{
+ if (unlikely(nt->dev->type != ARPHRD_NETLINK))
+ return -EINVAL;
+
+ spin_lock(&netlink_tap_lock);
+ list_add_rcu(&nt->list, &netlink_tap_all);
+ spin_unlock(&netlink_tap_lock);
+
+ if (nt->module)
+ __module_get(nt->module);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netlink_add_tap);
+
+int __netlink_remove_tap(struct netlink_tap *nt)
+{
+ bool found = false;
+ struct netlink_tap *tmp;
+
+ spin_lock(&netlink_tap_lock);
+
+ list_for_each_entry(tmp, &netlink_tap_all, list) {
+ if (nt == tmp) {
+ list_del_rcu(&nt->list);
+ found = true;
+ goto out;
+ }
+ }
+
+ pr_warn("__netlink_remove_tap: %p not found\n", nt);
+out:
+ spin_unlock(&netlink_tap_lock);
+
+ if (found && nt->module)
+ module_put(nt->module);
+
+ return found ? 0 : -ENODEV;
+}
+EXPORT_SYMBOL_GPL(__netlink_remove_tap);
+
+int netlink_remove_tap(struct netlink_tap *nt)
+{
+ int ret;
+
+ ret = __netlink_remove_tap(nt);
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(netlink_remove_tap);
+
+static int __netlink_deliver_tap_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct sk_buff *nskb;
+ int ret = -ENOMEM;
+
+ dev_hold(dev);
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (nskb) {
+ nskb->dev = dev;
+ ret = dev_queue_xmit(nskb);
+ if (unlikely(ret > 0))
+ ret = net_xmit_errno(ret);
+ }
+
+ dev_put(dev);
+ return ret;
+}
+
+static void __netlink_deliver_tap(struct sk_buff *skb)
+{
+ int ret;
+ struct netlink_tap *tmp;
+
+ list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+ ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+ if (unlikely(ret))
+ break;
+ }
+}
+
+static void netlink_deliver_tap(struct sk_buff *skb)
+{
+ rcu_read_lock();
+
+ if (unlikely(!list_empty(&netlink_tap_all)))
+ __netlink_deliver_tap(skb);
+
+ rcu_read_unlock();
+}
+
static void netlink_overrun(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
@@ -1518,6 +1616,8 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
int len = skb->len;
+ netlink_deliver_tap(skb);
+
#ifdef CONFIG_NETLINK_MMAP
if (netlink_skb_is_mmaped(skb))
netlink_queue_mmaped_skb(sk, skb);
@@ -1578,6 +1678,11 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
ret = -ECONNREFUSED;
if (nlk->netlink_rcv != NULL) {
+ /* We could do a netlink_deliver_tap(skb) here as well
+ * but since this is intended for the kernel only, we
+ * should rather let it stay under the hood.
+ */
+
ret = skb->len;
netlink_skb_set_owner_r(skb, sk);
NETLINK_CB(skb).sk = ssk;
@@ -2975,6 +3080,8 @@ static int __init netlink_proto_init(void)
nl_table[i].compare = netlink_compare;
}
+ INIT_LIST_HEAD(&netlink_tap_all);
+
netlink_add_usersock_entry();
sock_register(&netlink_family_ops);
--
1.7.11.7
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH net-next 3/3] packet: nlmon: virtual netlink monitoring device for packet sockets
2013-06-21 17:38 [PATCH net-next 0/3] virtual netlink device for packet sockets Daniel Borkmann
2013-06-21 17:38 ` [PATCH net-next 1/3] net: if_arp: add ARPHRD_NETLINK type Daniel Borkmann
2013-06-21 17:38 ` [PATCH net-next 2/3] net: netlink: virtual tap device management Daniel Borkmann
@ 2013-06-21 17:38 ` Daniel Borkmann
2013-06-24 23:39 ` [PATCH net-next 0/3] virtual netlink " David Miller
3 siblings, 0 replies; 5+ messages in thread
From: Daniel Borkmann @ 2013-06-21 17:38 UTC (permalink / raw)
To: davem; +Cc: netdev
Currently, there is no good possibility to debug netlink traffic that
is being exchanged between kernel and user space. Therefore, this patch
implements a netlink virtual device, so that netlink messages will be
made visible to PF_PACKET sockets. Once there was an approach with a
similar idea [1], but it got forgotten somehow.
I think it makes most sense to accept the "overhead" of an extra netlink
net device over implementing the same functionality from PF_PACKET
sockets once again into netlink sockets. We have BPF filters that can
already be easily applied which even have netlink extensions, we have
RX_RING zero-copy between kernel- and user space that can be reused,
and much more features. So instead of re-implementing all of this, we
simply pass the skb to a given PF_PACKET socket for further analysis.
Another nice benefit that comes from that is that no code needs to be
changed in user space packet analyzers (maybe adding a dissector, but
not more), thus out of the box, we can already capture pcap files of
netlink traffic to debug/troubleshoot netlink problems.
Also thanks goes to Thomas Graf, Flavio Leitner, Jesper Dangaard Brouer.
[1] http://marc.info/?l=linux-netdev&m=113813401516110
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
drivers/net/Kconfig | 10 +++
drivers/net/Makefile | 1 +
drivers/net/nlmon.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 181 insertions(+)
create mode 100644 drivers/net/nlmon.c
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 3835321..bdcbaf4 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -244,6 +244,16 @@ config VIRTIO_NET
This is the virtual network driver for virtio. It can be used with
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+config NLMON
+ tristate "Virtual netlink monitoring device"
+ ---help---
+ This option enables a monitoring net device for netlink skbs. The
+ purpose of this is to analyze netlink messages with packet sockets.
+ Thus applications like tcpdump will be able to see local netlink
+ messages if they tap into the netlink device, record pcaps for further
+ diagnostics, etc. This is mostly intended for developers or support
+ to debug netlink issues. If unsure, say N.
+
endif # NET_CORE
config SUNGEM_PHY
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index ef3d090..3fef8a8 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TUN) += tun.o
obj-$(CONFIG_VETH) += veth.o
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
obj-$(CONFIG_VXLAN) += vxlan.o
+obj-$(CONFIG_NLMON) += nlmon.o
#
# Networking Drivers
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
new file mode 100644
index 0000000..dc364be
--- /dev/null
+++ b/drivers/net/nlmon.c
@@ -0,0 +1,170 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/if_arp.h>
+
+struct pcpu_lstats {
+ u64 packets;
+ u64 bytes;
+ struct u64_stats_sync syncp;
+};
+
+static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ int len = skb->len;
+ struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->bytes += len;
+ stats->packets++;
+ u64_stats_update_end(&stats->syncp);
+
+ dev_kfree_skb(skb);
+
+ return NETDEV_TX_OK;
+}
+
+static int nlmon_is_valid_mtu(int new_mtu)
+{
+ return new_mtu >= sizeof(struct nlmsghdr) && new_mtu <= INT_MAX;
+}
+
+static int nlmon_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (!nlmon_is_valid_mtu(new_mtu))
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static int nlmon_dev_init(struct net_device *dev)
+{
+ dev->lstats = alloc_percpu(struct pcpu_lstats);
+
+ return dev->lstats == NULL ? -ENOMEM : 0;
+}
+
+static void nlmon_dev_uninit(struct net_device *dev)
+{
+ free_percpu(dev->lstats);
+}
+
+static struct netlink_tap nlmon_tap;
+
+static int nlmon_open(struct net_device *dev)
+{
+ return netlink_add_tap(&nlmon_tap);
+}
+
+static int nlmon_close(struct net_device *dev)
+{
+ return netlink_remove_tap(&nlmon_tap);
+}
+
+static struct rtnl_link_stats64 *
+nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ int i;
+ u64 bytes = 0, packets = 0;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_lstats *nl_stats;
+ u64 tbytes, tpackets;
+ unsigned int start;
+
+ nl_stats = per_cpu_ptr(dev->lstats, i);
+
+ do {
+ start = u64_stats_fetch_begin_bh(&nl_stats->syncp);
+ tbytes = nl_stats->bytes;
+ tpackets = nl_stats->packets;
+ } while (u64_stats_fetch_retry_bh(&nl_stats->syncp, start));
+
+ packets += tpackets;
+ bytes += tbytes;
+ }
+
+ stats->rx_packets = packets;
+ stats->tx_packets = 0;
+
+ stats->rx_bytes = bytes;
+ stats->tx_bytes = 0;
+
+ return stats;
+}
+
+static u32 always_on(struct net_device *dev)
+{
+ return 1;
+}
+
+static const struct ethtool_ops nlmon_ethtool_ops = {
+ .get_link = always_on,
+};
+
+static const struct net_device_ops nlmon_ops = {
+ .ndo_init = nlmon_dev_init,
+ .ndo_uninit = nlmon_dev_uninit,
+ .ndo_open = nlmon_open,
+ .ndo_stop = nlmon_close,
+ .ndo_start_xmit = nlmon_xmit,
+ .ndo_get_stats64 = nlmon_get_stats64,
+ .ndo_change_mtu = nlmon_change_mtu,
+};
+
+static struct netlink_tap nlmon_tap __read_mostly = {
+ .module = THIS_MODULE,
+};
+
+static void nlmon_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_NETLINK;
+ dev->tx_queue_len = 0;
+
+ dev->netdev_ops = &nlmon_ops;
+ dev->ethtool_ops = &nlmon_ethtool_ops;
+ dev->destructor = free_netdev;
+
+ dev->features = NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
+ dev->flags = IFF_NOARP;
+
+ /* That's rather a softlimit here, which, of course,
+ * can be altered. Not a real MTU, but what is to be
+ * expected in most cases.
+ */
+ dev->mtu = NLMSG_GOODSIZE;
+}
+
+static __init int nlmon_register(void)
+{
+ int err;
+ struct net_device *nldev;
+
+ nldev = nlmon_tap.dev = alloc_netdev(0, "netlink", nlmon_setup);
+ if (unlikely(nldev == NULL))
+ return -ENOMEM;
+
+ err = register_netdev(nldev);
+ if (unlikely(err))
+ free_netdev(nldev);
+
+ return err;
+}
+
+static __exit void nlmon_unregister(void)
+{
+ struct net_device *nldev = nlmon_tap.dev;
+
+ unregister_netdev(nldev);
+}
+
+module_init(nlmon_register);
+module_exit(nlmon_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>");
+MODULE_DESCRIPTION("Netlink monitoring device");
--
1.7.11.7
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH net-next 0/3] virtual netlink device for packet sockets
2013-06-21 17:38 [PATCH net-next 0/3] virtual netlink device for packet sockets Daniel Borkmann
` (2 preceding siblings ...)
2013-06-21 17:38 ` [PATCH net-next 3/3] packet: nlmon: virtual netlink monitoring device for packet sockets Daniel Borkmann
@ 2013-06-24 23:39 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2013-06-24 23:39 UTC (permalink / raw)
To: dborkman; +Cc: netdev
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 21 Jun 2013 19:38:05 +0200
> This set allows for a virtual netlink device that can be easily used
> without modification by tools like tcpdump, Wireshark et al. to debug
> and troubleshoot netlink traffic that is exchanged between user and
> kernel space. We could even record pcap files for a later analysis.
> No code change would be needed on the side of such analyzers, except
> adding a simple protocol dissector, for example.
>
> Please have a look at the main description in patch 3. Patch 1 and 2
> are just prerequisits for the actual 3rd patch.
>
> I think the device idea is the cleanest solution. We have packet sockets
> and they do exactly what we want and expect from them, they have all the
> features etc, and user space would not even need to implement code. Thus
> adding more and more functionality into af_netlink would be a bigger
> surgery and further bloat it up with duplicate code, imho. By taking the
> approach with what I'm proposing, we have a clean segregation of
> functionality (as: packet sockets vs. netlink sockets), thus keeping it
> simple and stupid, and not too complex.
Yep, this seems like a good tradeoff. Series applied, thanks!
^ permalink raw reply [flat|nested] 5+ messages in thread