Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next V6 5/9] liquidio CN23XX: VF related operations
From: Raghu Vatsavayi @ 2016-11-14 23:54 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1479167687-9904-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for VF related operations like mac address vlan
and link changes.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c    |  22 +++
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h    |   5 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 211 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   5 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   8 +
 5 files changed, 251 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index ffc94ac..d01b00b 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -23,6 +23,7 @@
 #include <linux/pci.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
+#include <linux/etherdevice.h>
 #include "liquidio_common.h"
 #include "octeon_droq.h"
 #include "octeon_iq.h"
@@ -1416,3 +1417,24 @@ int cn23xx_fw_loaded(struct octeon_device *oct)
 	val = octeon_read_csr64(oct, CN23XX_SLI_SCRATCH1);
 	return (val >> 1) & 1ULL;
 }
+
+void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
+					u8 *mac)
+{
+	if (oct->sriov_info.vf_drv_loaded_mask & BIT_ULL(vfidx)) {
+		struct octeon_mbox_cmd mbox_cmd;
+
+		mbox_cmd.msg.u64 = 0;
+		mbox_cmd.msg.s.type = OCTEON_MBOX_REQUEST;
+		mbox_cmd.msg.s.resp_needed = 0;
+		mbox_cmd.msg.s.cmd = OCTEON_PF_CHANGED_VF_MACADDR;
+		mbox_cmd.msg.s.len = 1;
+		mbox_cmd.recv_len = 0;
+		mbox_cmd.recv_status = 0;
+		mbox_cmd.fn = NULL;
+		mbox_cmd.fn_arg = 0;
+		ether_addr_copy(mbox_cmd.msg.s.params, mac);
+		mbox_cmd.q_no = vfidx * oct->sriov_info.rings_per_vf;
+		octeon_mbox_write(oct, &mbox_cmd);
+	}
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index 21b5c90..cee346a 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -29,6 +29,8 @@
 
 #include "cn23xx_pf_regs.h"
 
+#define LIO_CMD_WAIT_TM 100
+
 /* Register address and configuration for a CN23XX devices.
  * If device specific changes need to be made then add a struct to include
  * device specific fields as shown in the commented section
@@ -56,4 +58,7 @@ int validate_cn23xx_pf_config_info(struct octeon_device *oct,
 void cn23xx_dump_pf_initialized_regs(struct octeon_device *oct);
 
 int cn23xx_fw_loaded(struct octeon_device *oct);
+
+void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
+					u8 *mac);
 #endif
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index aebf342..8c82cd3 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3573,6 +3573,151 @@ static void liquidio_del_vxlan_port(struct net_device *netdev,
 				    OCTNET_CMD_VXLAN_PORT_DEL);
 }
 
+static int __liquidio_set_vf_mac(struct net_device *netdev, int vfidx,
+				 u8 *mac, bool is_admin_assigned)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+
+	if (!is_valid_ether_addr(mac))
+		return -EINVAL;
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.max_vfs)
+		return -EINVAL;
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+
+	nctrl.ncmd.u64 = 0;
+	nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MACADDR;
+	/* vfidx is 0 based, but vf_num (param1) is 1 based */
+	nctrl.ncmd.s.param1 = vfidx + 1;
+	nctrl.ncmd.s.param2 = (is_admin_assigned ? 1 : 0);
+	nctrl.ncmd.s.more = 1;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.cb_fn = 0;
+	nctrl.wait_time = LIO_CMD_WAIT_TM;
+
+	nctrl.udd[0] = 0;
+	/* The MAC Address is presented in network byte order. */
+	ether_addr_copy((u8 *)&nctrl.udd[0] + 2, mac);
+
+	oct->sriov_info.vf_macaddr[vfidx] = nctrl.udd[0];
+
+	octnet_send_nic_ctrl_pkt(oct, &nctrl);
+
+	return 0;
+}
+
+static int liquidio_set_vf_mac(struct net_device *netdev, int vfidx, u8 *mac)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	int retval;
+
+	retval = __liquidio_set_vf_mac(netdev, vfidx, mac, true);
+	if (!retval)
+		cn23xx_tell_vf_its_macaddr_changed(oct, vfidx, mac);
+
+	return retval;
+}
+
+static int liquidio_set_vf_vlan(struct net_device *netdev, int vfidx,
+				u16 vlan, u8 qos, __be16 vlan_proto)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	u16 vlantci;
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced)
+		return -EINVAL;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	if (vlan >= VLAN_N_VID || qos > 7)
+		return -EINVAL;
+
+	if (vlan)
+		vlantci = vlan | (u16)qos << VLAN_PRIO_SHIFT;
+	else
+		vlantci = 0;
+
+	if (oct->sriov_info.vf_vlantci[vfidx] == vlantci)
+		return 0;
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+
+	if (vlan)
+		nctrl.ncmd.s.cmd = OCTNET_CMD_ADD_VLAN_FILTER;
+	else
+		nctrl.ncmd.s.cmd = OCTNET_CMD_DEL_VLAN_FILTER;
+
+	nctrl.ncmd.s.param1 = vlantci;
+	nctrl.ncmd.s.param2 =
+	    vfidx + 1; /* vfidx is 0 based, but vf_num (param2) is 1 based */
+	nctrl.ncmd.s.more = 0;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.cb_fn = 0;
+	nctrl.wait_time = LIO_CMD_WAIT_TM;
+
+	octnet_send_nic_ctrl_pkt(oct, &nctrl);
+
+	oct->sriov_info.vf_vlantci[vfidx] = vlantci;
+
+	return 0;
+}
+
+static int liquidio_get_vf_config(struct net_device *netdev, int vfidx,
+				  struct ifla_vf_info *ivi)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	u8 *macaddr;
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced)
+		return -EINVAL;
+
+	ivi->vf = vfidx;
+	macaddr = 2 + (u8 *)&oct->sriov_info.vf_macaddr[vfidx];
+	ether_addr_copy(&ivi->mac[0], macaddr);
+	ivi->vlan = oct->sriov_info.vf_vlantci[vfidx] & VLAN_VID_MASK;
+	ivi->qos = oct->sriov_info.vf_vlantci[vfidx] >> VLAN_PRIO_SHIFT;
+	ivi->linkstate = oct->sriov_info.vf_linkstate[vfidx];
+	return 0;
+}
+
+static int liquidio_set_vf_link_state(struct net_device *netdev, int vfidx,
+				      int linkstate)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced)
+		return -EINVAL;
+
+	if (oct->sriov_info.vf_linkstate[vfidx] == linkstate)
+		return 0;
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+	nctrl.ncmd.s.cmd = OCTNET_CMD_SET_VF_LINKSTATE;
+	nctrl.ncmd.s.param1 =
+	    vfidx + 1; /* vfidx is 0 based, but vf_num (param1) is 1 based */
+	nctrl.ncmd.s.param2 = linkstate;
+	nctrl.ncmd.s.more = 0;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.cb_fn = 0;
+	nctrl.wait_time = LIO_CMD_WAIT_TM;
+
+	octnet_send_nic_ctrl_pkt(oct, &nctrl);
+
+	oct->sriov_info.vf_linkstate[vfidx] = linkstate;
+
+	return 0;
+}
+
 static struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
@@ -3590,6 +3735,10 @@ static void liquidio_del_vxlan_port(struct net_device *netdev,
 	.ndo_set_features	= liquidio_set_features,
 	.ndo_udp_tunnel_add	= liquidio_add_vxlan_port,
 	.ndo_udp_tunnel_del	= liquidio_del_vxlan_port,
+	.ndo_set_vf_mac		= liquidio_set_vf_mac,
+	.ndo_set_vf_vlan	= liquidio_set_vf_vlan,
+	.ndo_get_vf_config	= liquidio_get_vf_config,
+	.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
 };
 
 /** \brief Entry point for the liquidio module
@@ -3912,6 +4061,19 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			"if%d gmx: %d hw_addr: 0x%llx\n", i,
 			lio->linfo.gmxport, CVM_CAST64(lio->linfo.hw_addr));
 
+		for (j = 0; j < octeon_dev->sriov_info.max_vfs; j++) {
+			u8 vfmac[ETH_ALEN];
+
+			random_ether_addr(&vfmac[0]);
+			if (__liquidio_set_vf_mac(netdev, j,
+						  &vfmac[0], false)) {
+				dev_err(&octeon_dev->pci_dev->dev,
+					"Error setting VF%d MAC address\n",
+					j);
+				goto setup_nic_dev_fail;
+			}
+		}
+
 		/* 64-bit swap required on LE machines */
 		octeon_swap_8B_data(&lio->linfo.hw_addr, 1);
 		for (j = 0; j < 6; j++)
@@ -4207,6 +4369,52 @@ static void nic_starter(struct work_struct *work)
 	complete(&handshake[oct->octeon_id].started);
 }
 
+static int
+octeon_recv_vf_drv_notice(struct octeon_recv_info *recv_info, void *buf)
+{
+	struct octeon_device *oct = (struct octeon_device *)buf;
+	struct octeon_recv_pkt *recv_pkt = recv_info->recv_pkt;
+	int i, notice, vf_idx;
+	u64 *data, vf_num;
+
+	notice = recv_pkt->rh.r.ossp;
+	data = (u64 *)get_rbd(recv_pkt->buffer_ptr[0]);
+
+	/* the first 64-bit word of data is the vf_num */
+	vf_num = data[0];
+	octeon_swap_8B_data(&vf_num, 1);
+	vf_idx = (int)vf_num - 1;
+
+	if (notice == VF_DRV_LOADED) {
+		if (!(oct->sriov_info.vf_drv_loaded_mask & BIT_ULL(vf_idx))) {
+			oct->sriov_info.vf_drv_loaded_mask |= BIT_ULL(vf_idx);
+			dev_info(&oct->pci_dev->dev,
+				 "driver for VF%d was loaded\n", vf_idx);
+			try_module_get(THIS_MODULE);
+		}
+	} else if (notice == VF_DRV_REMOVED) {
+		if (oct->sriov_info.vf_drv_loaded_mask & BIT_ULL(vf_idx)) {
+			oct->sriov_info.vf_drv_loaded_mask &= ~BIT_ULL(vf_idx);
+			dev_info(&oct->pci_dev->dev,
+				 "driver for VF%d was removed\n", vf_idx);
+			module_put(THIS_MODULE);
+		}
+	} else if (notice == VF_DRV_MACADDR_CHANGED) {
+		u8 *b = (u8 *)&data[1];
+
+		oct->sriov_info.vf_macaddr[vf_idx] = data[1];
+		dev_info(&oct->pci_dev->dev,
+			 "VF driver changed VF%d's MAC address to %pM\n",
+			 vf_idx, b + 2);
+	}
+
+	for (i = 0; i < recv_pkt->buffer_count; i++)
+		recv_buffer_free(recv_pkt->buffer_ptr[i]);
+	octeon_free_recv_info(recv_info);
+
+	return 0;
+}
+
 /**
  * \brief Device initialization for each Octeon device that is probed
  * @param octeon_dev  octeon device
@@ -4265,6 +4473,9 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 				    octeon_core_drv_init,
 				    octeon_dev);
 
+	octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC,
+				    OPCODE_NIC_VF_DRV_NOTICE,
+				    octeon_recv_vf_drv_notice, octeon_dev);
 	INIT_DELAYED_WORK(&octeon_dev->nic_poll_work.work, nic_starter);
 	octeon_dev->nic_poll_work.ctxptr = (void *)octeon_dev;
 	schedule_delayed_work(&octeon_dev->nic_poll_work.work,
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index caeff9a..edf1282 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -89,6 +89,10 @@ enum octeon_tag_type {
 #define OPCODE_NIC_TIMESTAMP           0x07
 #define OPCODE_NIC_INTRMOD_CFG         0x08
 #define OPCODE_NIC_IF_CFG              0x09
+#define OPCODE_NIC_VF_DRV_NOTICE       0x0A
+#define VF_DRV_LOADED                  1
+#define VF_DRV_REMOVED                -1
+#define VF_DRV_MACADDR_CHANGED         2
 
 #define CORE_DRV_TEST_SCATTER_OP    0xFFF5
 
@@ -235,6 +239,7 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 
 #define   OCTNET_CMD_ID_ACTIVE         0x1a
 
+#define   OCTNET_CMD_SET_VF_LINKSTATE  0x1c
 #define   OCTNET_CMD_VXLAN_PORT_ADD    0x0
 #define   OCTNET_CMD_VXLAN_PORT_DEL    0x1
 #define   OCTNET_CMD_RXCSUM_ENABLE     0x0
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index bdb10a0..70a0d98 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -354,6 +354,14 @@ struct octeon_sriov_info {
 
 	/*lookup table that maps DPI ring number to VF pci_dev struct pointer*/
 	struct pci_dev *dpiring_to_vfpcidev_lut[MAX_POSSIBLE_VFS];
+
+	u64	vf_macaddr[MAX_POSSIBLE_VFS];
+
+	u16	vf_vlantci[MAX_POSSIBLE_VFS];
+
+	int	vf_linkstate[MAX_POSSIBLE_VFS];
+
+	u64	vf_drv_loaded_mask;
 };
 
 struct octeon_ioq_vector {
-- 
1.8.3.1

^ permalink raw reply related

* Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in __sk_mem_raise_allocated()
From: Andrei Vagin @ 2016-11-15  0:02 UTC (permalink / raw)
  To: Linux Kernel Network Developers, Paolo Abeni
In-Reply-To: <CANaxB-yCk8hhP68L4Q2nFOJht8sqgXGGQO2AftpHs0u1xyGG5A@mail.gmail.com>

On Mon, Nov 14, 2016 at 3:24 PM, Andrei Vagin <avagin@gmail.com> wrote:
> Hi Paolo,
>
> Our test system detected a kernel oops. Looks like a problem in the
> "udp: refactor memory accounting" series.
>
> # good: [f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb] udp: implement
> memory accounting helpers
> git bisect good f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb
> # bad: [2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e] bpf: add helper for
> retrieving current numa node id
> git bisect bad 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e
> # bad: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge branch 'udpmem'
> git bisect bad a10b91b8b81c29b87ff5a6d58c1402898337b956
> # good: [850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9] udp: use it's own
> memory accounting schema
> git bisect good 850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9

I did a mistake on this step and
850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9 is the first bad commit.

> # first bad commit: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge
> branch 'udpmem'
>
>
> [  112.472363] BUG: unable to handle kernel NULL pointer dereference
> at           (null)
> [  112.473360] IP: [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
> [  112.474156] PGD 62a08067 [  112.474455] PUD 2b8bf067
> PMD 0 [  112.474856]
> [  112.475054] Oops: 0002 [#1] SMP
> [  112.475431] Modules linked in: nf_conntrack_netlink udp_diag
> tcp_diag inet_diag netlink_diag af_packet_diag unix_diag binfmt_misc
> nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4
> xt_conntrack nf_conntrack nfnetlink ip6table_filter ip6_tables ppdev
> sunrpc crc32c_intel joydev virtio_balloon virtio_net i2c_piix4
> parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm virtio_blk
> serio_raw virtio_pci virtio_ring virtio ata_generic pata_acpi
> [  112.480594] CPU: 1 PID: 7405 Comm: socket_udplite Not tainted 4.8.0+ #84
> [  112.481377] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.9.1-1.fc24 04/01/2014
> [  112.482375] task: ffff928a5b5fa540 task.stack: ffffb3b484a0c000
> [  112.483059] RIP: 0010:[<ffffffffb76f8031>]  [<ffffffffb76f8031>]
> __sk_mem_raise_allocated+0x31/0x3f0
> [  112.484135] RSP: 0018:ffff928abfd03b18  EFLAGS: 00010296
> [  112.484758] RAX: 0000000000000001 RBX: ffff928aa293cfc0 RCX: 0000000000000001
> [  112.485585] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff928aa293cfc0
> [  112.486414] RBP: ffff928abfd03b48 R08: 0de4c53600000000 R09: 0000000000000000
> [  112.487241] R10: 000000006226b971 R11: 0000000000000000 R12: ffff928aa293cfc0
> [  112.488064] R13: 0000000000000001 R14: ffffffffb7f0d5a0 R15: 0000000000001000
> [  112.488893] FS:  00007f058067a700(0000) GS:ffff928abfd00000(0000)
> knlGS:0000000000000000
> [  112.489807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  112.490447] CR2: 0000000000000000 CR3: 000000002b8f5000 CR4: 00000000000006e0
> [  112.491248] DR0: 00000000000100a0 DR1: 0000000000000000 DR2: 0000000000000000
> [  112.492025] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [  112.492808] Stack:
> [  112.493038]  0000000100000300 ffff928aa293cfc0 ffff928a651b9c00
> 0000000000000300
> [  112.493912]  ffff928aa293d108 0000000000001000 ffff928abfd03b88
> ffffffffb779e094
> [  112.494782]  ffff928abfd03b70 ffff928a651b9c00 ffff928aa293cfc0
> 0000000000000000
> [  112.495653] Call Trace:
> [  112.495930]  <IRQ> [  112.496154]  [<ffffffffb779e094>]
> __udp_enqueue_schedule_skb+0xc4/0x170
> [  112.496896]  [<ffffffffb77a15b4>] udp_queue_rcv_skb+0x1a4/0x5b0
> [  112.497551]  [<ffffffffb77a1f3e>] __udp4_lib_rcv+0x57e/0xe30
> [  112.498173]  [<ffffffffb77a2cfa>] udplite_rcv+0x1a/0x20
> [  112.498761]  [<ffffffffb776799f>] ip_local_deliver_finish+0xdf/0x370
> [  112.499466]  [<ffffffffb77678ef>] ? ip_local_deliver_finish+0x2f/0x370
> [  112.500184]  [<ffffffffb77683c4>] ip_local_deliver+0x74/0x210
> [  112.500825]  [<ffffffffb77683ec>] ? ip_local_deliver+0x9c/0x210
> [  112.501482]  [<ffffffffb77678c0>] ? inet_del_offload+0x40/0x40
> [  112.502122]  [<ffffffffb7767daa>] ip_rcv_finish+0x17a/0x540
> [  112.502749]  [<ffffffffb77687f3>] ip_rcv+0x293/0x4d0
> [  112.503305]  [<ffffffffb776882f>] ? ip_rcv+0x2cf/0x4d0
> [  112.503873]  [<ffffffffb7767c30>] ? ip_local_deliver_finish+0x370/0x370
> [  112.504607]  [<ffffffffb771683b>] __netif_receive_skb_core+0x34b/0xca0
> [  112.505327]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
> [  112.505967]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
> [  112.506617]  [<ffffffffb77171a8>] __netif_receive_skb+0x18/0x60
> [  112.507277]  [<ffffffffb7718088>] process_backlog+0x78/0x240
> [  112.507904]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
> [  112.508552]  [<ffffffffb7717e01>] net_rx_action+0x1d1/0x3e0
> [  112.509165]  [<ffffffffb7873b3d>] __do_softirq+0xcd/0x471
> [  112.509765]  [<ffffffffb776d312>] ? ip_finish_output2+0x242/0x640
> [  112.510446]  [<ffffffffb7871ecc>] do_softirq_own_stack+0x1c/0x30
> [  112.511106]  <EOI> [  112.511336]  [<ffffffffb709c956>]
> do_softirq.part.14+0x46/0x70
> [  112.511990]  [<ffffffffb709ca39>] __local_bh_enable_ip+0xb9/0xc0
> [  112.512661]  [<ffffffffb776d33b>] ip_finish_output2+0x26b/0x640
> [  112.513319]  [<ffffffffb776d177>] ? ip_finish_output2+0xa7/0x640
> [  112.513979]  [<ffffffffb776e27f>] ip_finish_output+0x19f/0x330
> [  112.514627]  [<ffffffffb776f533>] ip_output+0x83/0x270
> [  112.515204]  [<ffffffffb776f55b>] ? ip_output+0xab/0x270
> [  112.515794]  [<ffffffffb776e0e0>] ? ip_fragment.constprop.51+0x80/0x80
> [  112.516521]  [<ffffffffb776e699>] ip_local_out+0x39/0x70
> [  112.517107]  [<ffffffffb7770069>] ip_send_skb+0x19/0x40
> [  112.517689]  [<ffffffffb779dd22>] udp_send_skb+0x172/0x260
> [  112.518299]  [<ffffffffb779f4b0>] udp_sendmsg+0x340/0xb30
> [  112.518893]  [<ffffffffb779de70>] ? udp_push_pending_frames+0x60/0x60
> [  112.519605]  [<ffffffffb77aeff8>] inet_sendmsg+0xf8/0x1c0
> [  112.520197]  [<ffffffffb77aef05>] ? inet_sendmsg+0x5/0x1c0
> [  112.520807]  [<ffffffffb76f4b98>] sock_sendmsg+0x38/0x50
> [  112.521397]  [<ffffffffb76f51b1>] SYSC_sendto+0x101/0x190
> [  112.521993]  [<ffffffffb70efc0f>] ? up_read+0x1f/0x40
> [  112.522563]  [<ffffffffb7054dfd>] ? __do_page_fault+0x26d/0x4f0
> [  112.523221]  [<ffffffffb70f3f55>] ? trace_hardirqs_on_caller+0xf5/0x1b0
> [  112.523950]  [<ffffffffb700201a>] ? trace_hardirqs_on_thunk+0x1a/0x1c
> [  112.524664]  [<ffffffffb76f60fe>] SyS_sendto+0xe/0x10
> [  112.525230]  [<ffffffffb7870e01>] entry_SYSCALL_64_fastpath+0x1f/0xc2
> [  112.525939] Code: 48 89 e5 41 57 41 56 41 55 41 54 4c 63 ea 53 49
> 89 fc 41 89 f7 4c 89 e8 48 83 ec 08 4c 8b 77 28 44 89 6d d4 49 8b 96
> d0 00 00 00 <f0> 48 0f c1 02 49 8d 5c 05 00 e9 41 00 00 00 49 8b 44 24
> 28 48
> [  112.528965] RIP  [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
> [  112.529732]  RSP <ffff928abfd03b18>
> [  112.530122] CR2: 0000000000000000
> [  112.530504] ---[ end trace ed0c680ae4317de5 ]---
> [  112.531019] Kernel panic - not syncing: Fatal exception in interrupt
> [  112.550850] Kernel Offset: 0x36000000 from 0xffffffff81000000
> (relocation rang

^ permalink raw reply

* Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in __sk_mem_raise_allocated()
From: Andrei Vagin @ 2016-11-14 23:58 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Linux Kernel Network Developers, Paolo Abeni
In-Reply-To: <1479166501.8455.91.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, Nov 14, 2016 at 3:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2016-11-14 at 15:24 -0800, Andrei Vagin wrote:
>> Hi Paolo,
>>
>> Our test system detected a kernel oops. Looks like a problem in the
>> "udp: refactor memory accounting" series.
>>
>> # good: [f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb] udp: implement
>> memory accounting helpers
>> git bisect good f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb
>> # bad: [2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e] bpf: add helper for
>> retrieving current numa node id
>> git bisect bad 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e
>> # bad: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge branch 'udpmem'
>> git bisect bad a10b91b8b81c29b87ff5a6d58c1402898337b956
>> # good: [850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9] udp: use it's own
>> memory accounting schema
>> git bisect good 850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9
>> # first bad commit: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge
>> branch 'udpmem'
>>
>>
>> [  112.472363] BUG: unable to handle kernel NULL pointer dereference
>> at           (null)
>> [  112.473360] IP: [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
>> [  112.474156] PGD 62a08067 [  112.474455] PUD 2b8bf067
>> PMD 0 [  112.474856]
>> [  112.475054] Oops: 0002 [#1] SMP
>> [  112.475431] Modules linked in: nf_conntrack_netlink udp_diag
>> tcp_diag inet_diag netlink_diag af_packet_diag unix_diag binfmt_misc
>> nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4
>> xt_conntrack nf_conntrack nfnetlink ip6table_filter ip6_tables ppdev
>> sunrpc crc32c_intel joydev virtio_balloon virtio_net i2c_piix4
>> parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm virtio_blk
>> serio_raw virtio_pci virtio_ring virtio ata_generic pata_acpi
>> [  112.480594] CPU: 1 PID: 7405 Comm: socket_udplite Not tainted 4.8.0+ #84
>> [  112.481377] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>> BIOS 1.9.1-1.fc24 04/01/2014
>> [  112.482375] task: ffff928a5b5fa540 task.stack: ffffb3b484a0c000
>> [  112.483059] RIP: 0010:[<ffffffffb76f8031>]  [<ffffffffb76f8031>]
>> __sk_mem_raise_allocated+0x31/0x3f0
>> [  112.484135] RSP: 0018:ffff928abfd03b18  EFLAGS: 00010296
>> [  112.484758] RAX: 0000000000000001 RBX: ffff928aa293cfc0 RCX: 0000000000000001
>> [  112.485585] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff928aa293cfc0
>> [  112.486414] RBP: ffff928abfd03b48 R08: 0de4c53600000000 R09: 0000000000000000
>> [  112.487241] R10: 000000006226b971 R11: 0000000000000000 R12: ffff928aa293cfc0
>> [  112.488064] R13: 0000000000000001 R14: ffffffffb7f0d5a0 R15: 0000000000001000
>> [  112.488893] FS:  00007f058067a700(0000) GS:ffff928abfd00000(0000)
>> knlGS:0000000000000000
>> [  112.489807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  112.490447] CR2: 0000000000000000 CR3: 000000002b8f5000 CR4: 00000000000006e0
>> [  112.491248] DR0: 00000000000100a0 DR1: 0000000000000000 DR2: 0000000000000000
>> [  112.492025] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
>> [  112.492808] Stack:
>> [  112.493038]  0000000100000300 ffff928aa293cfc0 ffff928a651b9c00
>> 0000000000000300
>> [  112.493912]  ffff928aa293d108 0000000000001000 ffff928abfd03b88
>> ffffffffb779e094
>> [  112.494782]  ffff928abfd03b70 ffff928a651b9c00 ffff928aa293cfc0
>> 0000000000000000
>
> Thanks for the report.
>
> I guess following patch would be needed ?

Yes, you are right. It works if we set .memory_allocated and .sysctl_mem.

Thanks,
Andrei

>
> diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
> index af817158d830c0da080935ba29e012dffbb89112..12604c0371c451efcc9aad278bb86be9ac4bb813 100644
> --- a/net/ipv4/udplite.c
> +++ b/net/ipv4/udplite.c
> @@ -54,6 +54,7 @@ struct proto  udplite_prot = {
>         .hash              = udp_lib_hash,
>         .unhash            = udp_lib_unhash,
>         .get_port          = udp_v4_get_port,
> +       .memory_allocated  = &udp_memory_allocated,
>         .obj_size          = sizeof(struct udp_sock),
>         .h.udp_table       = &udplite_table,
>  #ifdef CONFIG_COMPAT
> diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
> index 47d0d2b87106558fece3496479198005c55b99e7..946025c888cc9519fb3523edbbe8afbb18273326 100644
> --- a/net/ipv6/udplite.c
> +++ b/net/ipv6/udplite.c
> @@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
>         .hash              = udp_lib_hash,
>         .unhash            = udp_lib_unhash,
>         .get_port          = udp_v6_get_port,
> +       .memory_allocated  = &udp_memory_allocated,
>         .obj_size          = sizeof(struct udp6_sock),
>         .h.udp_table       = &udplite_table,
>  #ifdef CONFIG_COMPAT
>
>

^ permalink raw reply

* Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in __sk_mem_raise_allocated()
From: Eric Dumazet @ 2016-11-14 23:49 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Linux Kernel Network Developers, Paolo Abeni
In-Reply-To: <1479166501.8455.91.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, 2016-11-14 at 15:35 -0800, Eric Dumazet wrote:

> 
> Thanks for the report.
> 
> I guess following patch would be needed ?
> 
> diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
> index af817158d830c0da080935ba29e012dffbb89112..12604c0371c451efcc9aad278bb86be9ac4bb813 100644
> --- a/net/ipv4/udplite.c
> +++ b/net/ipv4/udplite.c
> @@ -54,6 +54,7 @@ struct proto 	udplite_prot = {
>  	.hash		   = udp_lib_hash,
>  	.unhash		   = udp_lib_unhash,
>  	.get_port	   = udp_v4_get_port,
> +	.memory_allocated  = &udp_memory_allocated,
>  	.obj_size	   = sizeof(struct udp_sock),
>  	.h.udp_table	   = &udplite_table,
>  #ifdef CONFIG_COMPAT
> diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
> index 47d0d2b87106558fece3496479198005c55b99e7..946025c888cc9519fb3523edbbe8afbb18273326 100644
> --- a/net/ipv6/udplite.c
> +++ b/net/ipv6/udplite.c
> @@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
>  	.hash		   = udp_lib_hash,
>  	.unhash		   = udp_lib_unhash,
>  	.get_port	   = udp_v6_get_port,
> +	.memory_allocated  = &udp_memory_allocated,
>  	.obj_size	   = sizeof(struct udp6_sock),
>  	.h.udp_table	   = &udplite_table,
>  #ifdef CONFIG_COMPAT
> 

.sysctl_mem also needs to be set.

^ permalink raw reply

* Re: qed, qedi patchset submission
From: Martin K. Petersen @ 2016-11-14 23:47 UTC (permalink / raw)
  To: Arun Easi; +Cc: Martin K. Petersen, David Miller, linux-scsi, netdev
In-Reply-To: <alpine.LRH.2.00.1611141256340.28058@mvluser05.qlc.com>

>>>>> "Arun" == Arun Easi <arun.easi@cavium.com> writes:

Arun,

Arun> Do you have any preference or thoughts on how the "qed" patches be
Arun> approached? Just as a reference, our rdma driver "qedr" went
Arun> through something similar[1], and eventually "qed" patches were
Arun> taken by David in the net tree and "qedr", in the rdma tree
Arun> (obviously) by Doug L.

DaveM can queue the whole series or I can hold the SCSI pieces for
rc1. Either way works for me.

However, I would like to do a review of the driver first. I will try to
get it done this week.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in __sk_mem_raise_allocated()
From: Eric Dumazet @ 2016-11-14 23:35 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Linux Kernel Network Developers, Paolo Abeni
In-Reply-To: <CANaxB-yCk8hhP68L4Q2nFOJht8sqgXGGQO2AftpHs0u1xyGG5A@mail.gmail.com>

On Mon, 2016-11-14 at 15:24 -0800, Andrei Vagin wrote:
> Hi Paolo,
> 
> Our test system detected a kernel oops. Looks like a problem in the
> "udp: refactor memory accounting" series.
> 
> # good: [f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb] udp: implement
> memory accounting helpers
> git bisect good f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb
> # bad: [2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e] bpf: add helper for
> retrieving current numa node id
> git bisect bad 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e
> # bad: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge branch 'udpmem'
> git bisect bad a10b91b8b81c29b87ff5a6d58c1402898337b956
> # good: [850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9] udp: use it's own
> memory accounting schema
> git bisect good 850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9
> # first bad commit: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge
> branch 'udpmem'
> 
> 
> [  112.472363] BUG: unable to handle kernel NULL pointer dereference
> at           (null)
> [  112.473360] IP: [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
> [  112.474156] PGD 62a08067 [  112.474455] PUD 2b8bf067
> PMD 0 [  112.474856]
> [  112.475054] Oops: 0002 [#1] SMP
> [  112.475431] Modules linked in: nf_conntrack_netlink udp_diag
> tcp_diag inet_diag netlink_diag af_packet_diag unix_diag binfmt_misc
> nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4
> xt_conntrack nf_conntrack nfnetlink ip6table_filter ip6_tables ppdev
> sunrpc crc32c_intel joydev virtio_balloon virtio_net i2c_piix4
> parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm virtio_blk
> serio_raw virtio_pci virtio_ring virtio ata_generic pata_acpi
> [  112.480594] CPU: 1 PID: 7405 Comm: socket_udplite Not tainted 4.8.0+ #84
> [  112.481377] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.9.1-1.fc24 04/01/2014
> [  112.482375] task: ffff928a5b5fa540 task.stack: ffffb3b484a0c000
> [  112.483059] RIP: 0010:[<ffffffffb76f8031>]  [<ffffffffb76f8031>]
> __sk_mem_raise_allocated+0x31/0x3f0
> [  112.484135] RSP: 0018:ffff928abfd03b18  EFLAGS: 00010296
> [  112.484758] RAX: 0000000000000001 RBX: ffff928aa293cfc0 RCX: 0000000000000001
> [  112.485585] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff928aa293cfc0
> [  112.486414] RBP: ffff928abfd03b48 R08: 0de4c53600000000 R09: 0000000000000000
> [  112.487241] R10: 000000006226b971 R11: 0000000000000000 R12: ffff928aa293cfc0
> [  112.488064] R13: 0000000000000001 R14: ffffffffb7f0d5a0 R15: 0000000000001000
> [  112.488893] FS:  00007f058067a700(0000) GS:ffff928abfd00000(0000)
> knlGS:0000000000000000
> [  112.489807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  112.490447] CR2: 0000000000000000 CR3: 000000002b8f5000 CR4: 00000000000006e0
> [  112.491248] DR0: 00000000000100a0 DR1: 0000000000000000 DR2: 0000000000000000
> [  112.492025] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [  112.492808] Stack:
> [  112.493038]  0000000100000300 ffff928aa293cfc0 ffff928a651b9c00
> 0000000000000300
> [  112.493912]  ffff928aa293d108 0000000000001000 ffff928abfd03b88
> ffffffffb779e094
> [  112.494782]  ffff928abfd03b70 ffff928a651b9c00 ffff928aa293cfc0
> 0000000000000000

Thanks for the report.

I guess following patch would be needed ?

diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index af817158d830c0da080935ba29e012dffbb89112..12604c0371c451efcc9aad278bb86be9ac4bb813 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -54,6 +54,7 @@ struct proto 	udplite_prot = {
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
 	.obj_size	   = sizeof(struct udp_sock),
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 47d0d2b87106558fece3496479198005c55b99e7..946025c888cc9519fb3523edbbe8afbb18273326 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
+	.memory_allocated  = &udp_memory_allocated,
 	.obj_size	   = sizeof(struct udp6_sock),
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT

^ permalink raw reply related

* linux-next: BUG: unable to handle kernel NULL pointer dereference in __sk_mem_raise_allocated()
From: Andrei Vagin @ 2016-11-14 23:24 UTC (permalink / raw)
  To: Linux Kernel Network Developers, Paolo Abeni

Hi Paolo,

Our test system detected a kernel oops. Looks like a problem in the
"udp: refactor memory accounting" series.

# good: [f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb] udp: implement
memory accounting helpers
git bisect good f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb
# bad: [2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e] bpf: add helper for
retrieving current numa node id
git bisect bad 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e
# bad: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge branch 'udpmem'
git bisect bad a10b91b8b81c29b87ff5a6d58c1402898337b956
# good: [850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9] udp: use it's own
memory accounting schema
git bisect good 850cbaddb52dfd4e0c7cabe2c168dd34b44ae0b9
# first bad commit: [a10b91b8b81c29b87ff5a6d58c1402898337b956] Merge
branch 'udpmem'


[  112.472363] BUG: unable to handle kernel NULL pointer dereference
at           (null)
[  112.473360] IP: [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
[  112.474156] PGD 62a08067 [  112.474455] PUD 2b8bf067
PMD 0 [  112.474856]
[  112.475054] Oops: 0002 [#1] SMP
[  112.475431] Modules linked in: nf_conntrack_netlink udp_diag
tcp_diag inet_diag netlink_diag af_packet_diag unix_diag binfmt_misc
nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4
xt_conntrack nf_conntrack nfnetlink ip6table_filter ip6_tables ppdev
sunrpc crc32c_intel joydev virtio_balloon virtio_net i2c_piix4
parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm virtio_blk
serio_raw virtio_pci virtio_ring virtio ata_generic pata_acpi
[  112.480594] CPU: 1 PID: 7405 Comm: socket_udplite Not tainted 4.8.0+ #84
[  112.481377] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.9.1-1.fc24 04/01/2014
[  112.482375] task: ffff928a5b5fa540 task.stack: ffffb3b484a0c000
[  112.483059] RIP: 0010:[<ffffffffb76f8031>]  [<ffffffffb76f8031>]
__sk_mem_raise_allocated+0x31/0x3f0
[  112.484135] RSP: 0018:ffff928abfd03b18  EFLAGS: 00010296
[  112.484758] RAX: 0000000000000001 RBX: ffff928aa293cfc0 RCX: 0000000000000001
[  112.485585] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff928aa293cfc0
[  112.486414] RBP: ffff928abfd03b48 R08: 0de4c53600000000 R09: 0000000000000000
[  112.487241] R10: 000000006226b971 R11: 0000000000000000 R12: ffff928aa293cfc0
[  112.488064] R13: 0000000000000001 R14: ffffffffb7f0d5a0 R15: 0000000000001000
[  112.488893] FS:  00007f058067a700(0000) GS:ffff928abfd00000(0000)
knlGS:0000000000000000
[  112.489807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  112.490447] CR2: 0000000000000000 CR3: 000000002b8f5000 CR4: 00000000000006e0
[  112.491248] DR0: 00000000000100a0 DR1: 0000000000000000 DR2: 0000000000000000
[  112.492025] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[  112.492808] Stack:
[  112.493038]  0000000100000300 ffff928aa293cfc0 ffff928a651b9c00
0000000000000300
[  112.493912]  ffff928aa293d108 0000000000001000 ffff928abfd03b88
ffffffffb779e094
[  112.494782]  ffff928abfd03b70 ffff928a651b9c00 ffff928aa293cfc0
0000000000000000
[  112.495653] Call Trace:
[  112.495930]  <IRQ> [  112.496154]  [<ffffffffb779e094>]
__udp_enqueue_schedule_skb+0xc4/0x170
[  112.496896]  [<ffffffffb77a15b4>] udp_queue_rcv_skb+0x1a4/0x5b0
[  112.497551]  [<ffffffffb77a1f3e>] __udp4_lib_rcv+0x57e/0xe30
[  112.498173]  [<ffffffffb77a2cfa>] udplite_rcv+0x1a/0x20
[  112.498761]  [<ffffffffb776799f>] ip_local_deliver_finish+0xdf/0x370
[  112.499466]  [<ffffffffb77678ef>] ? ip_local_deliver_finish+0x2f/0x370
[  112.500184]  [<ffffffffb77683c4>] ip_local_deliver+0x74/0x210
[  112.500825]  [<ffffffffb77683ec>] ? ip_local_deliver+0x9c/0x210
[  112.501482]  [<ffffffffb77678c0>] ? inet_del_offload+0x40/0x40
[  112.502122]  [<ffffffffb7767daa>] ip_rcv_finish+0x17a/0x540
[  112.502749]  [<ffffffffb77687f3>] ip_rcv+0x293/0x4d0
[  112.503305]  [<ffffffffb776882f>] ? ip_rcv+0x2cf/0x4d0
[  112.503873]  [<ffffffffb7767c30>] ? ip_local_deliver_finish+0x370/0x370
[  112.504607]  [<ffffffffb771683b>] __netif_receive_skb_core+0x34b/0xca0
[  112.505327]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
[  112.505967]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
[  112.506617]  [<ffffffffb77171a8>] __netif_receive_skb+0x18/0x60
[  112.507277]  [<ffffffffb7718088>] process_backlog+0x78/0x240
[  112.507904]  [<ffffffffb77180e4>] ? process_backlog+0xd4/0x240
[  112.508552]  [<ffffffffb7717e01>] net_rx_action+0x1d1/0x3e0
[  112.509165]  [<ffffffffb7873b3d>] __do_softirq+0xcd/0x471
[  112.509765]  [<ffffffffb776d312>] ? ip_finish_output2+0x242/0x640
[  112.510446]  [<ffffffffb7871ecc>] do_softirq_own_stack+0x1c/0x30
[  112.511106]  <EOI> [  112.511336]  [<ffffffffb709c956>]
do_softirq.part.14+0x46/0x70
[  112.511990]  [<ffffffffb709ca39>] __local_bh_enable_ip+0xb9/0xc0
[  112.512661]  [<ffffffffb776d33b>] ip_finish_output2+0x26b/0x640
[  112.513319]  [<ffffffffb776d177>] ? ip_finish_output2+0xa7/0x640
[  112.513979]  [<ffffffffb776e27f>] ip_finish_output+0x19f/0x330
[  112.514627]  [<ffffffffb776f533>] ip_output+0x83/0x270
[  112.515204]  [<ffffffffb776f55b>] ? ip_output+0xab/0x270
[  112.515794]  [<ffffffffb776e0e0>] ? ip_fragment.constprop.51+0x80/0x80
[  112.516521]  [<ffffffffb776e699>] ip_local_out+0x39/0x70
[  112.517107]  [<ffffffffb7770069>] ip_send_skb+0x19/0x40
[  112.517689]  [<ffffffffb779dd22>] udp_send_skb+0x172/0x260
[  112.518299]  [<ffffffffb779f4b0>] udp_sendmsg+0x340/0xb30
[  112.518893]  [<ffffffffb779de70>] ? udp_push_pending_frames+0x60/0x60
[  112.519605]  [<ffffffffb77aeff8>] inet_sendmsg+0xf8/0x1c0
[  112.520197]  [<ffffffffb77aef05>] ? inet_sendmsg+0x5/0x1c0
[  112.520807]  [<ffffffffb76f4b98>] sock_sendmsg+0x38/0x50
[  112.521397]  [<ffffffffb76f51b1>] SYSC_sendto+0x101/0x190
[  112.521993]  [<ffffffffb70efc0f>] ? up_read+0x1f/0x40
[  112.522563]  [<ffffffffb7054dfd>] ? __do_page_fault+0x26d/0x4f0
[  112.523221]  [<ffffffffb70f3f55>] ? trace_hardirqs_on_caller+0xf5/0x1b0
[  112.523950]  [<ffffffffb700201a>] ? trace_hardirqs_on_thunk+0x1a/0x1c
[  112.524664]  [<ffffffffb76f60fe>] SyS_sendto+0xe/0x10
[  112.525230]  [<ffffffffb7870e01>] entry_SYSCALL_64_fastpath+0x1f/0xc2
[  112.525939] Code: 48 89 e5 41 57 41 56 41 55 41 54 4c 63 ea 53 49
89 fc 41 89 f7 4c 89 e8 48 83 ec 08 4c 8b 77 28 44 89 6d d4 49 8b 96
d0 00 00 00 <f0> 48 0f c1 02 49 8d 5c 05 00 e9 41 00 00 00 49 8b 44 24
28 48
[  112.528965] RIP  [<ffffffffb76f8031>] __sk_mem_raise_allocated+0x31/0x3f0
[  112.529732]  RSP <ffff928abfd03b18>
[  112.530122] CR2: 0000000000000000
[  112.530504] ---[ end trace ed0c680ae4317de5 ]---
[  112.531019] Kernel panic - not syncing: Fatal exception in interrupt
[  112.550850] Kernel Offset: 0x36000000 from 0xffffffff81000000
(relocation rang

^ permalink raw reply

* Re: Long delays creating a netns after deleting one (possibly RCU related)
From: Eric Dumazet @ 2016-11-14 23:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Paul E. McKenney, Cong Wang, Rolf Neugebauer, LKML,
	Linux Kernel Network Developers, Justin Cormack, Ian Campbell,
	Eric Dumazet
In-Reply-To: <1479163571.8455.83.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, 2016-11-14 at 14:46 -0800, Eric Dumazet wrote:
> On Mon, 2016-11-14 at 16:12 -0600, Eric W. Biederman wrote:
> 
> > synchronize_rcu_expidited is not enough if you have multiple network
> > devices in play.
> > 
> > Looking at the code it comes down to this commit, and it appears there
> > is a promise add rcu grace period combining by Eric Dumazet.
> > 
> > Eric since people are hitting noticable stalls because of the rcu grace
> > period taking a long time do you think you could look at this code path
> > a bit more?
> > 
> > commit 93d05d4a320cb16712bb3d57a9658f395d8cecb9
> > Author: Eric Dumazet <edumazet@google.com>
> > Date:   Wed Nov 18 06:31:03 2015 -0800
> 
> Absolutely, I will take a loop asap.

The worst offender should be fixed by the following patch.

busy poll needs to poll the physical device, not a virtual one...

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index d15214d673b2e8e08fd6437b572278fb1359f10d..2a1abbf8da74368cd01adc40cef6c0644e059ef2 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -68,6 +68,9 @@ static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *de
 		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
 
 		__skb_queue_head_init(&cell->napi_skbs);
+
+		set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
 		netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
 		napi_enable(&cell->napi);
 	}

^ permalink raw reply related

* Re: [RFC PATCH 2/2] ptr_ring_ll: pop/push multiple objects at once
From: Michael S. Tsirkin @ 2016-11-14 23:06 UTC (permalink / raw)
  To: John Fastabend; +Cc: jasowang, netdev, linux-kernel
In-Reply-To: <20161111044432.1547.65342.stgit@john-Precision-Tower-5810>

On Thu, Nov 10, 2016 at 08:44:32PM -0800, John Fastabend wrote:
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

This will naturally reduce the cache line bounce
costs, but so will a _many API for ptr-ring,
doing lock-add many-unlock.

the number of atomics also scales better with the lock:
one per push instead of one per queue.

Also, when can qdisc use a _many operation?


> ---
>  include/linux/ptr_ring_ll.h |   22 ++++++++++++++++------
>  include/linux/skb_array.h   |   11 +++++++++--
>  net/sched/sch_generic.c     |    2 +-
>  3 files changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/ptr_ring_ll.h b/include/linux/ptr_ring_ll.h
> index bcb11f3..5dc25f7 100644
> --- a/include/linux/ptr_ring_ll.h
> +++ b/include/linux/ptr_ring_ll.h
> @@ -45,9 +45,10 @@ struct ptr_ring_ll {
>  /* Note: callers invoking this in a loop must use a compiler barrier,
>   * for example cpu_relax(). Callers must hold producer_lock.
>   */
> -static inline int __ptr_ring_ll_produce(struct ptr_ring_ll *r, void *ptr)
> +static inline int __ptr_ring_ll_produce_many(struct ptr_ring_ll *r,
> +					     void **ptr, int num)
>  {
> -	u32 ret, head, tail, next, slots, mask;
> +	u32 ret, head, tail, next, slots, mask, i;
>  
>  	do {
>  		head = READ_ONCE(r->prod_head);
> @@ -55,21 +56,30 @@ static inline int __ptr_ring_ll_produce(struct ptr_ring_ll *r, void *ptr)
>  		tail = READ_ONCE(r->cons_tail);
>  
>  		slots = mask + tail - head;
> -		if (slots < 1)
> +		if (slots < num)
> +			num = slots;
> +
> +		if (unlikely(!num))
>  			return -ENOMEM;
>  
> -		next = head + 1;
> +		next = head + num;
>  		ret = cmpxchg(&r->prod_head, head, next);
>  	} while (ret != head);
>  
> -	r->queue[head & mask] = ptr;
> +	for (i = 0; i < num; i++)
> +		r->queue[(head + i) & mask] = ptr[i];
>  	smp_wmb();
>  
>  	while (r->prod_tail != head)
>  		cpu_relax();
>  
>  	r->prod_tail = next;
> -	return 0;
> +	return num;
> +}
> +
> +static inline int __ptr_ring_ll_produce(struct ptr_ring_ll *r, void **ptr)
> +{
> +	return __ptr_ring_ll_produce_many(r, ptr, 1);
>  }
>  
>  static inline void *__ptr_ring_ll_consume(struct ptr_ring_ll *r)
> diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
> index 9b43dfd..de3c700 100644
> --- a/include/linux/skb_array.h
> +++ b/include/linux/skb_array.h
> @@ -48,9 +48,16 @@ static inline bool skb_array_full(struct skb_array *a)
>  	return ptr_ring_full(&a->ring);
>  }
>  
> -static inline int skb_array_ll_produce(struct skb_array_ll *a, struct sk_buff *skb)
> +static inline int skb_array_ll_produce_many(struct skb_array_ll *a,
> +					    struct sk_buff **skb, int num)
>  {
> -	return __ptr_ring_ll_produce(&a->ring, skb);
> +	return __ptr_ring_ll_produce_many(&a->ring, (void **)skb, num);
> +}
> +
> +static inline int skb_array_ll_produce(struct skb_array_ll *a,
> +				       struct sk_buff **skb)
> +{
> +	return __ptr_ring_ll_produce(&a->ring, (void **)skb);
>  }
>  
>  static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb)
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 4648ec8..58f2011 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -571,7 +571,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>  	struct skb_array_ll *q = band2list(priv, band);
>  	int err;
>  
> -	err = skb_array_ll_produce(q, skb);
> +	err = skb_array_ll_produce(q, &skb);
>  
>  	if (unlikely(err)) {
>  		net_warn_ratelimited("drop a packet from fast enqueue\n");

I don't see a pop many operation here.

-- 
MST

^ permalink raw reply

* mvneta bug with  mvneta_set_rx_mode()
From: Andrew Lunn @ 2016-11-14 23:03 UTC (permalink / raw)
  To: Thomas Petazzoni; +Cc: netdev

Hi Thomas

I think i have found a bug in the mvneta driver, when coupled with
DSA. I'm using an Armada 370 RD board, but i expect you can reproduce
this in any board using mvneta and a switch.

I set the MAC address on a switch interface:

root@370-rd:~# ip link set address f2:42:42:42:42:42 dev lan2
root@370-rd:~# ip addr add 10.42.42.42/24 dev lan2
root@370-rd:~# ip link set lan2 up
root@370-rd:~# ip link set eth1 up
root@370-rd:~# ping 10.42.42.41

This does not work:

PING 10.42.42.41 (10.42.42.1) 56(84) bytes of data.
>From 10.42.42.42 icmp_seq=1 Destination Host Unreachable
>From 10.42.42.42 icmp_seq=2 Destination Host Unreachable
>From 10.42.42.42 icmp_seq=3 Destination Host Unreachable
^C
--- 10.42.42.1 ping statistics ---
10 packets transmitted, 0 received, +3 errors, 100% packet loss, time 9338ms
pipe 8

However, if i run tcpdump on the mvneta interface, so putting it into
promiscuous mode:

root@370-rd:~# tcpdump -i eth1 &
[1] 3757
root@370-rd:~# ping 10.42.42.1
PING 10.42.42.1 (10.42.42.1) 56(84) bytes of data.
64 bytes from 10.42.42.1: icmp_seq=1 ttl=64 time=1.43 ms
16:48:28.185095 MEDSA 0.2:0: ARP, Request who-has 10.42.42.1 tell 10.42.42.42, length 28
16:48:28.185767 MEDSA 0.2:0: ARP, Reply 10.42.42.1 is-at d8:eb:97:bf:44:f3 (oui Unknown), length 46
16:48:28.185961 MEDSA 0.2:0: IP 10.42.42.42 > 10.42.42.1: ICMP echo request, id 3760, seq 1, length 64
16:48:28.186429 MEDSA 0.2:0: IP 10.42.42.1 > 10.42.42.42: ICMP echo reply, id 3760, seq 1, length 64
16:48:29.186857 MEDSA 0.2:0: IP 10.42.42.42 > 10.42.42.1: ICMP echo request, id 3760, seq 2, length 64
16:48:29.187620 MEDSA 0.2:0: IP 10.42.42.1 > 10.42.42.42: ICMP echo reply, id 3760, seq 2, length 64
64 bytes from 10.42.42.1: icmp_seq=2 ttl=64 time=0.834 ms
16:48:30.189156 MEDSA 0.2:0: IP 10.42.42.42 > 10.42.42.1: ICMP echo request, id 3760, seq 3, length 64
16:48:30.189867 MEDSA 0.2:0: IP 10.42.42.1 > 10.42.42.42: ICMP echo reply, id 3760, seq 3, length 64
64 bytes from 10.42.42.1: icmp_seq=3 ttl=64 time=0.782 ms
^C
--- 10.42.42.1 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2004ms
rtt min/avg/max/mdev = 0.782/1.015/1.431/0.296 ms

What should happen is that when the MAC address is set on lan2, it
calls dsa_slave_set_mac_address(). It sees that the requested address
is different to the masters MAC address, so it calls dev_uc_add() to
add the MAC address to the master device, i.e. the mvneta. That causes
the MAC address to be added to the interfaces dev->uc list.  Since the
driver has dev->priv_flags & IFF_UNICAST_FLT true, it is indicating it
supports unicast filtering. So __dev_set_rx_mode() calls the drivers
set_rx_mode() to setup the filtering.

I don't see anywhere in mvneta where it walks the dev->uc list adding
these MAC addresses to its filter. Either it needs to do this, or it
should not have IFF_UNICAST_FLT, so that the core code will put the
interface into promisc mode when multiple unicast addresses are added.

Do you have time to look into this?

Thanks
	Andrew

^ permalink raw reply

* Re: [RFC PATCH 1/2] net: use cmpxchg instead of spinlock in ptr rings
From: Michael S. Tsirkin @ 2016-11-14 23:01 UTC (permalink / raw)
  To: John Fastabend; +Cc: jasowang, netdev, linux-kernel
In-Reply-To: <20161111044408.1547.92737.stgit@john-Precision-Tower-5810>

On Thu, Nov 10, 2016 at 08:44:08PM -0800, John Fastabend wrote:
> 
> ---
>  include/linux/ptr_ring_ll.h |  136 +++++++++++++++++++++++++++++++++++++++++++
>  include/linux/skb_array.h   |   25 ++++++++
>  2 files changed, 161 insertions(+)
>  create mode 100644 include/linux/ptr_ring_ll.h
> 
> diff --git a/include/linux/ptr_ring_ll.h b/include/linux/ptr_ring_ll.h
> new file mode 100644
> index 0000000..bcb11f3
> --- /dev/null
> +++ b/include/linux/ptr_ring_ll.h
> @@ -0,0 +1,136 @@
> +/*
> + *	Definitions for the 'struct ptr_ring_ll' datastructure.
> + *
> + *	Author:
> + *		John Fastabend <john.r.fastabend@intel.com>
> + *
> + *	Copyright (C) 2016 Intel Corp.
> + *
> + *	This program is free software; you can redistribute it and/or modify it
> + *	under the terms of the GNU General Public License as published by the
> + *	Free Software Foundation; either version 2 of the License, or (at your
> + *	option) any later version.
> + *
> + *	This is a limited-size FIFO maintaining pointers in FIFO order, with
> + *	one CPU producing entries and another consuming entries from a FIFO.
> + *	extended from ptr_ring_ll to use cmpxchg over spin lock.

So when is each one (ptr-ring/ptr-ring-ll) a win? _ll suffix seems to
imply this gives a better latency, OTOH for a ping/pong I suspect
ptr-ring would be better as it avoids index cache line bounces.

> + */
> +
> +#ifndef _LINUX_PTR_RING_LL_H
> +#define _LINUX_PTR_RING_LL_H 1
> +
> +#ifdef __KERNEL__
> +#include <linux/spinlock.h>
> +#include <linux/cache.h>
> +#include <linux/types.h>
> +#include <linux/compiler.h>
> +#include <linux/cache.h>
> +#include <linux/slab.h>
> +#include <asm/errno.h>
> +#endif
> +
> +struct ptr_ring_ll {
> +	u32 prod_size;
> +	u32 prod_mask;
> +	u32 prod_head;
> +	u32 prod_tail;
> +	u32 cons_size;
> +	u32 cons_mask;
> +	u32 cons_head;
> +	u32 cons_tail;
> +
> +	void **queue;
> +};
> +
> +/* Note: callers invoking this in a loop must use a compiler barrier,
> + * for example cpu_relax(). Callers must hold producer_lock.
> + */
> +static inline int __ptr_ring_ll_produce(struct ptr_ring_ll *r, void *ptr)
> +{
> +	u32 ret, head, tail, next, slots, mask;
> +
> +	do {
> +		head = READ_ONCE(r->prod_head);
> +		mask = READ_ONCE(r->prod_mask);
> +		tail = READ_ONCE(r->cons_tail);
> +
> +		slots = mask + tail - head;
> +		if (slots < 1)
> +			return -ENOMEM;
> +
> +		next = head + 1;
> +		ret = cmpxchg(&r->prod_head, head, next);
> +	} while (ret != head);


So why is this preferable to a lock?

I suspect it's nothing else than the qspinlock fairness
and polling code complexity. It's all not very useful if you
1. are just doing a couple of instructions under the lock
and
2. use a finite FIFO which is unfair anyway


How about this hack (lifted from virt_spin_lock):

static inline void quick_spin_lock(struct qspinlock *lock)
{
        do {
                while (atomic_read(&lock->val) != 0)
                        cpu_relax();
        } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
}

Or maybe we should even drop the atomic_read in the middle -
worth profiling and comparing:

static inline void quick_spin_lock(struct qspinlock *lock)
{
        while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
		cpu_relax();
}


Then, use quick_spin_lock instead of spin_lock everywhere in
ptr_ring - will that make it more efficient?


> +
> +	r->queue[head & mask] = ptr;
> +	smp_wmb();
> +
> +	while (r->prod_tail != head)
> +		cpu_relax();
> +
> +	r->prod_tail = next;
> +	return 0;
> +}
> +
> +static inline void *__ptr_ring_ll_consume(struct ptr_ring_ll *r)
> +{
> +	u32 ret, head, tail, next, slots, mask;
> +	void *ptr;
> +
> +	do {
> +		head = READ_ONCE(r->cons_head);
> +		mask = READ_ONCE(r->cons_mask);
> +		tail = READ_ONCE(r->prod_tail);
> +
> +		slots = tail - head;
> +		if (slots < 1)
> +			return ERR_PTR(-ENOMEM);
> +
> +		next = head + 1;
> +		ret = cmpxchg(&r->cons_head, head, next);
> +	} while (ret != head);
> +
> +	ptr = r->queue[head & mask];
> +	smp_rmb();
> +
> +	while (r->cons_tail != head)
> +		cpu_relax();
> +
> +	r->cons_tail = next;
> +	return ptr;
> +}
> +
> +static inline void **__ptr_ring_ll_init_queue_alloc(int size, gfp_t gfp)
> +{
> +	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
> +}
> +
> +static inline int ptr_ring_ll_init(struct ptr_ring_ll *r, int size, gfp_t gfp)
> +{
> +	r->queue = __ptr_ring_init_queue_alloc(size, gfp);
> +	if (!r->queue)
> +		return -ENOMEM;
> +
> +	r->prod_size = r->cons_size = size;
> +	r->prod_mask = r->cons_mask = size - 1;
> +	r->prod_tail = r->prod_head = 0;
> +	r->cons_tail = r->prod_tail = 0;
> +
> +	return 0;
> +}
> +
> +static inline void ptr_ring_ll_cleanup(struct ptr_ring_ll *r, void (*destroy)(void *))
> +{
> +	if (destroy) {
> +		void *ptr;
> +
> +		ptr = __ptr_ring_ll_consume(r);
> +		while (!IS_ERR_OR_NULL(ptr)) {
> +			destroy(ptr);
> +			ptr = __ptr_ring_ll_consume(r);
> +		}
> +	}
> +	kfree(r->queue);
> +}
> +
> +#endif /* _LINUX_PTR_RING_LL_H  */
> diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
> index f4dfade..9b43dfd 100644
> --- a/include/linux/skb_array.h
> +++ b/include/linux/skb_array.h
> @@ -22,6 +22,7 @@
>  
>  #ifdef __KERNEL__
>  #include <linux/ptr_ring.h>
> +#include <linux/ptr_ring_ll.h>
>  #include <linux/skbuff.h>
>  #include <linux/if_vlan.h>
>  #endif
> @@ -30,6 +31,10 @@ struct skb_array {
>  	struct ptr_ring ring;
>  };
>  
> +struct skb_array_ll {
> +	struct ptr_ring_ll ring;
> +};
> +
>  /* Might be slightly faster than skb_array_full below, but callers invoking
>   * this in a loop must use a compiler barrier, for example cpu_relax().
>   */
> @@ -43,6 +48,11 @@ static inline bool skb_array_full(struct skb_array *a)
>  	return ptr_ring_full(&a->ring);
>  }
>  
> +static inline int skb_array_ll_produce(struct skb_array_ll *a, struct sk_buff *skb)
> +{
> +	return __ptr_ring_ll_produce(&a->ring, skb);
> +}
> +
>  static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb)
>  {
>  	return ptr_ring_produce(&a->ring, skb);
> @@ -92,6 +102,11 @@ static inline bool skb_array_empty_any(struct skb_array *a)
>  	return ptr_ring_empty_any(&a->ring);
>  }
>  
> +static inline struct sk_buff *skb_array_ll_consume(struct skb_array_ll *a)
> +{
> +	return __ptr_ring_ll_consume(&a->ring);
> +}
> +
>  static inline struct sk_buff *skb_array_consume(struct skb_array *a)
>  {
>  	return ptr_ring_consume(&a->ring);
> @@ -146,6 +161,11 @@ static inline int skb_array_peek_len_any(struct skb_array *a)
>  	return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag);
>  }
>  
> +static inline int skb_array_ll_init(struct skb_array_ll *a, int size, gfp_t gfp)
> +{
> +	return ptr_ring_ll_init(&a->ring, size, gfp);
> +}
> +
>  static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
>  {
>  	return ptr_ring_init(&a->ring, size, gfp);
> @@ -170,6 +190,11 @@ static inline int skb_array_resize_multiple(struct skb_array **rings,
>  					__skb_array_destroy_skb);
>  }
>  
> +static inline void skb_array_ll_cleanup(struct skb_array_ll *a)
> +{
> +	ptr_ring_ll_cleanup(&a->ring, __skb_array_destroy_skb);
> +}
> +
>  static inline void skb_array_cleanup(struct skb_array *a)
>  {
>  	ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);

^ permalink raw reply

* [PATCH v2 iproute2 1/1] tc: print raw qdisc handle.
From: Roman Mashak @ 2016-11-14 22:59 UTC (permalink / raw)
  To: stephen; +Cc: netdev, jhs, Roman Mashak

This is v2 patch with fixed code indentation.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 tc/tc_qdisc.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index 93c9a4c..6f60002 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -231,7 +231,15 @@ int print_qdisc(const struct sockaddr_nl *who,
 	if (n->nlmsg_type == RTM_DELQDISC)
 		fprintf(fp, "deleted ");
 
-	fprintf(fp, "qdisc %s %x: ", rta_getattr_str(tb[TCA_KIND]), t->tcm_handle>>16);
+	if (!show_raw) {
+		fprintf(fp, "qdisc %s %x: ", rta_getattr_str(tb[TCA_KIND]),
+			t->tcm_handle >> 16);
+	} else {
+		fprintf(fp, "qdisc %s %x:[%08x]  ",
+			rta_getattr_str(tb[TCA_KIND]),
+			t->tcm_handle >> 16, t->tcm_handle);
+	}
+
 	if (filter_ifindex == 0)
 		fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex));
 	if (t->tcm_parent == TC_H_ROOT)
-- 
1.9.1

^ permalink raw reply related

* Re: Debugging Ethernet issues
From: Mason @ 2016-11-14 22:48 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Sebastian Frias, Andrew Lunn, netdev, Mans Rullgard,
	Sergei Shtylyov, Tom Lendacky, Zach Brown, Shaohui Xie, Tim Beale,
	Brian Hill, Vince Bridgers, Balakumaran Kannan, David S. Miller,
	Kirill Kapranov
In-Reply-To: <1fd492b4-3c32-7939-791c-56cf6d8c0078@gmail.com>

On 14/11/2016 22:00, Florian Fainelli wrote:

> No I missed that, way too many emails, really.

Sorry, I was trying to be thorough, but I went overboard.

I'll start a new thread tomorrow, with a smaller CC list
(only those who have participated so far) and I'll try to
remain concise.

Regards.

^ permalink raw reply

* Re: [PATCH net 1/3] bpf, mlx5: fix mlx5e_create_rq taking reference on prog
From: Saeed Mahameed @ 2016-11-14 18:15 UTC (permalink / raw)
  To: Daniel Borkmann, davem
  Cc: alexei.starovoitov, bblanco, tariqt, zhiyisun, ranas, netdev
In-Reply-To: <0447a6161550c466ef799b46095440949ffc7df0.1479080215.git.daniel@iogearbox.net>



On 11/14/2016 02:43 AM, Daniel Borkmann wrote:
> In mlx5e_create_rq(), when creating a new queue, we call bpf_prog_add() but
> without checking the return value. bpf_prog_add() can fail, so we really

Didn't know this, thanks for noticing, I wonder why taking a reference for an object would fail ?
especially when someone is requesting from the driver to take a reference to it ndo_xdp_set ?! sounds like a bad design.

Anyway I will check that later.

> must check it. Take the reference right when we assign it to the rq from
> priv->xdp_prog, and just drop the reference on error path. Destruction in
> mlx5e_destroy_rq() looks good, though.
> 
> Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 14 +++++++++++---
>  kernel/bpf/syscall.c                              |  1 +
>  2 files changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 84e8b25..2b83667 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -489,7 +489,16 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
>  	rq->channel = c;
>  	rq->ix      = c->ix;
>  	rq->priv    = c->priv;
> +
>  	rq->xdp_prog = priv->xdp_prog;

Why keeping this assignment ? just test priv->xdp_prog.

> +	if (rq->xdp_prog) {
> +		rq->xdp_prog = bpf_prog_inc(rq->xdp_prog);
> +		if (IS_ERR(rq->xdp_prog)) {
> +			err = PTR_ERR(rq->xdp_prog);
> +			rq->xdp_prog = NULL;
> +			goto err_rq_wq_destroy;
> +		}
> +	}

Try this, simpler and less indentations:
 
rq->xdp_prog = priv->xdp_prog ? bpf_prog_inc(priv->xdp_prog) : NULL;
if (IS_ERR(rq->xdp_prog)) {
	err = PTR_ERR(rq->xdp_prog);
	rq->xdp_prog = NULL;
	goto err_rq_wq_destroy;
}

Saeed.

^ permalink raw reply

* Re: Long delays creating a netns after deleting one (possibly RCU related)
From: Eric Dumazet @ 2016-11-14 22:46 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Paul E. McKenney, Cong Wang, Rolf Neugebauer, LKML,
	Linux Kernel Network Developers, Justin Cormack, Ian Campbell,
	Eric Dumazet
In-Reply-To: <87y40lhfrt.fsf@xmission.com>

On Mon, 2016-11-14 at 16:12 -0600, Eric W. Biederman wrote:

> synchronize_rcu_expidited is not enough if you have multiple network
> devices in play.
> 
> Looking at the code it comes down to this commit, and it appears there
> is a promise add rcu grace period combining by Eric Dumazet.
> 
> Eric since people are hitting noticable stalls because of the rcu grace
> period taking a long time do you think you could look at this code path
> a bit more?
> 
> commit 93d05d4a320cb16712bb3d57a9658f395d8cecb9
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Wed Nov 18 06:31:03 2015 -0800

Absolutely, I will take a loop asap.

Thanks Eric.

^ permalink raw reply

* Re: [PATCH 0/2] amd-xgbe: AMD XGBE driver updates 2016-11-14
From: Tom Lendacky @ 2016-11-14 22:42 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: julia.lawall, christophe.jaillet
In-Reply-To: <20161114222827.25160.24791.stgit@tlendack-t1.amdoffice.net>

On 11/14/2016 4:28 PM, Tom Lendacky wrote:
> This patch series addresses some minor issues found in the recently
> accepted patch series for the AMD XGBE driver.
> 
> The following fixes are included in this driver update series:
> 
> - Fix how a mask is applied to a Clause 37 register value
> - Fix some coccinelle identified warnings
> 
> This patch series is based on net-next.

David,

I forgot to include the net-next repo name in the subject, please
ignore this series and I'll resend.

Thanks,
Tom

> 
> ---
> 
> Tom Lendacky (2):
>       amd-xgbe: Fix mask appliciation for Clause 37 register
>       amd-xgbe: Fix up some coccinelle identified warnings
> 
> 
>  drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |    4 ++--
>  drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    5 ++---
>  2 files changed, 4 insertions(+), 5 deletions(-)
> 

^ permalink raw reply

* [PATCH net] udp: restore UDPlite many-cast delivery
From: Pablo Neira Ayuso @ 2016-11-14 22:40 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, drheld

Honor udptable parameter that is passed to __udp*_lib_mcast_deliver(),
otherwise udplite broadcast/multicast use the wrong table and it breaks.

Fixes: 2dc41cff7545 ("udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
It looks like UDPlite accidentally broke when dealing with long UDP many-cast
chains, give the wrong table is used for udplite. Compile tested only.

 net/ipv4/udp.c | 6 +++---
 net/ipv6/udp.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 195992e0440d..5b5a552a2804 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1731,10 +1731,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 
 	if (use_hash2) {
 		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
-			    udp_table.mask;
-		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+			    udptable->mask;
+		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
-		hslot = &udp_table.hash2[hash2];
+		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a7700bbf6788..a9dd23d5c3c4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -688,10 +688,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 
 	if (use_hash2) {
 		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
-			    udp_table.mask;
-		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+			    udptable->mask;
+		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
-		hslot = &udp_table.hash2[hash2];
+		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v2 2/2] amd-xgbe: Fix up some coccinelle identified warnings
From: Tom Lendacky @ 2016-11-14 22:39 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller
In-Reply-To: <20161114223856.25437.13649.stgit@tlendack-t1.amdoffice.net>

Fix up some warnings that were identified by coccinelle:

Clean up an if/else block that can look confusing since the same statement
is executed in an "else if" check and the final "else" statement.

Change a variable from unsigned int to int since it is used in an if
statement checking the value to be less than 0.

Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 4ba4332..348cc8c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1766,8 +1766,6 @@ static void xgbe_phy_sfi_mode(struct xgbe_prv_data *pdata)
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 1);
 		else if (phy_data->sfp_cable_len <= 3)
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 2);
-		else if (phy_data->sfp_cable_len <= 5)
-			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 3);
 		else
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 3);
 	}
@@ -2346,7 +2344,8 @@ static bool xgbe_phy_valid_speed(struct xgbe_prv_data *pdata, int speed)
 static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int ret, reg;
+	unsigned int reg;
+	int ret;
 
 	*an_restart = 0;
 

^ permalink raw reply related

* [PATCH net-next v2 1/2] amd-xgbe: Fix mask appliciation for Clause 37 register
From: Tom Lendacky @ 2016-11-14 22:39 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller
In-Reply-To: <20161114223856.25437.13649.stgit@tlendack-t1.amdoffice.net>

The application of a mask to clear an area of a clause 37 register value
was not properly applied. Update the code to do the proper application
of the mask.

Reported-by: Marion & Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 0ecae70..4c5b90e 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -943,8 +943,8 @@ static void xgbe_an37_init(struct xgbe_prv_data *pdata)
 
 	/* Set up the Control register */
 	reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL);
-	reg &= XGBE_AN_CL37_TX_CONFIG_MASK;
-	reg &= XGBE_AN_CL37_PCS_MODE_MASK;
+	reg &= ~XGBE_AN_CL37_TX_CONFIG_MASK;
+	reg &= ~XGBE_AN_CL37_PCS_MODE_MASK;
 
 	switch (pdata->an_mode) {
 	case XGBE_AN_MODE_CL37:

^ permalink raw reply related

* [PATCH net-next v2 0/2] amd-xgbe: AMD XGBE driver updates 2016-11-14
From: Tom Lendacky @ 2016-11-14 22:38 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller

(Resending with net-next in the subject)

This patch series addresses some minor issues found in the recently
accepted patch series for the AMD XGBE driver.

The following fixes are included in this driver update series:

- Fix how a mask is applied to a Clause 37 register value
- Fix some coccinelle identified warnings

This patch series is based on net-next.

---

Tom Lendacky (2):
      amd-xgbe: Fix mask appliciation for Clause 37 register
      amd-xgbe: Fix up some coccinelle identified warnings


 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |    4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

-- 
Tom Lendacky

^ permalink raw reply

* Re: [patch] netlink.7: srcfix Change buffer size in example code about reading netlink message.
From: Rick Jones @ 2016-11-14 22:36 UTC (permalink / raw)
  To: dwilder, mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w
  Cc: linux-man-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <e9409957290af5249750afa0f10de3a6-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

> Lets change the example so others don't propagate the problem further.
>
> Signed-off-by David Wilder <dwilder-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>
> --- man7/netlink.7.orig 2016-11-14 13:30:36.522101156 -0800
> +++ man7/netlink.7      2016-11-14 13:30:51.002086354 -0800
> @@ -511,7 +511,7 @@
>  .in +4n
>  .nf
>  int len;
> -char buf[4096];
> +char buf[8192];

Since there doesn't seem to be a define one could use in the user space 
linux/netlink.h (?), but there are comments in the example code in the 
manpage, how about also including a brief comment to the effect that 
using 8192 bytes will avoid message truncation problems on platforms 
with a large PAGE_SIZE?

/* avoid msg truncation on > 4096 byte PAGE_SIZE platforms */

or something like that.

rick jones
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 2/2] amd-xgbe: Fix up some coccinelle identified warnings
From: Tom Lendacky @ 2016-11-14 22:28 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller
In-Reply-To: <20161114222827.25160.24791.stgit@tlendack-t1.amdoffice.net>

Fix up some warnings that were identified by coccinelle:

Clean up an if/else block that can look confusing since the same statement
is executed in an "else if" check and the final "else" statement.

Change a variable from unsigned int to int since it is used in an if
statement checking the value to be less than 0.

Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 4ba4332..348cc8c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1766,8 +1766,6 @@ static void xgbe_phy_sfi_mode(struct xgbe_prv_data *pdata)
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 1);
 		else if (phy_data->sfp_cable_len <= 3)
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 2);
-		else if (phy_data->sfp_cable_len <= 5)
-			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 3);
 		else
 			XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, 3);
 	}
@@ -2346,7 +2344,8 @@ static bool xgbe_phy_valid_speed(struct xgbe_prv_data *pdata, int speed)
 static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int ret, reg;
+	unsigned int reg;
+	int ret;
 
 	*an_restart = 0;
 

^ permalink raw reply related

* [PATCH 1/2] amd-xgbe: Fix mask appliciation for Clause 37 register
From: Tom Lendacky @ 2016-11-14 22:28 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller
In-Reply-To: <20161114222827.25160.24791.stgit@tlendack-t1.amdoffice.net>

The application of a mask to clear an area of a clause 37 register value
was not properly applied. Update the code to do the proper application
of the mask.

Reported-by: Marion & Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 0ecae70..4c5b90e 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -943,8 +943,8 @@ static void xgbe_an37_init(struct xgbe_prv_data *pdata)
 
 	/* Set up the Control register */
 	reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL);
-	reg &= XGBE_AN_CL37_TX_CONFIG_MASK;
-	reg &= XGBE_AN_CL37_PCS_MODE_MASK;
+	reg &= ~XGBE_AN_CL37_TX_CONFIG_MASK;
+	reg &= ~XGBE_AN_CL37_PCS_MODE_MASK;
 
 	switch (pdata->an_mode) {
 	case XGBE_AN_MODE_CL37:

^ permalink raw reply related

* [PATCH 0/2] amd-xgbe: AMD XGBE driver updates 2016-11-14
From: Tom Lendacky @ 2016-11-14 22:28 UTC (permalink / raw)
  To: netdev; +Cc: julia.lawall, christophe.jaillet, David Miller

This patch series addresses some minor issues found in the recently
accepted patch series for the AMD XGBE driver.

The following fixes are included in this driver update series:

- Fix how a mask is applied to a Clause 37 register value
- Fix some coccinelle identified warnings

This patch series is based on net-next.

---

Tom Lendacky (2):
      amd-xgbe: Fix mask appliciation for Clause 37 register
      amd-xgbe: Fix up some coccinelle identified warnings


 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |    4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

-- 
Tom Lendacky

^ permalink raw reply

* Re: [PATCH net] ipv4: fix cloning issues in fib_trie_unmerge()
From: Alexander Duyck @ 2016-11-14 22:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Alexander Duyck
In-Reply-To: <1479159274.8455.82.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, Nov 14, 2016 at 1:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> I had crashes in a DEBUG_PAGEALLOC kernels in fib_table_flush() or
> fib_table_lookup() that I back tracked to a refcounting issue
> happening when we clone struct fib_alias in fib_trie_unmerge()
>
> While fixing this issue, I also noticed a mem leak happening
> if fib_insert_alias() fails.
>
> Fixes: 0ddcf43d5d4a0 ("ipv4: FIB Local/MAIN table collapse")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>  net/ipv4/fib_trie.c |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 4cff74d4133f..ebf49ab889e8 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -1737,14 +1737,19 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
>                                 goto out;
>
>                         memcpy(new_fa, fa, sizeof(*fa));
> +                       if (fa->fa_info)
> +                               fa->fa_info->fib_treeref++;
>
>                         /* insert clone into table */
>                         if (!local_l)
>                                 local_l = fib_find_node(lt, &local_tp, l->key);
>
>                         if (fib_insert_alias(lt, local_tp, local_l, new_fa,
> -                                            NULL, l->key))
> +                                            NULL, l->key)) {
> +                               kmem_cache_free(fn_alias_kmem, new_fa);
> +                               fib_release_info(fa->fa_info);
>                                 goto out;
> +                       }
>                 }
>
>                 /* stop loop if key wrapped back to 0 */
>
>

Actually I think this creates a reference leak.  If you look the call
to fib_table_flush_external is skipping the call to fib_release_info.
If you add this then you would probably need to update
fib_table_flush_external so that we call fib_release_info like we do
for fib_table_flush.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox