Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH RFC net-next] virtio_net: refill buffer right after being used
From: Mike Waychison @ 2011-07-29 23:58 UTC (permalink / raw)
  To: Shirley Ma; +Cc: Rusty Russell, mst, kvm, virtualization, netdev
In-Reply-To: <1311980131.24300.30.camel@localhost.localdomain>

On Fri, Jul 29, 2011 at 3:55 PM, Shirley Ma <mashirle@us.ibm.com> wrote:
> Resubmit it with a typo fix.
>
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0c7321c..c8201d4 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -429,6 +429,22 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
>        return err;
>  }
>
> +static int fill_one(struct virtnet_info *vi, gfp_t gfp)
> +{
> +       int err;
> +
> +       if (vi->mergeable_rx_bufs)
> +               err = add_recvbuf_mergeable(vi, gfp);
> +       else if (vi->big_packets)
> +               err = add_recvbuf_big(vi, gfp);
> +       else
> +               err = add_recvbuf_small(vi, gfp);
> +
> +       if (err >= 0)
> +               ++vi->num;
> +       return err;
> +}
> +
>  /* Returns false if we couldn't fill entirely (OOM). */
>  static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
>  {
> @@ -436,17 +452,10 @@ static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
>        bool oom;
>
>        do {
> -               if (vi->mergeable_rx_bufs)
> -                       err = add_recvbuf_mergeable(vi, gfp);
> -               else if (vi->big_packets)
> -                       err = add_recvbuf_big(vi, gfp);
> -               else
> -                       err = add_recvbuf_small(vi, gfp);
> -
> +               err = fill_one(vi, gfp);
>                oom = err == -ENOMEM;
>                if (err < 0)
>                        break;
> -               ++vi->num;
>        } while (err > 0);
>        if (unlikely(vi->num > vi->max))
>                vi->max = vi->num;
> @@ -506,13 +515,13 @@ again:
>                receive_buf(vi->dev, buf, len);
>                --vi->num;
>                received++;
> -       }
> -
> -       if (vi->num < vi->max / 2) {
> -               if (!try_fill_recv(vi, GFP_ATOMIC))
> +               if (fill_one(vi, GFP_ATOMIC) < 0)
>                        schedule_delayed_work(&vi->refill, 0);
>        }
>
> +       /* notify buffers are refilled */
> +       virtqueue_kick(vi->rvq);
> +

How does this reduce latency?   We are doing the same amount of work
in both cases, and in both cases the newly available buffers are not
visible to the device until the virtqueue_kick..


>        /* Out of packets? */
>        if (received < budget) {
>                napi_complete(napi);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: J. Bruce Fields @ 2011-07-29 23:48 UTC (permalink / raw)
  To: NeilBrown
  Cc: Greg Banks, Eric Dumazet, Christoph Hellwig,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David Miller,
	linux-kernel, netdev
In-Reply-To: <20110730093025.716f3f50-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>

On Sat, Jul 30, 2011 at 09:30:25AM +1000, NeilBrown wrote:
> On Sat, 30 Jul 2011 06:34:44 +1000 Greg Banks <gnb-97jfqw80gc6171pxa8y+qA@public.gmane.org> wrote:
> 
> > 
> > 
> > Sent from my iPhone
> > 
> > On 30/07/2011, at 2:53, "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org> wrote:
> > 
> > > On Fri, Jul 29, 2011 at 12:48:36PM -0400, bfields wrote:
> > >> On Fri, Jul 29, 2011 at 11:30:05PM +1000, Greg Banks wrote:
> > >>>
> > >>>
> > >>> Sent from my iPhone
> > >>>
> > >>> On 29/07/2011, at 22:11, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>  
> > >>> wrote:
> > >>>
> > >>>> Le vendredi 29 juillet 2011 à 21:58 +1000, Greg Banks a écrit :
> > >>>>
> > >>>>>
> > >>>>> Sure, and a whole lot of the callsites are ("..._%d", cpu),  
> > >>>>> hence the
> > >>>>> unfortune :(
> > >>>>
> > >>>> BTW, we could name nfsd threads differently :
> > >>>>
> > >>>> Currently, they all are named : "nfsd"
> > >>>>
> > >>>> If SVC_POOL_PERCPU is selected, we could name them :
> > >>>> nfsd_c0 -> nfsd_cN
> > >>>>
> > >>>> If SVC_POOL_PERNODE is selected, we could name them :
> > >>>> nfsd_n0  -> nfsd_nN
> > >>>>
> > >>>> That would help to check with "ps aux" which cpu/nodes are under
> > >>>> stress.
> > >>>>
> > >>>>
> > >>>
> > >>> I like it!
> > >>
> > >> Yup, patch welcomed.--b.
> > >
> > > (Annoying fact: some initscripts stop nfsd using a rough equivalent of
> > > "killall nfsd".  So the name of the threads is arguably ABI.  I think
> > > those initscripts are nuts and deserve what they get, but that may be
> > > because I'm forgetting the reason they do that.)
> > >
> > 
> > We had this discussion in May-June 2008; it's because the nfsds were  
> > once  many many years ago userspace threads.
> 
> Even when they became kernel threads, 'kill' was the only way to kill them -
> at first.
> 
> > 
> > The "killall nfsd" semantics in those scripts are awful and lead to  
> > problems shutting down when there are lots of threads. It would  
> > probably be an improvement to provide a better shutdown mechanism and  
> > force distros to use it.
> 
> rpc.nfsd 0
> 
> will stop all nfsd threads.  Follow with
> 
>   exportfs -f
> 
> and you should be done.  I'm not 100% sure about the nfsv4 thread though -
> would need to check.

Should be fine.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next 2/5] qlcnic: FW dump related changes
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Anirban Chakraborty
In-Reply-To: <1311982230-10543-1-git-send-email-anirban.chakraborty@qlogic.com>

o Added code to support FW reset without invoking the dump
o Fixed the return value of the dump data size if dump is not available.

Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h         |    1 +
 drivers/net/qlcnic/qlcnic_ethtool.c |   22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index baf646d..4200ef8 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -1344,6 +1344,7 @@ enum op_codes {
 #define QLCNIC_FORCE_FW_DUMP_KEY	0xdeadfeed
 #define QLCNIC_ENABLE_FW_DUMP		0xaddfeed
 #define QLCNIC_DISABLE_FW_DUMP		0xbadfeed
+#define QLCNIC_FORCE_FW_RESET		0xdeaddead
 
 struct qlcnic_dump_operations {
 	enum op_codes opcode;
diff --git a/drivers/net/qlcnic/qlcnic_ethtool.c b/drivers/net/qlcnic/qlcnic_ethtool.c
index 72a723d..7c64f2f 100644
--- a/drivers/net/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/qlcnic/qlcnic_ethtool.c
@@ -1105,7 +1105,10 @@ qlcnic_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump)
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
 
-	dump->len = fw_dump->tmpl_hdr->size + fw_dump->size;
+	if (fw_dump->clr)
+		dump->len = fw_dump->tmpl_hdr->size + fw_dump->size;
+	else
+		dump->len = 0;
 	dump->flag = fw_dump->tmpl_hdr->drv_cap_mask;
 	dump->version = adapter->fw_version;
 	return 0;
@@ -1152,7 +1155,8 @@ qlcnic_set_dump(struct net_device *netdev, struct ethtool_dump *val)
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
 
-	if (val->flag == QLCNIC_FORCE_FW_DUMP_KEY) {
+	switch (val->flag) {
+	case QLCNIC_FORCE_FW_DUMP_KEY:
 		if (!fw_dump->enable) {
 			netdev_info(netdev, "FW dump not enabled\n");
 			return ret;
@@ -1164,17 +1168,25 @@ qlcnic_set_dump(struct net_device *netdev, struct ethtool_dump *val)
 		}
 		netdev_info(netdev, "Forcing a FW dump\n");
 		qlcnic_dev_request_reset(adapter);
-	} else if (val->flag == QLCNIC_DISABLE_FW_DUMP) {
+		break;
+	case QLCNIC_DISABLE_FW_DUMP:
 		if (fw_dump->enable) {
 			netdev_info(netdev, "Disabling FW dump\n");
 			fw_dump->enable = 0;
 		}
-	} else if (val->flag == QLCNIC_ENABLE_FW_DUMP) {
+		break;
+	case QLCNIC_ENABLE_FW_DUMP:
 		if (!fw_dump->enable && fw_dump->tmpl_hdr) {
 			netdev_info(netdev, "Enabling FW dump\n");
 			fw_dump->enable = 1;
 		}
-	} else {
+		break;
+	case QLCNIC_FORCE_FW_RESET:
+		netdev_info(netdev, "Forcing a FW reset\n");
+		qlcnic_dev_request_reset(adapter);
+		adapter->flags &= ~QLCNIC_FW_RESET_OWNER;
+		break;
+	default:
 		if (val->flag > QLCNIC_DUMP_MASK_MAX ||
 			val->flag < QLCNIC_DUMP_MASK_MIN) {
 				netdev_info(netdev,
-- 
1.7.4.1



^ permalink raw reply related

* [PATCH net-next 0/5] qlcnic: Fixes and debug support
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Anirban Chakraborty
In-Reply-To: <1311982230-10543-1-git-send-email-anirban.chakraborty@qlogic.com>

Please apply the series to net-next. Thanks.

-Anirban



^ permalink raw reply

* [PATCH net-next 3/5] qlcnic: Fix delay in reset path
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sritej Velaga
In-Reply-To: <1311982230-10543-1-git-send-email-anirban.chakraborty@qlogic.com>

From: Sritej Velaga <sritej.velaga@qlogic.com>

Driver should not check for heart beat anymore when FW is hung, rather it
should restart the FW.

Signed-off-by: Sritej Velaga <sritej.velaga@qlogic.com>
Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h      |    1 +
 drivers/net/qlcnic/qlcnic_init.c |    3 ++-
 drivers/net/qlcnic/qlcnic_main.c |    5 +++++
 3 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index 4200ef8..5f0141b 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -911,6 +911,7 @@ struct qlcnic_ipaddr {
 #define QLCNIC_PROMISC_DISABLED		0x800
 #define QLCNIC_NEED_FLR			0x1000
 #define QLCNIC_FW_RESET_OWNER		0x2000
+#define QLCNIC_FW_HANG			0x4000
 #define QLCNIC_IS_MSI_FAMILY(adapter) \
 	((adapter)->flags & (QLCNIC_MSI_ENABLED | QLCNIC_MSIX_ENABLED))
 
diff --git a/drivers/net/qlcnic/qlcnic_init.c b/drivers/net/qlcnic/qlcnic_init.c
index ee8a398..3b6741e 100644
--- a/drivers/net/qlcnic/qlcnic_init.c
+++ b/drivers/net/qlcnic/qlcnic_init.c
@@ -1056,7 +1056,8 @@ qlcnic_check_fw_hearbeat(struct qlcnic_adapter *adapter)
 int
 qlcnic_need_fw_reset(struct qlcnic_adapter *adapter)
 {
-	if (qlcnic_check_fw_hearbeat(adapter)) {
+	if ((adapter->flags & QLCNIC_FW_HANG) ||
+			qlcnic_check_fw_hearbeat(adapter)) {
 		qlcnic_rom_lock_recovery(adapter);
 		return 1;
 	}
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index 5ca1b56..248ebbd 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -2682,6 +2682,7 @@ qlcnic_clr_all_drv_state(struct qlcnic_adapter *adapter, u8 failed)
 	qlcnic_api_unlock(adapter);
 err:
 	adapter->fw_fail_cnt = 0;
+	adapter->flags &= ~QLCNIC_FW_HANG;
 	clear_bit(__QLCNIC_START_FW, &adapter->state);
 	clear_bit(__QLCNIC_RESETTING, &adapter->state);
 }
@@ -2859,6 +2860,7 @@ skip_ack_check:
 		    (adapter->flags & QLCNIC_FW_RESET_OWNER)) {
 			QLCDB(adapter, DRV, "Take FW dump\n");
 			qlcnic_dump_fw(adapter);
+			adapter->flags |= QLCNIC_FW_HANG;
 		}
 		rtnl_unlock();
 
@@ -3046,6 +3048,7 @@ attach:
 done:
 	netif_device_attach(netdev);
 	adapter->fw_fail_cnt = 0;
+	adapter->flags &= ~QLCNIC_FW_HANG;
 	clear_bit(__QLCNIC_RESETTING, &adapter->state);
 
 	if (!qlcnic_clr_drv_state(adapter))
@@ -3090,6 +3093,8 @@ qlcnic_check_health(struct qlcnic_adapter *adapter)
 	if (++adapter->fw_fail_cnt < FW_FAIL_THRESH)
 		return 0;
 
+	adapter->flags |= QLCNIC_FW_HANG;
+
 	qlcnic_dev_request_reset(adapter);
 
 	if (auto_fw_reset)
-- 
1.7.4.1



^ permalink raw reply related

* [PATCH net-next 1/5] qlcnic: Fix enviroment variable for udev event generation during FW dump
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Anirban Chakraborty

Driver was not generating the environment variable for the FW dump event correctly.
Fix it by formatting it properly.

Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/net/qlcnic/qlcnic_hw.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic_hw.c b/drivers/net/qlcnic/qlcnic_hw.c
index 4055c21..74e9d7b 100644
--- a/drivers/net/qlcnic/qlcnic_hw.c
+++ b/drivers/net/qlcnic/qlcnic_hw.c
@@ -1773,8 +1773,8 @@ int qlcnic_dump_fw(struct qlcnic_adapter *adapter)
 		goto error;
 	} else {
 		fw_dump->clr = 1;
-		snprintf(mesg, sizeof(mesg), "FW dump for device: %d\n",
-			adapter->pdev->devfn);
+		snprintf(mesg, sizeof(mesg), "FW_DUMP=%s",
+			adapter->netdev->name);
 		dev_info(&adapter->pdev->dev, "Dump data, %d bytes captured\n",
 			fw_dump->size);
 		/* Send a udev event to notify availability of FW dump */
-- 
1.7.4.1



^ permalink raw reply related

* [PATCH net-next 4/5] qlcnic: Move get template from probe to start fw
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sritej Velaga
In-Reply-To: <1311982230-10543-1-git-send-email-anirban.chakraborty@qlogic.com>

From: Sritej Velaga <sritej.velaga@qlogic.com>

Place for gathering FW dump template has been moved to the FW restart path
so that the driver can check if a newer FW version is available and in that case
it replaces the existing FW dump template with the newer template.

Signed-off-by: Sritej Velaga <sritej.velaga@qlogic.com>
Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/net/qlcnic/qlcnic_main.c |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index 248ebbd..d287a2b 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -643,8 +643,11 @@ static void get_brd_name(struct qlcnic_adapter *adapter, char *name)
 static void
 qlcnic_check_options(struct qlcnic_adapter *adapter)
 {
-	u32 fw_major, fw_minor, fw_build;
+	u32 fw_major, fw_minor, fw_build, prev_fw_version;
 	struct pci_dev *pdev = adapter->pdev;
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+
+	prev_fw_version = adapter->fw_version;
 
 	fw_major = QLCRD32(adapter, QLCNIC_FW_VERSION_MAJOR);
 	fw_minor = QLCRD32(adapter, QLCNIC_FW_VERSION_MINOR);
@@ -652,6 +655,17 @@ qlcnic_check_options(struct qlcnic_adapter *adapter)
 
 	adapter->fw_version = QLCNIC_VERSION_CODE(fw_major, fw_minor, fw_build);
 
+	if (adapter->op_mode != QLCNIC_NON_PRIV_FUNC) {
+		if (fw_dump->tmpl_hdr == NULL ||
+				adapter->fw_version > prev_fw_version) {
+			if (fw_dump->tmpl_hdr)
+				vfree(fw_dump->tmpl_hdr);
+			if (!qlcnic_fw_cmd_get_minidump_temp(adapter))
+				dev_info(&pdev->dev,
+					"Supports FW dump capability\n");
+		}
+	}
+
 	dev_info(&pdev->dev, "firmware v%d.%d.%d\n",
 			fw_major, fw_minor, fw_build);
 	if (adapter->ahw->port_type == QLCNIC_XGBE) {
@@ -1610,12 +1624,6 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_decr_ref;
 	}
 
-	/* Get FW dump template and store it */
-	if (adapter->op_mode != QLCNIC_NON_PRIV_FUNC)
-		if (!qlcnic_fw_cmd_get_minidump_temp(adapter))
-			dev_info(&pdev->dev,
-				"Supports FW dump capability\n");
-
 	if (qlcnic_read_mac_addr(adapter))
 		dev_warn(&pdev->dev, "failed to read mac addr\n");
 
-- 
1.7.4.1



^ permalink raw reply related

* [PATCH net-next 5/5] qlcnic: Added debug info
From: Anirban Chakraborty @ 2011-07-29 23:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sritej Velaga
In-Reply-To: <1311982230-10543-1-git-send-email-anirban.chakraborty@qlogic.com>

From: Sritej Velaga <sritej.velaga@qlogic.com>

Now printing states of essential registers once fw hang has been detected.
Bumped up the driver version to 5.0.22

Signed-off-by: Sritej Velaga <sritej.velaga@qlogic.com>
Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h      |    4 ++--
 drivers/net/qlcnic/qlcnic_main.c |   13 ++++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index 5f0141b..53c6e5d 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -36,8 +36,8 @@
 
 #define _QLCNIC_LINUX_MAJOR 5
 #define _QLCNIC_LINUX_MINOR 0
-#define _QLCNIC_LINUX_SUBVERSION 21
-#define QLCNIC_LINUX_VERSIONID  "5.0.21"
+#define _QLCNIC_LINUX_SUBVERSION 22
+#define QLCNIC_LINUX_VERSIONID  "5.0.22"
 #define QLCNIC_DRV_IDC_VER  0x01
 #define QLCNIC_DRIVER_VERSION  ((_QLCNIC_LINUX_MAJOR << 16) |\
 		 (_QLCNIC_LINUX_MINOR << 8) | (_QLCNIC_LINUX_SUBVERSION))
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index d287a2b..ec8ef72 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -3109,7 +3109,18 @@ qlcnic_check_health(struct qlcnic_adapter *adapter)
 		clear_bit(__QLCNIC_FW_ATTACHED, &adapter->state);
 
 	dev_info(&netdev->dev, "firmware hang detected\n");
-
+	dev_info(&adapter->pdev->dev, "Dumping hw/fw registers\n"
+			"PEG_HALT_STATUS1: 0x%x, PEG_HALT_STATUS2: 0x%x,\n"
+			"PEG_NET_0_PC: 0x%x, PEG_NET_1_PC: 0x%x,\n"
+			"PEG_NET_2_PC: 0x%x, PEG_NET_3_PC: 0x%x,\n"
+			"PEG_NET_4_PC: 0x%x\n",
+			QLCRD32(adapter, QLCNIC_PEG_HALT_STATUS1),
+			QLCRD32(adapter, QLCNIC_PEG_HALT_STATUS2),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_0 + 0x3c),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_1 + 0x3c),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_2 + 0x3c),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_3 + 0x3c),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x3c));
 detach:
 	adapter->dev_state = (state == QLCNIC_DEV_NEED_QUISCENT) ? state :
 		QLCNIC_DEV_NEED_RESET;
-- 
1.7.4.1



^ permalink raw reply related

* Re: [PATCH] sunrpc: use better NUMA affinities
From: NeilBrown @ 2011-07-29 23:30 UTC (permalink / raw)
  To: Greg Banks
  Cc: J. Bruce Fields, Eric Dumazet, Christoph Hellwig,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David Miller,
	linux-kernel, netdev
In-Reply-To: <F562C6F4-3FC2-4952-A903-8E87E9EF2D9D-97jfqw80gc6171pxa8y+qA@public.gmane.org>

On Sat, 30 Jul 2011 06:34:44 +1000 Greg Banks <gnb-97jfqw80gc6171pxa8y+qA@public.gmane.org> wrote:

> 
> 
> Sent from my iPhone
> 
> On 30/07/2011, at 2:53, "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org> wrote:
> 
> > On Fri, Jul 29, 2011 at 12:48:36PM -0400, bfields wrote:
> >> On Fri, Jul 29, 2011 at 11:30:05PM +1000, Greg Banks wrote:
> >>>
> >>>
> >>> Sent from my iPhone
> >>>
> >>> On 29/07/2011, at 22:11, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>  
> >>> wrote:
> >>>
> >>>> Le vendredi 29 juillet 2011 à 21:58 +1000, Greg Banks a écrit :
> >>>>
> >>>>>
> >>>>> Sure, and a whole lot of the callsites are ("..._%d", cpu),  
> >>>>> hence the
> >>>>> unfortune :(
> >>>>
> >>>> BTW, we could name nfsd threads differently :
> >>>>
> >>>> Currently, they all are named : "nfsd"
> >>>>
> >>>> If SVC_POOL_PERCPU is selected, we could name them :
> >>>> nfsd_c0 -> nfsd_cN
> >>>>
> >>>> If SVC_POOL_PERNODE is selected, we could name them :
> >>>> nfsd_n0  -> nfsd_nN
> >>>>
> >>>> That would help to check with "ps aux" which cpu/nodes are under
> >>>> stress.
> >>>>
> >>>>
> >>>
> >>> I like it!
> >>
> >> Yup, patch welcomed.--b.
> >
> > (Annoying fact: some initscripts stop nfsd using a rough equivalent of
> > "killall nfsd".  So the name of the threads is arguably ABI.  I think
> > those initscripts are nuts and deserve what they get, but that may be
> > because I'm forgetting the reason they do that.)
> >
> 
> We had this discussion in May-June 2008; it's because the nfsds were  
> once  many many years ago userspace threads.

Even when they became kernel threads, 'kill' was the only way to kill them -
at first.

> 
> The "killall nfsd" semantics in those scripts are awful and lead to  
> problems shutting down when there are lots of threads. It would  
> probably be an improvement to provide a better shutdown mechanism and  
> force distros to use it.

rpc.nfsd 0

will stop all nfsd threads.  Follow with

  exportfs -f

and you should be done.  I'm not 100% sure about the nfsv4 thread though -
would need to check.

And yes - I would love it if distros could standardise on start/stop scripts
so upstreams could maintain them sensibly.  This is my personal number 1
reason for liking systemd - it pushes for this standardisation.

> 
> Or, you could preserve the effective semantics by having a single  
> "nfsd" thread whose purpose is to notice that it's being signalled and  
> perform a clean shutdown (perhaps blocking the thread doing the kill()  
> call until the shutdown has completed).

That's kinda neat.  It would be an ugly wart to have to keep around, but
sometimes that the price we pay for "no regressions".

NeilBrown
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH RFC net-next] virtio_net: refill buffer right after being used
From: Shirley Ma @ 2011-07-29 22:55 UTC (permalink / raw)
  To: Rusty Russell; +Cc: mst, kvm, virtualization, netdev
In-Reply-To: <1311979448.24300.28.camel@localhost.localdomain>

Resubmit it with a typo fix.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c7321c..c8201d4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -429,6 +429,22 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
 	return err;
 }
 
+static int fill_one(struct virtnet_info *vi, gfp_t gfp)
+{
+	int err;
+
+	if (vi->mergeable_rx_bufs)
+		err = add_recvbuf_mergeable(vi, gfp);
+	else if (vi->big_packets)
+		err = add_recvbuf_big(vi, gfp);
+	else
+		err = add_recvbuf_small(vi, gfp);
+
+	if (err >= 0)
+		++vi->num;
+	return err;
+}
+
 /* Returns false if we couldn't fill entirely (OOM). */
 static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 {
@@ -436,17 +452,10 @@ static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 	bool oom;
 
 	do {
-		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
-		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
-		else
-			err = add_recvbuf_small(vi, gfp);
-
+		err = fill_one(vi, gfp);
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
 	} while (err > 0);
 	if (unlikely(vi->num > vi->max))
 		vi->max = vi->num;
@@ -506,13 +515,13 @@ again:
 		receive_buf(vi->dev, buf, len);
 		--vi->num;
 		received++;
-	}
-
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
+		if (fill_one(vi, GFP_ATOMIC) < 0)
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
+	/* notify buffers are refilled */
+	virtqueue_kick(vi->rvq);
+
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);



^ permalink raw reply related

* [PATCH RFC net-next] virtio_net: refill buffer right after being used
From: Shirley Ma @ 2011-07-29 22:44 UTC (permalink / raw)
  To: Rusty Russell, mst; +Cc: kvm, virtualization, netdev

To even the latency, refill buffer right after being used.

Sign-off-by: Shirley Ma <xma@us.ibm.com>
---

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c7321c..c8201d4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -429,6 +429,22 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
 	return err;
 }
 
+static bool fill_one(struct virtio_net *vi, gfp_t gfp)
+{
+	int err;
+
+	if (vi->mergeable_rx_bufs)
+		err = add_recvbuf_mergeable(vi, gfp);
+	else if (vi->big_packets)
+		err = add_recvbuf_big(vi, gfp);
+	else
+		err = add_recvbuf_small(vi, gfp);
+
+	if (err >= 0)
+		++vi->num;
+	return err;
+}
+
 /* Returns false if we couldn't fill entirely (OOM). */
 static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 {
@@ -436,17 +452,10 @@ static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 	bool oom;
 
 	do {
-		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
-		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
-		else
-			err = add_recvbuf_small(vi, gfp);
-
+		err = fill_one(vi, gfp);
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
 	} while (err > 0);
 	if (unlikely(vi->num > vi->max))
 		vi->max = vi->num;
@@ -506,13 +515,13 @@ again:
 		receive_buf(vi->dev, buf, len);
 		--vi->num;
 		received++;
-	}
-
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
+		if (fill_one(vi, GFP_ATOMIC) < 0)
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
+	/* notify buffers are refilled */
+	virtqueue_kick(vi->rvq);
+
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);



^ permalink raw reply related

* [PATCH] cfg80211: Update REG_DBG_PRINT macro and uses
From: Joe Perches @ 2011-07-29 21:51 UTC (permalink / raw)
  To: Johannes Berg, John W. Linville
  Cc: David S. Miller, linux-wireless, netdev, linux-kernel

Several uses were missing terminating newlines.
Typo fix and macro neatening.

Signed-off-by: Joe Perches <joe@perches.com>
---
 net/wireless/reg.c |   16 +++++++---------
 1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 1ad0f39..b22c324 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -49,10 +49,8 @@
 #include "nl80211.h"
 
 #ifdef CONFIG_CFG80211_REG_DEBUG
-#define REG_DBG_PRINT(format, args...) \
-	do { \
-		printk(KERN_DEBUG pr_fmt(format), ##args);	\
-	} while (0)
+#define REG_DBG_PRINT(format, args...)			\
+	printk(KERN_DEBUG pr_fmt(format), ##args)
 #else
 #define REG_DBG_PRINT(args...)
 #endif
@@ -890,7 +888,7 @@ static bool ignore_reg_update(struct wiphy *wiphy,
 	    wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) {
 		REG_DBG_PRINT("Ignoring regulatory request %s "
 			      "since the driver uses its own custom "
-			      "regulatory domain ",
+			      "regulatory domain\n",
 			      reg_initiator_name(initiator));
 		return true;
 	}
@@ -903,8 +901,8 @@ static bool ignore_reg_update(struct wiphy *wiphy,
 	    initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
 	    !is_world_regdom(last_request->alpha2)) {
 		REG_DBG_PRINT("Ignoring regulatory request %s "
-			      "since the driver requires its own regulaotry "
-			      "domain to be set first",
+			      "since the driver requires its own regulatory "
+			      "domain to be set first\n",
 			      reg_initiator_name(initiator));
 		return true;
 	}
@@ -1473,7 +1471,7 @@ static void reg_process_pending_hints(void)
 	/* When last_request->processed becomes true this will be rescheduled */
 	if (last_request && !last_request->processed) {
 		REG_DBG_PRINT("Pending regulatory request, waiting "
-			      "for it to be processed...");
+			      "for it to be processed...\n");
 		goto out;
 	}
 
@@ -2186,7 +2184,7 @@ out:
 static void reg_timeout_work(struct work_struct *work)
 {
 	REG_DBG_PRINT("Timeout while waiting for CRDA to reply, "
-		      "restoring regulatory settings");
+		      "restoring regulatory settings\n");
 	restore_regulatory_settings(true);
 }
 
-- 
1.7.6.131.g99019

^ permalink raw reply related

* Re: vpnc-script fix for changed iproute output with newer kernels
From: David Woodhouse @ 2011-07-29 21:26 UTC (permalink / raw)
  To: David Miller; +Cc: jsbronder, netdev, shemminger
In-Reply-To: <1311944244.17528.76.camel@i7.infradead.org>

On Fri, 2011-07-29 at 13:57 +0100, David Woodhouse wrote:
> > You're going to have to be knowledgable about which attributes are
> > part of the route, whether you want to do this with iproute2 as a tool
> > or whether you do this directly with C code using netlink.
> 
> I don't think I really want to try shipping vpnc-script with C code.
> 
> The 'opt-in' approach seems like the best one for now, then. I suppose
> we want just the 'via' and 'dev' and 'src' attributes... anything else?

This should do it for now, I suppose:

--- a/vpnc-script
+++ b/vpnc-script
@@ -139,8 +139,9 @@ destroy_tun_device() {
 
 if [ -n "$IPROUTE" ]; then
 	fix_ip_get_output () {
-		sed 's/cache//;s/metric \?[0-9]\+ [0-9]\+//g;s/hoplimit [0-9]\+//g;s/ipid 0x....//g'
+		sed -e 's/ /\n/g' | \
+		    sed -ne '1p;/via/{N;p};/dev/{N;p};/src/{N;p};/mtu/{N;p}'
 	}
 
 	set_vpngateway_route() {
 		$IPROUTE route add `$IPROUTE route get "$VPNGATEWAY" | fix_ip_get_output`

I'm still not happy with it, since I'm not 100% convinced I'm
preserving all the attributes that need to be preserved, and will need
to be preserved in future. I managed to keep 'src', but what else might
there be? I just don't want to have to know.

On trying to torture-test it, I also noticed that 'ip route get' doesn't
do what I'd want in the case of the following route:

default  src 90.155.92.214 
	nexthop via 81.2.98.173  dev eth1 weight 1
	nexthop dev ppp1 weight 1
[root@solos ~]# ip route get 131.111.8.42
131.111.8.42 via 81.2.98.173 dev eth1  src 90.155.92.214 
    cache  mtu 1500 advmss 1460 hoplimit 64


-- 
dwmw2



^ permalink raw reply

* Re: pull request: wireless-next-2.6 2011-07-27
From: Rafał Miłecki @ 2011-07-29 21:20 UTC (permalink / raw)
  To: David Miller; +Cc: linville, linux-wireless, netdev, linux-kernel
In-Reply-To: <20110727.222247.2000159983502866405.davem@davemloft.net>

2011/7/28 David Miller <davem@davemloft.net>:
> From: "John W. Linville" <linville@tuxdriver.com>
> Date: Wed, 27 Jul 2011 14:49:21 -0400
>
>> Here is a handful of fixes intended for 3.1.  This includes a
>> user-visible typo fix, a fix for a user after free in the new pn533
>> NFC driver, a cfg80211 fix for a possible NULL pointer dereference,
>> a fix for an invalid memory access in b43, and another b43 fix for
>> a memory corruption problem.
>>
>> On top of that b43 memory corruption fix, there is a patch to remove
>> BROKEN from the B43_BCMA Kconfig entry, which is key to enabling
>> support for some of the more modern Broadcom wireless hardware.
>> I'm sure the Rafał (and a number of others) would love to see that
>> merged while the 3.1 merge window is still open as well.
>>
>> Please let me know if there are problems...
>
> Yep, removing BROKEN from b43 seems reasonable.
>
> Pulled, thanks!

Thanks a lot John and David, hope to get some testers now :)

-- 
Rafał

^ permalink raw reply

* Re: [PATCH] Fix cdc-phonet build
From: Chris Clayton @ 2011-07-29 20:49 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20110727.224117.389682636642939049.davem@davemloft.net>

On Thursday 28 July 2011 06:41:17 David Miller wrote:
> From: Chris Clayton <chris2553@googlemail.com>
> Date: Tue, 26 Jul 2011 23:20:22 +0100
>
> > cdc-phonet does not presently build on linux-3.0 because there is no
> > entry for it in drivers/net/Makefile. This patch adds that entry.
> >
> > Signed-off-by: Chris Clayton <chris2553@googlemail.com>
>
> Applied, thanks.

Actually, I've just checked and the same problem exists in 2.6.39.3, so once the patch is Linus' 
tree, I guess it should go in stable too. Do you have that in hand, David or should I send to 
stable once the patch is applied to mainline?

-- 
The more I see, the more I know. The more I know, the less I understand. Changing Man - Paul Weller

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: Greg Banks @ 2011-07-29 20:39 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Eric Dumazet, Trond Myklebust, Neil Brown, David Miller,
	linux-nfs@vger.kernel.org, netdev, linux-kernel,
	bmarson@redhat.com
In-Reply-To: <20110729180844.GA28947@fieldses.org>



Sent from my iPhone

On 30/07/2011, at 4:08, "J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Fri, Jul 29, 2011 at 08:02:05PM +0200, Eric Dumazet wrote:
>> Le vendredi 29 juillet 2011 à 12:42 -0400, J. Bruce Fields a écri 
>> t :
>>> On Thu, Jul 28, 2011 at 08:04:09PM +0200, Eric Dumazet wrote:
>>>> Use NUMA aware allocations to reduce latencies and increase  
>>>> throughput.
>>>>
>>>> sunrpc kthreads can use kthread_create_on_node() if pool_mode is
>>>> "percpu" or "pernode", and svc_prepare_thread()/svc_init_buffer()  
>>>> can
>>>> also take into account NUMA node affinity for memory allocations.
>>> ...
>>>> @@ -662,14 +675,16 @@ svc_set_num_threads(struct svc_serv *serv,  
>>>> struct svc_pool *pool, int nrservs)
>>>>        nrservs--;
>>>>        chosen_pool = choose_pool(serv, pool, &state);
>>>>
>>>> -        rqstp = svc_prepare_thread(serv, chosen_pool);
>>>> +        node = svc_pool_map_get_node(chosen_pool->sp_id);
>>>> +        rqstp = svc_prepare_thread(serv, chosen_pool, node);
>>>
>>> The only correct value for the third argument there is
>>> svc_pool_map_get_node(chosen_pool->sp_id), so let's have
>>> svc_prepare_thread() call that itself.
>>>
>>
>> I have no idea of what you mean ;)
>>
>> I need 'node' for the following kthread_create_on_node()
>
> Doh, of course--apologies.
>
>>> Seems OK otherwise.
>>>
>>> Any suggestions on how we should test this?
>>
>> I did tests on my machine, seems good.
>>
>> I checked that stacks were now correct using :
>> "echo t > /proc/sysrq-trigger"
>
> I was wondering more about good tests of nfsd's performance on numa;
> that might be more of a question for Greg.
>

To really show a big difference you need a much bigger box, or slower  
NUMA interconnects than today's. You also want network cards locally  
attached to each node and a metadata heavy (i.e. high rpc call rate)  
load.

Greg.--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: Greg Banks @ 2011-07-29 20:34 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Eric Dumazet, Christoph Hellwig, NeilBrown,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David Miller,
	linux-kernel, netdev
In-Reply-To: <20110729165345.GM23194-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>



Sent from my iPhone

On 30/07/2011, at 2:53, "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org> wrote:

> On Fri, Jul 29, 2011 at 12:48:36PM -0400, bfields wrote:
>> On Fri, Jul 29, 2011 at 11:30:05PM +1000, Greg Banks wrote:
>>>
>>>
>>> Sent from my iPhone
>>>
>>> On 29/07/2011, at 22:11, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>  
>>> wrote:
>>>
>>>> Le vendredi 29 juillet 2011 à 21:58 +1000, Greg Banks a écrit :
>>>>
>>>>>
>>>>> Sure, and a whole lot of the callsites are ("..._%d", cpu),  
>>>>> hence the
>>>>> unfortune :(
>>>>
>>>> BTW, we could name nfsd threads differently :
>>>>
>>>> Currently, they all are named : "nfsd"
>>>>
>>>> If SVC_POOL_PERCPU is selected, we could name them :
>>>> nfsd_c0 -> nfsd_cN
>>>>
>>>> If SVC_POOL_PERNODE is selected, we could name them :
>>>> nfsd_n0  -> nfsd_nN
>>>>
>>>> That would help to check with "ps aux" which cpu/nodes are under
>>>> stress.
>>>>
>>>>
>>>
>>> I like it!
>>
>> Yup, patch welcomed.--b.
>
> (Annoying fact: some initscripts stop nfsd using a rough equivalent of
> "killall nfsd".  So the name of the threads is arguably ABI.  I think
> those initscripts are nuts and deserve what they get, but that may be
> because I'm forgetting the reason they do that.)
>

We had this discussion in May-June 2008; it's because the nfsds were  
once  many many years ago userspace threads.

The "killall nfsd" semantics in those scripts are awful and lead to  
problems shutting down when there are lots of threads. It would  
probably be an improvement to provide a better shutdown mechanism and  
force distros to use it.

Or, you could preserve the effective semantics by having a single  
"nfsd" thread whose purpose is to notice that it's being signalled and  
perform a clean shutdown (perhaps blocking the thread doing the kill()  
call until the shutdown has completed).

Greg.--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: Greg Banks @ 2011-07-29 20:24 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Eric Dumazet, NeilBrown, linux-nfs@vger.kernel.org, David Miller,
	linux-kernel, netdev
In-Reply-To: <20110729164553.GJ23194@fieldses.org>



Sent from my iPhone

On 30/07/2011, at 2:45, "J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Fri, Jul 29, 2011 at 04:53:21PM +1000, Greg Banks wrote:
>> On 29/07/11 16:30, Eric Dumazet wrote:
>>> Le vendredi 29 juillet 2011 à 16:05 +1000, Greg Banks a écrit :
>>>> On 29/07/11 15:32, NeilBrown wrote:
>>>>
>>>> I seem to remember coming to the conclusion that Jeff eventually
>>>> addressed this problem...am I misremembering or did something  
>>>> regress?
>>>>
>>> Currently, all nfsd kthreads use memory for their kernel stack and
>>> various initial data from a _single_ node, even if you use
>>> sunrpc.pool_mode=pernode  (or percpu)
>>
>> That's just plain broken and I'm very pleased to see you fix it.
>
> Should I take that as a "Reviewed-by"?

If you like.

>
>> [...] In ToT
>> svc_pool_map_set_cpumask() is called *after* kthread_create() and
>> applies to the child thread, *after* it's stack has been allocated
>> on the wrong node.  In the working SGI code,
>> svc_pool_map_set_cpumask() is called by the parent node on itself
>> *before* calling kernel_thread() or doing any of the data structure
>> allocations, thus ensuring that everything gets allocated using the
>> default memory allocation policy, which on SGI NFS servers was
>> globally tuned to be "node-local".
>
> OK, so would it be enough to just move the svc_pool_map_set_cpumask()
> back a few lines, or do we want Eric's approach, in order to have
> something that will work better with other memory allocation policies?
>

Relying on global policy was easy but not a great idea, I prefer  
Eric's approach of doing it explicitly.

Greg.
>>

^ permalink raw reply

* Re: Kernel IPSec Questions
From: Andreas Steffen @ 2011-07-29 20:20 UTC (permalink / raw)
  To: T C; +Cc: netdev
In-Reply-To: <CAL0-=WwDFeCJX62_Bgty615MtjuBY3rBp6BtHP7p9_3NvJ3u+g@mail.gmail.com>

Hello Terry,

each IPsec SA in the kernel has a lifetime configuration consisting
of both a soft and a hard limit for the number of bytes, number of
packets and time:

	lifetime config:
	  limit: soft (INF)(bytes), hard (INF)(bytes)
	  limit: soft (INF)(packets), hard (INF)(packets)
	  expire add: soft 903(sec), hard 1200(sec)
	  expire use: soft 0(sec), hard 0(sec)

Each time one of the soft or hard limits is reached, the Linux kernel
generates an XFRM_MSG_EXPIRE message to which the charon daemon
subscribes when creating the NETLINK_XFRM socket:

http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c;h=06720a0f4bddf9fde60288f796df0eca647ae995;hb=HEAD#l2664

The callback function receive_events() is triggered by these
subscribed XFRM messages:

http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c;h=06720a0f4bddf9fde60288f796df0eca647ae995;hb=HEAD#l939

In the case of XFRM_MSG_EXPIRE the function process_expire() is
called:

http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c;h=06720a0f4bddf9fde60288f796df0eca647ae995;hb=HEAD#l939

which in turn calls hydra->kernel_interface->expire():

http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/kernel/kernel_interface.c;h=ebe653ec4582ef2c16024d1cc5711d51c8b45970;hb=HEAD#l388

All registered expire listeners are notified, in our case the libcharon
listener:

http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libcharon/kernel/kernel_handler.c;h=51fccb1acd6d7813bb83517428fc8f7b15f841d5;hb=HEAD#l75

As you can see, if a soft limit was reached then a CHILD_SA rekeying
job is scheduled

  job = (job_t*)rekey_child_sa_job_create(reqid, proto, spi);

and if a hard limit is reached (what should not happen with rekey=yes)
then the CHILD_SA is deleted

  job = (job_t*)delete_child_sa_job_create(reqid, proto, spi);

Best regards

Andreas

On 29.07.2011 19:56, T C wrote:
> Hi Andreas,
> 
> Thanks for the URLs.  I'll look thru them.
> 
> As far as strongswan is concerned, Martin has been very helpful in
> explaining all the active actions that StrongSwan takes from
> the user side.  So actions taken by IKE daemon based on configuration
> files I already have info on that.  However,
> the part that remains mostly unfamiliar is those actions taken by the
> kernel during rekeying by sending messages back
> from the kernel to the IKE daemon.  Do you happen to know anything
> about that?  How are those actions trigged and what
> happens to the communication channels during rekeying is what I am
> most interested in finding out.  If your URLs already
> contain something that'll point to those, I'll find out from them.  If
> there is additional info on this, could you share them
> as well?
> 
> Thanks,
> Terry
> 
> On Fri, Jul 29, 2011 at 12:03 AM, Andreas Steffen
> <andreas.steffen@strongswan.org> wrote:
>> Hello Terry,
>>
>> here a repost of my email including the netdev list and fixing
>> the last URL which was wrong.
>>
>> Here the definition of strongSwan's IPsec high level kernel interface
>>
>> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/kernel/kernel_ipsec.h;h=986e21fca1bbd109445e95d86dbf458095299573;hb=HEAD
>>
>> and here the link to the kernel-netlink plugin which implements
>> configuration and management of IPsec Policies and SAs via XFRM
>>
>> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c;h=06720a0f4bddf9fde60288f796df0eca647ae995;hb=HEAD
>>
>> Our plugin of course relies on the ipsec.h, netlink.h, rtnetlink.h,
>> and xfrm.h Linux header files which define the API of the XFRM Netlink
>> kernel interface
>>
>> http://git.strongswan.org/?p=strongswan.git;a=tree;f=src/include/linux;h=a41d3e9a10954c47aff2efeb06576f323c039483;hb=HEAD
>>
>> Much more documentation than the Linux header files and the XFRM kernel
>> source code itself does not exist.
>>
>> Finally a link which shows how strongSwan installs, updates, queries
>> and deletes IPsec Policies and SAs
>>
>> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libcharon/sa/child_sa.c;h=cda150f8736d010cf8d897071427daf8a02a337a;hb=HEAD
>>
>> Just look for all "hydra->kernel_interface" function calls.
>>
>> Best regards
>>
>> Andreas

======================================================================
Andreas Steffen                         andreas.steffen@strongswan.org
strongSwan - the Linux VPN Solution!                www.strongswan.org
Institute for Internet Technologies and Applications
University of Applied Sciences Rapperswil
CH-8640 Rapperswil (Switzerland)
===========================================================[ITA-HSR]==

^ permalink raw reply

* Re: IP over 802.2 with LLC/SNAP
From: Alan Ott @ 2011-07-29 18:42 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev
In-Reply-To: <20110727133209.0d9dd6c4@lxorguk.ukuu.org.uk>

On 07/27/2011 08:32 AM, Alan Cox wrote:
>> So the question is, does Linux support IP over 802.2 with LLC/SNAP? 
> 
> It should be a 'simple matter of hacking' and there are example devices
> that fiddle with packets and add headers etc you can nick a lot of code
> from.

That sounds like it could be a fun project actually. It looks like I can
use what net/ipx does as a pattern, at least to get the receive going.
The send sounds a bit trickier but doable. I'll take a look when I get
some time (but it'll be a couple weeks at least).

> I figure if you can write an IP stack for an ancient Gould system
> you can probably do that.

I haven't written a fully-featured IP stack on the Gould, but it does
handle sending/receiving of UDP, which is what I need for my purposes.

> 
> The other way is to use the Linux raw packet interfaces, open a raw
> socket, push a BPF filter onto it to just get the frames for IP/SNAP and
> ARP/SNAP then modify them and feed them to the kernel tun/tap interface.
> 

Since I just needed to read the one packet on the Linux side, I ended up
just using libpcap to grab the raw packets and parse out the header for
now to get it working.

Thanks for the response and information,

Alan.

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: Eric Dumazet @ 2011-07-29 18:15 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Greg Banks, Christoph Hellwig, NeilBrown,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David Miller,
	linux-kernel, netdev
In-Reply-To: <20110729165345.GM23194-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>

Le vendredi 29 juillet 2011 à 12:53 -0400, J. Bruce Fields a écrit :

> (Annoying fact: some initscripts stop nfsd using a rough equivalent of
> "killall nfsd".  So the name of the threads is arguably ABI.  I think
> those initscripts are nuts and deserve what they get, but that may be
> because I'm forgetting the reason they do that.)
> 

Yes, it seems we cant change then...

I tried on a RHEL4 distro and "service nfs stop" could not stop my nfsd
threads if I changed their names as described.

Next, I am going to try nfsd/...  

By the way, any idea why after "service nfs stop" I cannot unload sunrpc
module ?

# lsmod
Module                  Size  Used by
nfsd                  285597  3 
lockd                  76103  1 nfsd
auth_rpcgss            45002  1 nfsd
sunrpc                231915  6 nfsd,lockd,auth_rpcgss
exportfs                3813  1 nfsd


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: J. Bruce Fields @ 2011-07-29 18:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Trond Myklebust, Neil Brown, David Miller,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA, netdev, linux-kernel,
	Greg Banks, bmarson-H+wXaHxf7aLQT0dZR+AlfA
In-Reply-To: <1311962525.2873.6.camel@edumazet-laptop>

On Fri, Jul 29, 2011 at 08:02:05PM +0200, Eric Dumazet wrote:
> Le vendredi 29 juillet 2011 à 12:42 -0400, J. Bruce Fields a écrit :
> > On Thu, Jul 28, 2011 at 08:04:09PM +0200, Eric Dumazet wrote:
> > > Use NUMA aware allocations to reduce latencies and increase throughput.
> > > 
> > > sunrpc kthreads can use kthread_create_on_node() if pool_mode is
> > > "percpu" or "pernode", and svc_prepare_thread()/svc_init_buffer() can
> > > also take into account NUMA node affinity for memory allocations.
> > ...
> > > @@ -662,14 +675,16 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> > >  		nrservs--;
> > >  		chosen_pool = choose_pool(serv, pool, &state);
> > >  
> > > -		rqstp = svc_prepare_thread(serv, chosen_pool);
> > > +		node = svc_pool_map_get_node(chosen_pool->sp_id);
> > > +		rqstp = svc_prepare_thread(serv, chosen_pool, node);
> > 
> > The only correct value for the third argument there is
> > svc_pool_map_get_node(chosen_pool->sp_id), so let's have
> > svc_prepare_thread() call that itself.
> > 
> 
> I have no idea of what you mean ;)
> 
> I need 'node' for the following kthread_create_on_node()

Doh, of course--apologies.

> > Seems OK otherwise.
> > 
> > Any suggestions on how we should test this?
> 
> I did tests on my machine, seems good.
> 
> I checked that stacks were now correct using :
> "echo t > /proc/sysrq-trigger"

I was wondering more about good tests of nfsd's performance on numa;
that might be more of a question for Greg.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] sunrpc: use better NUMA affinities
From: Eric Dumazet @ 2011-07-29 18:02 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Trond Myklebust, Neil Brown, David Miller,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA, netdev, linux-kernel,
	Greg Banks
In-Reply-To: <20110729164214.GI23194-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>

Le vendredi 29 juillet 2011 à 12:42 -0400, J. Bruce Fields a écrit :
> On Thu, Jul 28, 2011 at 08:04:09PM +0200, Eric Dumazet wrote:
> > Use NUMA aware allocations to reduce latencies and increase throughput.
> > 
> > sunrpc kthreads can use kthread_create_on_node() if pool_mode is
> > "percpu" or "pernode", and svc_prepare_thread()/svc_init_buffer() can
> > also take into account NUMA node affinity for memory allocations.
> ...
> > @@ -662,14 +675,16 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> >  		nrservs--;
> >  		chosen_pool = choose_pool(serv, pool, &state);
> >  
> > -		rqstp = svc_prepare_thread(serv, chosen_pool);
> > +		node = svc_pool_map_get_node(chosen_pool->sp_id);
> > +		rqstp = svc_prepare_thread(serv, chosen_pool, node);
> 
> The only correct value for the third argument there is
> svc_pool_map_get_node(chosen_pool->sp_id), so let's have
> svc_prepare_thread() call that itself.
> 

I have no idea of what you mean ;)

I need 'node' for the following kthread_create_on_node()


> Seems OK otherwise.
> 
> Any suggestions on how we should test this?

I did tests on my machine, seems good.

I checked that stacks were now correct using :
"echo t > /proc/sysrq-trigger"



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Kernel IPSec Questions
From: T C @ 2011-07-29 17:56 UTC (permalink / raw)
  To: Andreas Steffen; +Cc: netdev
In-Reply-To: <4E325B58.6030202@strongswan.org>

Hi Andreas,

Thanks for the URLs.  I'll look thru them.

As far as strongswan is concerned, Martin has been very helpful in
explaining all the active actions that StrongSwan takes from
the user side.  So actions taken by IKE daemon based on configuration
files I already have info on that.  However,
the part that remains mostly unfamiliar is those actions taken by the
kernel during rekeying by sending messages back
from the kernel to the IKE daemon.  Do you happen to know anything
about that?  How are those actions trigged and what
happens to the communication channels during rekeying is what I am
most interested in finding out.  If your URLs already
contain something that'll point to those, I'll find out from them.  If
there is additional info on this, could you share them
as well?

Thanks,
Terry

On Fri, Jul 29, 2011 at 12:03 AM, Andreas Steffen
<andreas.steffen@strongswan.org> wrote:
> Hello Terry,
>
> here a repost of my email including the netdev list and fixing
> the last URL which was wrong.
>
> Here the definition of strongSwan's IPsec high level kernel interface
>
> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/kernel/kernel_ipsec.h;h=986e21fca1bbd109445e95d86dbf458095299573;hb=HEAD
>
> and here the link to the kernel-netlink plugin which implements
> configuration and management of IPsec Policies and SAs via XFRM
>
> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c;h=06720a0f4bddf9fde60288f796df0eca647ae995;hb=HEAD
>
> Our plugin of course relies on the ipsec.h, netlink.h, rtnetlink.h,
> and xfrm.h Linux header files which define the API of the XFRM Netlink
> kernel interface
>
> http://git.strongswan.org/?p=strongswan.git;a=tree;f=src/include/linux;h=a41d3e9a10954c47aff2efeb06576f323c039483;hb=HEAD
>
> Much more documentation than the Linux header files and the XFRM kernel
> source code itself does not exist.
>
> Finally a link which shows how strongSwan installs, updates, queries
> and deletes IPsec Policies and SAs
>
> http://git.strongswan.org/?p=strongswan.git;a=blob;f=src/libcharon/sa/child_sa.c;h=cda150f8736d010cf8d897071427daf8a02a337a;hb=HEAD
>
> Just look for all "hydra->kernel_interface" function calls.
>
> Best regards
>
> Andreas
>
> On 07/29/2011 07:40 AM, T C wrote:
>> Hi all,
>>
>> I have some questions on how IPSec logic works in the kernel.  There might be
>> a difference between when XFRM was introduced and prior.  If possible,
>> I like to know both scenarios.  If not, at least from XFRM perspective would
>> be very helpful.
>>
>> Specifically, I am interested in knowing how does IPSec obtain the initial keys
>> from IKE exchange (and likely from XFRM) to set up the SA.   Also what happens
>> during rekeying?  Does the SA have to be terminated first, or somehow it can be
>> rekey'ed and continue as the same SA?  I'll be using strongswan for IKE.
>>
>> Function names and if possible some flow graphs would be greatly appreciated.
>>
>> Thanks,
>> Terry
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
> --
> ======================================================================
> Andreas Steffen                         andreas.steffen@strongswan.org
> strongSwan - the Linux VPN Solution!                www.strongswan.org
> Institute for Internet Technologies and Applications
> University of Applied Sciences Rapperswil
> CH-8640 Rapperswil (Switzerland)
> ===========================================================[ITA-HSR]==
>

^ permalink raw reply

* [PATCH 02/14] allow root in container to copy namespaces (v3)
From: Serge E. Hallyn @ 2011-07-29 17:27 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-kernel, netdev, containers, dhowells
In-Reply-To: <m1hb67fh9l.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> Serge Hallyn <serge@hallyn.com> writes:
> 
> > From: Serge E. Hallyn <serge.hallyn@canonical.com>
> >
> > Othewise nested containers with user namespaces won't be possible.
> >
> > It's true that user namespaces are not yet fully isolated, but for
> > that same reason there are far worse things that root in a child
> > user ns can do.  Spawning a child user ns is not in itself bad.
> >
> > This patch also allows setns for root in a container:
> > @Eric Biederman: are there gotchas in allowing setns from child
> > userns?
> 
> Yes.  We need to ensure that the target namespaces are namespaces
> that have been created in from user_namespace or from a child of this
> user_namespace.
> 
> Aka we need to ensure that we have CAP_SYS_ADMIN for the new namespace.

[New patch below]

Othewise nested containers with user namespaces won't be possible.

It's true that user namespaces are not yet fully isolated, but for
that same reason there are far worse things that root in a child
user ns can do.  Spawning a child user ns is not in itself bad.

This patch also allows setns for root in a container:
@Eric Biederman: are there gotchas in allowing setns from child
userns?

Changelog:
  Jul 29: setns: target capability check for setns
          When changing to another namespace, make sure that we have
          the CAP_SYS_ADMIN capability targeted at the user namespace
          owning the new ns.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 ipc/namespace.c          |    3 +++
 kernel/fork.c            |    4 ++--
 kernel/nsproxy.c         |    7 ++-----
 kernel/utsname.c         |    3 +++
 net/core/net_namespace.c |    3 +++
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/ipc/namespace.c b/ipc/namespace.c
index ce0a647..f527e49 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -163,6 +163,9 @@ static void ipcns_put(void *ns)
 
 static int ipcns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct ipc_namespace *newns = ns;
+	if (!ns_capable(newns->user_ns, CAP_SYS_ADMIN))
+		return -1;
 	/* Ditch state from the old ipc namespace */
 	exit_sem(current);
 	put_ipc_ns(nsproxy->ipc_ns);
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..f9fac70 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,8 +1488,8 @@ long do_fork(unsigned long clone_flags,
 		/* hopefully this check will go away when userns support is
 		 * complete
 		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
+		if (!nsown_capable(CAP_SYS_ADMIN) || !nsown_capable(CAP_SETUID) ||
+				!nsown_capable(CAP_SETGID))
 			return -EPERM;
 	}
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b..cadcee0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -134,7 +134,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!nsown_capable(CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto out;
 	}
@@ -191,7 +191,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 			       CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!nsown_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	*new_nsp = create_new_namespaces(unshare_flags, current,
@@ -241,9 +241,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	struct file *file;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	file = proc_ns_fget(fd);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b..8f648cc 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -104,6 +104,9 @@ static void utsns_put(void *ns)
 
 static int utsns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct uts_namespace *newns = ns;
+	if (!ns_capable(newns->user_ns, CAP_SYS_ADMIN))
+		return -1;
 	get_uts_ns(ns);
 	put_uts_ns(nsproxy->uts_ns);
 	nsproxy->uts_ns = ns;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 5bbdbf0..90c97f6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -620,6 +620,9 @@ static void netns_put(void *ns)
 
 static int netns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct net *net = ns;
+	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN))
+		return -1;
 	put_net(nsproxy->net_ns);
 	nsproxy->net_ns = get_net(ns);
 	return 0;
-- 
1.7.5.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox