Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 5/5] be2net: Fix to allow set/get of debug levels in the Firmware.
From: Somnath Kotur @ 2012-04-30  6:05 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur, Suresh Reddy


Suggested-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Suresh Reddy <Suresh.Reddy@emulex.com>
Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c    |   60 +++++++++++++
 drivers/net/ethernet/emulex/benet/be_cmds.h    |   70 +++++++++++++++
 drivers/net/ethernet/emulex/benet/be_ethtool.c |  108 ++++++++++++++++++++++++
 3 files changed, 238 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 43167e8..29bc5d5 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -2590,3 +2590,63 @@ err:
 	pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
 	return status;
 }
+
+int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_ext_fat_caps *req;
+	int status;
+
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_EXT_FAT_CAPABILITES,
+			       cmd->size, wrb, cmd);
+	/*
+	* Retrieve currently set parameters
+	*/
+	req->parameter_type = cpu_to_le32(1);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd,
+				   struct be_fat_conf_params *configs)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_ext_fat_caps *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	memcpy(&req->set_params, configs, sizeof(struct be_fat_conf_params));
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_EXT_FAT_CAPABILITES,
+			       cmd->size, wrb, cmd);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index 944f031..b244e22 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -189,6 +189,8 @@ struct be_mcc_mailbox {
 #define OPCODE_COMMON_GET_PHY_DETAILS			102
 #define OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP		103
 #define OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES	121
+#define OPCODE_COMMON_GET_EXT_FAT_CAPABILITES		125
+#define OPCODE_COMMON_SET_EXT_FAT_CAPABILITES		126
 #define OPCODE_COMMON_GET_MAC_LIST			147
 #define OPCODE_COMMON_SET_MAC_LIST			148
 #define OPCODE_COMMON_GET_HSW_CONFIG			152
@@ -1602,6 +1604,69 @@ static inline void *be_erx_stats_from_cmd(struct be_adapter *adapter)
 	}
 }
 
+
+/************** get fat capabilites *******************/
+#define MAX_MODULES 27
+#define MAX_MODES 4
+#define MODE_UART 0
+
+/* Trace level Valid values */
+enum {
+	TRACE_LEVEL_ALL = 0x0,
+	TRACE_LEVEL_ENTER_FUNCTION = 0x1,
+	TRACE_LEVEL_VERBOSE = 0x4,
+	TRACE_LEVEL_INFO = 0x5,
+	TRACE_LEVEL_TEST = 0x8,
+	TRACE_LEVEL_SHOW = 0x10,
+	TRACE_LEVEL_WARNING = 0x20,
+	TRACE_LEVEL_ERROR = 0x30,
+	TRACE_LEVEL_FATAL = 0x40,
+	TRACE_LEVEL_ALWAYS = 0x80,
+	TRACE_LEVEL_DISABLED = 0xff
+};
+
+struct ext_fat_mode {
+	u8 mode;
+	u8 rsvd0;
+	u16 port_mask;
+	u32 dbg_lvl;
+	u64 fun_mask;
+} __packed;
+
+struct ext_fat_modules {
+	u8 modules_str[32];
+	u32 modules_id;
+	u32 num_modes;
+	struct ext_fat_mode trace_lvl[MAX_MODES];
+} __packed;
+
+struct be_fat_conf_params {
+	u32 max_log_entries;
+	u32 log_entry_size;
+	u8 log_type;
+	u8 max_log_funs;
+	u8 max_log_ports;
+	u8 rsvd0;
+	u32 supp_modes;
+	u32 num_modules;
+	struct ext_fat_modules module[MAX_MODULES];
+} __packed;
+
+struct be_cmd_req_get_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	u32 parameter_type;
+};
+
+struct be_cmd_resp_get_ext_fat_caps {
+	struct be_cmd_resp_hdr hdr;
+	struct be_fat_conf_params get_params;
+};
+
+struct be_cmd_req_set_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	struct be_fat_conf_params set_params;
+};
+
 extern int be_pci_fnum_get(struct be_adapter *adapter);
 extern int be_cmd_POST(struct be_adapter *adapter);
 extern int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
@@ -1707,4 +1772,9 @@ extern int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid,
 extern int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid,
 			u32 domain, u16 intf_id);
 extern int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter);
+extern int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+					  struct be_dma_mem *cmd);
+extern int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+					  struct be_dma_mem *cmd,
+					  struct be_fat_conf_params *cfgs);
 
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 747f68f..10ef946 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -878,6 +878,112 @@ be_read_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
 	return status;
 }
 
+static u32 be_get_msglevel(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	u32 data = TRACE_LEVEL_ERROR;
+	int status, j;
+
+	if (lancer_chip(adapter)) {
+		dev_err(&adapter->pdev->dev, "Operation not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
+					     &extfat_cmd.dma);
+	if (!extfat_cmd.va) {
+		dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n",
+			__func__);
+		goto err;
+	}
+
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (!status) {
+		cfgs = (struct be_fat_conf_params *)(extfat_cmd.va +
+					sizeof(struct be_cmd_resp_hdr));
+		for (j = 0; j < cfgs->module[0].num_modes; j++) {
+			if (cfgs->module[0].trace_lvl[j].mode ==
+							MODE_UART)
+				data = cfgs->module[0].trace_lvl[j].dbg_lvl;
+		}
+	}
+	pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
+			    extfat_cmd.dma);
+err:
+	return data;
+}
+
+static void be_set_msglevel(struct net_device *netdev, u32 data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	int status;
+	int i, j;
+
+	if (lancer_chip(adapter)) {
+		dev_err(&adapter->pdev->dev, "Operation not supported\n");
+		return;
+	}
+
+	switch (data) {
+	case TRACE_LEVEL_ALL:
+	case TRACE_LEVEL_ENTER_FUNCTION:
+	case TRACE_LEVEL_VERBOSE:
+	case TRACE_LEVEL_INFO:
+	case TRACE_LEVEL_TEST:
+	case TRACE_LEVEL_SHOW:
+	case TRACE_LEVEL_WARNING:
+	case TRACE_LEVEL_ERROR:
+	case TRACE_LEVEL_FATAL:
+	case TRACE_LEVEL_ALWAYS:
+	case TRACE_LEVEL_DISABLED:
+		break;
+	default:
+		dev_err(&adapter->pdev->dev, "Invalid message level value\n");
+		return;
+	}
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
+					     &extfat_cmd.dma);
+	if (!extfat_cmd.va) {
+		dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n",
+			__func__);
+		goto err;
+	}
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (!status) {
+		cfgs = (struct be_fat_conf_params *)(extfat_cmd.va +
+					sizeof(struct be_cmd_resp_hdr));
+		for (i = 0; i < cfgs->num_modules; i++) {
+			for (j = 0; j < cfgs->module[i].num_modes; j++) {
+				if (cfgs->module[i].trace_lvl[j].mode ==
+								MODE_UART)
+					cfgs->module[i].trace_lvl[j].dbg_lvl =
+							cpu_to_le32(data);
+			}
+		}
+		status = be_cmd_set_ext_fat_capabilites(adapter, &extfat_cmd,
+									cfgs);
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Message level set failed\n");
+	} else {
+		dev_err(&adapter->pdev->dev, "Message level get failed\n");
+	}
+
+	pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
+			    extfat_cmd.dma);
+err:
+	return;
+}
+
 const struct ethtool_ops be_ethtool_ops = {
 	.get_settings = be_get_settings,
 	.get_drvinfo = be_get_drvinfo,
@@ -893,6 +999,8 @@ const struct ethtool_ops be_ethtool_ops = {
 	.set_pauseparam = be_set_pauseparam,
 	.get_strings = be_get_stat_strings,
 	.set_phys_id = be_set_phys_id,
+	.get_msglevel = be_get_msglevel,
+	.set_msglevel = be_set_msglevel,
 	.get_sset_count = be_get_sset_count,
 	.get_ethtool_stats = be_get_ethtool_stats,
 	.get_regs_len = be_get_reg_len,
-- 
1.5.6.1

^ permalink raw reply related

* [PATCH net-next 4/5] be2net: Fix EEH error reset before a flash dump completes
From: Somnath Kotur @ 2012-04-30  6:05 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur, Sathya Perla

An EEH error can cause the FW to trigger a flash debug dump.
Resetting the card while flash dump is in progress can cause it not to recover.
Wait for it to finish before letting EEH flow to reset the card.

Signed-off-by: Sathya Perla <Sathya.Perla@emulex.com>
Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index b7bc905..6d5d30b 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -3821,6 +3821,11 @@ static pci_ers_result_t be_eeh_err_detected(struct pci_dev *pdev,
 
 	pci_disable_device(pdev);
 
+	/* The error could cause the FW to trigger a flash debug dump.
+	 * Resetting the card while flash dump is in progress
+	 * can cause it not to recover; wait for it to finish
+	 */
+	ssleep(30);
 	return PCI_ERS_RESULT_NEED_RESET;
 }
 
-- 
1.5.6.1

^ permalink raw reply related

* [PATCH net-next 3/5] be2net: Record receive queue index in skb to aid RPS.
From: Somnath Kotur @ 2012-04-30  6:04 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur, Sarveshwar Bandi


Signed-off-by: Sarveshwar Bandi <Sarveshwar.Bandi@emulex.com>
Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index c8f7b3a..b7bc905 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1259,6 +1259,7 @@ static void be_rx_compl_process(struct be_rx_obj *rxo,
 		skb_checksum_none_assert(skb);
 
 	skb->protocol = eth_type_trans(skb, netdev);
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
 	if (netdev->features & NETIF_F_RXHASH)
 		skb->rxhash = rxcp->rss_hash;
 
@@ -1315,6 +1316,7 @@ void be_rx_compl_process_gro(struct be_rx_obj *rxo, struct napi_struct *napi,
 	skb->len = rxcp->pkt_size;
 	skb->data_len = rxcp->pkt_size;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
 	if (adapter->netdev->features & NETIF_F_RXHASH)
 		skb->rxhash = rxcp->rss_hash;
 
-- 
1.5.6.1

^ permalink raw reply related

* [PATCH net-next 2/5] be2net: Fix to apply duplex value as unknown when link is down.
From: Somnath Kotur @ 2012-04-30  6:04 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur, Sarveshwar Bandi


Suggested-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Sarveshwar Bandi <sarveshwar.bandi@emulex.com>
Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index c9ba2cb..747f68f 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -618,7 +618,7 @@ static int be_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 		ecmd->supported = adapter->phy.supported;
 	}
 
-	ecmd->duplex = DUPLEX_FULL;
+	ecmd->duplex = netif_carrier_ok(netdev) ? DUPLEX_FULL : DUPLEX_UNKNOWN;
 	ecmd->phy_address = adapter->port_num;
 
 	return 0;
-- 
1.5.6.1

^ permalink raw reply related

* [PATCH net-next 0/5] be2net fixes
From: Somnath Kotur @ 2012-04-30  6:03 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur

Re-posting patches after incorporating Ben Hutchings's 
review comments in patches 2 and one of the comments for patch 5

Somnath Kotur (5):
  be2net: Fix to not set link speed for disabled functions of a UMC
    card
  be2net: Fix to apply duplex value as unknown when link is down.
  be2net: Record receive queue index in skb to aid RPS.
  be2net: Fix EEH error reset before a flash dump completes
  be2net: Fix to allow set/get of debug levels in the Firmware.

 drivers/net/ethernet/emulex/benet/be_cmds.c    |   60 +++++++++++++
 drivers/net/ethernet/emulex/benet/be_cmds.h    |   70 +++++++++++++++
 drivers/net/ethernet/emulex/benet/be_ethtool.c |  112 +++++++++++++++++++++++-
 drivers/net/ethernet/emulex/benet/be_main.c    |    7 ++
 4 files changed, 247 insertions(+), 2 deletions(-)

^ permalink raw reply

* [PATCH net-next 1/5] be2net: Fix to not set link speed for disabled functions of a UMC card
From: Somnath Kotur @ 2012-04-30  6:04 UTC (permalink / raw)
  To: netdev; +Cc: Somnath Kotur

This renders the interface view somewhat inconsistent from the Host OS POV
considering the rest of the interfaces are showing their respective speeds
based on the bandwidth assigned to them.

Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 7b06f35..c9ba2cb 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -558,7 +558,7 @@ static int be_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 				be_link_status_update(adapter, link_status);
 			if (link_speed)
 				et_speed = link_speed * 10;
-			else
+			else if (link_status)
 				et_speed = convert_to_et_speed(port_speed);
 		} else {
 			et_speed = adapter->phy.forced_port_speed;
-- 
1.5.6.1

^ permalink raw reply related

* linux-next: build failure after merge of the final tree (net-next tree related)
From: Stephen Rothwell @ 2012-04-30  5:58 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: linux-next, linux-kernel, Benjamin LaHaise

[-- Attachment #1: Type: text/plain, Size: 559 bytes --]

Hi all,

After merging the final tree, today's linux-next build (powerpc
allyesconfig) failed like this:

net/l2tp/l2tp_core.c: In function 'l2tp_verify_udp_checksum':
net/l2tp/l2tp_core.c:464:7: error: implicit declaration of function 'csum_ipv6_magic' [-Werror=implicit-function-declaration]

Caused by commit d2cf3361677e ("net/l2tp: add support for L2TP over IPv6
UDP").  Include file missing.

I have reverted that commit for today.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH] net: skb_set_dev do not unconditionally drop ref to dst
From: Eric Dumazet @ 2012-04-30  5:51 UTC (permalink / raw)
  To: Frank Blaschka; +Cc: davem, netdev, linux-s390, Arnd Bergmann
In-Reply-To: <20120430053804.GA59677@tuxmaker.boeblingen.de.ibm.com>

On Mon, 2012-04-30 at 07:38 +0200, Frank Blaschka wrote:
> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> 
> commit 8a83a00b0735190384a348156837918271034144 unconditionally
> drops dst reference when skb->dev is set. This causes a regression
> with VLAN and the qeth_l3 network driver. qeth_l3 can not get gw
> information from the skb coming from the vlan driver. It is only
> valid to drop the dst in case of different name spaces.
> 
> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> ---
>  net/core/dev.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1881,8 +1881,8 @@ EXPORT_SYMBOL(netif_device_attach);
>  #ifdef CONFIG_NET_NS
>  void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
>  {
> -	skb_dst_drop(skb);
>  	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
> +		skb_dst_drop(skb);
>  		secpath_reset(skb);
>  		nf_reset(skb);
>  		skb_init_secmark(skb);
> 

You forgot CC Arnd Bergmann <arnd@arndb.de> ?

But we do want to do the skb_dst_drop() in dev_forward_skb()

Your patch breaks dev_forward_skb() then.

But apparently this forward path was alredy broken in Arnd patch...

^ permalink raw reply

* [PATCH] net: skb_set_dev do not unconditionally drop ref to dst
From: Frank Blaschka @ 2012-04-30  5:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390

From: Frank Blaschka <frank.blaschka@de.ibm.com>

commit 8a83a00b0735190384a348156837918271034144 unconditionally
drops dst reference when skb->dev is set. This causes a regression
with VLAN and the qeth_l3 network driver. qeth_l3 can not get gw
information from the skb coming from the vlan driver. It is only
valid to drop the dst in case of different name spaces.

Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
---
 net/core/dev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1881,8 +1881,8 @@ EXPORT_SYMBOL(netif_device_attach);
 #ifdef CONFIG_NET_NS
 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
 {
-	skb_dst_drop(skb);
 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
+		skb_dst_drop(skb);
 		secpath_reset(skb);
 		nf_reset(skb);
 		skb_init_secmark(skb);

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  5:26 UTC (permalink / raw)
  To: kaffeemonster
  Cc: David Miller, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <4F9E188E.80503@googlemail.com>


> No idea, i was going by the old saying:
> "Thou shall not include kernel header, or you will feel the wrath of angry
> kernel gurus."

Heh :-)

Well, googling around, it looks like there's a mix of both type of
programs out there. Those using a struct bpf_program and those using a
struct soft_fprog.

Only the latter will work on BE machines as far as I can tell.

David, what's the right way to fix that ?

Cheers,
Ben.

^ permalink raw reply

* [REGRESSION][PATCH V5 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  5:02 UTC (permalink / raw)
  To: netdev
  Cc: Benjamin Herrenschmidt, David Miller, eric.dumazet, matt,
	linux-kernel, linuxppc-dev
In-Reply-To: <1335759088.20866.32.camel@pasglop>

Now the helper function from filter.c for negative offsets is exported,
it can be used it in the jit to handle negative offsets.

First modify the asm load helper functions to handle:
- know positive offsets
- know negative offsets
- any offset

then the compiler can be modified to explicitly use these helper
when appropriate.

This fixes the case of a negative X register and allows to lift
the restriction that bpf programs with negative offsets can't
be jited.

Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
---
 arch/powerpc/net/bpf_jit.h      |    8 +++-
 arch/powerpc/net/bpf_jit_64.S   |  108 ++++++++++++++++++++++++++++++++++-----
 arch/powerpc/net/bpf_jit_comp.c |   26 +++------
 3 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index af1ab5e..5c3cf2d 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -48,7 +48,13 @@
 /*
  * Assembly helpers from arch/powerpc/net/bpf_jit.S:
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+#define DECLARE_LOAD_FUNC(func)	\
+	extern u8 func[], func##_negative_offset[], func##_positive_offset[]
+
+DECLARE_LOAD_FUNC(sk_load_word);
+DECLARE_LOAD_FUNC(sk_load_half);
+DECLARE_LOAD_FUNC(sk_load_byte);
+DECLARE_LOAD_FUNC(sk_load_byte_msh);
 
 #define FUNCTION_DESCR_SIZE	24
 
diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
index ff4506e..55ba385 100644
--- a/arch/powerpc/net/bpf_jit_64.S
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -31,14 +31,13 @@
  * then branch directly to slow_path_XXX if required.  (In fact, could
  * load a spare GPR with the address of slow_path_generic and pass size
  * as an argument, making the call site a mtlr, li and bllr.)
- *
- * Technically, the "is addr < 0" check is unnecessary & slowing down
- * the ABS path, as it's statically checked on generation.
  */
 	.globl	sk_load_word
 sk_load_word:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_word_neg
+	.globl	sk_load_word_positive_offset
+sk_load_word_positive_offset:
 	/* Are we accessing past headlen? */
 	subi	r_scratch1, r_HL, 4
 	cmpd	r_scratch1, r_addr
@@ -51,7 +50,9 @@ sk_load_word:
 	.globl	sk_load_half
 sk_load_half:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_half_neg
+	.globl	sk_load_half_positive_offset
+sk_load_half_positive_offset:
 	subi	r_scratch1, r_HL, 2
 	cmpd	r_scratch1, r_addr
 	blt	bpf_slow_path_half
@@ -61,7 +62,9 @@ sk_load_half:
 	.globl	sk_load_byte
 sk_load_byte:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_byte_neg
+	.globl	sk_load_byte_positive_offset
+sk_load_byte_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte
 	lbzx	r_A, r_D, r_addr
@@ -69,22 +72,20 @@ sk_load_byte:
 
 /*
  * BPF_S_LDX_B_MSH: ldxb  4*([offset]&0xf)
- * r_addr is the offset value, already known positive
+ * r_addr is the offset value
  */
 	.globl sk_load_byte_msh
 sk_load_byte_msh:
+	cmpdi	r_addr, 0
+	blt	bpf_slow_path_byte_msh_neg
+	.globl sk_load_byte_msh_positive_offset
+sk_load_byte_msh_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte_msh
 	lbzx	r_X, r_D, r_addr
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
 
-bpf_error:
-	/* Entered with cr0 = lt */
-	li	r3, 0
-	/* Generated code will 'blt epilogue', returning 0. */
-	blr
-
 /* Call out to skb_copy_bits:
  * We'll need to back up our volatile regs first; we have
  * local variable space at r1+(BPF_PPC_STACK_BASIC).
@@ -136,3 +137,84 @@ bpf_slow_path_byte_msh:
 	lbz	r_X, BPF_PPC_STACK_BASIC+(2*8)(r1)
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
+
+/* Call out to bpf_internal_load_pointer_neg_helper:
+ * We'll need to back up our volatile regs first; we have
+ * local variable space at r1+(BPF_PPC_STACK_BASIC).
+ * Allocate a new stack frame here to remain ABI-compliant in
+ * stashing LR.
+ */
+#define sk_negative_common(SIZE)				\
+	mflr	r0;						\
+	std	r0, 16(r1);					\
+	/* R3 goes in parameter space of caller's frame */	\
+	std	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	std	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	std	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	stdu	r1, -BPF_PPC_SLOWPATH_FRAME(r1);		\
+	/* R3 = r_skb, as passed */				\
+	mr	r4, r_addr;					\
+	li	r5, SIZE;					\
+	bl	bpf_internal_load_pointer_neg_helper;		\
+	/* R3 != 0 on success */				\
+	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
+	ld	r0, 16(r1);					\
+	ld	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	ld	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	mtlr	r0;						\
+	cmpldi	r3, 0;						\
+	beq	bpf_error_slow;	/* cr0 = EQ */			\
+	mr	r_addr, r3;					\
+	ld	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	/* Great success! */
+
+bpf_slow_path_word_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_word_negative_offset
+sk_load_word_negative_offset:
+	sk_negative_common(4)
+	lwz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_half_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_half_negative_offset
+sk_load_half_negative_offset:
+	sk_negative_common(2)
+	lhz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_negative_offset
+sk_load_byte_negative_offset:
+	sk_negative_common(1)
+	lbz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_msh_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_msh_negative_offset
+sk_load_byte_msh_negative_offset:
+	sk_negative_common(1)
+	lbz	r_X, 0(r_addr)
+	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
+	blr
+
+bpf_error_slow:
+	/* fabricate a cr0 = lt */
+	li	r_scratch1, -1
+	cmpdi	r_scratch1, 0
+bpf_error:
+	/* Entered with cr0 = lt */
+	li	r3, 0
+	/* Generated code will 'blt epilogue', returning 0. */
+	blr
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 73619d3..2dc8b14 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -127,6 +127,9 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 	PPC_BLR();
 }
 
+#define CHOOSE_LOAD_FUNC(K, func) \
+	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
+
 /* Assemble the body code between the prologue & epilogue. */
 static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			      struct codegen_context *ctx,
@@ -391,21 +394,16 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 
 			/*** Absolute loads from packet header/data ***/
 		case BPF_S_LD_W_ABS:
-			func = sk_load_word;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
 			goto common_load;
 		case BPF_S_LD_H_ABS:
-			func = sk_load_half;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
 			goto common_load;
 		case BPF_S_LD_B_ABS:
-			func = sk_load_byte;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
 		common_load:
-			/*
-			 * Load from [K].  Reference with the (negative)
-			 * SKF_NET_OFF/SKF_LL_OFF offsets is unsupported.
-			 */
+			/* Load from [K]. */
 			ctx->seen |= SEEN_DATAREF;
-			if ((int)K < 0)
-				return -ENOTSUPP;
 			PPC_LI64(r_scratch1, func);
 			PPC_MTLR(r_scratch1);
 			PPC_LI32(r_addr, K);
@@ -429,7 +427,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 		common_load_ind:
 			/*
 			 * Load from [X + K].  Negative offsets are tested for
-			 * in the helper functions, and result in a 'ret 0'.
+			 * in the helper functions.
 			 */
 			ctx->seen |= SEEN_DATAREF | SEEN_XREG;
 			PPC_LI64(r_scratch1, func);
@@ -443,13 +441,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			break;
 
 		case BPF_S_LDX_B_MSH:
-			/*
-			 * x86 version drops packet (RET 0) when K<0, whereas
-			 * interpreter does allow K<0 (__load_pointer, special
-			 * ancillary data).  common_load returns ENOTSUPP if K<0,
-			 * so we fall back to interpreter & filter works.
-			 */
-			func = sk_load_byte_msh;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
 			goto common_load;
 			break;
 

^ permalink raw reply related

* Re: inconsistent lock/deadlock crash, vanilla 3.3.4, 32bit, tcp
From: Glauber Costa @ 2012-04-30  4:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Denys Fedoryshchenko, netdev, davem, jmorris, yoshfuji, kaber,
	linux-kernel
In-Reply-To: <1335689052.2900.86.camel@edumazet-glaptop>

On 04/29/2012 05:44 AM, Eric Dumazet wrote:
> On Sun, 2012-04-29 at 10:40 +0200, Eric Dumazet wrote:
>> On Sun, 2012-04-29 at 10:27 +0200, Eric Dumazet wrote:
>>> On Sun, 2012-04-29 at 10:41 +0300, Denys Fedoryshchenko wrote:
>>>> I apologize for late night post, and a lot of trash left in report.
>>>> I will cleanup it up now, and send with CC to maintainers.
>>>>
>>>> Server job are proxy, with very high rate of new connections.
>>>> Deadlock at peaktime can be easily reproduced in 10-15 minutes.
>>>>
>>>> Deadlock occured on almost all 3.3-stable versions (tried 3.3.3 -
>>>> 3.3.4). It is not easy to try older kernel,
>>>> but if required i can try.
>>>> Usually also, because SYN rate very high, i can see:
>>>> [   51.612987] TCP: Possible SYN flooding on port 8080. Sending
>>>> cookies.  Check SNMP counters.
>>>>
>>>>    [  762.903868]
>>>>    [  762.903880] =================================
>>>>    [  762.903890] [ INFO: inconsistent lock state ]
>>>>    [  762.903903] 3.3.4-build-0061 #8 Not tainted
>>>>    [  762.904133] ---------------------------------
>>>>    [  762.904344] inconsistent {IN-SOFTIRQ-W} ->  {SOFTIRQ-ON-W} usage.
>>>>    [  762.904542] squid/1603 [HC0[0]:SC0[0]:HE1:SE1] takes:
>>>>    [  762.904542]  (key#3){+.?...}, at: [<c0232cc4>]
>>>> __percpu_counter_sum+0xd/0x58
>>>>    [  762.904542] {IN-SOFTIRQ-W} state was registered at:
>>>>    [  762.904542]   [<c0158b84>] __lock_acquire+0x284/0xc26
>>>>    [  762.904542]   [<c01598e8>] lock_acquire+0x71/0x85
>>>>    [  762.904542]   [<c0349765>] _raw_spin_lock+0x33/0x40
>>>>    [  762.904542]   [<c0232c93>] __percpu_counter_add+0x58/0x7c
>>>>    [  762.904542]   [<c02cfde1>] sk_clone_lock+0x1e5/0x200
>>>>    [  762.904542]   [<c0303ee4>] inet_csk_clone_lock+0xe/0x78
>>>>    [  762.904542]   [<c0315778>] tcp_create_openreq_child+0x1b/0x404
>>>>    [  762.904542]   [<c031339c>] tcp_v4_syn_recv_sock+0x32/0x1c1
>>>>    [  762.904542]   [<c031615a>] tcp_check_req+0x1fd/0x2d7
>>>>    [  762.904542]   [<c0313f77>] tcp_v4_do_rcv+0xab/0x194
>>>>    [  762.904542]   [<c03153bb>] tcp_v4_rcv+0x3b3/0x5cc
>>>>    [  762.904542]   [<c02fc0c4>] ip_local_deliver_finish+0x13a/0x1e9
>>>>    [  762.904542]   [<c02fc539>] NF_HOOK.clone.11+0x46/0x4d
>>>>    [  762.904542]   [<c02fc652>] ip_local_deliver+0x41/0x45
>>>>    [  762.904542]   [<c02fc4d1>] ip_rcv_finish+0x31a/0x33c
>>>>    [  762.904542]   [<c02fc539>] NF_HOOK.clone.11+0x46/0x4d
>>>>    [  762.904542]   [<c02fc857>] ip_rcv+0x201/0x23e
>>>>    [  762.904542]   [<c02daa3a>] __netif_receive_skb+0x319/0x368
>>>>    [  762.904542]   [<c02dac07>] netif_receive_skb+0x4e/0x7d
>>>>    [  762.904542]   [<c02dacf6>] napi_skb_finish+0x1e/0x34
>>>>    [  762.904542]   [<c02db122>] napi_gro_receive+0x20/0x24
>>>>    [  762.904542]   [<f85d1743>] e1000_receive_skb+0x3f/0x45 [e1000e]
>>>>    [  762.904542]   [<f85d3464>] e1000_clean_rx_irq+0x1f9/0x284 [e1000e]
>>>>    [  762.904542]   [<f85d3926>] e1000_clean+0x62/0x1f4 [e1000e]
>>>>    [  762.904542]   [<c02db228>] net_rx_action+0x90/0x160
>>>>    [  762.904542]   [<c012a445>] __do_softirq+0x7b/0x118
>>>>    [  762.904542] irq event stamp: 156915469
>>>>    [  762.904542] hardirqs last  enabled at (156915469): [<c019b4f4>]
>>>> __slab_alloc.clone.58.clone.63+0xc4/0x2de
>>>>    [  762.904542] hardirqs last disabled at (156915468): [<c019b452>]
>>>> __slab_alloc.clone.58.clone.63+0x22/0x2de
>>>>    [  762.904542] softirqs last  enabled at (156915466): [<c02ce677>]
>>>> lock_sock_nested+0x64/0x6c
>>>>    [  762.904542] softirqs last disabled at (156915464): [<c0349914>]
>>>> _raw_spin_lock_bh+0xe/0x45
>>>>    [  762.904542]
>>>>    [  762.904542] other info that might help us debug this:
>>>>    [  762.904542]  Possible unsafe locking scenario:
>>>>    [  762.904542]
>>>>    [  762.904542]        CPU0
>>>>    [  762.904542]        ----
>>>>    [  762.904542]   lock(key#3);
>>>>    [  762.904542]<Interrupt>
>>>>    [  762.904542]     lock(key#3);
>>>>    [  762.904542]
>>>>    [  762.904542]  *** DEADLOCK ***
>>>>    [  762.904542]
>>>>    [  762.904542] 1 lock held by squid/1603:
>>>>    [  762.904542]  #0:  (sk_lock-AF_INET){+.+.+.}, at: [<c03055c0>]
>>>> lock_sock+0xa/0xc
>>>>    [  762.904542]
>>>>    [  762.904542] stack backtrace:
>>>>    [  762.904542] Pid: 1603, comm: squid Not tainted 3.3.4-build-0061 #8
>>>>    [  762.904542] Call Trace:
>>>>    [  762.904542]  [<c0347b73>] ? printk+0x18/0x1d
>>>>    [  762.904542]  [<c015873a>] valid_state+0x1f6/0x201
>>>>    [  762.904542]  [<c0158816>] mark_lock+0xd1/0x1bb
>>>>    [  762.904542]  [<c015876b>] ? mark_lock+0x26/0x1bb
>>>>    [  762.904542]  [<c015805d>] ? check_usage_forwards+0x77/0x77
>>>>    [  762.904542]  [<c0158bf8>] __lock_acquire+0x2f8/0xc26
>>>>    [  762.904542]  [<c0159b8e>] ? mark_held_locks+0x5d/0x7b
>>>>    [  762.904542]  [<c0159cf6>] ? trace_hardirqs_on+0xb/0xd
>>>>    [  762.904542]  [<c0158dd4>] ? __lock_acquire+0x4d4/0xc26
>>>>    [  762.904542]  [<c01598e8>] lock_acquire+0x71/0x85
>>>>    [  762.904542]  [<c0232cc4>] ? __percpu_counter_sum+0xd/0x58
>>>>    [  762.904542]  [<c0349765>] _raw_spin_lock+0x33/0x40
>>>>    [  762.904542]  [<c0232cc4>] ? __percpu_counter_sum+0xd/0x58
>>>>    [  762.904542]  [<c0232cc4>] __percpu_counter_sum+0xd/0x58
>>>>    [  762.904542]  [<c02cebc4>] __sk_mem_schedule+0xdd/0x1c7
>>>>    [  762.904542]  [<c02d178d>] ? __alloc_skb+0x76/0x100
>>>>    [  762.904542]  [<c0305e8e>] sk_wmem_schedule+0x21/0x2d
>>>>    [  762.904542]  [<c0306370>] sk_stream_alloc_skb+0x42/0xaa
>>>>    [  762.904542]  [<c0306567>] tcp_sendmsg+0x18f/0x68b
>>>>    [  762.904542]  [<c031f3dc>] ? ip_fast_csum+0x30/0x30
>>>>    [  762.904542]  [<c0320193>] inet_sendmsg+0x53/0x5a
>>>>    [  762.904542]  [<c02cb633>] sock_aio_write+0xd2/0xda
>>>>    [  762.904542]  [<c015876b>] ? mark_lock+0x26/0x1bb
>>>>    [  762.904542]  [<c01a1017>] do_sync_write+0x9f/0xd9
>>>>    [  762.904542]  [<c01a2111>] ? file_free_rcu+0x2f/0x2f
>>>>    [  762.904542]  [<c01a17a1>] vfs_write+0x8f/0xab
>>>>    [  762.904542]  [<c01a284d>] ? fget_light+0x75/0x7c
>>>>    [  762.904542]  [<c01a1900>] sys_write+0x3d/0x5e
>>>>    [  762.904542]  [<c0349ec9>] syscall_call+0x7/0xb
>>>>    [  762.904542]  [<c0340000>] ? rp_sidt+0x41/0x83
>>>>
>>>
>>>
>>> OK, so when we have memory pressure we can call
>>> percpu_counter_read_positive() with SOFTIRQ enabled, and lockdep
>>> complains...
>>>
>>> This bug was probably added in 2008, in commit 1748376b6626a
>>> (net: Use a percpu_counter for sockets_allocated)
>>
>> Hmm, no this patch was fine.
>>
>> Bug was in fact added by Glauber Costa in commit 180d8cd942ce336b2c869
>> (foundations of per-cgroup memory pressure controlling.)
>>
>> Because he replaced the safe percpu_counter_read_positive() call to
>> unsafe percpu_counter_sum_positive() in
>> sk_sockets_allocated_read_positive()
>>
>> But anyway the patch I sent should fix the problem.
>
> Thinking again, I am not sure why Glauber did this change. He probably
> made a typo or something.
>

Indeed.

It wasn't my intent to change that, it was a mistake.

Very sorry.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  4:43 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: David Miller, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335760199.20866.33.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Mon, 2012-04-30 at 06:27 +0200, Jan Seiffert wrote:
>> Benjamin Herrenschmidt schrieb:
>>> On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
>>>
>>>>> Matt's having a look at powerpc
>>>>
>>>> Ok, he hasn't so I'll dig a bit.
>>>>
>>>> No obvious wrongness (but I'm not very familiar with bpf), though I do
>>>> have a comment: sk_negative_common() and bpf_slow_path_common() should
>>>> be made one and single macro which takes the fallback function as an
>>>> argument.
>>>
>>> Ok, with the compile fix below it seems to work for me:
>>>
>>> (Feel free to fold that into the original patch)
>>>
>>
>> Should i resend the complete patch with the compile fix?
> 
> Won't hurt...
> 

Ok

> BTW. Any idea about that bpf_program vs. sock_fprog issue I mentioned
> earlier ?
> 

No idea, i was going by the old saying:
"Thou shall not include kernel header, or you will feel the wrath of angry
kernel gurus."

> Cheers,
> Ben.
> 

Greetings
	Jan

-- 
The OO-Hype keeps on spinning, C stays.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  4:29 UTC (permalink / raw)
  To: kaffeemonster
  Cc: David Miller, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <4F9E1496.9060603@googlemail.com>

On Mon, 2012-04-30 at 06:27 +0200, Jan Seiffert wrote:
> Benjamin Herrenschmidt schrieb:
> > On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
> > 
> >>> Matt's having a look at powerpc
> >>
> >> Ok, he hasn't so I'll dig a bit.
> >>
> >> No obvious wrongness (but I'm not very familiar with bpf), though I do
> >> have a comment: sk_negative_common() and bpf_slow_path_common() should
> >> be made one and single macro which takes the fallback function as an
> >> argument.
> > 
> > Ok, with the compile fix below it seems to work for me:
> > 
> > (Feel free to fold that into the original patch)
> > 
> 
> Should i resend the complete patch with the compile fix?

Won't hurt...

BTW. Any idea about that bpf_program vs. sock_fprog issue I mentioned
earlier ?

Cheers,
Ben.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  4:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: David Miller, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335759088.20866.32.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
> 
>>> Matt's having a look at powerpc
>>
>> Ok, he hasn't so I'll dig a bit.
>>
>> No obvious wrongness (but I'm not very familiar with bpf), though I do
>> have a comment: sk_negative_common() and bpf_slow_path_common() should
>> be made one and single macro which takes the fallback function as an
>> argument.
> 
> Ok, with the compile fix below it seems to work for me:
> 
> (Feel free to fold that into the original patch)
> 

Should i resend the complete patch with the compile fix?

[snip]
>  
> Cheers,
> Ben.
> 


Greetings
	Jan


PS: I am sure i compile tested the orig. patch here, hmmm, must have
lost that part when moving trees, #GitIsNotMyFriend

-- 
en.gin.eer en-ji-nir n 1: a mechanism for converting caffeine into designs.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  4:11 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335753820.20866.27.camel@pasglop>

On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:

> > Matt's having a look at powerpc
> 
> Ok, he hasn't so I'll dig a bit.
> 
> No obvious wrongness (but I'm not very familiar with bpf), though I do
> have a comment: sk_negative_common() and bpf_slow_path_common() should
> be made one and single macro which takes the fallback function as an
> argument.

Ok, with the compile fix below it seems to work for me:

(Feel free to fold that into the original patch)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index af1ab5e..5c3cf2d 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -48,7 +48,13 @@
 /*
  * Assembly helpers from arch/powerpc/net/bpf_jit.S:
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+#define DECLARE_LOAD_FUNC(func)	\
+	extern u8 func[], func##_negative_offset[], func##_positive_offset[]
+
+DECLARE_LOAD_FUNC(sk_load_word);
+DECLARE_LOAD_FUNC(sk_load_half);
+DECLARE_LOAD_FUNC(sk_load_byte);
+DECLARE_LOAD_FUNC(sk_load_byte_msh);
 
 #define FUNCTION_DESCR_SIZE	24
 
Cheers,
Ben.

^ permalink raw reply related

* Re: [PATCH] tcp: fix infinite cwnd in tcp_complete_cwr()
From: Neal Cardwell @ 2012-04-30  3:49 UTC (permalink / raw)
  To: Yuchung Cheng; +Cc: davem, ilpo.jarvinen, nanditad, netdev
In-Reply-To: <1335639894-18569-1-git-send-email-ycheng@google.com>

On Sat, Apr 28, 2012 at 3:04 PM, Yuchung Cheng <ycheng@google.com> wrote:
> When the cwnd reduction is done, ssthresh may be infinite
> if TCP enters CWR via ECN or F-RTO. If cwnd is not undone, i.e.,
> undo_marker is set, tcp_complete_cwr() falsely set cwnd to the
> infinite ssthresh value. The correct operation is to keep cwnd
> intact because it has been updated in ECN or F-RTO.
>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> ---
>  net/ipv4/tcp_input.c |    8 +++++---
>  1 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c93b0cb..ac417de 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2868,11 +2868,13 @@ static inline void tcp_complete_cwr(struct sock *sk)
>
>        /* Do not moderate cwnd if it's already undone in cwr or recovery. */
>        if (tp->undo_marker) {
> -               if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
> +               if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
>                        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);

In this TCP_CA_CWR case the patch needs to add the assignment:

     tp->snd_cwnd_stamp = tcp_time_stamp;

I know that was in earlier versions of your patch; looks like it just
got accidentally dropped on this latest version.

Otherwise, looks good to me.

neal

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  3:43 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: David Miller, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335753820.20866.27.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Wed, 2012-04-04 at 08:11 +1000, Benjamin Herrenschmidt wrote:
>> On Tue, 2012-04-03 at 18:03 -0400, David Miller wrote:
>> 
>>>> Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
>>>> 
>>>> I have only compile tested this, -ENOHARDWARE. Can someone with
>>>> more powerpc kung-fu review and maybe test this? Esp. powerpc
>>>> asm is not my strong point. I think i botched the stack frame
>>>> in the call setup. Help?
>>> 
>>> I'm not applying this until a powerpc person tests it.
>>> 
>>> Also, we have an ARM JIT in the tree which probably needs to be
>>> fixed similarly.
>> 
>> Matt's having a look at powerpc
> 
> Ok, he hasn't so I'll dig a bit.
> 

That would be great Benjamin!

> No obvious wrongness (but I'm not very familiar with bpf),

As long as you know PPC ASM you are my man ;-)

> though I do have a comment: sk_negative_common() and
> bpf_slow_path_common() should be made one and single macro which
> takes the fallback function as an argument.
> 

I don't know if this is possible.
The return value is different (one returns 0 on success, the other != 0,
the return value of != is needed). I didn't wanted to change to much,
because i'm not fluent in ppc.

> I'll mess around & try to test using Jan test case & will come back 
> with an updated patch.
> 

Would be great!

> Cheers, Ben.
> 

Greetings
	Jan

-- 
A UDP packet walks into a

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  3:40 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335753820.20866.27.camel@pasglop>

On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:

> Ok, he hasn't so I'll dig a bit.
> 
> No obvious wrongness (but I'm not very familiar with bpf), though I do
> have a comment: sk_negative_common() and bpf_slow_path_common() should
> be made one and single macro which takes the fallback function as an
> argument.
> 
> I'll mess around & try to test using Jan test case & will come back
> with an updated patch.

Wow, hit that nasty along the way: The test program will not work
on big endian machines because of a nasty difference between
the kernel struct sock_fprog and libpcap struct bpf_program:

Kernel expects:

struct sock_fprog {     /* Required for SO_ATTACH_FILTER. */
        unsigned short          len;    /* Number of filter blocks */
        struct sock_filter __user *filter;
};

libpcap provides:

struct bpf_program {
        u_int bf_len;
        struct bpf_insn *bf_insns;
};

Note the unsigned short vs. unsigned int there ? This totally
breaks it here.

Is it expected that one can pass a struct bpf_program directly
to the kernel or should it be "converted" by the library in which
case it's just a bug in Jan's test program ?

Cheers,
Ben.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  2:43 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1333491102.3040.12.camel@pasglop>

On Wed, 2012-04-04 at 08:11 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-04-03 at 18:03 -0400, David Miller wrote:
> 
> > > Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
> > > 
> > > I have only compile tested this, -ENOHARDWARE.
> > > Can someone with more powerpc kung-fu review and maybe test this?
> > > Esp. powerpc asm is not my strong point. I think i botched the
> > > stack frame in the call setup. Help?
> > 
> > I'm not applying this until a powerpc person tests it.
> > 
> > Also, we have an ARM JIT in the tree which probably needs to
> > be fixed similarly.
> 
> Matt's having a look at powerpc

Ok, he hasn't so I'll dig a bit.

No obvious wrongness (but I'm not very familiar with bpf), though I do
have a comment: sk_negative_common() and bpf_slow_path_common() should
be made one and single macro which takes the fallback function as an
argument.

I'll mess around & try to test using Jan test case & will come back
with an updated patch.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 3/5] ipvs: null check of net->ipvs in lblc(r) shedulers
From: Pablo Neira Ayuso @ 2012-04-29 23:43 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <1335488039-13471-4-git-send-email-horms@verge.net.au>

On Fri, Apr 27, 2012 at 09:53:57AM +0900, Simon Horman wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> 
> Avoid crash when registering shedulers after
> the IPVS core initialization for netns fails. Do this by
> checking for present core (net->ipvs).
> 
> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Julian Anastasov <ja@ssi.bg>

This incomplete tag has slipped through, right?

Looking at the archive, I don't see any comment from Julian on this,
so I'll just remove it from the patch description.

I'll fix this minor nitppick myself. No need to resend. But you'll
have to rebase your tree, sorry.

^ permalink raw reply

* Re: [net-next 1/4 (V3)] net: ethtool: add the EEE support
From: Ben Hutchings @ 2012-04-29 21:56 UTC (permalink / raw)
  To: Yuval Mintz; +Cc: Giuseppe CAVALLARO, netdev, davem
In-Reply-To: <4F9D07FB.1010208@broadcom.com>

On Sun, 2012-04-29 at 12:20 +0300, Yuval Mintz wrote:
> On 04/27/2012 05:11 PM, Giuseppe CAVALLARO wrote:
> 
> > On 4/26/2012 7:17 PM, Ben Hutchings wrote:
> >> On Thu, 2012-04-26 at 09:48 +0200, Giuseppe CAVALLARO wrote:
> >>> Hello Ben
> >>>
> >>> On 4/19/2012 5:30 PM, Ben Hutchings wrote:
> >>> [snip]
> >>>>> I'm changing the code for getting/setting the EEE capability and trying
> >>>>> to follow your suggestions.
> >>>>>
> >>>>> The "get" will show the following things; this is a bit different of the
> >>>>> points "a" "b" and "c" we had discussed. Maybe, this could also be a
> >>>>> more complete (*) .
> >>>>> The ethtool (see output below as example) could report the phy
> >>>>> (supported/advertised/lp_advertised) and mac eee capabilities separately.
> >>>> Sounds reasonable.
> >>>>
> >>>>> The "set" will be useful for some eth devices (like the stmmac) that can
> >>>>> stop/enable internally the eee capability (at mac level).
> >>>> I don't know much about EEE, but shouldn't the driver take care of
> >>>> configuring the MAC for this whenever the PHY is set to advertise EEE
> >>>> capability?
> >>> Yes indeed this can be done at driver level. So could I definitely
> >>> remove it from ethtool? What do you suggest?
> >>>
> >>> In case of the stmmac I could add a specific driver option via sys to
> >>> enable/disable the eee and set timer.
> >> Generally, ethtool doesn't distinguish MAC and PHY settings because they
> >> have to be configured consistently for the device to do anything useful.
> >> If there is some good use for enabling EEE in the MAC and not the PHY,
> >> or vice versa, then this should be exposed in the ethtool interface.
> >> But if not then I don't believe it needs to be in either an ethtool or a
> >> driver-specific interface.
> > Thanks Ben for this clarification: in case of the stmmac the option is
> > useful to stop a timer to enter in lpi state for the tx.
> > So it's worth having that and from ethtool.

I think I finally get it.  If we negotiate a 100BASE-TX link (or one of
the various backplane modes) with EEE enabled, we allow the link partner
to assert LPI but we might still not want to assert it in the transmit
direction.  Right?  (Whereas for 1000BASE-T and 10GBASE-T this would be
useless, since both sides must assert LPI before any transition can
happen.)

> How will a user turn off EEE support using this implementation?

At the ethtool API level this would be done by clearing the EEE
advertising mask.  At the command-line level there could be a shortcut
for this, just as you can use 'autoneg on' and 'autoneg off' rather than
specifying a mask of link modes.

> Are you suggesting a "set" that works similarly to the control of the pause
> parameters - that is, a user could either shutdown EEE or only Tx, which
> will mean to the driver "don't enter Tx LPI mode"?
>
> Keep in mind that if later an interface controlling the LPI timers would be
> added (as a measure of user control to the power saving vs. latency issue),
> it could make this 'partial' closure interface redundant.
>
> Perhaps "set" should only turn the EEE feature on/off entirely (adv. them or
> not, since clearly the link will have to be re-established afterwards), and
> we should have a different function that prevents entry into LPI mode in Tx
> - one whose functionality could later on be extended.

It sounds like this might as well be included, even if not all
drivers/hardware would allow the values to be changed.  So the command
structure would have at least:

1. EEE link mode supported flags (get-only)
2. EEE link mode advertising flags (get/set)
3. Ditto for link partner (get-only)
4. TX LPI enable flag (get/set)
5. TX LPI timer values (get/set but driver may reject changes)

But if it's not yet clear exactly what timer parameters will be useful,
we could leave some reserved space and then later define them along with
flags to indicate whether the driver understands them.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: inconsistent lock/deadlock crash, vanilla 3.3.4, 32bit, tcp
From: Neal Cardwell @ 2012-04-29 21:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Denys Fedoryshchenko, netdev, davem, jmorris, yoshfuji, kaber,
	linux-kernel, Glauber Costa
In-Reply-To: <1335691316.2900.100.camel@edumazet-glaptop>

On Sun, Apr 29, 2012 at 5:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> [PATCH] net: fix sk_sockets_allocated_read_positive
>
> Denys Fedoryshchenko reported frequent crashes on a proxy server and kindly
> provided a lockdep report that explains it all :
>
>
>  [  762.903868]
>  [  762.903880] =================================
>  [  762.903890] [ INFO: inconsistent lock state ]
>  [  762.903903] 3.3.4-build-0061 #8 Not tainted
>  [  762.904133] ---------------------------------
>  [  762.904344] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
>  [  762.904542] squid/1603 [HC0[0]:SC0[0]:HE1:SE1] takes:
>  [  762.904542]  (key#3){+.?...}, at: [<c0232cc4>]
> __percpu_counter_sum+0xd/0x58
>  [  762.904542] {IN-SOFTIRQ-W} state was registered at:
>  [  762.904542]   [<c0158b84>] __lock_acquire+0x284/0xc26
>  [  762.904542]   [<c01598e8>] lock_acquire+0x71/0x85
>  [  762.904542]   [<c0349765>] _raw_spin_lock+0x33/0x40
>  [  762.904542]   [<c0232c93>] __percpu_counter_add+0x58/0x7c
>  [  762.904542]   [<c02cfde1>] sk_clone_lock+0x1e5/0x200
>  [  762.904542]   [<c0303ee4>] inet_csk_clone_lock+0xe/0x78
>  [  762.904542]   [<c0315778>] tcp_create_openreq_child+0x1b/0x404
>  [  762.904542]   [<c031339c>] tcp_v4_syn_recv_sock+0x32/0x1c1
>  [  762.904542]   [<c031615a>] tcp_check_req+0x1fd/0x2d7
>  [  762.904542]   [<c0313f77>] tcp_v4_do_rcv+0xab/0x194
>  [  762.904542]   [<c03153bb>] tcp_v4_rcv+0x3b3/0x5cc
>  [  762.904542]   [<c02fc0c4>] ip_local_deliver_finish+0x13a/0x1e9
>  [  762.904542]   [<c02fc539>] NF_HOOK.clone.11+0x46/0x4d
>  [  762.904542]   [<c02fc652>] ip_local_deliver+0x41/0x45
>  [  762.904542]   [<c02fc4d1>] ip_rcv_finish+0x31a/0x33c
>  [  762.904542]   [<c02fc539>] NF_HOOK.clone.11+0x46/0x4d
>  [  762.904542]   [<c02fc857>] ip_rcv+0x201/0x23e
>  [  762.904542]   [<c02daa3a>] __netif_receive_skb+0x319/0x368
>  [  762.904542]   [<c02dac07>] netif_receive_skb+0x4e/0x7d
>  [  762.904542]   [<c02dacf6>] napi_skb_finish+0x1e/0x34
>  [  762.904542]   [<c02db122>] napi_gro_receive+0x20/0x24
>  [  762.904542]   [<f85d1743>] e1000_receive_skb+0x3f/0x45 [e1000e]
>  [  762.904542]   [<f85d3464>] e1000_clean_rx_irq+0x1f9/0x284 [e1000e]
>  [  762.904542]   [<f85d3926>] e1000_clean+0x62/0x1f4 [e1000e]
>  [  762.904542]   [<c02db228>] net_rx_action+0x90/0x160
>  [  762.904542]   [<c012a445>] __do_softirq+0x7b/0x118
>  [  762.904542] irq event stamp: 156915469
>  [  762.904542] hardirqs last  enabled at (156915469): [<c019b4f4>]
> __slab_alloc.clone.58.clone.63+0xc4/0x2de
>  [  762.904542] hardirqs last disabled at (156915468): [<c019b452>]
> __slab_alloc.clone.58.clone.63+0x22/0x2de
>  [  762.904542] softirqs last  enabled at (156915466): [<c02ce677>]
> lock_sock_nested+0x64/0x6c
>  [  762.904542] softirqs last disabled at (156915464): [<c0349914>]
> _raw_spin_lock_bh+0xe/0x45
>  [  762.904542]
>  [  762.904542] other info that might help us debug this:
>  [  762.904542]  Possible unsafe locking scenario:
>  [  762.904542]
>  [  762.904542]        CPU0
>  [  762.904542]        ----
>  [  762.904542]   lock(key#3);
>  [  762.904542]   <Interrupt>
>  [  762.904542]     lock(key#3);
>  [  762.904542]
>  [  762.904542]  *** DEADLOCK ***
>  [  762.904542]
>  [  762.904542] 1 lock held by squid/1603:
>  [  762.904542]  #0:  (sk_lock-AF_INET){+.+.+.}, at: [<c03055c0>]
> lock_sock+0xa/0xc
>  [  762.904542]
>  [  762.904542] stack backtrace:
>  [  762.904542] Pid: 1603, comm: squid Not tainted 3.3.4-build-0061 #8
>  [  762.904542] Call Trace:
>  [  762.904542]  [<c0347b73>] ? printk+0x18/0x1d
>  [  762.904542]  [<c015873a>] valid_state+0x1f6/0x201
>  [  762.904542]  [<c0158816>] mark_lock+0xd1/0x1bb
>  [  762.904542]  [<c015876b>] ? mark_lock+0x26/0x1bb
>  [  762.904542]  [<c015805d>] ? check_usage_forwards+0x77/0x77
>  [  762.904542]  [<c0158bf8>] __lock_acquire+0x2f8/0xc26
>  [  762.904542]  [<c0159b8e>] ? mark_held_locks+0x5d/0x7b
>  [  762.904542]  [<c0159cf6>] ? trace_hardirqs_on+0xb/0xd
>  [  762.904542]  [<c0158dd4>] ? __lock_acquire+0x4d4/0xc26
>  [  762.904542]  [<c01598e8>] lock_acquire+0x71/0x85
>  [  762.904542]  [<c0232cc4>] ? __percpu_counter_sum+0xd/0x58
>  [  762.904542]  [<c0349765>] _raw_spin_lock+0x33/0x40
>  [  762.904542]  [<c0232cc4>] ? __percpu_counter_sum+0xd/0x58
>  [  762.904542]  [<c0232cc4>] __percpu_counter_sum+0xd/0x58
>  [  762.904542]  [<c02cebc4>] __sk_mem_schedule+0xdd/0x1c7
>  [  762.904542]  [<c02d178d>] ? __alloc_skb+0x76/0x100
>  [  762.904542]  [<c0305e8e>] sk_wmem_schedule+0x21/0x2d
>  [  762.904542]  [<c0306370>] sk_stream_alloc_skb+0x42/0xaa
>  [  762.904542]  [<c0306567>] tcp_sendmsg+0x18f/0x68b
>  [  762.904542]  [<c031f3dc>] ? ip_fast_csum+0x30/0x30
>  [  762.904542]  [<c0320193>] inet_sendmsg+0x53/0x5a
>  [  762.904542]  [<c02cb633>] sock_aio_write+0xd2/0xda
>  [  762.904542]  [<c015876b>] ? mark_lock+0x26/0x1bb
>  [  762.904542]  [<c01a1017>] do_sync_write+0x9f/0xd9
>  [  762.904542]  [<c01a2111>] ? file_free_rcu+0x2f/0x2f
>  [  762.904542]  [<c01a17a1>] vfs_write+0x8f/0xab
>  [  762.904542]  [<c01a284d>] ? fget_light+0x75/0x7c
>  [  762.904542]  [<c01a1900>] sys_write+0x3d/0x5e
>  [  762.904542]  [<c0349ec9>] syscall_call+0x7/0xb
>  [  762.904542]  [<c0340000>] ? rp_sidt+0x41/0x83
>
>
> Bug is that sk_sockets_allocated_read_positive() calls
> percpu_counter_sum_positive() without BH being disabled.
>
> This bug was added in commit 180d8cd942ce33
> (foundations of per-cgroup memory pressure controlling.), since previous
> code was using percpu_counter_read_positive() which is IRQ safe.
>
> In __sk_mem_schedule() we dont need the precise count of allocated
> sockets and can revert to previous behavior.
>
> Reported-by: Denys Fedoryshchenko <denys@visp.net.lb>
> Sined-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Glauber Costa <glommer@parallels.com>
> ---

Acked-by: Neal Cardwell <ncardwell@google.com>

neal

^ permalink raw reply

* [PATCH] netem: fix possible skb leak
From: Eric Dumazet @ 2012-04-29 19:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Stephen Hemminger

From: Eric Dumazet <edumazet@google.com>

skb_checksum_help(skb) can return an error, we must free skb in this
case. qdisc_drop(skb, sch) can also be feeded with a NULL skb (if
skb_unshare() failed), so lets use this generic helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <shemminger@osdl.org>
---
 net/sched/sch_netem.c |    6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5da548f..ebd2296 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -408,10 +408,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
-		     skb_checksum_help(skb))) {
-			sch->qstats.drops++;
-			return NET_XMIT_DROP;
-		}
+		     skb_checksum_help(skb)))
+			return qdisc_drop(skb, sch);
 
 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 	}

^ permalink raw reply related

* Re: [PATCH RFC 0/2] e1000e: 82574 also needs ASPM L1 completely disabled
From: Chris Boot @ 2012-04-29 18:03 UTC (permalink / raw)
  To: Nix; +Cc: e1000-devel, netdev, Jesse Brandeburg, linux-kernel
In-Reply-To: <87y5pe1o89.fsf@spindle.srvr.nix>

On 29/04/2012 17:45, Nix wrote:
> On 24 Apr 2012, Jesse Brandeburg outgrape:
>
>> Please let us know the results of your testing, we will let you know if
>> we see any issues as well.

Right, I have finally managed to test my patch on my servers. I've had a 
really tough week with them due to my cluster falling over inexplicably 
so I didn't want to change too much too soon after everything came back up.

The patch does properly disable ASPM L1 as well as L0s as before. Unlike 
for Nix, these do remain disabled. I'll keep running with the patch now 
but I'm confident this will solve my NIC lockups just as Nix's setpci 
incantations did.

Please apply the patches. I'd also really like to have them CCed to 
stable so that Debian will pick them up in time.

> Alas, it has no effect at all here; L0s and L1 claim to be being
> disabled at boot time, but if you ask with lspci you see that they are
> not. I strongly suspect that they *are* being disabled, but then get
> re-enabled by something else, because even if I force them off with
> setpci in the boot scripts, by the time the scripts have finished
> executing and I've got to a root prompt where I can run setpci, L0s and
> L1 are always back on again.

Indeed our troubles must be different. My patch definitely disables ASPM 
fully on the NIC and the upstream device as evidenced by lspci.

Here are extracts from the boot logs and lspci before my patch:

[    3.305372] e1000e: Intel(R) PRO/1000 Network Driver - 1.5.1-k
[    3.317015] e1000e: Copyright(c) 1999 - 2011 Intel Corporation.
[    3.328436] e1000e 0000:00:19.0: PCI INT A -> GSI 20 (level, low) -> 
IRQ 20
[    3.328482] e1000e 0000:00:19.0: setting latency timer to 64
[    3.329493] e1000e 0000:00:19.0: irq 45 for MSI/MSI-X
[    3.679153] e1000e 0000:00:19.0: eth1: (PCI Express:2.5GT/s:Width x1) 
00:25:90:56:ac:d1
[    3.691391] e1000e 0000:00:19.0: eth1: Intel(R) PRO/1000 Network 
Connection
[    3.703689] e1000e 0000:00:19.0: eth1: MAC: 10, PHY: 11, PBA No: 
FFFFFF-0FF
[    3.715639] e1000e 0000:05:00.0: Disabling ASPM L0s
[    4.156806] e1000e 0000:05:00.0: PCI INT A -> GSI 16 (level, low) -> 
IRQ 16
[    4.371659] e1000e 0000:05:00.0: setting latency timer to 64
[    4.371928] e1000e 0000:05:00.0: irq 65 for MSI/MSI-X
[    4.371933] e1000e 0000:05:00.0: irq 66 for MSI/MSI-X
[    4.371937] e1000e 0000:05:00.0: irq 67 for MSI/MSI-X
[    4.485505] e1000e 0000:05:00.0: eth3: (PCI Express:2.5GT/s:Width x1) 
00:25:90:56:ac:d0
[    4.485507] e1000e 0000:05:00.0: eth3: Intel(R) PRO/1000 Network 
Connection
[    4.485647] e1000e 0000:05:00.0: eth3: MAC: 3, PHY: 8, PBA No: FFFFFF-0FF
[   14.237551] e1000e 0000:00:19.0: irq 45 for MSI/MSI-X
[   14.293193] e1000e 0000:00:19.0: irq 45 for MSI/MSI-X
[   16.160177] e1000e: eth2 NIC Link is Up 100 Mbps Full Duplex, Flow 
Control: None
[   16.174293] e1000e 0000:05:00.0: eth2: 10/100 speed: disabling TSO

tidyup ~ # lspci -vvv -s 05:00.0 | grep ASPM
                 LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, 
Latency L0 <128ns, L1 <64us
                 LnkCtl: ASPM L1 Enabled; RCB 64 bytes Disabled- 
Retrain- CommClk+
tidyup ~ # lspci -vvv -s 00:1c.4 | grep ASPM
                 LnkCap: Port #5, Speed 5GT/s, Width x1, ASPM L0s L1, 
Latency L0 <512ns, L1 <4us
                 LnkCtl: ASPM L1 Enabled; RCB 64 bytes Disabled- 
Retrain- CommClk+

And now the same kernel with the patch applied:

[    3.310165] e1000e: Intel(R) PRO/1000 Network Driver - 1.5.1-k
[    3.321625] e1000e: Copyright(c) 1999 - 2011 Intel Corporation.
[    3.332996] e1000e 0000:00:19.0: PCI INT A -> GSI 20 (level, low) -> 
IRQ 20
[    3.413898] e1000e 0000:00:19.0: setting latency timer to 64
[    3.426699] e1000e 0000:00:19.0: irq 54 for MSI/MSI-X
[    3.731112] e1000e 0000:00:19.0: eth2: (PCI Express:2.5GT/s:Width x1) 
00:25:90:56:ac:d1
[    3.743437] e1000e 0000:00:19.0: eth2: Intel(R) PRO/1000 Network 
Connection
[    3.755918] e1000e 0000:00:19.0: eth2: MAC: 10, PHY: 11, PBA No: 
FFFFFF-0FF
[    3.768758] e1000e 0000:05:00.0: Disabling ASPM L0s L1
[    3.794095] e1000e 0000:05:00.0: PCI INT A -> GSI 16 (level, low) -> 
IRQ 16
[    3.794178] e1000e 0000:05:00.0: setting latency timer to 64
[    3.795074] e1000e 0000:05:00.0: irq 64 for MSI/MSI-X
[    3.795088] e1000e 0000:05:00.0: irq 65 for MSI/MSI-X
[    3.795107] e1000e 0000:05:00.0: irq 66 for MSI/MSI-X
[    3.912691] e1000e 0000:05:00.0: eth3: (PCI Express:2.5GT/s:Width x1) 
00:25:90:56:ac:d0
[    3.912693] e1000e 0000:05:00.0: eth3: Intel(R) PRO/1000 Network 
Connection
[    3.912842] e1000e 0000:05:00.0: eth3: MAC: 3, PHY: 8, PBA No: FFFFFF-0FF
[   14.454955] e1000e 0000:00:19.0: irq 54 for MSI/MSI-X
[   14.507724] e1000e 0000:00:19.0: irq 54 for MSI/MSI-X
[   15.944706] e1000e: eth2 NIC Link is Up 100 Mbps Full Duplex, Flow 
Control: None
[   15.956279] e1000e 0000:05:00.0: eth2: 10/100 speed: disabling TSO

tidyup ~ # lspci -vvv -s 05:00.0 | grep ASPM
                 LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, 
Latency L0 <128ns, L1 <64us
                 LnkCtl: ASPM Disabled; RCB 64 bytes Disabled- Retrain- 
CommClk+
tidyup ~ # lspci -vvv -s 00:1c.4 | grep ASPM
                 LnkCap: Port #5, Speed 5GT/s, Width x1, ASPM L0s L1, 
Latency L0 <512ns, L1 <4us
                 LnkCtl: ASPM Disabled; RCB 64 bytes Disabled- Retrain- 
CommClk+

Cheers,
Chris

-- 
Chris Boot
bootc@bootc.net


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox