Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 11/15] i40e/i40evf: find partition_id in npar mode
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Shannon Nelson, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

When in NPAR mode the driver instance might be controlling the base
partition or one of the other "fake" PFs.  There are some things that
can only be done by the base partition, aka partition_id 1.  This code
does a bit of work to find how many partitions are there per port and
what is the current partition_id.

Change-ID: Iba427f020a1983d02147d86f121b3627e20ee21d
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_common.c    | 66 ++++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_prototype.h |  3 ++
 drivers/net/ethernet/intel/i40e/i40e_type.h      |  7 ++-
 drivers/net/ethernet/intel/i40evf/i40e_type.h    |  7 ++-
 4 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 3d741ee..b16fc03 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2035,6 +2035,43 @@ i40e_status i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
 }
 
 /**
+ * i40e_aq_debug_read_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the register using the admin queue commands
+ **/
+i40e_status i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32  reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_reg_read_write *cmd_resp =
+		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	i40e_status status;
+
+	if (reg_val == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_debug_read_reg);
+
+	cmd_resp->address = cpu_to_le32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		*reg_val = ((u64)cmd_resp->value_high << 32) |
+			    (u64)cmd_resp->value_low;
+		*reg_val = le64_to_cpu(*reg_val);
+	}
+
+	return status;
+}
+
+/**
  * i40e_aq_debug_write_register
  * @hw: pointer to the hw struct
  * @reg_addr: register address
@@ -2292,6 +2329,7 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 				     enum i40e_admin_queue_opc list_type_opc)
 {
 	struct i40e_aqc_list_capabilities_element_resp *cap;
+	u32 valid_functions, num_functions;
 	u32 number, logical_id, phys_id;
 	struct i40e_hw_capabilities *p;
 	u32 i = 0;
@@ -2427,6 +2465,34 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 	if (p->npar_enable || p->mfp_mode_1)
 		p->fcoe = false;
 
+	/* count the enabled ports (aka the "not disabled" ports) */
+	hw->num_ports = 0;
+	for (i = 0; i < 4; i++) {
+		u32 port_cfg_reg = I40E_PRTGEN_CNF + (4 * i);
+		u64 port_cfg = 0;
+
+		/* use AQ read to get the physical register offset instead
+		 * of the port relative offset
+		 */
+		i40e_aq_debug_read_register(hw, port_cfg_reg, &port_cfg, NULL);
+		if (!(port_cfg & I40E_PRTGEN_CNF_PORT_DIS_MASK))
+			hw->num_ports++;
+	}
+
+	valid_functions = p->valid_functions;
+	num_functions = 0;
+	while (valid_functions) {
+		if (valid_functions & 1)
+			num_functions++;
+		valid_functions >>= 1;
+	}
+
+	/* partition id is 1-based, and functions are evenly spread
+	 * across the ports as partitions
+	 */
+	hw->partition_id = (hw->pf_id / hw->num_ports) + 1;
+	hw->num_partitions = num_functions / hw->num_ports;
+
 	/* additional HW specific goodies that might
 	 * someday be HW version specific
 	 */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 2fb4306..d1c7d63 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -71,6 +71,9 @@ i40e_status i40e_aq_get_firmware_version(struct i40e_hw *hw,
 i40e_status i40e_aq_debug_write_register(struct i40e_hw *hw,
 					u32 reg_addr, u64 reg_val,
 					struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32  reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
 				struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_set_default_vsi(struct i40e_hw *hw, u16 vsi_id,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index c1f2eb9..611de3e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -431,7 +431,7 @@ struct i40e_hw {
 	u8 __iomem *hw_addr;
 	void *back;
 
-	/* function pointer structs */
+	/* subsystem structs */
 	struct i40e_phy_info phy;
 	struct i40e_mac_info mac;
 	struct i40e_bus_info bus;
@@ -458,6 +458,11 @@ struct i40e_hw {
 	u8  pf_id;
 	u16 main_vsi_seid;
 
+	/* for multi-function MACs */
+	u16 partition_id;
+	u16 num_partitions;
+	u16 num_ports;
+
 	/* Closest numa node to the device */
 	u16 numa_node;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index 68aec11..d1c2b5a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -425,7 +425,7 @@ struct i40e_hw {
 	u8 __iomem *hw_addr;
 	void *back;
 
-	/* function pointer structs */
+	/* subsystem structs */
 	struct i40e_phy_info phy;
 	struct i40e_mac_info mac;
 	struct i40e_bus_info bus;
@@ -452,6 +452,11 @@ struct i40e_hw {
 	u8  pf_id;
 	u16 main_vsi_seid;
 
+	/* for multi-function MACs */
+	u16 partition_id;
+	u16 num_partitions;
+	u16 num_ports;
+
 	/* Closest numa node to the device */
 	u16 numa_node;
 
-- 
1.9.3

^ permalink raw reply related

* [net-next 12/15] i40e: Adding function for reading PBA String
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Kamil Krawczyk, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Kamil Krawczyk <kamil.krawczyk@intel.com>

Function will read PBA Block from Shadow RAM and return it in a string format.

Change-ID: I4ee7059f6e21bd0eba38687da15e772e0b4ab36e
Signed-off-by: Kamil Krawczyk <kamil.krawczyk@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_common.c    | 59 ++++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_prototype.h |  2 +
 drivers/net/ethernet/intel/i40e/i40e_type.h      |  2 +
 3 files changed, 63 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index b16fc03..4f4d9d1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -742,6 +742,65 @@ i40e_status i40e_get_san_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
 #endif
 
 /**
+ *  i40e_read_pba_string - Reads part number string from EEPROM
+ *  @hw: pointer to hardware structure
+ *  @pba_num: stores the part number string from the EEPROM
+ *  @pba_num_size: part number string buffer length
+ *
+ *  Reads the part number string from the EEPROM.
+ **/
+i40e_status i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+				 u32 pba_num_size)
+{
+	i40e_status status = 0;
+	u16 pba_word = 0;
+	u16 pba_size = 0;
+	u16 pba_ptr = 0;
+	u16 i = 0;
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_FLAGS, &pba_word);
+	if (status || (pba_word != 0xFAFA)) {
+		hw_dbg(hw, "Failed to read PBA flags or flag is invalid.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_BLOCK_PTR, &pba_ptr);
+	if (status) {
+		hw_dbg(hw, "Failed to read PBA Block pointer.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, pba_ptr, &pba_size);
+	if (status) {
+		hw_dbg(hw, "Failed to read PBA Block size.\n");
+		return status;
+	}
+
+	/* Subtract one to get PBA word count (PBA Size word is included in
+	 * total size)
+	 */
+	pba_size--;
+	if (pba_num_size < (((u32)pba_size * 2) + 1)) {
+		hw_dbg(hw, "Buffer to small for PBA data.\n");
+		return I40E_ERR_PARAM;
+	}
+
+	for (i = 0; i < pba_size; i++) {
+		status = i40e_read_nvm_word(hw, (pba_ptr + 1) + i, &pba_word);
+		if (status) {
+			hw_dbg(hw, "Failed to read PBA Block word %d.\n", i);
+			return status;
+		}
+
+		pba_num[(i * 2)] = (pba_word >> 8) & 0xFF;
+		pba_num[(i * 2) + 1] = pba_word & 0xFF;
+	}
+	pba_num[(pba_size * 2)] = '\0';
+
+	return status;
+}
+
+/**
  * i40e_get_media_type - Gets media type
  * @hw: pointer to the hardware structure
  **/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index d1c7d63..68e852a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -248,6 +248,8 @@ void i40e_clear_pxe_mode(struct i40e_hw *hw);
 bool i40e_get_link_status(struct i40e_hw *hw);
 i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
 i40e_status i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
+i40e_status i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+				 u32 pba_num_size);
 i40e_status i40e_validate_mac_addr(u8 *mac_addr);
 void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable);
 #ifdef I40E_FCOE
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 611de3e..ff121fe 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -1140,6 +1140,8 @@ struct i40e_hw_port_stats {
 /* Checksum and Shadow RAM pointers */
 #define I40E_SR_NVM_CONTROL_WORD		0x00
 #define I40E_SR_EMP_MODULE_PTR			0x0F
+#define I40E_SR_PBA_FLAGS			0x15
+#define I40E_SR_PBA_BLOCK_PTR			0x16
 #define I40E_SR_NVM_IMAGE_VERSION		0x18
 #define I40E_SR_NVM_WAKE_ON_LAN			0x19
 #define I40E_SR_ALTERNATE_SAN_MAC_ADDRESS_PTR	0x27
-- 
1.9.3

^ permalink raw reply related

* [net-next 10/15] i40e: remove VN2VN related mac filters
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Vasu Dev, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Vasu Dev <vasu.dev@intel.com>

These mac address already added by FCoE stack above netdev,
therefore adding them here is redundant.

Change-ID: Ia5b59f426f57efd20f8945f7c6cc5d741fbe06e5
Signed-off-by: Vasu Dev <vasu.dev@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
index a8b8bd9..2cd841b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
@@ -1515,8 +1515,6 @@ void i40e_fcoe_config_netdev(struct net_device *netdev, struct i40e_vsi *vsi)
 	i40e_add_filter(vsi, (u8[6]) FC_FCOE_FLOGI_MAC, 0, false, false);
 	i40e_add_filter(vsi, FIP_ALL_FCOE_MACS, 0, false, false);
 	i40e_add_filter(vsi, FIP_ALL_ENODE_MACS, 0, false, false);
-	i40e_add_filter(vsi, FIP_ALL_VN2VN_MACS, 0, false, false);
-	i40e_add_filter(vsi, FIP_ALL_P2P_MACS, 0, false, false);
 
 	/* use san mac */
 	ether_addr_copy(netdev->dev_addr, hw->mac.san_addr);
-- 
1.9.3

^ permalink raw reply related

* [net-next 13/15] i40e: limit WoL and link settings to partition 1
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Shannon Nelson, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

When in multi-function mode, e.g. Dell's NPAR, only partition 1
of each MAC is allowed to set WoL, speed, and flow control.

Change-ID: I87a9debc7479361c55a71f0120294ea319f23588
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 43 +++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 951e876..b8230dc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -219,6 +219,16 @@ static const char i40e_gstrings_test[][ETH_GSTRING_LEN] = {
 #define I40E_TEST_LEN (sizeof(i40e_gstrings_test) / ETH_GSTRING_LEN)
 
 /**
+ * i40e_partition_setting_complaint - generic complaint for MFP restriction
+ * @pf: the PF struct
+ **/
+static void i40e_partition_setting_complaint(struct i40e_pf *pf)
+{
+	dev_info(&pf->pdev->dev,
+		 "The link settings are allowed to be changed only from the first partition of a given port. Please switch to the first partition in order to change the setting.\n");
+}
+
+/**
  * i40e_get_settings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
  * @ecmd: ethtool command
@@ -485,6 +495,14 @@ static int i40e_set_settings(struct net_device *netdev,
 	u8 autoneg;
 	u32 advertise;
 
+	/* Changing port settings is not supported if this isn't the
+	 * port's controlling PF
+	 */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+
 	if (vsi != pf->vsi[pf->lan_vsi])
 		return -EOPNOTSUPP;
 
@@ -687,6 +705,14 @@ static int i40e_set_pauseparam(struct net_device *netdev,
 	u8 aq_failures;
 	int err = 0;
 
+	/* Changing the port's flow control is not supported if this isn't the
+	 * port's controlling PF
+	 */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+
 	if (vsi != pf->vsi[pf->lan_vsi])
 		return -EOPNOTSUPP;
 
@@ -1503,7 +1529,7 @@ static void i40e_get_wol(struct net_device *netdev,
 
 	/* NVM bit on means WoL disabled for the port */
 	i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
-	if ((1 << hw->port) & wol_nvm_bits) {
+	if ((1 << hw->port) & wol_nvm_bits || hw->partition_id != 1) {
 		wol->supported = 0;
 		wol->wolopts = 0;
 	} else {
@@ -1512,13 +1538,28 @@ static void i40e_get_wol(struct net_device *netdev,
 	}
 }
 
+/**
+ * i40e_set_wol - set the WakeOnLAN configuration
+ * @netdev: the netdev in question
+ * @wol: the ethtool WoL setting data
+ **/
 static int i40e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_hw *hw = &pf->hw;
 	u16 wol_nvm_bits;
 
+	/* WoL not supported if this isn't the controlling PF on the port */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+
+	if (vsi != pf->vsi[pf->lan_vsi])
+		return -EOPNOTSUPP;
+
 	/* NVM bit on means WoL disabled for the port */
 	i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
 	if (((1 << hw->port) & wol_nvm_bits))
-- 
1.9.3

^ permalink raw reply related

* [net-next 14/15] i40e: Don't exit link event early if link speed has changed
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Catherine Sullivan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Catherine Sullivan <catherine.sullivan@intel.com>

Previously we were only checking if the link up state had changed,
and if it hadn't exiting the link event routine early. We should
also check if speed has changed, and if it has, stay and finish
processing the link event.

Change-ID: I9c8e0991b3f0279108a7858898c3c5ce0a9856b8
Signed-off-by: Catherine Sullivan <catherine.sullivan@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7c14973..80430b0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5503,14 +5503,18 @@ static void i40e_link_event(struct i40e_pf *pf)
 {
 	bool new_link, old_link;
 	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	u8 new_link_speed, old_link_speed;
 
 	/* set this to force the get_link_status call to refresh state */
 	pf->hw.phy.get_link_info = true;
 
 	old_link = (pf->hw.phy.link_info_old.link_info & I40E_AQ_LINK_UP);
 	new_link = i40e_get_link_status(&pf->hw);
+	old_link_speed = pf->hw.phy.link_info_old.link_speed;
+	new_link_speed = pf->hw.phy.link_info.link_speed;
 
 	if (new_link == old_link &&
+	    new_link_speed == old_link_speed &&
 	    (test_bit(__I40E_DOWN, &vsi->state) ||
 	     new_link == netif_carrier_ok(vsi->netdev)))
 		return;
-- 
1.9.3

^ permalink raw reply related

* [net-next 15/15] i40e: limit sriov to partition 1 of NPAR configurations
From: Jeff Kirsher @ 2015-01-13 11:33 UTC (permalink / raw)
  To: davem; +Cc: Shannon Nelson, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <1421148811-9763-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

Make sure we only allow SR/IOV on the master PF of a port in multifunction
mode.  This should be the case anyway based on the num_vfs configured in
the NVM, but this will help make sure there's no question.  If we're not
in multifunction mode the partition_id will always be 1.

Change-ID: I8b2592366fe6782f15301bde2ebd1d4da240109d
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Tested-by: Jim Young <james.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 80430b0..f3b036d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7319,7 +7319,7 @@ static int i40e_sw_init(struct i40e_pf *pf)
 
 #endif /* I40E_FCOE */
 #ifdef CONFIG_PCI_IOV
-	if (pf->hw.func_caps.num_vfs) {
+	if (pf->hw.func_caps.num_vfs && pf->hw.partition_id == 1) {
 		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
 		pf->flags |= I40E_FLAG_SRIOV_ENABLED;
 		pf->num_req_vfs = min_t(int,
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH net-next v2 2/2] vxlan: Remote checksum offload
From: Thomas Graf @ 2015-01-13 11:44 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <20150113012626.GD20387@casper.infradead.org>

On 01/13/15 at 01:26am, Thomas Graf wrote:
> On 01/12/15 at 05:00pm, Tom Herbert wrote:
> > +	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
> > +		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni);
> > +		if (!vxh)
> > +			goto drop;
> > +
> > +		flags &= ~VXLAN_HF_RCO;
> > +		vni &= VXLAN_VID_MASK;
> > +	}
> 
> Nice.
> 
> Would you mind basing this on top off the extension framework being put
> in place by GBP? I think that all VXLAN extensions should be exposed as
> such in a universal way to user space.

Doing so would also fix the missing "don't share UDP port on extension
mismatch" functionality as explained in "[PATCH 2/6] vxlan: Group Policy
extension".

^ permalink raw reply

* Re: why are IPv6 addresses removed on link down
From: YOSHIFUJI Hideaki @ 2015-01-13 11:58 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Stephen Hemminger
  Cc: hideaki.yoshifuji, David Ahern, netdev@vger.kernel.org
In-Reply-To: <1421145346.13626.12.camel@redhat.com>

Hi,

Hannes Frederic Sowa wrote:
> On Mo, 2015-01-12 at 23:10 -0800, Stephen Hemminger wrote:
>> On Mon, 12 Jan 2015 22:06:44 -0700
>> David Ahern <dsahern@gmail.com> wrote:
>>
>>> We noticed that IPv6 addresses are removed on a link down. e.g.,
>>>     ip link set dev eth1
>>>
>>>
>>> Looking at the code it appears to be this code path in addrconf.c:
>>>
>>>           case NETDEV_DOWN:
>>>           case NETDEV_UNREGISTER:
>>>                   /*
>>>                    *      Remove all addresses from this interface.
>>>                    */
>>>                   addrconf_ifdown(dev, event != NETDEV_DOWN);
>>>                   break;
>>>
>>> IPv4 addresses are NOT removed on a link down. Is there a particular
>>> reason IPv6 addresses are?
>>>
>>> Thanks,
>>> David
>>
>> See RFC's which describes how IPv6 does Duplicate Address Detection.
>> Address is not valid when link is down, since DAD is not possible.
>
> It should be no problem if the kernel would reacquire them on ifup and
> do proper DAD. We simply must not use them while the interface is dead
> (also making sure they don't get used for loopback routing).
>
> The problem the IPv6 addresses get removed is much more a historical
> artifact nowadays, I think. It is part of user space API and scripts
> deal with that already.

We might have another "detached" state which essintially drops
outgoing packets while link is down.  Just after recovering link,
we could start receiving packet from the link and perform optimistic
DAD. And then, after it succeeds, we may start sending packets.

Since "detached" state is like the state just before completing
Optimistic DAD, it is not so difficult to implement this extended
behavior, I guess.

-- 
Hideaki Yoshifuji <hideaki.yoshifuji@miraclelinux.com>
Technical Division, MIRACLE LINUX CORPORATION

^ permalink raw reply

* Re: [RFC PATCH v2 2/2] net: ixgbe: implement af_packet direct queue mappings
From: Hannes Frederic Sowa @ 2015-01-13 12:05 UTC (permalink / raw)
  To: John Fastabend
  Cc: netdev, danny.zhou, nhorman, dborkman, john.ronciak, brouer
In-Reply-To: <20150113043542.29985.15658.stgit@nitbit.x32>

On Mo, 2015-01-12 at 20:35 -0800, John Fastabend wrote:
> +static int
> +ixgbe_ndo_qpair_page_map(struct vm_area_struct *vma, struct net_device *dev)
> +{
> +	struct ixgbe_adapter *adapter = netdev_priv(dev);
> +	phys_addr_t phy_addr = pci_resource_start(adapter->pdev, 0);
> +	unsigned long pfn_rx = (phy_addr + RX_DESC_ADDR_OFFSET) >> PAGE_SHIFT;
> +	unsigned long pfn_tx = (phy_addr + TX_DESC_ADDR_OFFSET) >> PAGE_SHIFT;
> +	unsigned long dummy_page_phy;
> +	pgprot_t pre_vm_page_prot;
> +	unsigned long start;
> +	unsigned int i;
> +	int err;
> +
> +	if (!dummy_page_buf) {
> +		dummy_page_buf = kzalloc(PAGE_SIZE_4K, GFP_KERNEL);
> +		if (!dummy_page_buf)
> +			return -ENOMEM;
> +
> +		for (i = 0; i < PAGE_SIZE_4K / sizeof(unsigned int); i++)
> +			dummy_page_buf[i] = 0xdeadbeef;
> +	}
> +
> +	dummy_page_phy = virt_to_phys(dummy_page_buf);
> +	pre_vm_page_prot = vma->vm_page_prot;
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +	/* assume the vm_start is 4K aligned address */
> +	for (start = vma->vm_start;
> +	     start < vma->vm_end;
> +	     start += PAGE_SIZE_4K) {
> +		if (start == vma->vm_start + RX_DESC_ADDR_OFFSET) {
> +			err = remap_pfn_range(vma, start, pfn_rx, PAGE_SIZE_4K,
> +					      vma->vm_page_prot);
> +			if (err)
> +				return -EAGAIN;
> +		} else if (start == vma->vm_start + TX_DESC_ADDR_OFFSET) {
> +			err = remap_pfn_range(vma, start, pfn_tx, PAGE_SIZE_4K,
> +					      vma->vm_page_prot);
> +			if (err)
> +				return -EAGAIN;
> +		} else {
> +			unsigned long addr = dummy_page_phy > PAGE_SHIFT;

I guess you have forgotten to delete this line?

> +
> +			err = remap_pfn_range(vma, start, addr, PAGE_SIZE_4K,
> +					      pre_vm_page_prot);
> +			if (err)
> +				return -EAGAIN;
> +		}
> +	}
> +	return 0;
> +}

^ permalink raw reply

* [net PATCH 1/1] drivers: net: cpsw: fix multicast flush in dual emac mode
From: Mugunthan V N @ 2015-01-13 12:05 UTC (permalink / raw)
  To: netdev; +Cc: davem, Mugunthan V N, stable

Since ALE table is a common resource for both the interfaces in Dual EMAC
mode and while bringing up the second interface in cpsw_ndo_set_rx_mode()
all the multicast entries added by the first interface is flushed out and
only second interface multicast addresses are added. Fixing this by
flushing multicast addresses based on dual EMAC port vlans which will not
affect the other emac port multicast addresses.

Fixes: d9ba8f9 (driver: net: ethernet: cpsw: dual emac interface implementation)
Cc: <stable@vger.kernel.org> # v3.9+
Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
---
 drivers/net/ethernet/ti/cpsw.c     | 11 +++++++++--
 drivers/net/ethernet/ti/cpsw_ale.c | 10 +++++++++-
 drivers/net/ethernet/ti/cpsw_ale.h |  2 +-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index e61ee83..64d1cef 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -610,7 +610,7 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 
 			/* Clear all mcast from ALE */
 			cpsw_ale_flush_multicast(ale, ALE_ALL_PORTS <<
-						 priv->host_port);
+						 priv->host_port, -1);
 
 			/* Flood All Unicast Packets to Host port */
 			cpsw_ale_control_set(ale, 0, ALE_P0_UNI_FLOOD, 1);
@@ -634,6 +634,12 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
+	int vid;
+
+	if (priv->data.dual_emac)
+		vid = priv->slaves[priv->emac_port].port_vlan;
+	else
+		vid = priv->data.default_vlan;
 
 	if (ndev->flags & IFF_PROMISC) {
 		/* Enable promiscuous mode */
@@ -649,7 +655,8 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 	cpsw_ale_set_allmulti(priv->ale, priv->ndev->flags & IFF_ALLMULTI);
 
 	/* Clear all mcast from ALE */
-	cpsw_ale_flush_multicast(priv->ale, ALE_ALL_PORTS << priv->host_port);
+	cpsw_ale_flush_multicast(priv->ale, ALE_ALL_PORTS << priv->host_port,
+				 vid);
 
 	if (!netdev_mc_empty(ndev)) {
 		struct netdev_hw_addr *ha;
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 097ebe7..5246b3a 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -234,7 +234,7 @@ static void cpsw_ale_flush_mcast(struct cpsw_ale *ale, u32 *ale_entry,
 		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
 }
 
-int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask)
+int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask, int vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS];
 	int ret, idx;
@@ -245,6 +245,14 @@ int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask)
 		if (ret != ALE_TYPE_ADDR && ret != ALE_TYPE_VLAN_ADDR)
 			continue;
 
+		/* if vid passed is -1 then remove all multicast entry from
+		 * the table irrespective of vlan id, if a valid vlan id is
+		 * passed then remove only multicast added to that vlan id.
+		 * if vlan id doesn't match then move on to next entry.
+		 */
+		if (vid != -1 && cpsw_ale_get_vlan_id(ale_entry) != vid)
+			continue;
+
 		if (cpsw_ale_get_mcast(ale_entry)) {
 			u8 addr[6];
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index c0d4127..af1e7ec 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -92,7 +92,7 @@ void cpsw_ale_stop(struct cpsw_ale *ale);
 
 int cpsw_ale_set_ageout(struct cpsw_ale *ale, int ageout);
 int cpsw_ale_flush(struct cpsw_ale *ale, int port_mask);
-int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask);
+int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask, int vid);
 int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port,
 		       int flags, u16 vid);
 int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port,
-- 
2.2.1.62.g3f15098

^ permalink raw reply related

* Re: why are IPv6 addresses removed on link down
From: YOSHIFUJI Hideaki @ 2015-01-13 12:15 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Stephen Hemminger
  Cc: hideaki.yoshifuji, David Ahern, netdev@vger.kernel.org
In-Reply-To: <54B50873.4090907@miraclelinux.com>

YOSHIFUJI Hideaki wrote:
> Hi,
>
> Hannes Frederic Sowa wrote:
>> On Mo, 2015-01-12 at 23:10 -0800, Stephen Hemminger wrote:
>>> On Mon, 12 Jan 2015 22:06:44 -0700
>>> David Ahern <dsahern@gmail.com> wrote:
>>>
>>>> We noticed that IPv6 addresses are removed on a link down. e.g.,
>>>>     ip link set dev eth1
>>>>
>>>>
>>>> Looking at the code it appears to be this code path in addrconf.c:
>>>>
>>>>           case NETDEV_DOWN:
>>>>           case NETDEV_UNREGISTER:
>>>>                   /*
>>>>                    *      Remove all addresses from this interface.
>>>>                    */
>>>>                   addrconf_ifdown(dev, event != NETDEV_DOWN);
>>>>                   break;
>>>>
>>>> IPv4 addresses are NOT removed on a link down. Is there a particular
>>>> reason IPv6 addresses are?
>>>>
>>>> Thanks,
>>>> David
>>>
>>> See RFC's which describes how IPv6 does Duplicate Address Detection.
>>> Address is not valid when link is down, since DAD is not possible.
>>
>> It should be no problem if the kernel would reacquire them on ifup and
>> do proper DAD. We simply must not use them while the interface is dead
>> (also making sure they don't get used for loopback routing).
>>
>> The problem the IPv6 addresses get removed is much more a historical
>> artifact nowadays, I think. It is part of user space API and scripts
>> deal with that already.
>
> We might have another "detached" state which essintially drops
> outgoing packets while link is down.  Just after recovering link,
> we could start receiving packet from the link and perform optimistic
> DAD. And then, after it succeeds, we may start sending packets.
>
> Since "detached" state is like the state just before completing
> Optimistic DAD, it is not so difficult to implement this extended
> behavior, I guess.
>

Note that node is allowed to send packets to neighbours or default
routers if the node knows their link-layer addresses during Optimistic
DAD.

-- 
Hideaki Yoshifuji <hideaki.yoshifuji@miraclelinux.com>
Technical Division, MIRACLE LINUX CORPORATION

^ permalink raw reply

* Re: [3.19-rc3] tg3: BUG: sleeping function called from invalid context
From: Peter Hurley @ 2015-01-13 12:30 UTC (permalink / raw)
  To: Prashant Sreedharan; +Cc: Michael Chan, netdev, Linux kernel
In-Reply-To: <1421116255.16485.14.camel@prashant>

On 01/12/2015 09:30 PM, Prashant Sreedharan wrote:
> On Mon, 2015-01-12 at 19:59 -0500, Peter Hurley wrote:
>> On 3.19-rc3, I'm seeing this might_sleep() warning [1] from the tg3_open()
>> call stack. Let me know if I need to bisect this.
>>
>> Regards,
>> Peter Hurley
>>
>> [1]
>>
>> [   17.203009] BUG: sleeping function called from invalid context at /home/peter/src/kernels/mainline/kernel/irq/manage.c:104
>> [   17.203067] in_atomic(): 1, irqs_disabled(): 0, pid: 1106, name: ip
>> [   17.203092] 2 locks held by ip/1106:
>> [   17.205255]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff816adf1f>] rtnetlink_rcv+0x1f/0x40
>> [   17.207445]  #1:  (&(&tp->lock)->rlock){+.....}, at: [<ffffffffa01073e6>] tg3_start+0xc06/0x11f0 [tg3]
>> [   17.209725] CPU: 2 PID: 1106 Comm: ip Not tainted 3.19.0-rc3+wip-xeon+lockdep #rc3+wip
>> [   17.211900] Hardware name: Dell Inc. Precision WorkStation T5400  /0RW203, BIOS A11 04/30/2012
>> [   17.214086]  0000000000000068 ffff8802ac823498 ffffffff817af7e8 0000000000000005
>> [   17.216265]  ffffffff81a9be78 ffff8802ac8234a8 ffffffff810998a5 ffff8802ac8234d8
>> [   17.218446]  ffffffff8109991a ffff8802ac8234c8 ffff8802af0aae00 ffffffffa00ed000
>> [   17.220636] Call Trace:
>> [   17.222743]  [<ffffffff817af7e8>] dump_stack+0x4f/0x7b
>> [   17.224808]  [<ffffffff810998a5>] ___might_sleep+0x105/0x140
>> [   17.226842]  [<ffffffff8109991a>] __might_sleep+0x3a/0xa0
>> [   17.228869]  [<ffffffffa00ed000>] ? 0xffffffffa00ed000
>> [   17.230939]  [<ffffffff810d7d78>] synchronize_irq+0x38/0xa0
>> [   17.232967]  [<ffffffffa00ed000>] ? 0xffffffffa00ed000
>> [   17.234991]  [<ffffffffa010105f>] tg3_chip_reset+0x13f/0x9c0 [tg3]
>> [   17.236988]  [<ffffffffa01020ae>] tg3_reset_hw+0x7e/0x2d20 [tg3]
>> [   17.238996]  [<ffffffff813bfaff>] ? __udelay+0x2f/0x40
>> [   17.241007]  [<ffffffffa00ef2f7>] ? _tw32_flush+0x47/0x80 [tg3]
>> [   17.243066]  [<ffffffffa0104dac>] tg3_init_hw+0x5c/0x70 [tg3]
>> [   17.245438]  [<ffffffffa010740b>] tg3_start+0xc2b/0x11f0 [tg3]
>> [   17.247444]  [<ffffffffa0107ad7>] ? tg3_open+0x107/0x2e0 [tg3]
>> [   17.249556]  [<ffffffff810c338d>] ? trace_hardirqs_on+0xd/0x10
>> [   17.251581]  [<ffffffff8107806f>] ? __local_bh_enable_ip+0x6f/0x100
>> [   17.253710]  [<ffffffffa0107af8>] tg3_open+0x128/0x2e0 [tg3]
>> [   17.255758]  [<ffffffff816ba3f5>] ? netpoll_poll_disable+0x5/0xa0
>> [   17.257932]  [<ffffffff816a14af>] __dev_open+0xbf/0x140
>> [   17.260091]  [<ffffffff816a17c1>] __dev_change_flags+0xa1/0x160
>> [   17.262222]  [<ffffffff816a18a9>] dev_change_flags+0x29/0x60
>> [   17.264360]  [<ffffffff816b0e02>] do_setlink+0x2f2/0xa30
>> [   17.266431]  [<ffffffff816b1b7f>] rtnl_newlink+0x51f/0x750
>> [   17.268485]  [<ffffffff816b1749>] ? rtnl_newlink+0xe9/0x750
>> [   17.270483]  [<ffffffff811869c2>] ? free_pages_prepare+0x1d2/0x270
>> [   17.272507]  [<ffffffff810c32bd>] ? trace_hardirqs_on_caller+0x11d/0x1e0
>> [   17.274531]  [<ffffffff813dd1b2>] ? nla_parse+0x32/0x120
>> [   17.276531]  [<ffffffff81021ab5>] ? native_sched_clock+0x35/0xa0
>> [   17.278514]  [<ffffffff816adfd5>] rtnetlink_rcv_msg+0x95/0x250
>> [   17.280485]  [<ffffffff8109f699>] ? preempt_count_sub+0x49/0x50
>> [   17.282448]  [<ffffffff817b4a02>] ? mutex_lock_nested+0x382/0x530
>> [   17.284402]  [<ffffffff816adf1f>] ? rtnetlink_rcv+0x1f/0x40
>> [   17.286290]  [<ffffffff816adf1f>] ? rtnetlink_rcv+0x1f/0x40
>> [   17.288142]  [<ffffffff816adf40>] ? rtnetlink_rcv+0x40/0x40
>> [   17.290031]  [<ffffffff816cedc1>] netlink_rcv_skb+0xc1/0xe0
>> [   17.291836]  [<ffffffff816adf2e>] rtnetlink_rcv+0x2e/0x40
>> [   17.293615]  [<ffffffff816ce473>] netlink_unicast+0xf3/0x1d0
>> [   17.295420]  [<ffffffff816ce863>] netlink_sendmsg+0x313/0x690
>> [   17.297132]  [<ffffffff811ada4f>] ? might_fault+0x5f/0xb0
>> [   17.298799]  [<ffffffff8168253c>] do_sock_sendmsg+0x8c/0x100
>> [   17.300493]  [<ffffffff81681e3e>] ? copy_msghdr_from_user+0x15e/0x1f0
>> [   17.302173]  [<ffffffff81682aeb>] ___sys_sendmsg+0x30b/0x320
>> [   17.303798]  [<ffffffff81021ab5>] ? native_sched_clock+0x35/0xa0
>> [   17.305431]  [<ffffffff810bdee0>] ? cpuacct_account_field+0x80/0xb0
>> [   17.307085]  [<ffffffff81021ab5>] ? native_sched_clock+0x35/0xa0
>> [   17.308744]  [<ffffffff810a4f35>] ? sched_clock_local+0x25/0x90
>> [   17.310375]  [<ffffffff810a5dc1>] ? vtime_account_user+0x91/0xa0
>> [   17.311948]  [<ffffffff810a5198>] ? sched_clock_cpu+0xb8/0xe0
>> [   17.313509]  [<ffffffff810bf8be>] ? put_lock_stats.isra.26+0xe/0x30
>> [   17.315069]  [<ffffffff810c007e>] ? lock_release_holdtime.part.27+0x12e/0x1b0
>> [   17.316618]  [<ffffffff810a5dc1>] ? vtime_account_user+0x91/0xa0
>> [   17.318162]  [<ffffffff8109f5d1>] ? get_parent_ip+0x11/0x50
>> [   17.319703]  [<ffffffff8109f699>] ? preempt_count_sub+0x49/0x50
>> [   17.321235]  [<ffffffff811807e5>] ? context_tracking_user_exit+0x55/0x130
>> [   17.322732]  [<ffffffff811807e5>] ? context_tracking_user_exit+0x55/0x130
>> [   17.324197]  [<ffffffff816834f2>] __sys_sendmsg+0x42/0x80
>> [   17.325634]  [<ffffffff81683542>] SyS_sendmsg+0x12/0x20
>> [   17.327048]  [<ffffffff817ba12d>] system_call_fastpath+0x16/0x1b
> 
> Please bisect, there hasn't been tg3 code changes in this path that
> might cause this. It would help to know the commit changes that is
> triggering the problem.

Ok, will do.

> Also could you provide the device details, from
> syslog look for "Tigon3 [partno(BCMxxxxx) rev xxxxxxx]". Thanks.

[    1.430884] tg3 0000:08:00.0 eth0: Tigon3 [partno(BCM95754) rev b002] (PCI Express) MAC address xx:xx:xx:xx:xx:xx
[    1.431095] tg3 0000:08:00.0 eth0: attached PHY is 5787 (10/100/1000Base-T Ethernet) (WireSpeed[1], EEE[0])
[    1.431295] tg3 0000:08:00.0 eth0: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[0] TSOcap[1]
[    1.431488] tg3 0000:08:00.0 eth0: dma_rwctrl[76180000] dma_mask[64-bit]

Regards,
Peter Hurley

^ permalink raw reply

* Re: [RFC PATCH v2 1/2] net: af_packet support for direct ring access in user space
From: Hannes Frederic Sowa @ 2015-01-13 12:35 UTC (permalink / raw)
  To: John Fastabend
  Cc: netdev, danny.zhou, nhorman, dborkman, john.ronciak, brouer
In-Reply-To: <20150113043509.29985.33515.stgit@nitbit.x32>

On Mo, 2015-01-12 at 20:35 -0800, John Fastabend wrote:
> This patch adds net_device ops to split off a set of driver queues
> from the driver and map the queues into user space via mmap. This
> allows the queues to be directly manipulated from user space. For
> raw packet interface this removes any overhead from the kernel network
> stack.
> 
> With these operations we bypass the network stack and packet_type
> handlers that would typically send traffic to an af_packet socket.
> This means hardware must do the forwarding. To do this ew can use
> the ETHTOOL_SRXCLSRLINS ops in the ethtool command set. It is
> currently supported by multiple drivers including sfc, mlx4, niu,
> ixgbe, and i40e. Supporting some way to steer traffic to a queue
> is the _only_ hardware requirement to support this interface.
> 
> A follow on patch adds support for ixgbe but we expect at least
> the subset of drivers implementing ETHTOOL_SRXCLSRLINS can be
> implemented later.
> 
> The high level flow, leveraging the af_packet control path, looks
> like:
> 
> 	bind(fd, &sockaddr, sizeof(sockaddr));
> 
> 	/* Get the device type and info */
> 	getsockopt(fd, SOL_PACKET, PACKET_DEV_DESC_INFO, &def_info,
> 		   &optlen);
> 
> 	/* With device info we can look up descriptor format */
> 
> 	/* Get the layout of ring space offset, page_sz, cnt */
> 	getsockopt(fd, SOL_PACKET, PACKET_DEV_QPAIR_MAP_REGION_INFO,
> 		   &info, &optlen);
> 
> 	/* request some queues from the driver */
> 	setsockopt(fd, SOL_PACKET, PACKET_RXTX_QPAIRS_SPLIT,
> 		   &qpairs_info, sizeof(qpairs_info));
> 
> 	/* if we let the driver pick us queues learn which queues
>          * we were given
>          */
> 	getsockopt(fd, SOL_PACKET, PACKET_RXTX_QPAIRS_SPLIT,
> 		   &qpairs_info, sizeof(qpairs_info));
> 
> 	/* And mmap queue pairs to user space */
> 	mmap(NULL, info.tp_dev_bar_sz, PROT_READ | PROT_WRITE,
> 	     MAP_SHARED, fd, 0);
> 
> 	/* Now we have some user space queues to read/write to*/
> 
> There is one critical difference when running with these interfaces
> vs running without them. In the normal case the af_packet module
> uses a standard descriptor format exported by the af_packet user
> space headers. In this model because we are working directly with
> driver queues the descriptor format maps to the descriptor format
> used by the device. User space applications can learn device
> information from the socket option PACKET_DEV_DESC_INFO. These
> are described by giving the vendor/deviceid and a descriptor layout
> in offset/length/width/alignment/byte_ordering.
> 
> To protect against arbitrary DMA writes IOMMU devices put memory
> in a single domain to stop arbitrary DMA to memory. Note it would
> be possible to dma into another sockets pages because most NIC
> devices only support a single domain. This would require being
> able to guess another sockets page layout. However the socket
> operation does require CAP_NET_ADMIN privileges.
> 
> Additionally we have a set of DPDK patches to enable DPDK with this
> interface. DPDK can be downloaded @ dpdk.org although as I hope is
> clear from above DPDK is just our paticular test environment we
> expect other libraries could be built on this interface.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  include/linux/netdevice.h      |   79 ++++++++
>  include/uapi/linux/if_packet.h |   88 +++++++++
>  net/packet/af_packet.c         |  397 ++++++++++++++++++++++++++++++++++++++++
>  net/packet/internal.h          |   10 +
>  4 files changed, 573 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 679e6e9..b71c97d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -52,6 +52,8 @@
>  #include <linux/neighbour.h>
>  #include <uapi/linux/netdevice.h>
>  
> +#include <linux/if_packet.h>
> +
>  struct netpoll_info;
>  struct device;
>  struct phy_device;
> @@ -1030,6 +1032,54 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state);
>   *	Called to notify switch device port of bridge port STP
>   *	state change.
> + *
> + * int (*ndo_split_queue_pairs) (struct net_device *dev,
> + *				 unsigned int qpairs_start_from,
> + *				 unsigned int qpairs_num,
> + *				 struct sock *sk)
> + *	Called to request a set of queues from the driver to be handed to the
> + *	callee for management. After this returns the driver will not use the
> + *	queues.
> + *
> + * int (*ndo_get_split_queue_pairs) (struct net_device *dev,
> + *				 unsigned int *qpairs_start_from,
> + *				 unsigned int *qpairs_num,
> + *				 struct sock *sk)
> + *	Called to get the location of queues that have been split for user
> + *	space to use. The socket must have previously requested the queues via
> + *	ndo_split_queue_pairs successfully.
> + *
> + * int (*ndo_return_queue_pairs) (struct net_device *dev,
> + *				  struct sock *sk)
> + *	Called to return a set of queues identified by sock to the driver. The
> + *	socket must have previously requested the queues via
> + *	ndo_split_queue_pairs for this action to be performed.
> + *
> + * int (*ndo_get_device_qpair_map_region_info) (struct net_device *dev,
> + *				struct tpacket_dev_qpair_map_region_info *info)
> + *	Called to return mapping of queue memory region.
> + *
> + * int (*ndo_get_device_desc_info) (struct net_device *dev,
> + *				    struct tpacket_dev_info *dev_info)
> + *	Called to get device specific information. This should uniquely identify
> + *	the hardware so that descriptor formats can be learned by the stack/user
> + *	space.
> + *
> + * int (*ndo_direct_qpair_page_map) (struct vm_area_struct *vma,
> + *				     struct net_device *dev)
> + *	Called to map queue pair range from split_queue_pairs into mmap region.
> + *
> + * int (*ndo_direct_validate_dma_mem_region_map)
> + *					(struct net_device *dev,
> + *					 struct tpacket_dma_mem_region *region,
> + *					 struct sock *sk)
> + *	Called to validate DMA address remaping for userspace memory region
> + *
> + * int (*ndo_get_dma_region_info)
> + *				 (struct net_device *dev,
> + *				  struct tpacket_dma_mem_region *region,
> + *				  struct sock *sk)
> + *	Called to get dma region' information such as iova.
>   */
>  struct net_device_ops {
>  	int			(*ndo_init)(struct net_device *dev);
> @@ -1190,6 +1240,35 @@ struct net_device_ops {
>  	int			(*ndo_switch_port_stp_update)(struct net_device *dev,
>  							      u8 state);
>  #endif
> +	int			(*ndo_split_queue_pairs)(struct net_device *dev,
> +					 unsigned int qpairs_start_from,
> +					 unsigned int qpairs_num,
> +					 struct sock *sk);
> +	int			(*ndo_get_split_queue_pairs)
> +					(struct net_device *dev,
> +					 unsigned int *qpairs_start_from,
> +					 unsigned int *qpairs_num,
> +					 struct sock *sk);
> +	int			(*ndo_return_queue_pairs)
> +					(struct net_device *dev,
> +					 struct sock *sk);
> +	int			(*ndo_get_device_qpair_map_region_info)
> +					(struct net_device *dev,
> +					 struct tpacket_dev_qpair_map_region_info *info);
> +	int			(*ndo_get_device_desc_info)
> +					(struct net_device *dev,
> +					 struct tpacket_dev_info *dev_info);
> +	int			(*ndo_direct_qpair_page_map)
> +					(struct vm_area_struct *vma,
> +					 struct net_device *dev);
> +	int			(*ndo_validate_dma_mem_region_map)
> +					(struct net_device *dev,
> +					 struct tpacket_dma_mem_region *region,
> +					 struct sock *sk);
> +	int			(*ndo_get_dma_region_info)
> +					(struct net_device *dev,
> +					 struct tpacket_dma_mem_region *region,
> +					 struct sock *sk);
>  };
>  
>  /**
> diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
> index da2d668..eb7a727 100644
> --- a/include/uapi/linux/if_packet.h
> +++ b/include/uapi/linux/if_packet.h
> @@ -54,6 +54,13 @@ struct sockaddr_ll {
>  #define PACKET_FANOUT			18
>  #define PACKET_TX_HAS_OFF		19
>  #define PACKET_QDISC_BYPASS		20
> +#define PACKET_RXTX_QPAIRS_SPLIT	21
> +#define PACKET_RXTX_QPAIRS_RETURN	22
> +#define PACKET_DEV_QPAIR_MAP_REGION_INFO	23
> +#define PACKET_DEV_DESC_INFO		24
> +#define PACKET_DMA_MEM_REGION_MAP       25
> +#define PACKET_DMA_MEM_REGION_RELEASE   26
> +
>  
>  #define PACKET_FANOUT_HASH		0
>  #define PACKET_FANOUT_LB		1
> @@ -64,6 +71,87 @@ struct sockaddr_ll {
>  #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
>  #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
>  
> +#define PACKET_MAX_NUM_MAP_MEMORY_REGIONS 64
> +#define PACKET_MAX_NUM_DESC_FORMATS	  8
> +#define PACKET_MAX_NUM_DESC_FIELDS	  64
> +#define PACKET_NIC_DESC_FIELD(fseq, foffset, fwidth, falign, fbo) \
> +		.seqn = (__u8)fseq,				\
> +		.offset = (__u8)foffset,			\
> +		.width = (__u8)fwidth,				\
> +		.align = (__u8)falign,				\
> +		.byte_order = (__u8)fbo

Are the __u8 necessary? They seem to hide compiler warnings?

> +
> +#define MAX_MAP_MEMORY_REGIONS	64
> +
> +/* setsockopt takes addr, size ,direction parametner, getsockopt takes
> + * iova, size, direction.
> + * */
> +struct tpacket_dma_mem_region {
> +	void *addr;		/* userspace virtual address */
> +	__u64 phys_addr;	/* physical address */
> +	__u64 iova;		/* IO virtual address used for DMA */
> +	unsigned long size;	/* size of region */
> +	int direction;		/* dma data direction */
> +};

Have you tested this with with 32 bit user space and 32 bit kernel, too?
I don't have any problem with only supporting 64 bit kernels for this
feature, but looking through the code I wonder if we handle the __u64
addresses correctly in all situations.

The other question I have, would it make sense to move the

+#ifdef CONFIG_DMA_MEMORY_PROTECTION
+	/* IOVA not equal to physical address means IOMMU takes effect */
+	if (region->phys_addr == region->iova)
+		return -EFAULT;
+#endif

check from the ixgbe driver into the kernel core, so we never expose
memory mapped io which is not protected by its own memory domain?

Thanks,
Hannes

^ permalink raw reply

* Re: why are IPv6 addresses removed on link down
From: Hannes Frederic Sowa @ 2015-01-13 12:36 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki; +Cc: Stephen Hemminger, David Ahern, netdev@vger.kernel.org
In-Reply-To: <54B50C71.7090007@miraclelinux.com>

Hi,

On Di, 2015-01-13 at 21:15 +0900, YOSHIFUJI Hideaki wrote:
> YOSHIFUJI Hideaki wrote:
> > Hi,
> >
> > Hannes Frederic Sowa wrote:
> >> On Mo, 2015-01-12 at 23:10 -0800, Stephen Hemminger wrote:
> >>> On Mon, 12 Jan 2015 22:06:44 -0700
> >>> David Ahern <dsahern@gmail.com> wrote:
> >>>
> >>>> We noticed that IPv6 addresses are removed on a link down. e.g.,
> >>>>     ip link set dev eth1
> >>>>
> >>>>
> >>>> Looking at the code it appears to be this code path in addrconf.c:
> >>>>
> >>>>           case NETDEV_DOWN:
> >>>>           case NETDEV_UNREGISTER:
> >>>>                   /*
> >>>>                    *      Remove all addresses from this interface.
> >>>>                    */
> >>>>                   addrconf_ifdown(dev, event != NETDEV_DOWN);
> >>>>                   break;
> >>>>
> >>>> IPv4 addresses are NOT removed on a link down. Is there a particular
> >>>> reason IPv6 addresses are?
> >>>>
> >>>> Thanks,
> >>>> David
> >>>
> >>> See RFC's which describes how IPv6 does Duplicate Address Detection.
> >>> Address is not valid when link is down, since DAD is not possible.
> >>
> >> It should be no problem if the kernel would reacquire them on ifup and
> >> do proper DAD. We simply must not use them while the interface is dead
> >> (also making sure they don't get used for loopback routing).
> >>
> >> The problem the IPv6 addresses get removed is much more a historical
> >> artifact nowadays, I think. It is part of user space API and scripts
> >> deal with that already.
> >
> > We might have another "detached" state which essintially drops
> > outgoing packets while link is down.  Just after recovering link,
> > we could start receiving packet from the link and perform optimistic
> > DAD. And then, after it succeeds, we may start sending packets.
> >
> > Since "detached" state is like the state just before completing
> > Optimistic DAD, it is not so difficult to implement this extended
> > behavior, I guess.
> >
> 
> Note that node is allowed to send packets to neighbours or default
> routers if the node knows their link-layer addresses during Optimistic
> DAD.
> 

I don't think it should be a problem from internal state handling of the
addresses.

I am much more concerned with scripts expecting the addresses to be
flushed on interface down/up and not reacting appropriate.

Bye,
Hannes

^ permalink raw reply

* Re: [3.19-rc3] tg3: BUG: sleeping function called from invalid context
From: Peter Hurley @ 2015-01-13 12:47 UTC (permalink / raw)
  To: Michael Chan; +Cc: Prashant Sreedharan, netdev, Linux kernel
In-Reply-To: <1421131762.7208.31.camel@LTIRV-MCHAN1.corp.ad.broadcom.com>

On 01/13/2015 01:49 AM, Michael Chan wrote:
> On Mon, 2015-01-12 at 19:59 -0500, Peter Hurley wrote: 
>> [   17.203009] BUG: sleeping function called from invalid context at /home/peter/src/kernels/mainline/kernel/irq/manage.c:104
>> [   17.203067] in_atomic(): 1, irqs_disabled(): 0, pid: 1106, name: ip
>> [   17.203092] 2 locks held by ip/1106:
>> [   17.205255]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff816adf1f>] rtnetlink_rcv+0x1f/0x40
>> [   17.207445]  #1:  (&(&tp->lock)->rlock){+.....}, at: [<ffffffffa01073e6>] tg3_start+0xc06/0x11f0 [tg3]
>> [   17.209725] CPU: 2 PID: 1106 Comm: ip Not tainted 3.19.0-rc3+wip-xeon+lockdep #rc3+wip
>> [   17.211900] Hardware name: Dell Inc. Precision WorkStation T5400  /0RW203, BIOS A11 04/30/2012
>> [   17.214086]  0000000000000068 ffff8802ac823498 ffffffff817af7e8 0000000000000005
>> [   17.216265]  ffffffff81a9be78 ffff8802ac8234a8 ffffffff810998a5 ffff8802ac8234d8
>> [   17.218446]  ffffffff8109991a ffff8802ac8234c8 ffff8802af0aae00 ffffffffa00ed000
>> [   17.220636] Call Trace:
>> [   17.222743]  [<ffffffff817af7e8>] dump_stack+0x4f/0x7b
>> [   17.224808]  [<ffffffff810998a5>] ___might_sleep+0x105/0x140
>> [   17.226842]  [<ffffffff8109991a>] __might_sleep+0x3a/0xa0
>> [   17.228869]  [<ffffffffa00ed000>] ? 0xffffffffa00ed000
>> [   17.230939]  [<ffffffff810d7d78>] synchronize_irq+0x38/0xa0
>> [   17.232967]  [<ffffffffa00ed000>] ? 0xffffffffa00ed000
>> [   17.234991]  [<ffffffffa010105f>] tg3_chip_reset+0x13f/0x9c0 [tg3]
>> [   17.236988]  [<ffffffffa01020ae>] tg3_reset_hw+0x7e/0x2d20 [tg3] 
> 
> tp->lock is held in this code path.  If synchronize_irq() sleeps in
> wait_event(desc->wait_for_threads, ...), we'll get the warning.
> 
> The synchronize_irq() call is to wait for any tg3 irq handler to finish
> so that it is guaranteed that next time it will see the CHIP_RESETTING
> flag and do nothing.
> 
> Not sure if we can drop the tp->lock before we call synchronize_irq()
> and then take it again after synchronize_irq().

Well, this device [1] is using MSI (INTx disabled) so if the synchronize_irq()
is _only_ for the CHIP_RESETTING logic then it would seem ok to skip it (the
synchronize_irq()).

Regards,
Peter Hurley


[1] lspci -vv

08:00.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5754 Gigabit Ethernet PCI Express (rev 02)
	Subsystem: Dell Precision T5400
	Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 0, Cache Line Size: 64 bytes
	Interrupt: pin A routed to IRQ 31
	Region 0: Memory at d3ff0000 (64-bit, non-prefetchable) [size=64K]
	Expansion ROM at <ignored> [disabled]
	Capabilities: [48] Power Management version 3
		Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA PME(D0-,D1-,D2-,D3hot+,D3cold+)
		Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=1 PME-
	Capabilities: [50] Vital Product Data
		Product Name: Broadcom NetLink Gigabit Ethernet Controller
		Read-only fields:
			[PN] Part number: BCM95754
			[EC] Engineering changes: 106679-15
			[SN] Serial number: 0123456789
			[MN] Manufacture ID: 31 34 65 34
			[RV] Reserved: checksum good, 30 byte(s) reserved
		Read/write fields:
			[YA] Asset tag: XYZ01234567
			[RW] Read-write area: 107 byte(s) free
		End
	Capabilities: [58] Vendor Specific Information: Len=78 <?>
	Capabilities: [e8] MSI: Enable+ Count=1/1 Maskable- 64bit+
		Address: 00000000fee0400c  Data: 41a2
	Capabilities: [d0] Express (v1) Endpoint, MSI 00
		DevCap:	MaxPayload 128 bytes, PhantFunc 0, Latency L0s <4us, L1 unlimited
			ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
		DevCtl:	Report errors: Correctable- Non-Fatal+ Fatal+ Unsupported-
			RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
			MaxPayload 128 bytes, MaxReadReq 512 bytes
		DevSta:	CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend-
		LnkCap:	Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Exit Latency L0s <4us, L1 <64us
			ClockPM- Surprise- LLActRep- BwNot-
		LnkCtl:	ASPM Disabled; RCB 64 bytes Disabled- CommClk+
			ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
		LnkSta:	Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
	Capabilities: [100 v1] Advanced Error Reporting
		UESta:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UEMsk:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UESvrt:	DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
		CESta:	RxErr+ BadTLP- BadDLLP+ Rollover- Timeout- NonFatalErr-
		CEMsk:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr+
		AERCap:	First Error Pointer: 00, GenCap+ CGenEn- ChkCap+ ChkEn-
	Capabilities: [13c v1] Virtual Channel
		Caps:	LPEVC=0 RefClk=100ns PATEntryBits=1
		Arb:	Fixed- WRR32- WRR64- WRR128-
		Ctrl:	ArbSelect=Fixed
		Status:	InProgress-
		VC0:	Caps:	PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
			Arb:	Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
			Ctrl:	Enable+ ID=0 ArbSelect=Fixed TC/VC=ff
			Status:	NegoPending- InProgress-
	Capabilities: [160 v1] Device Serial Number xx-xx-xx-xx-xx-xx-xx-xx
	Capabilities: [16c v1] Power Budgeting <?>
	Kernel driver in use: tg3

^ permalink raw reply

* Re: [RFC PATCH v2 1/2] net: af_packet support for direct ring access in user space
From: Daniel Borkmann @ 2015-01-13 13:21 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: John Fastabend, netdev, danny.zhou, nhorman, john.ronciak, brouer
In-Reply-To: <1421152510.13626.22.camel@stressinduktion.org>

On 01/13/2015 01:35 PM, Hannes Frederic Sowa wrote:
> On Mo, 2015-01-12 at 20:35 -0800, John Fastabend wrote:
...
>> +/* setsockopt takes addr, size ,direction parametner, getsockopt takes
>> + * iova, size, direction.
>> + * */
>> +struct tpacket_dma_mem_region {
>> +	void *addr;		/* userspace virtual address */
>> +	__u64 phys_addr;	/* physical address */
>> +	__u64 iova;		/* IO virtual address used for DMA */
>> +	unsigned long size;	/* size of region */
>> +	int direction;		/* dma data direction */
>> +};
>
> Have you tested this with with 32 bit user space and 32 bit kernel, too?
> I don't have any problem with only supporting 64 bit kernels for this
> feature, but looking through the code I wonder if we handle the __u64
> addresses correctly in all situations.

Given this is placed into uapi and transferred via setsockopt(2), this
would also need some form of compat handling, also for the case of mixed
environments (e.g. 64 bit kernel, 32 bit user space).

^ permalink raw reply

* Re: [PATCH net-next 4/8] net: dsa: cleanup resources upon module removal
From: Sergei Shtylyov @ 2015-01-13 13:31 UTC (permalink / raw)
  To: Florian Fainelli, netdev; +Cc: davem, buytenh
In-Reply-To: <1421099866-3184-5-git-send-email-f.fainelli@gmail.com>

Hello.

On 1/13/2015 12:57 AM, Florian Fainelli wrote:

> We were not doing anything while removing the dsa module, which means
> that we were leaving dangling network devices without any sort of driver
> backing them, leading to all sorts of crashes. Make sure that we do
> cleanup the slave network devices, slave MII bus we created, and
> unassign the master_netdev dsa_ptr to make the packet processing go
> through the regulard Ethernet receive path.

    Regular. :-)

> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
[...]

> diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
> index de77c83cfd9a..df7ec066ac64 100644
> --- a/net/dsa/dsa.c
> +++ b/net/dsa/dsa.c
> @@ -316,10 +316,22 @@ out:
>
>   static void dsa_switch_destroy(struct dsa_switch *ds)
>   {
> +	int i;

    Need empty line here.

>   #ifdef CONFIG_NET_DSA_HWMON
>   	if (ds->hwmon_dev)
>   		hwmon_device_unregister(ds->hwmon_dev);
>   #endif

[...]

WBR, Sergei

^ permalink raw reply

* [RFC PATCHv1 net-next] xen-netback: always fully coalesce guest Rx packets
From: David Vrabel @ 2015-01-13 14:05 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Ian Campbell, Wei Liu

Always fully coalesce guest Rx packets into the minimum number of ring
slots.  Reducing the number of slots per packet has significant
performance benefits (e.g., 7.2 Gbit/s to 11 Gbit/s in an off-host
receive test).

However, this does increase the number of grant ops per packet which
decreases performance with some workloads (intrahost VM to VM)
/unless/ grant copy has been optimized for adjacent ops with the same
source or destination (see "grant-table: defer releasing pages
acquired in a grant copy"[1]).

Do we need to retain the existing path and make the always coalesce
path conditional on a suitable version of Xen?

[1] http://lists.xen.org/archives/html/xen-devel/2015-01/msg01118.html

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/net/xen-netback/common.h  |    1 -
 drivers/net/xen-netback/netback.c |  106 ++-----------------------------------
 2 files changed, 3 insertions(+), 104 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 5f1fda4..589fa25 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -251,7 +251,6 @@ struct xenvif {
 struct xenvif_rx_cb {
 	unsigned long expires;
 	int meta_slots_used;
-	bool full_coalesce;
 };
 
 #define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 908e65e..568238d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -233,51 +233,6 @@ static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue)
 	}
 }
 
-/*
- * Returns true if we should start a new receive buffer instead of
- * adding 'size' bytes to a buffer which currently contains 'offset'
- * bytes.
- */
-static bool start_new_rx_buffer(int offset, unsigned long size, int head,
-				bool full_coalesce)
-{
-	/* simple case: we have completely filled the current buffer. */
-	if (offset == MAX_BUFFER_OFFSET)
-		return true;
-
-	/*
-	 * complex case: start a fresh buffer if the current frag
-	 * would overflow the current buffer but only if:
-	 *     (i)   this frag would fit completely in the next buffer
-	 * and (ii)  there is already some data in the current buffer
-	 * and (iii) this is not the head buffer.
-	 * and (iv)  there is no need to fully utilize the buffers
-	 *
-	 * Where:
-	 * - (i) stops us splitting a frag into two copies
-	 *   unless the frag is too large for a single buffer.
-	 * - (ii) stops us from leaving a buffer pointlessly empty.
-	 * - (iii) stops us leaving the first buffer
-	 *   empty. Strictly speaking this is already covered
-	 *   by (ii) but is explicitly checked because
-	 *   netfront relies on the first buffer being
-	 *   non-empty and can crash otherwise.
-	 * - (iv) is needed for skbs which can use up more than MAX_SKB_FRAGS
-	 *   slot
-	 *
-	 * This means we will effectively linearise small
-	 * frags but do not needlessly split large buffers
-	 * into multiple copies tend to give large frags their
-	 * own buffers as before.
-	 */
-	BUG_ON(size > MAX_BUFFER_OFFSET);
-	if ((offset + size > MAX_BUFFER_OFFSET) && offset && !head &&
-	    !full_coalesce)
-		return true;
-
-	return false;
-}
-
 struct netrx_pending_operations {
 	unsigned copy_prod, copy_cons;
 	unsigned meta_prod, meta_cons;
@@ -336,24 +291,13 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb
 		BUG_ON(offset >= PAGE_SIZE);
 		BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
 
-		bytes = PAGE_SIZE - offset;
+		if (npo->copy_off == MAX_BUFFER_OFFSET)
+			meta = get_next_rx_buffer(queue, npo);
 
+		bytes = PAGE_SIZE - offset;
 		if (bytes > size)
 			bytes = size;
 
-		if (start_new_rx_buffer(npo->copy_off,
-					bytes,
-					*head,
-					XENVIF_RX_CB(skb)->full_coalesce)) {
-			/*
-			 * Netfront requires there to be some data in the head
-			 * buffer.
-			 */
-			BUG_ON(*head);
-
-			meta = get_next_rx_buffer(queue, npo);
-		}
-
 		if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
 			bytes = MAX_BUFFER_OFFSET - npo->copy_off;
 
@@ -652,60 +596,16 @@ static void xenvif_rx_action(struct xenvif_queue *queue)
 
 	while (xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX)
 	       && (skb = xenvif_rx_dequeue(queue)) != NULL) {
-		RING_IDX max_slots_needed;
 		RING_IDX old_req_cons;
 		RING_IDX ring_slots_used;
 		int i;
 
 		queue->last_rx_time = jiffies;
 
-		/* We need a cheap worse case estimate for the number of
-		 * slots we'll use.
-		 */
-
-		max_slots_needed = DIV_ROUND_UP(offset_in_page(skb->data) +
-						skb_headlen(skb),
-						PAGE_SIZE);
-		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			unsigned int size;
-			unsigned int offset;
-
-			size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
-			offset = skb_shinfo(skb)->frags[i].page_offset;
-
-			/* For a worse-case estimate we need to factor in
-			 * the fragment page offset as this will affect the
-			 * number of times xenvif_gop_frag_copy() will
-			 * call start_new_rx_buffer().
-			 */
-			max_slots_needed += DIV_ROUND_UP(offset + size,
-							 PAGE_SIZE);
-		}
-
-		/* To avoid the estimate becoming too pessimal for some
-		 * frontends that limit posted rx requests, cap the estimate
-		 * at MAX_SKB_FRAGS. In this case netback will fully coalesce
-		 * the skb into the provided slots.
-		 */
-		if (max_slots_needed > MAX_SKB_FRAGS) {
-			max_slots_needed = MAX_SKB_FRAGS;
-			XENVIF_RX_CB(skb)->full_coalesce = true;
-		} else {
-			XENVIF_RX_CB(skb)->full_coalesce = false;
-		}
-
-		/* We may need one more slot for GSO metadata */
-		if (skb_is_gso(skb) &&
-		   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 ||
-		    skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6))
-			max_slots_needed++;
-
 		old_req_cons = queue->rx.req_cons;
 		XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo, queue);
 		ring_slots_used = queue->rx.req_cons - old_req_cons;
 
-		BUG_ON(ring_slots_used > max_slots_needed);
-
 		__skb_queue_tail(&rxq, skb);
 	}
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] rtlwifi/rtl8192de: remove redundant else if check
From: Colin King @ 2015-01-13 14:07 UTC (permalink / raw)
  To: Larry Finger, Chaoming Li, Kalle Valo, John W. Linville,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA

From: Colin Ian King <colin.king-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>

The else if check condition checks for the opposite of the
if check, hence the else if check is redundant and can be
replaced with a simple else:

if (rtlpriv->rtlhal.macphymode == SINGLEMAC_SINGLEPHY) {
	..
} else if (rtlpriv->rtlhal.macphymode != SINGLEMAC_SINGLEPHY) {
	..
}

replaced with:

if (rtlpriv->rtlhal.macphymode == SINGLEMAC_SINGLEPHY) {
	..
} else {
	..
}

Signed-off-by: Colin Ian King <colin.king-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 drivers/net/wireless/rtlwifi/rtl8192de/hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rtlwifi/rtl8192de/hw.c b/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
index 280c3da..01bcc2d 100644
--- a/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
+++ b/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
@@ -546,7 +546,7 @@ static bool _rtl92de_llt_table_init(struct ieee80211_hw *hw)
 		txpktbuf_bndy = 246;
 		value8 = 0;
 		value32 = 0x80bf0d29;
-	} else if (rtlpriv->rtlhal.macphymode != SINGLEMAC_SINGLEPHY) {
+	} else {
 		maxPage = 127;
 		txpktbuf_bndy = 123;
 		value8 = 0;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] bridge: only provide proxy ARP when CONFIG_INET is enabled
From: Arnd Bergmann @ 2015-01-13 14:10 UTC (permalink / raw)
  To: netdev; +Cc: davem, Kyeyoon Park, bridge, Stephen Hemminger

When IPV4 support is disabled, we cannot call arp_send from
the bridge code, which would result in a kernel link error:

net/built-in.o: In function `br_handle_frame_finish':
:(.text+0x59914): undefined reference to `arp_send'
:(.text+0x59a50): undefined reference to `arp_tbl'

This makes the newly added proxy ARP support in the bridge
code depend on the CONFIG_INET symbol and lets the compiler
optimize the code out to avoid the link error.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP")
Cc: Kyeyoon Park <kyeyoonp@codeaurora.org>

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 1f1de715197c..e2aa7be3a847 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -154,7 +154,8 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	dst = NULL;
 
 	if (is_broadcast_ether_addr(dest)) {
-		if (p->flags & BR_PROXYARP &&
+		if (IS_ENABLED(CONFIG_INET) &&
+		    p->flags & BR_PROXYARP &&
 		    skb->protocol == htons(ETH_P_ARP))
 			br_do_proxy_arp(skb, br, vid);
 

^ permalink raw reply related

* Re: [PATCH 8/8] ath10k: fix error return code
From: Kalle Valo @ 2015-01-13 14:16 UTC (permalink / raw)
  To: Julia Lawall
  Cc: linux-wireless, kernel-janitors, linux-kernel, ath10k, netdev
In-Reply-To: <1419872683-32709-9-git-send-email-Julia.Lawall@lip6.fr>

Julia Lawall <Julia.Lawall@lip6.fr> writes:

> Return a negative error code on failure.
>
> A simplified version of the semantic match that finds this problem is as
> follows: (http://coccinelle.lip6.fr/)
>
> // <smpl>
> @@
> identifier ret; expression e1,e2;
> @@
> (
> if (\(ret < 0\|ret != 0\))
>  { ... return ret; }
> |
> ret = 0
> )
> ... when != ret = e1
>     when != &ret
> *if(...)
> {
>   ... when != ret = e2
>       when forall
>  return ret;
> }
> // </smpl>
>
> Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>

Thanks, applied to ath.git.

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH v3] ath10k: fixup wait_for_completion_timeout return handling
From: Kalle Valo @ 2015-01-13 14:20 UTC (permalink / raw)
  To: Nicholas Mc Guire
  Cc: Chun-Yeow Yeoh, Sergei Shtylyov, netdev, linux-wireless,
	linux-kernel, ath10k, Michal Kazior, Yanbo Li, Ben Greear
In-Reply-To: <1420720054-27870-1-git-send-email-der.herr@hofr.at>

Nicholas Mc Guire <der.herr@hofr.at> writes:

> wait_for_completion_timeout does not return negative values so the tests
> for <= 0 are not needed and the case differentiation in the error handling
> path unnecessary.
>
> Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>

Thanks, applied to ath.git.

-- 
Kalle Valo

^ permalink raw reply

* Re: [net-next 03/15] i40evf: Remove some scary log messages
From: Sergei Shtylyov @ 2015-01-13 14:22 UTC (permalink / raw)
  To: Jeff Kirsher, davem; +Cc: Mitch A Williams, netdev, nhorman, sassmann, jogreene
In-Reply-To: <1421148811-9763-4-git-send-email-jeffrey.t.kirsher@intel.com>

Hello.

On 1/13/2015 2:33 PM, Jeff Kirsher wrote:

> From: Mitch A Williams <mitch.a.williams@intel.com>

> These messages may be triggered during normal init of the driver if the
> PF or FW take a long time to respond. There's nothing really wrong, so
> don't freak people out logging messages.

> If the communication channel really is dead, then we'll retry a few
> times and give up. This will log a different more scary message that
> should cause consternation. This allows the user to more easily detect a
> genuine failure.

> Change-ID: I6e2b758d4234a3a09c1015c82c8f2442a697cbdb
> Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
> Acked-by: Shannon Nelson <shannon.nelson@intel.com>
> Tested-by: Jim Young <james.m.young@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
[...]

> diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
> index ee0db59..f8f1d26 100644
> --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
> +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
> @@ -2026,10 +2026,7 @@ static void i40evf_init_task(struct work_struct *work)
>   		/* aq msg sent, awaiting reply */
>   		err = i40evf_verify_api_ver(adapter);
>   		if (err) {
> -			dev_info(&pdev->dev, "Unable to verify API version (%d), retrying\n",
> -				 err);
>   			if (err == I40E_ERR_ADMIN_QUEUE_NO_WORK) {
> -				dev_info(&pdev->dev, "Resending request\n");
>   				err = i40evf_send_api_ver(adapter);
>   			}

    {} not needed anymore, should have removed them.

[...]

WBR, Sergei

^ permalink raw reply

* [PATCH] rocker: fix harmless warning on 32-bit machines
From: Arnd Bergmann @ 2015-01-13 14:23 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Jiri Pirko, Scott Feldman, linux-arm-kernel

The rocker driver tries to assign a pointer to a 64-bit integer
and then back to a pointer. This is safe on all architectures,
but causes a compiler warning when pointers are shorter than
64-bit:

rocker/rocker.c: In function 'rocker_desc_cookie_ptr_get':
rocker/rocker.c:809:9: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  return (void *) desc_info->desc->cookie;
         ^

This adds another cast to uintptr_t to tell the compiler
that it's safe.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 2f398fa4b9e6..cad8cf962cdf 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -806,13 +806,13 @@ static bool rocker_desc_gen(struct rocker_desc_info *desc_info)
 
 static void *rocker_desc_cookie_ptr_get(struct rocker_desc_info *desc_info)
 {
-	return (void *) desc_info->desc->cookie;
+	return (void *)(uintptr_t)desc_info->desc->cookie;
 }
 
 static void rocker_desc_cookie_ptr_set(struct rocker_desc_info *desc_info,
 				       void *ptr)
 {
-	desc_info->desc->cookie = (long) ptr;
+	desc_info->desc->cookie = (uintptr_t) ptr;
 }
 
 static struct rocker_desc_info *

^ permalink raw reply related

* Re: [RFC PATCH v2 2/2] net: ixgbe: implement af_packet direct queue mappings
From: Daniel Borkmann @ 2015-01-13 14:26 UTC (permalink / raw)
  To: John Fastabend; +Cc: netdev, danny.zhou, nhorman, john.ronciak, hannes, brouer
In-Reply-To: <20150113043542.29985.15658.stgit@nitbit.x32>

On 01/13/2015 05:35 AM, John Fastabend wrote:
...
> +static int ixgbe_ndo_split_queue_pairs(struct net_device *dev,
> +				       unsigned int start_from,
> +				       unsigned int qpairs_num,
> +				       struct sock *sk)
> +{
> +	struct ixgbe_adapter *adapter = netdev_priv(dev);
> +	unsigned int qpair_index;

We should probably return -EINVAL, still from within the setsockopt
call when qpairs_num is 0?

> +	/* allocate whatever available qpairs */
> +	if (start_from == -1) {

I guess we should define the notion of auto-select into a uapi
define instead of -1, which might not be overly obvious.

Anyway, extending Documentation/networking/packet_mmap.txt with
API details/examples at least for a non-RFC version is encouraged. ;)

> +		unsigned int count = 0;
> +
> +		for (qpair_index = adapter->num_rx_queues;
> +		     qpair_index < MAX_RX_QUEUES;
> +		     qpair_index++) {
> +			if (!adapter->user_queue_info[qpair_index].sk_handle) {
> +				count++;
> +				if (count == qpairs_num) {
> +					start_from = qpair_index - count + 1;
> +					break;
> +				}
> +			} else {
> +				count = 0;
> +			}
> +		}
> +	}
> +
> +	/* otherwise the caller specified exact queues */
> +	if ((start_from > MAX_TX_QUEUES) ||
> +	    (start_from > MAX_RX_QUEUES) ||
> +	    (start_from + qpairs_num > MAX_TX_QUEUES) ||
> +	    (start_from + qpairs_num > MAX_RX_QUEUES))
> +		return -EINVAL;

Shouldn't this be '>=' if I see this correctly?

> +	/* If the qpairs are being used by the driver do not let user space
> +	 * consume the queues. Also if the queue has already been allocated
> +	 * to a socket do fail the request.
> +	 */
> +	for (qpair_index = start_from;
> +	     qpair_index < start_from + qpairs_num;
> +	     qpair_index++) {
> +		if ((qpair_index < adapter->num_tx_queues) ||
> +		    (qpair_index < adapter->num_rx_queues))
> +			return -EINVAL;
> +
> +		if (adapter->user_queue_info[qpair_index].sk_handle)
> +			return -EBUSY;
> +	}
> +
> +	/* remember the sk handle for each queue pair */
> +	for (qpair_index = start_from;
> +	     qpair_index < start_from + qpairs_num;
> +	     qpair_index++) {
> +		adapter->user_queue_info[qpair_index].sk_handle = sk;
> +		adapter->user_queue_info[qpair_index].num_of_regions = 0;
> +	}
> +
> +	return 0;
> +}

I guess many drivers would need to implement similar code, do you see
a chance to move generic parts to the core, at least for some helper
functions?

Thanks,
Daniel

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox