Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 3/4] qed: Add support for multi function mode with 802.1ad tagging.
From: Sudarsana Reddy Kalluru @ 2018-05-06  1:43 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior
In-Reply-To: <20180506014302.29110-1-sudarsana.kalluru@cavium.com>

The patch adds support for new Multi function mode wherein the traffic
classification is done based on the 802.1ad tagging and the outer vlan tag
provided by the management firmware.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 64 ++++++++++++++++-------
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  5 ++
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 9b07d7f..95d00cb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1668,6 +1668,18 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		if (rc)
 			return rc;
 
+		if (IS_PF(cdev) && test_bit(QED_MF_8021AD_TAGGING,
+					    &cdev->mf_bits)) {
+			STORE_RT_REG(p_hwfn, PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET,
+				     ETH_P_8021AD);
+		}
+
 		qed_fill_load_req_params(&load_req_params,
 					 p_params->p_drv_load_params);
 		rc = qed_mcp_load_req(p_hwfn, p_hwfn->p_main_ptt,
@@ -2630,39 +2642,51 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		   link->pause.autoneg,
 		   p_caps->default_eee, p_caps->eee_lpi_timer);
 
-	/* Read Multi-function information from shmem */
-	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
-	       offsetof(struct nvm_cfg1, glob) +
-	       offsetof(struct nvm_cfg1_glob, generic_cont0);
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		struct qed_dev *cdev = p_hwfn->cdev;
 
-	generic_cont0 = qed_rd(p_hwfn, p_ptt, addr);
+		/* Read Multi-function information from shmem */
+		addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+		       offsetof(struct nvm_cfg1, glob) +
+		       offsetof(struct nvm_cfg1_glob, generic_cont0);
 
-	mf_mode = (generic_cont0 & NVM_CFG1_GLOB_MF_MODE_MASK) >>
-		  NVM_CFG1_GLOB_MF_MODE_OFFSET;
+		generic_cont0 = qed_rd(p_hwfn, p_ptt, addr);
 
-	switch (mf_mode) {
-	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
-		break;
-	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+		mf_mode = (generic_cont0 & NVM_CFG1_GLOB_MF_MODE_MASK) >>
+			  NVM_CFG1_GLOB_MF_MODE_OFFSET;
+
+		switch (mf_mode) {
+		case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_BD:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_8021AD_TAGGING);
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
+			cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
 					BIT(QED_MF_LL2_NON_UNICAST) |
 					BIT(QED_MF_INTER_PF_SWITCH);
-		break;
-	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
+			cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
 					BIT(QED_MF_LL2_NON_UNICAST);
-		if (QED_IS_BB(p_hwfn->cdev))
-			p_hwfn->cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
-		break;
+			if (QED_IS_BB(p_hwfn->cdev))
+				cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
+			break;
+		}
+
+		DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
+			cdev->mf_bits);
 	}
 
 	DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
 		p_hwfn->cdev->mf_bits);
 
-	/* Read Multi-function information from shmem */
+	/* Read device capabilities information from shmem */
 	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
 		offsetof(struct nvm_cfg1, glob) +
 		offsetof(struct nvm_cfg1_glob, device_capabilities);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index fbb3172..26bed26 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -346,6 +346,11 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
 		cpu_to_le16(p_hwfn->hw_info.ovlan);
+	if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
+		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021AD;
+		p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
+	}
+
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 4/4] qed: Add support for Unified Fabric Port.
From: Sudarsana Reddy Kalluru @ 2018-05-06  1:43 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior
In-Reply-To: <20180506014302.29110-1-sudarsana.kalluru@cavium.com>

This patch adds driver changes for supporting the Unified Fabric Port
(UFP). This is a new paritioning mode wherein MFW provides the set of
parameters to be used by the device such as traffic class, outer-vlan
tag value, priority type etc. Drivers receives this info via notifications
from mfw and configures the hardware accordingly.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed.h             | 20 ++++++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        | 14 +++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 32 ++++++++--
 drivers/net/ethernet/qlogic/qed/qed_fcoe.c        |  3 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         | 28 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         | 40 ++++++++----
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         | 78 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |  8 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  9 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 57 ++++++++++++++++-
 drivers/scsi/qedf/qedf_fip.c                      |  7 +-
 drivers/scsi/qedf/qedf_main.c                     |  2 +-
 drivers/scsi/qedi/qedi_iscsi.c                    |  2 +-
 include/linux/qed/qed_ll2_if.h                    | 10 ++-
 14 files changed, 283 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index c8f3507..adcff49 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -474,6 +474,24 @@ enum qed_mf_mode_bit {
 	QED_MF_DSCP_TO_TC_MAP,
 };
 
+enum qed_ufp_mode {
+	QED_UFP_MODE_ETS,
+	QED_UFP_MODE_VNIC_BW,
+	QED_UFP_MODE_UNKNOWN
+};
+
+enum qed_ufp_pri_type {
+	QED_UFP_PRI_OS,
+	QED_UFP_PRI_VNIC,
+	QED_UFP_PRI_UNKNOWN
+};
+
+struct qed_ufp_info {
+	enum qed_ufp_pri_type pri_type;
+	enum qed_ufp_mode mode;
+	u8 tc;
+};
+
 enum BAR_ID {
 	BAR_ID_0,		/* used for GRC */
 	BAR_ID_1		/* Used for doorbells */
@@ -582,6 +600,8 @@ struct qed_hwfn {
 
 	struct qed_dcbx_info		*p_dcbx_info;
 
+	struct qed_ufp_info		ufp_info;
+
 	struct qed_dmae_info		dmae_info;
 
 	/* QM init */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 449777f..8f31406 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -274,8 +274,8 @@ static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
 		     u32 pri_tc_tbl, int count, u8 dcbx_version)
 {
 	enum dcbx_protocol_type type;
+	bool enable, ieee, eth_tlv;
 	u8 tc, priority_map;
-	bool enable, ieee;
 	u16 protocol_id;
 	int priority;
 	int i;
@@ -283,6 +283,7 @@ static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
 	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
 
 	ieee = (dcbx_version == DCBX_CONFIG_VERSION_IEEE);
+	eth_tlv = false;
 	/* Parse APP TLV */
 	for (i = 0; i < count; i++) {
 		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
@@ -304,13 +305,22 @@ static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
 			 * indication, but we only got here if there was an
 			 * app tlv for the protocol, so dcbx must be enabled.
 			 */
-			enable = !(type == DCBX_PROTOCOL_ETH);
+			if (type == DCBX_PROTOCOL_ETH) {
+				enable = false;
+				eth_tlv = true;
+			} else {
+				enable = true;
+			}
 
 			qed_dcbx_update_app_info(p_data, p_hwfn, enable,
 						 priority, tc, type);
 		}
 	}
 
+	/* If Eth TLV is not detected, use UFP TC as default TC */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) && !eth_tlv)
+		p_data->arr[DCBX_PROTOCOL_ETH].tc = p_hwfn->ufp_info.tc;
+
 	/* Update ramrod protocol data and hw_info fields
 	 * with default info when corresponding APP TLV's are not detected.
 	 * The enabled field has a different logic for ethernet as only for
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 95d00cb..5605289 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1499,6 +1499,11 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET, 1);
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET,
 			     p_hwfn->hw_info.ovlan);
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "Configuring LLH_FUNC_FILTER_HDR_SEL\n");
+		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET,
+			     1);
 	}
 
 	/* Enable classification by MAC if needed */
@@ -1635,6 +1640,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 	bool b_default_mtu = true;
 	struct qed_hwfn *p_hwfn;
 	int rc = 0, mfw_rc, i;
+	u16 ether_type;
 
 	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
 		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
@@ -1668,16 +1674,22 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		if (rc)
 			return rc;
 
-		if (IS_PF(cdev) && test_bit(QED_MF_8021AD_TAGGING,
-					    &cdev->mf_bits)) {
+		if (IS_PF(cdev) && (test_bit(QED_MF_8021Q_TAGGING,
+					     &cdev->mf_bits) ||
+				    test_bit(QED_MF_8021AD_TAGGING,
+					     &cdev->mf_bits))) {
+			if (test_bit(QED_MF_8021Q_TAGGING, &cdev->mf_bits))
+				ether_type = ETH_P_8021Q;
+			else
+				ether_type = ETH_P_8021AD;
 			STORE_RT_REG(p_hwfn, PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 		}
 
 		qed_fill_load_req_params(&load_req_params,
@@ -2659,6 +2671,12 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
 			break;
+		case NVM_CFG1_GLOB_MF_MODE_UFP:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_UFP_SPECIFIC) |
+					BIT(QED_MF_8021Q_TAGGING);
+			break;
 		case NVM_CFG1_GLOB_MF_MODE_BD:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
@@ -2879,6 +2897,8 @@ static void qed_get_eee_caps(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
 
 		qed_get_eee_caps(p_hwfn, p_ptt);
+
+		qed_mcp_read_ufp_config(p_hwfn, p_ptt);
 	}
 
 	if (qed_mcp_is_init(p_hwfn)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
index 2dc9b31..cc1b373 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
@@ -313,6 +313,9 @@ struct qed_fcoe_conn {
 	p_data->d_id.addr_mid = p_conn->d_id.addr_mid;
 	p_data->d_id.addr_lo = p_conn->d_id.addr_lo;
 	p_data->flags = p_conn->flags;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		SET_FIELD(p_data->flags,
+			  FCOE_CONN_OFFLOAD_RAMROD_DATA_B_SINGLE_VLAN, 1);
 	p_data->def_q_idx = p_conn->def_q_idx;
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 7f5ec42..b5f70ef 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -11993,6 +11993,16 @@ struct public_port {
 #define EEE_REMOTE_TW_TX_OFFSET 0
 #define EEE_REMOTE_TW_RX_MASK   0xffff0000
 #define EEE_REMOTE_TW_RX_OFFSET 16
+
+	u32 oem_cfg_port;
+#define OEM_CFG_CHANNEL_TYPE_MASK                       0x00000003
+#define OEM_CFG_CHANNEL_TYPE_OFFSET                     0
+#define OEM_CFG_CHANNEL_TYPE_VLAN_PARTITION             0x1
+#define OEM_CFG_CHANNEL_TYPE_STAGGED                    0x2
+#define OEM_CFG_SCHED_TYPE_MASK                         0x0000000C
+#define OEM_CFG_SCHED_TYPE_OFFSET                       2
+#define OEM_CFG_SCHED_TYPE_ETS                          0x1
+#define OEM_CFG_SCHED_TYPE_VNIC_BW                      0x2
 };
 
 struct public_func {
@@ -12069,6 +12079,23 @@ struct public_func {
 #define DRV_ID_DRV_INIT_HW_MASK		0x80000000
 #define DRV_ID_DRV_INIT_HW_SHIFT	31
 #define DRV_ID_DRV_INIT_HW_FLAG		(1 << DRV_ID_DRV_INIT_HW_SHIFT)
+
+	u32 oem_cfg_func;
+#define OEM_CFG_FUNC_TC_MASK                    0x0000000F
+#define OEM_CFG_FUNC_TC_OFFSET                  0
+#define OEM_CFG_FUNC_TC_0                       0x0
+#define OEM_CFG_FUNC_TC_1                       0x1
+#define OEM_CFG_FUNC_TC_2                       0x2
+#define OEM_CFG_FUNC_TC_3                       0x3
+#define OEM_CFG_FUNC_TC_4                       0x4
+#define OEM_CFG_FUNC_TC_5                       0x5
+#define OEM_CFG_FUNC_TC_6                       0x6
+#define OEM_CFG_FUNC_TC_7                       0x7
+
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_MASK         0x00000030
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET       4
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC         0x1
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OS           0x2
 };
 
 struct mcp_mac {
@@ -12495,6 +12522,7 @@ enum MFW_DRV_MSG_TYPE {
 	MFW_DRV_MSG_BW_UPDATE10,
 	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
 	MFW_DRV_MSG_BW_UPDATE11,
+	MFW_DRV_MSG_OEM_CFG_UPDATE,
 	MFW_DRV_MSG_MAX
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 3ac0ede..9f4f2cc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -919,6 +919,10 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
 	p_ramrod->inner_vlan_stripping_en =
 		p_ll2_conn->input.rx_vlan_removal_en;
+
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+	    p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE)
+		p_ramrod->report_outer_vlan = 1;
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
 	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
@@ -1493,11 +1497,12 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
 	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_add_protocol_filter(p_hwfn, p_ptt,
+						    ETH_P_FCOE, 0,
+						    QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8906, 0,
-					    QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8914, 0,
+					    ETH_P_FIP, 0,
 					    QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -1653,11 +1658,16 @@ static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn,
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
-	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO) {
 		start_bd->nw_vlan_or_lb_echo =
 		    cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
-	else
+	} else {
 		start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+		if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+		    p_ll2->input.conn_type == QED_LL2_TYPE_FCOE)
+			pkt->remove_stag = true;
+	}
+
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
 		  cpu_to_le16(pkt->l4_hdr_offset_w));
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
@@ -1668,6 +1678,9 @@ static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn,
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_DISABLE_STAG_INSERTION,
+		  !!(pkt->remove_stag));
+
 	start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
 	DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
 	start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
@@ -1884,11 +1897,12 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 		qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
+						       ETH_P_FCOE, 0,
+						      QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8906, 0,
-					       QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8914, 0,
+					       ETH_P_FIP, 0,
 					       QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -2360,7 +2374,8 @@ static int qed_ll2_stop(struct qed_dev *cdev)
 	return -EINVAL;
 }
 
-static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
+static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
+			      unsigned long xmit_flags)
 {
 	struct qed_ll2_tx_pkt_info pkt;
 	const skb_frag_t *frag;
@@ -2405,6 +2420,9 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
 	pkt.first_frag = mapping;
 	pkt.first_frag_len = skb->len;
 	pkt.cookie = skb;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &cdev->mf_bits) &&
+	    test_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &xmit_flags))
+		pkt.remove_stag = true;
 
 	rc = qed_ll2_prepare_tx_packet(&cdev->hwfns[0], cdev->ll2->handle,
 				       &pkt, 1);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 0550f0e..e80f5e7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/etherdevice.h>
 #include "qed.h"
+#include "qed_cxt.h"
 #include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -1486,6 +1487,80 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		    &resp, &param);
 }
 
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct public_func shmem_info;
+	u32 port_cfg, val;
+
+	if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		return;
+
+	memset(&p_hwfn->ufp_info, 0, sizeof(p_hwfn->ufp_info));
+	port_cfg = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			  offsetof(struct public_port, oem_cfg_port));
+	val = (port_cfg & OEM_CFG_CHANNEL_TYPE_MASK) >>
+		OEM_CFG_CHANNEL_TYPE_OFFSET;
+	if (val != OEM_CFG_CHANNEL_TYPE_STAGGED)
+		DP_NOTICE(p_hwfn, "Incorrect UFP Channel type  %d\n", val);
+
+	val = (port_cfg & OEM_CFG_SCHED_TYPE_MASK) >> OEM_CFG_SCHED_TYPE_OFFSET;
+	if (val == OEM_CFG_SCHED_TYPE_ETS) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_ETS;
+	} else if (val == OEM_CFG_SCHED_TYPE_VNIC_BW) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_VNIC_BW;
+	} else {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown UFP scheduling mode %d\n", val);
+	}
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+	val = (port_cfg & OEM_CFG_FUNC_TC_MASK) >> OEM_CFG_FUNC_TC_OFFSET;
+	p_hwfn->ufp_info.tc = (u8)val;
+	val = (port_cfg & OEM_CFG_FUNC_HOST_PRI_CTRL_MASK) >>
+		OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET;
+	if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_VNIC;
+	} else if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_OS) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_OS;
+	} else {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown Host priority control %d\n", val);
+	}
+
+	DP_NOTICE(p_hwfn,
+		  "UFP shmem config: mode = %d tc = %d pri_type = %d\n",
+		  p_hwfn->ufp_info.mode,
+		  p_hwfn->ufp_info.tc, p_hwfn->ufp_info.pri_type);
+}
+
+static int
+qed_mcp_handle_ufp_event(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	qed_mcp_read_ufp_config(p_hwfn, p_ptt);
+
+	if (p_hwfn->ufp_info.mode == QED_UFP_MODE_VNIC_BW) {
+		p_hwfn->qm_info.ooo_tc = p_hwfn->ufp_info.tc;
+		p_hwfn->hw_info.offload_tc = p_hwfn->ufp_info.tc;
+
+		qed_qm_reconf(p_hwfn, p_ptt);
+	} else if (p_hwfn->ufp_info.mode == QED_UFP_MODE_ETS) {
+		/* Merge UFP TC with the dcbx TC data */
+		qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+					  QED_DCBX_OPERATIONAL_MIB);
+	} else {
+		DP_ERR(p_hwfn, "Invalid sched type, discard the UFP config\n");
+		return -EINVAL;
+	}
+
+	/* update storm FW with negotiation results */
+	qed_sp_pf_update_ufp(p_hwfn);
+
+	/* update stag pcp value */
+	qed_sp_pf_update_stag(p_hwfn);
+
+	return 0;
+}
+
 int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt)
 {
@@ -1529,6 +1604,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
 						  QED_DCBX_OPERATIONAL_MIB);
 			break;
+		case MFW_DRV_MSG_OEM_CFG_UPDATE:
+			qed_mcp_handle_ufp_event(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 3af3896..250579b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -1005,6 +1005,14 @@ void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
 int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
 /**
+ * @brief Read ufp config from the shared memory.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
  * @brief Populate the nvm info shadow in the given hardware function
  *
  * @param p_hwfn
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 7680222..e95431f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -462,6 +462,15 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
  * @return int
  */
 
+/**
+ * @brief qed_sp_pf_update_ufp - PF ufp update Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn);
+
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 26bed26..8de644b4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -314,7 +314,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
 	int rc = -EINVAL;
-	u8 page_cnt;
+	u8 page_cnt, i;
 
 	/* update initial eq producer */
 	qed_eq_prod_update(p_hwfn,
@@ -345,12 +345,30 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->mf_mode = MF_NPAR;
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
-		cpu_to_le16(p_hwfn->hw_info.ovlan);
-	if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
+				cpu_to_le16(p_hwfn->hw_info.ovlan);
+	if (test_bit(QED_MF_8021Q_TAGGING, &p_hwfn->cdev->mf_bits)) {
+		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021Q;
+	} else if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
 		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021AD;
 		p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
 	}
 
+	p_ramrod->outer_tag_config.pri_map_valid = 1;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		p_ramrod->outer_tag_config.inner_to_outer_pri_map[i] = i;
+
+	/* enable_stag_pri_change should be set if port is in BD mode or,
+	 * UFP with Host Control mode.
+	 */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits)) {
+		if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
+		else
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 0;
+
+		p_ramrod->outer_tag_config.outer_tag.tci |=
+		    cpu_to_le16(((u16)p_hwfn->ufp_info.tc << 13));
+	}
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
@@ -431,6 +449,39 @@ int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EOPNOTSUPP;
+
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_UNKNOWN) {
+		DP_INFO(p_hwfn, "Invalid priority type %d\n",
+			p_hwfn->ufp_info.pri_type);
+		return -EINVAL;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	p_ent->ramrod.pf_update.update_enable_stag_pri_change = true;
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 1;
+	else
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 0;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_ptt *p_ptt,
diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c
index 773558f..16d1a21 100644
--- a/drivers/scsi/qedf/qedf_fip.c
+++ b/drivers/scsi/qedf/qedf_fip.c
@@ -23,6 +23,7 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 	struct fip_vlan *vlan;
 #define MY_FIP_ALL_FCF_MACS        ((__u8[6]) { 1, 0x10, 0x18, 1, 0, 2 })
 	static u8 my_fcoe_all_fcfs[ETH_ALEN] = MY_FIP_ALL_FCF_MACS;
+	unsigned long flags = 0;
 
 	skb = dev_alloc_skb(sizeof(struct fip_vlan));
 	if (!skb)
@@ -65,7 +66,9 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 		kfree_skb(skb);
 		return;
 	}
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+
+	set_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &flags);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, flags);
 }
 
 static void qedf_fcoe_process_vlan_resp(struct qedf_ctx *qedf,
@@ -139,7 +142,7 @@ void qedf_fip_send(struct fcoe_ctlr *fip, struct sk_buff *skb)
 		print_hex_dump(KERN_WARNING, "fip ", DUMP_PREFIX_OFFSET, 16, 1,
 		    skb->data, skb->len, false);
 
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 }
 
 /* Process incoming FIP frames. */
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index 284ccb5..6c19015 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -994,7 +994,7 @@ static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	if (qedf_dump_frames)
 		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
 		    1, skb->data, skb->len, false);
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 
 	return 0;
 }
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 7ec7f6e..ff21aa1 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1150,7 +1150,7 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
 	if (vlanid)
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
 
-	rc = qedi_ops->ll2->start_xmit(cdev, skb);
+	rc = qedi_ops->ll2->start_xmit(cdev, skb, 0);
 	if (rc) {
 		QEDI_ERR(&qedi->dbg_ctx, "ll2 start_xmit returned %d\n",
 			 rc);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 266c1fb..5eb0229 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -202,6 +202,7 @@ struct qed_ll2_tx_pkt_info {
 	bool enable_ip_cksum;
 	bool enable_l4_cksum;
 	bool calc_ip_len;
+	bool remove_stag;
 };
 
 #define QED_LL2_UNUSED_HANDLE   (0xff)
@@ -220,6 +221,11 @@ struct qed_ll2_params {
 	u8 ll2_mac_address[ETH_ALEN];
 };
 
+enum qed_ll2_xmit_flags {
+	/* FIP discovery packet */
+	QED_LL2_XMIT_FLAGS_FIP_DISCOVERY
+};
+
 struct qed_ll2_ops {
 /**
  * @brief start - initializes ll2
@@ -245,10 +251,12 @@ struct qed_ll2_ops {
  *
  * @param cdev
  * @param skb
+ * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags.
  *
  * @return 0 on success, otherwise error value.
  */
-	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb);
+	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb,
+			  unsigned long xmit_flags);
 
 /**
  * @brief register_cb_ops - protocol driver register the callback for Rx/Tx
-- 
1.8.3.1

^ permalink raw reply related

* Re: [Intel-wired-lan] [PATCH v2 1/1] drivers core: multi-threading device shutdown
From: kbuild test robot @ 2018-05-06  1:56 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: kbuild-all, pasha.tatashin, steven.sistare, daniel.m.jordan,
	linux-kernel, jeffrey.t.kirsher, intel-wired-lan, netdev, gregkh,
	alexander.duyck, tobin
In-Reply-To: <20180505154040.28614-2-pasha.tatashin@oracle.com>

Hi Pavel,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on driver-core/driver-core-testing]
[also build test WARNING on v4.17-rc3 next-20180504]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Pavel-Tatashin/multi-threading-device-shutdown/20180506-035150
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> drivers/base/core.c:2877:25: sparse: symbol 'device_root_tasks_done' was not declared. Should it be static?
   drivers/base/core.c:62:5: sparse: context imbalance in 'device_links_read_lock' - wrong count at exit
   drivers/base/core.c:67:6: sparse: context imbalance in 'device_links_read_unlock' - unexpected unlock

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* [RFC PATCH] drivers core: device_root_tasks_done can be static
From: kbuild test robot @ 2018-05-06  1:56 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: kbuild-all, pasha.tatashin, steven.sistare, daniel.m.jordan,
	linux-kernel, jeffrey.t.kirsher, intel-wired-lan, netdev, gregkh,
	alexander.duyck, tobin
In-Reply-To: <20180505154040.28614-2-pasha.tatashin@oracle.com>


Fixes: 6d54ba128905 ("drivers core: multi-threading device shutdown")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
 core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 110d797..666c163 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2874,7 +2874,7 @@ static int device_shutdown_child_task(void *data);
  */
 static atomic_t		device_root_tasks_finished;
 static atomic_t		device_root_tasks_started;
-struct completion	device_root_tasks_done;
+static struct completion	device_root_tasks_done;
 
 /**
  * Shutdown device tree with root started in dev. If dev has no children

^ permalink raw reply related

* (unknown), 
From: Shane Missler @ 2018-05-05 22:07 UTC (permalink / raw)




Hallo, Sie haben eine Spende von 5.800.000,00EUR, Am Shane Missler der Gewinner des Powerball-Jackpots im Wert von $ 451 Millionen. Ich spende einen Teil meiner Lotterie an fünf glückliche Menschen und Wohltätigkeitseinrichtungen zum Gedenken an meine verstorbene Mutter, die an Krebs gestorben ist. Kontaktieren Sie mich für weitere Informationen unter: shanemissler84@gmail.com 

^ permalink raw reply

* Re: [PATCH 4/8] rhashtable: fix race in nested_table_alloc()
From: Herbert Xu @ 2018-05-06  5:18 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <871sepepuj.fsf@notabene.neil.brown.name>

On Sun, May 06, 2018 at 07:48:20AM +1000, NeilBrown wrote:
>
> The spinlock protects 2 or more buckets.  The nested table contains at
> least 512 buckets, maybe more.
> It is quite possible for two insertions into 2 different buckets to both
> get their spinlock and both try to instantiate the same nested table.

I think you missed the fact that when we use nested tables the spin
lock table is limited to just a single page and hence corresponds
to the first level in the nested table.  Therefore it's always safe.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 8/8] rhashtable: don't hold lock on first table throughout insertion.
From: Herbert Xu @ 2018-05-06  5:20 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <87sh75dapa.fsf@notabene.neil.brown.name>

On Sun, May 06, 2018 at 08:00:49AM +1000, NeilBrown wrote:
>
> The insert function must (and does) take the lock on the bucket before
> testing if there is a "next" table.
> If one inserter finds that it has locked the "last" table (because there
> is no next) and successfully inserts, then the other inserter cannot
> have locked that table yet, else it would have inserted.  When it does,
> it will find what the first inserter inserted. 

If you release the lock to the first table then it may be deleted
by the resize thread.  Hence the other inserter may not have even
started from the same place.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] net/mlx5: Fix mlx5_get_vector_affinity function
From: Thomas Gleixner @ 2018-05-06  7:33 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: Israel Rukshin, Max Gurtovoy, Matan Barak, Doug Ledford,
	linux-rdma, linux-kernel, netdev
In-Reply-To: <20180505143838.GA12621@roeck-us.net>

[-- Attachment #1: Type: text/plain, Size: 2165 bytes --]

On Sat, 5 May 2018, Guenter Roeck wrote:

> On Thu, Apr 12, 2018 at 09:49:11AM +0000, Israel Rukshin wrote:
> > Adding the vector offset when calling to mlx5_vector2eqn() is wrong.
> > This is because mlx5_vector2eqn() checks if EQ index is equal to vector number
> > and the fact that the internal completion vectors that mlx5 allocates
> > don't get an EQ index.
> > 
> > The second problem here is that using effective_affinity_mask gives the same
> > CPU for different vectors.
> > This leads to unmapped queues when calling it from blk_mq_rdma_map_queues().
> > This doesn't happen when using affinity_hint mask.
> > 
> Except that affinity_hint is only defined if SMP is enabled. Without:
> 
> include/linux/mlx5/driver.h: In function ‘mlx5_get_vector_affinity_hint’:
> include/linux/mlx5/driver.h:1299:13: error:
>         ‘struct irq_desc’ has no member named ‘affinity_hint’
> 
> Note that this is the only use of affinity_hint outside kernel/irq.
> Don't other drivers have similar problems ?

Aside of that.

> >  static inline const struct cpumask *
> > -mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector)
> > +mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
> >  {
> > -	const struct cpumask *mask;
> >  	struct irq_desc *desc;
> >  	unsigned int irq;
> >  	int eqn;
> >  	int err;
> >  
> > -	err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE + vector, &eqn, &irq);
> > +	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
> >  	if (err)
> >  		return NULL;
> >  
> >  	desc = irq_to_desc(irq);
> > -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
> > -	mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
> > -#else
> > -	mask = desc->irq_common_data.affinity;
> > -#endif
> > -	return mask;
> > +	return desc->affinity_hint;

NAK.

Nothing in regular device drivers is supposed to ever fiddle with struct
irq_desc. The existing code is already a violation of that rule and needs
to be fixed, but not in that way.

The logic here is completely screwed. affinity_hint is set by the driver,
so the driver already knows what it is. If the driver does not set it, then
the thing is NULL.

Thanks,

	tglx


^ permalink raw reply

* Re: [PATCH] net/mlx5: Fix mlx5_get_vector_affinity function
From: Thomas Gleixner @ 2018-05-06  7:43 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: Israel Rukshin, Max Gurtovoy, Matan Barak, Doug Ledford,
	linux-rdma, linux-kernel, netdev
In-Reply-To: <alpine.DEB.2.21.1805060929540.1685@nanos.tec.linutronix.de>

On Sun, 6 May 2018, Thomas Gleixner wrote:
> On Sat, 5 May 2018, Guenter Roeck wrote:
> > > -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
> > > -	mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
> > > -#else
> > > -	mask = desc->irq_common_data.affinity;
> > > -#endif
> > > -	return mask;
> > > +	return desc->affinity_hint;
> 
> NAK.
> 
> Nothing in regular device drivers is supposed to ever fiddle with struct
> irq_desc. The existing code is already a violation of that rule and needs
> to be fixed, but not in that way.
> 
> The logic here is completely screwed. affinity_hint is set by the driver,
> so the driver already knows what it is. If the driver does not set it, then
> the thing is NULL.

And this completely insane fiddling with irq_desc is in MLX4 as
well. Dammit, why can't people respect subsytem boundaries and just fiddle
in everything just because they can? If there is something missing at the
core level then please talk to the maintainers instead of hacking utter
crap into your driver.

Yours grumpy

      tglx

^ permalink raw reply

* [PATCH] mwifiex: delete unneeded include
From: Julia Lawall @ 2018-05-06  7:53 UTC (permalink / raw)
  To: Amitkumar Karwar
  Cc: kernel-janitors, Nishant Sarmukadam, Ganapathi Bhat, Xinming Hu,
	Kalle Valo, David S. Miller, linux-wireless, netdev, linux-kernel

Nothing that is defined in 11ac.h is referenced in cmdevt.c.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>

---
 drivers/net/wireless/marvell/mwifiex/cmdevt.c |    1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
index 7014f44..9cfcdf6 100644
--- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c
+++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
@@ -25,7 +25,6 @@
 #include "main.h"
 #include "wmm.h"
 #include "11n.h"
-#include "11ac.h"
 
 static void mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter);
 

^ permalink raw reply related

* [PATCH 0/9] tree-wide: fix typo 'can by' to 'can be'
From: Wolfram Sang @ 2018-05-06 11:23 UTC (permalink / raw)
  To: trivial
  Cc: linux-renesas-soc, Wolfram Sang, devicetree, linux-hwmon,
	linux-i2c, linux-input, linux-kernel, linux-ntb, linuxppc-dev,
	netdev, reiserfs-devel

I found this kind of typo when reading the documentation for device_remove().
So, I checked the tree for it.

CCing all the subsystems directly, and I'd think the leftover ones could be
picked up by the trivial tree. Or would it be more convenient if trivial would
pick up all? I don't mind.

Based on v4.17-rc3.

Wolfram Sang (9):
  dt-bindings: i2c: fix typo 'can by' to 'can be'
  powerpc/watchdog: fix typo 'can by' to 'can be'
  base: core: fix typo 'can by' to 'can be'
  hwmon: fschmd: fix typo 'can by' to 'can be'
  input: ati_remote2: fix typo 'can by' to 'can be'
  NTB: ntb_hw_idt: fix typo 'can by' to 'can be'
  reiserfs: journal: fix typo 'can by' to 'can be'
  net: flow_dissector: fix typo 'can by' to 'can be'
  objtool: fix typo 'can by' to 'can be'

 Documentation/devicetree/bindings/i2c/i2c-davinci.txt | 2 +-
 arch/powerpc/kernel/watchdog.c                        | 2 +-
 drivers/base/core.c                                   | 2 +-
 drivers/hwmon/fschmd.c                                | 2 +-
 drivers/input/misc/ati_remote2.c                      | 2 +-
 drivers/ntb/hw/idt/ntb_hw_idt.c                       | 2 +-
 fs/reiserfs/journal.c                                 | 2 +-
 include/net/flow_dissector.h                          | 2 +-
 tools/objtool/Documentation/stack-validation.txt      | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH 8/9] net: flow_dissector: fix typo 'can by' to 'can be'
From: Wolfram Sang @ 2018-05-06 11:23 UTC (permalink / raw)
  To: trivial
  Cc: linux-renesas-soc, Wolfram Sang, David S. Miller, netdev,
	linux-kernel
In-Reply-To: <20180506112404.24872-1-wsa+renesas@sang-engineering.com>

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/net/flow_dissector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 9a074776f70b66..d1fcf2442a423b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -251,7 +251,7 @@ extern struct flow_dissector flow_keys_buf_dissector;
  * This structure is used to hold a digest of the full flow keys. This is a
  * larger "hash" of a flow to allow definitively matching specific flows where
  * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
- * that it can by used in CB of skb (see sch_choke for an example).
+ * that it can be used in CB of skb (see sch_choke for an example).
  */
 #define FLOW_KEYS_DIGEST_LEN	16
 struct flow_keys_digest {
-- 
2.11.0

^ permalink raw reply related

* Locking in network code
From: Jacob S. Moroni @ 2018-05-06 13:43 UTC (permalink / raw)
  To: netdev

Hello,

I have a stupid question regarding which variant of spin_lock to use
throughout the network stack, and inside RX handlers specifically.

It's my understanding that skbuffs are normally passed into the stack
from soft IRQ context if the device is using NAPI, and hard IRQ
context if it's not using NAPI (and I guess process context too if the
driver does it's own workqueue thing). 

So, that means that handlers registered with netdev_rx_handler_register
may end up being called from any context.

However, the RX handler in the macvlan code calls ip_check_defrag,
which could eventually lead to a call to ip_defrag, which ends
up taking a regular spin_lock around the call to ip_frag_queue.

Is this a risk of deadlock, and if not, why?

What if you're running a system with one CPU and a packet fragment
arrives on a NAPI interface, then, while the spin_lock is held,
another fragment somehow arrives on another interface which does
its processing in hard IRQ context?

-- 
  Jacob S. Moroni
  mail@jakemoroni.com

^ permalink raw reply

* [PATCH bpf-next v3 0/6] ipv6: sr: introduce seg6local End.BPF action
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov

As of Linux 4.14, it is possible to define advanced local processing for
IPv6 packets with a Segment Routing Header through the seg6local LWT
infrastructure. This LWT implements the network programming principles
defined in the IETF “SRv6 Network Programming” draft.

The implemented operations are generic, and it would be very interesting to
be able to implement user-specific seg6local actions, without having to
modify the kernel directly. To do so, this patchset adds an End.BPF action
to seg6local, powered by some specific Segment Routing-related helpers,
which provide SR functionalities that can be applied on the packet. This
BPF hook would then allow to implement specific actions at native kernel
speed such as OAM features, advanced SR SDN policies, SRv6 actions like
Segment Routing Header (SRH) encapsulation depending on the content of
the packet, etc ... 

This patchset is divided in 6 patches, whose main features are :

- A new seg6local action End.BPF with the corresponding new BPF program
  type BPF_PROG_TYPE_LWT_SEG6LOCAL. Such attached BPF program can be
  passed to the LWT seg6local through netlink, the same way as the LWT
  BPF hook operates.
- 3 new BPF helpers for the seg6local BPF hook, allowing to edit/grow/
  shrink a SRH and apply on a packet some of the generic SRv6 actions.
- 1 new BPF helper for the LWT BPF IN hook, allowing to add a SRH through
  encapsulation (via IPv6 encapsulation or inlining if the packet contains
  already an IPv6 header).

As this patchset adds a new LWT BPF hook, I took into account the result of
the discussions when the LWT BPF infrastructure got merged. Hence, the
seg6local BPF hook doesn’t allow write access to skb->data directly, only
the SRH can be modified through specific helpers, which ensures that the
integrity of the packet is maintained.
More details are available in the related patches messages.

The performances of this BPF hook have been assessed with the BPF JIT
enabled on a Intel Xeon X3440 processors with 4 cores and 8 threads
clocked at 2.53 GHz. No throughput losses are noted with the seg6local
BPF hook when the BPF program does nothing (440kpps). Adding a 8-bytes
TLV (1 call each to bpf_lwt_seg6_adjust_srh and bpf_lwt_seg6_store_bytes)
drops the throughput to 410kpps, and inlining a SRH via
bpf_lwt_seg6_action drops the throughput to 420kpps.
All throughputs are stable.

-------
v2: move the SRH integrity state from skb->cb to a per-cpu buffer
v3: - document helpers in man-page style
    - fix kbuild bugs
    - un-break BPF LWT out hook
    - bpf_push_seg6_encap is now static
    - preempt_enable is now called when the packet is dropped in
      input_action_end_bpf

Thanks.

Mathieu Xhonneux (6):
  ipv6: sr: make seg6.h includable without IPv6
  ipv6: sr: export function lookup_nexthop
  bpf: Add IPv6 Segment Routing helpers
  bpf: Split lwt inout verifier structures
  ipv6: sr: Add seg6local action End.BPF
  selftests/bpf: test for seg6local End.BPF action

 include/linux/bpf_types.h                         |   7 +-
 include/net/seg6.h                                |   7 +-
 include/net/seg6_local.h                          |  32 ++
 include/uapi/linux/bpf.h                          |  96 ++++-
 include/uapi/linux/seg6_local.h                   |   3 +
 kernel/bpf/verifier.c                             |   1 +
 net/core/filter.c                                 | 390 ++++++++++++++++---
 net/ipv6/seg6_local.c                             | 180 ++++++++-
 tools/include/uapi/linux/bpf.h                    |  97 ++++-
 tools/testing/selftests/bpf/Makefile              |   5 +-
 tools/testing/selftests/bpf/bpf_helpers.h         |  12 +
 tools/testing/selftests/bpf/test_lwt_seg6local.c  | 438 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_lwt_seg6local.sh | 140 +++++++
 13 files changed, 1335 insertions(+), 73 deletions(-)
 create mode 100644 include/net/seg6_local.h
 create mode 100644 tools/testing/selftests/bpf/test_lwt_seg6local.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_seg6local.sh

-- 
2.16.1

^ permalink raw reply

* [PATCH bpf-next v3 1/6] ipv6: sr: make seg6.h includable without IPv6
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

include/net/seg6.h cannot be included in a source file if CONFIG_IPV6 is
not enabled:
   include/net/seg6.h: In function 'seg6_pernet':
>> include/net/seg6.h:52:14: error: 'struct net' has no member named
                                        'ipv6'; did you mean 'ipv4'?
     return net->ipv6.seg6_data;
                 ^~~~
                 ipv4

This commit makes seg6_pernet return NULL if IPv6 is not compiled, hence
allowing seg6.h to be included regardless of the configuration.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
---
 include/net/seg6.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 099bad59dc90..70b4cfac52d7 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -49,7 +49,11 @@ struct seg6_pernet_data {
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 {
+#if IS_ENABLED(CONFIG_IPV6)
 	return net->ipv6.seg6_data;
+#else
+	return NULL;
+#endif
 }
 
 extern int seg6_init(void);
-- 
2.16.1

^ permalink raw reply related

* [PATCH bpf-next v3 2/6] ipv6: sr: export function lookup_nexthop
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

The function lookup_nexthop is essential to implement most of the seg6local
actions. As we want to provide a BPF helper allowing to apply some of these
actions on the packet being processed, the helper should be able to call
this function, hence the need to make it public.

Moreover, if one argument is incorrect or if the next hop can not be found,
an error should be returned by the BPF helper so the BPF program can adapt
its processing of the packet (return an error, properly force the drop,
...). This patch hence makes this function return dst->error to indicate a
possible error.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
---
 include/net/seg6.h       |  3 ++-
 include/net/seg6_local.h | 24 ++++++++++++++++++++++++
 net/ipv6/seg6_local.c    | 20 +++++++++++---------
 3 files changed, 37 insertions(+), 10 deletions(-)
 create mode 100644 include/net/seg6_local.h

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 70b4cfac52d7..e029e301faa5 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -67,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 			     int proto);
 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
-
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
 #endif
diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
new file mode 100644
index 000000000000..57498b23085d
--- /dev/null
+++ b/include/net/seg6_local.h
@@ -0,0 +1,24 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Authors:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_LOCAL_H
+#define _NET_SEG6_LOCAL_H
+
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
+
+#endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 45722327375a..e9b23fb924ad 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -30,6 +30,7 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <net/seg6_local.h>
 #include <linux/etherdevice.h>
 
 struct seg6_local_lwt;
@@ -140,8 +141,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
 	*daddr = *addr;
 }
 
-static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
-			   u32 tbl_id)
+int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			u32 tbl_id)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -187,6 +188,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
+	return dst->error;
 }
 
 /* regular endpoint function */
@@ -200,7 +202,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -220,7 +222,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, &slwt->nh6, 0);
+	seg6_lookup_nexthop(skb, &slwt->nh6, 0);
 
 	return dst_input(skb);
 
@@ -239,7 +241,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -331,7 +333,7 @@ static int input_action_end_dx6(struct sk_buff *skb,
 	if (!ipv6_addr_any(&slwt->nh6))
 		nhaddr = &slwt->nh6;
 
-	lookup_nexthop(skb, nhaddr, 0);
+	seg6_lookup_nexthop(skb, nhaddr, 0);
 
 	return dst_input(skb);
 drop:
@@ -380,7 +382,7 @@ static int input_action_end_dt6(struct sk_buff *skb,
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -406,7 +408,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -438,7 +440,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
-- 
2.16.1

^ permalink raw reply related

* [PATCH bpf-next v3 3/6] bpf: Add IPv6 Segment Routing helpers
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

The BPF seg6local hook should be powerful enough to enable users to
implement most of the use-cases one could think of. After some thinking,
we figured out that the following actions should be possible on a SRv6
packet, requiring 3 specific helpers :
    - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH
    - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH
                               (to add/delete TLVs)
    - bpf_lwt_seg6_action: Apply some SRv6 network programming actions
                           (specifically End.X, End.T, End.B6 and
                            End.B6.Encap)

The specifications of these helpers are provided in the patch (see
include/uapi/linux/bpf.h).

The non-sensitive fields of the SRH are the following : flags, tag and
TLVs. The other fields can not be modified, to maintain the SRH
integrity. Flags, tag and TLVs can easily be modified as their validity
can be checked afterwards via seg6_validate_srh. It is not allowed to
modify the segments directly. If one wants to add segments on the path,
he should stack a new SRH using the End.B6 action via
bpf_lwt_seg6_action.

Growing, shrinking or editing TLVs via the helpers will flag the SRH as
invalid, and it will have to be re-validated before re-entering the IPv6
layer. This flag is stored in a per-CPU buffer, along with the current
header length in bytes.

Storing the SRH len in bytes in the control block is mandatory when using
bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH
len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes
boundary). When adding/deleting TLVs within the BPF program, the SRH may
temporary be in an invalid state where its length cannot be rounded to 8
bytes without remainder, hence the need to store the length in bytes
separately. The caller of the BPF program can then ensure that the SRH's
final length is valid using this value. Again, a final SRH modified by a
BPF program which doesn’t respect the 8-bytes boundary will be discarded
as it will be considered as invalid.

Finally, a fourth helper is provided, bpf_lwt_push_encap, which is
available from the LWT BPF IN hook, but not from the seg6local BPF one.
This helper allows to encapsulate a Segment Routing Header (either with
a new outer IPv6 header, or by inlining it directly in the existing IPv6
header) into a non-SRv6 packet. This helper is required if we want to
offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet,
as the BPF seg6local hook only works on traffic already containing a SRH.
This is the BPF equivalent of the seg6 LWT infrastructure, which achieves
the same purpose but with a static SRH per route.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
---
 include/net/seg6_local.h |   8 ++
 include/uapi/linux/bpf.h |  95 +++++++++++++++-
 net/core/filter.c        | 282 +++++++++++++++++++++++++++++++++++++++++++----
 net/ipv6/seg6_local.c    |   2 +
 4 files changed, 363 insertions(+), 24 deletions(-)

diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
index 57498b23085d..661fd5b4d3e0 100644
--- a/include/net/seg6_local.h
+++ b/include/net/seg6_local.h
@@ -15,10 +15,18 @@
 #ifndef _NET_SEG6_LOCAL_H
 #define _NET_SEG6_LOCAL_H
 
+#include <linux/percpu.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
 
 extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 			       u32 tbl_id);
 
+struct seg6_bpf_srh_state {
+	bool valid;
+	u16 hdrlen;
+};
+
+DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 93d5a4eeec2a..df14a31500eb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1826,6 +1826,89 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1896,7 +1979,11 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),    \
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1963,6 +2050,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6877426c23a6..5a0c03ec22ac 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -60,6 +60,10 @@
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3322,28 +3326,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_pkt_data(void *func)
-{
-	if (func == bpf_skb_vlan_push ||
-	    func == bpf_skb_vlan_pop ||
-	    func == bpf_skb_store_bytes ||
-	    func == bpf_skb_change_proto ||
-	    func == bpf_skb_change_head ||
-	    func == bpf_skb_change_tail ||
-	    func == bpf_skb_adjust_room ||
-	    func == bpf_skb_pull_data ||
-	    func == bpf_clone_redirect ||
-	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data ||
-	    func == bpf_xdp_adjust_tail)
-		return true;
-
-	return false;
-}
-
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 				  unsigned long off, unsigned long len)
 {
@@ -4032,6 +4014,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+	int err;
+	struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+	if (!seg6_validate_srh(srh, len))
+		return -EINVAL;
+
+	switch (type) {
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EBADMSG;
+
+		err = seg6_do_srh_inline(skb, srh);
+		break;
+	case BPF_LWT_ENCAP_SEG6:
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+		err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (err)
+		return err;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_LWTUNNEL */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+	   u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL)
+	case BPF_LWT_ENCAP_SEG6:
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+	.func		= bpf_lwt_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_tlvs, *srh_end, *ptr;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+	srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+	ptr = skb->data + offset;
+	if (ptr >= srh_tlvs && ptr + len <= srh_end)
+		srh_state->valid = 0;
+	else if (ptr < (void *)&srh->flags ||
+		 ptr + len > (void *)&srh->segments)
+		return -EFAULT;
+
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
+		return -EFAULT;
+
+	memcpy(ptr, from, len);
+	return 0;
+#else /* CONFIG_IPV6_SEG6_LWTUNNEL */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+	.func		= bpf_lwt_seg6_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+	   u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int err;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	if (!srh_state->valid) {
+		if (unlikely((srh_state->hdrlen & 7) != 0))
+			return -EBADMSG;
+
+		srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+		if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+			return -EBADMSG;
+
+		srh_state->valid = 1;
+	}
+
+	switch (action) {
+	case SEG6_LOCAL_ACTION_END_X:
+		if (param_len != sizeof(struct in6_addr))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+	case SEG6_LOCAL_ACTION_END_T:
+		if (param_len != sizeof(int))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+	case SEG6_LOCAL_ACTION_END_B6:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	default:
+		return -EINVAL;
+	}
+#else /* CONFIG_IPV6_SEG6_LWTUNNEL */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+	.func		= bpf_lwt_seg6_action,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+	   s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_end, *srh_tlvs, *ptr;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6hdr *hdr;
+	int srhoff = 0;
+	int ret;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+			((srh->first_segment + 1) << 4));
+	srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+			srh_state->hdrlen);
+	ptr = skb->data + offset;
+
+	if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+		return -EFAULT;
+	if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+		return -EFAULT;
+
+	if (len > 0) {
+		ret = skb_cow_head(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		ret = bpf_skb_net_hdr_push(skb, offset, len);
+	} else {
+		ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+	}
+	if (unlikely(ret < 0))
+		return ret;
+
+	hdr = (struct ipv6hdr *)skb->data;
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	bpf_compute_data_pointers(skb);
+	srh_state->hdrlen += len;
+	srh_state->valid = 0;
+	return 0;
+#else /* CONFIG_IPV6_SEG6_LWTUNNEL */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+	.func		= bpf_lwt_seg6_adjust_srh,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_adjust_room ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_clone_redirect ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail ||
+	    func == bpf_lwt_push_encap ||
+	    func == bpf_lwt_seg6_store_bytes ||
+	    func == bpf_lwt_seg6_adjust_srh ||
+	    func == bpf_lwt_seg6_action
+	    )
+		return true;
+
+	return false;
+}
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4436,7 +4673,6 @@ static bool lwt_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
-
 /* Attach type specific accesses */
 static bool __sock_filter_check_attach_type(int off,
 					    enum bpf_access_type access_type,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index e9b23fb924ad..ae68c1ef8fb0 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -449,6 +449,8 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	return err;
 }
 
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
-- 
2.16.1

^ permalink raw reply related

* [PATCH bpf-next v3 4/6] bpf: Split lwt inout verifier structures
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

The new bpf_lwt_push_encap helper should only be accessible within the
LWT BPF IN hook, and not the OUT one, as this may lead to a skb under
panic.

At the moment, both LWT BPF IN and OUT share the same list of helpers,
whose calls are authorized by the verifier. This patch separates the
verifier ops for the IN and OUT hooks, and allows the IN hook to call the
bpf_lwt_push_encap helper.

This patch is also the occasion to put all lwt_*_func_proto functions
together for clarity. At the moment, socks_op_func_proto is in the middle
of lwt_inout_func_proto and lwt_xmit_func_proto.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
---
 include/linux/bpf_types.h |  4 +--
 net/core/filter.c         | 83 +++++++++++++++++++++++++++++------------------
 2 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d7df1b323082..cc9d7e031330 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 5a0c03ec22ac..2aa83e0f40ce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4448,33 +4448,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
-	switch (func_id) {
-	case BPF_FUNC_skb_load_bytes:
-		return &bpf_skb_load_bytes_proto;
-	case BPF_FUNC_skb_pull_data:
-		return &bpf_skb_pull_data_proto;
-	case BPF_FUNC_csum_diff:
-		return &bpf_csum_diff_proto;
-	case BPF_FUNC_get_cgroup_classid:
-		return &bpf_get_cgroup_classid_proto;
-	case BPF_FUNC_get_route_realm:
-		return &bpf_get_route_realm_proto;
-	case BPF_FUNC_get_hash_recalc:
-		return &bpf_get_hash_recalc_proto;
-	case BPF_FUNC_perf_event_output:
-		return &bpf_skb_event_output_proto;
-	case BPF_FUNC_get_smp_processor_id:
-		return &bpf_get_smp_processor_id_proto;
-	case BPF_FUNC_skb_under_cgroup:
-		return &bpf_skb_under_cgroup_proto;
-	default:
-		return bpf_base_func_proto(func_id);
-	}
-}
-
 static const struct bpf_func_proto *
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4534,6 +4507,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_push_encap_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4565,7 +4576,7 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
 	default:
-		return lwt_inout_func_proto(func_id, prog);
+		return lwt_out_func_proto(func_id, prog);
 	}
 }
 
@@ -6131,13 +6142,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
-	.get_func_proto		= lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+	.get_func_proto		= lwt_in_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_in_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+	.get_func_proto		= lwt_out_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_out_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-- 
2.16.1

^ permalink raw reply related

* [PATCH bpf-next v3 5/6] ipv6: sr: Add seg6local action End.BPF
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

This patch adds the End.BPF action to the LWT seg6local infrastructure.
This action works like any other seg6local End action, meaning that an IPv6
header with SRH is needed, whose DA has to be equal to the SID of the
action. It will also advance the SRH to the next segment, the BPF program
does not have to take care of this.

Since the BPF program may not be a source of instability in the kernel, it
is important to ensure that the integrity of the packet is maintained
before yielding it back to the IPv6 layer. The hook hence keeps track if
the SRH has been altered through the helpers, and re-validates its
content if needed with seg6_validate_srh. The state kept for validation is
stored in a per-CPU buffer. The BPF program is not allowed to directly
write into the packet, and only some fields of the SRH can be altered
through the helper bpf_lwt_seg6_store_bytes.

Performances profiling has shown that the SRH re-validation does not induce
a significant overhead. If the altered SRH is deemed as invalid, the packet
is dropped.

This validation is also done before executing any action through
bpf_lwt_seg6_action, and will not be performed again if the SRH is not
modified after calling the action.

The BPF program may return 3 types of return codes:
    - BPF_OK: the End.BPF action will look up the next destination through
             seg6_lookup_nexthop.
    - BPF_REDIRECT: if an action has been executed through the
          bpf_lwt_seg6_action helper, the BPF program should return this
          value, as the skb's destination is already set and the default
          lookup should not be performed.
    - BPF_DROP : the packet will be dropped.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
---
 include/linux/bpf_types.h       |   3 +
 include/uapi/linux/bpf.h        |   1 +
 include/uapi/linux/seg6_local.h |   3 +
 kernel/bpf/verifier.c           |   1 +
 net/core/filter.c               |  25 +++++++
 net/ipv6/seg6_local.c           | 158 +++++++++++++++++++++++++++++++++++++++-
 6 files changed, 188 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cc9d7e031330..5b732bfff8a3 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -12,6 +12,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
+#endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df14a31500eb..8c42297bf117 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -139,6 +139,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index ef2d8c3e76c1..aadcc11fb918 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -25,6 +25,7 @@ enum {
 	SEG6_LOCAL_NH6,
 	SEG6_LOCAL_IIF,
 	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -59,6 +60,8 @@ enum {
 	SEG6_LOCAL_ACTION_END_AS	= 13,
 	/* forward to SR-unaware VNF with masquerading */
 	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
 
 	__SEG6_LOCAL_ACTION_MAX,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5e1a6c4165d..bb6e4a17ce3d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 2aa83e0f40ce..592dec8c781c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4580,6 +4580,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_seg6_store_bytes:
+		return &bpf_lwt_seg6_store_bytes_proto;
+	case BPF_FUNC_lwt_seg6_action:
+		return &bpf_lwt_seg6_action_proto;
+	case BPF_FUNC_lwt_seg6_adjust_srh:
+		return &bpf_lwt_seg6_adjust_srh_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
@@ -6173,6 +6188,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+	.get_func_proto		= lwt_seg6local_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ae68c1ef8fb0..2ac887da63e2 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
 /*
  *  SR-IPv6 implementation
  *
- *  Author:
+ *  Authors:
  *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
  *
  *
  *  This program is free software; you can redistribute it and/or
@@ -32,6 +33,7 @@
 #endif
 #include <net/seg6_local.h>
 #include <linux/etherdevice.h>
+#include <linux/bpf.h>
 
 struct seg6_local_lwt;
 
@@ -42,6 +44,11 @@ struct seg6_action_desc {
 	int static_headroom;
 };
 
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
 struct seg6_local_lwt {
 	int action;
 	struct ipv6_sr_hdr *srh;
@@ -50,6 +57,7 @@ struct seg6_local_lwt {
 	struct in6_addr nh6;
 	int iif;
 	int oif;
+	struct bpf_lwt_prog bpf;
 
 	int headroom;
 	struct seg6_action_desc *desc;
@@ -451,6 +459,69 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 
 DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
 
+static int input_action_end_bpf(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct seg6_bpf_srh_state local_srh_state;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int ret;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+	/* preempt_disable is needed to protect the per-CPU buffer srh_state,
+	 * which is also accessed by the bpf_lwt_seg6_* helpers
+	 */
+	preempt_disable();
+	srh_state->hdrlen = srh->hdrlen << 3;
+	srh_state->valid = 1;
+
+	rcu_read_lock();
+	bpf_compute_data_pointers(skb);
+	ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+	rcu_read_unlock();
+
+	local_srh_state = *srh_state;
+	preempt_enable();
+
+	switch (ret) {
+	case BPF_OK:
+	case BPF_REDIRECT:
+		break;
+	case BPF_DROP:
+		goto drop;
+	default:
+		pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+		goto drop;
+	}
+
+	if (unlikely((local_srh_state.hdrlen & 7) != 0))
+		goto drop;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		goto drop;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
+
+	if (!local_srh_state.valid &&
+	    unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+		goto drop;
+
+	if (ret != BPF_REDIRECT)
+		seg6_lookup_nexthop(skb, NULL, 0);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
@@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.attrs		= (1 << SEG6_LOCAL_SRH),
 		.input		= input_action_end_b6_encap,
 		.static_headroom	= sizeof(struct ipv6hdr),
-	}
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_BPF,
+		.attrs		= (1 << SEG6_LOCAL_BPF),
+		.input		= input_action_end_bpf,
+	},
+
 };
 
 static struct seg6_action_desc *__get_action_desc(int action)
@@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 				    .len = sizeof(struct in6_addr) },
 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -719,6 +797,71 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
 	return 0;
 }
 
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD]   = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attrs[SEG6_LOCAL_BPF],
+			       bpf_prog_policy, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	slwt->bpf.name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!slwt->bpf.name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	slwt->bpf.prog = p;
+
+	return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nest;
+
+	if (!slwt->bpf.prog)
+		return 0;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (slwt->bpf.name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, slwt->bpf.name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (!a->bpf.name && !b->bpf.name)
+		return 0;
+
+	if (!a->bpf.name || !b->bpf.name)
+		return 1;
+
+	return strcmp(a->bpf.name, b->bpf.name);
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -749,6 +892,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
 				    .put = put_nla_oif,
 				    .cmp = cmp_nla_oif },
+
+	[SEG6_LOCAL_BPF]	= { .parse = parse_nla_bpf,
+				    .put = put_nla_bpf,
+				    .cmp = cmp_nla_bpf },
+
 };
 
 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -797,7 +945,6 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
 
 	err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
 			       extack);
-
 	if (err < 0)
 		return err;
 
@@ -886,6 +1033,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 	if (attrs & (1 << SEG6_LOCAL_OIF))
 		nlsize += nla_total_size(4);
 
+	if (attrs & (1 << SEG6_LOCAL_BPF))
+		nlsize += nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) +
+		       nla_total_size(4);
+
 	return nlsize;
 }
 
-- 
2.16.1

^ permalink raw reply related

* [PATCH bpf-next v3 6/6] selftests/bpf: test for seg6local End.BPF action
From: Mathieu Xhonneux @ 2018-05-06 17:27 UTC (permalink / raw)
  To: netdev; +Cc: dlebrun, alexei.starovoitov
In-Reply-To: <cover.1525626429.git.m.xhonneux@gmail.com>

Add a new test for the seg6local End.BPF action. The following helpers
are also tested :

- bpf_lwt_push_encap within the LWT BPF IN hook
- bpf_lwt_seg6_action
- bpf_lwt_seg6_adjust_srh
- bpf_lwt_seg6_store_bytes

A chain of End.BPF actions is built. The SRH is injected through a LWT
BPF IN hook before the chain. Each End.BPF action validates the previous
one, otherwise the packet is dropped.
The test succeeds if the last node in the chain receives the packet and
the UDP datagram contained can be retrieved from userspace.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
---
 tools/include/uapi/linux/bpf.h                    |  97 ++++-
 tools/testing/selftests/bpf/Makefile              |   5 +-
 tools/testing/selftests/bpf/bpf_helpers.h         |  12 +
 tools/testing/selftests/bpf/test_lwt_seg6local.c  | 438 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_lwt_seg6local.sh | 140 +++++++
 5 files changed, 689 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_seg6local.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_seg6local.sh

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 83a95ae388dd..8c42297bf117 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
@@ -138,6 +139,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
@@ -1825,6 +1827,89 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1895,7 +1980,11 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),    \
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1962,6 +2051,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9d762184b805..e7455e0e43b6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o
+	test_get_stack_rawtp.o test_lwt_seg6local.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -42,7 +42,8 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_meta.sh \
 	test_offload.py \
 	test_sock_addr.sh \
-	test_tunnel.sh
+	test_tunnel.sh \
+	test_lwt_seg6local.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..a3d556819559 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,18 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr,
+				 unsigned int len) =
+	(void *) BPF_FUNC_lwt_push_encap;
+static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset,
+				       void *from, unsigned int len) =
+	(void *) BPF_FUNC_lwt_seg6_store_bytes;
+static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param,
+				  unsigned int param_len) =
+	(void *) BPF_FUNC_lwt_seg6_action;
+static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset,
+				      unsigned int len) =
+	(void *) BPF_FUNC_lwt_seg6_adjust_srh;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.c b/tools/testing/selftests/bpf/test_lwt_seg6local.c
new file mode 100644
index 000000000000..d752bc1fe81c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.c
@@ -0,0 +1,438 @@
+#include <stddef.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <linux/seg6_local.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define bpf_printk(fmt, ...)				\
+({							\
+	char ____fmt[] = fmt;				\
+	bpf_trace_printk(____fmt, sizeof(____fmt),	\
+			##__VA_ARGS__);			\
+})
+
+/* Packet parsing state machine helpers. */
+#define cursor_advance(_cursor, _len) \
+	({ void *_tmp = _cursor; _cursor += _len; _tmp; })
+
+#define SR6_FLAG_ALERT (1 << 4)
+
+#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
+				0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
+#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
+				0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
+#define BPF_PACKET_HEADER __attribute__((packed))
+
+struct ip6_t {
+	unsigned int ver:4;
+	unsigned int priority:8;
+	unsigned int flow_label:20;
+	unsigned short payload_len;
+	unsigned char next_header;
+	unsigned char hop_limit;
+	unsigned long long src_hi;
+	unsigned long long src_lo;
+	unsigned long long dst_hi;
+	unsigned long long dst_lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_addr_t {
+	unsigned long long hi;
+	unsigned long long lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_srh_t {
+	unsigned char nexthdr;
+	unsigned char hdrlen;
+	unsigned char type;
+	unsigned char segments_left;
+	unsigned char first_segment;
+	unsigned char flags;
+	unsigned short tag;
+
+	struct ip6_addr_t segments[0];
+} BPF_PACKET_HEADER;
+
+struct sr6_tlv_t {
+	unsigned char type;
+	unsigned char len;
+	unsigned char value[0];
+} BPF_PACKET_HEADER;
+
+__attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb)
+{
+	void *cursor, *data_end;
+	struct ip6_srh_t *srh;
+	struct ip6_t *ip;
+	uint8_t *ipver;
+
+	data_end = (void *)(long)skb->data_end;
+	cursor = (void *)(long)skb->data;
+	ipver = (uint8_t *)cursor;
+
+	if ((void *)ipver + sizeof(*ipver) > data_end)
+		return NULL;
+
+	if ((*ipver >> 4) != 6)
+		return NULL;
+
+	ip = cursor_advance(cursor, sizeof(*ip));
+	if ((void *)ip + sizeof(*ip) > data_end)
+		return NULL;
+
+	if (ip->next_header != 43)
+		return NULL;
+
+	srh = cursor_advance(cursor, sizeof(*srh));
+	if ((void *)srh + sizeof(*srh) > data_end)
+		return NULL;
+
+	if (srh->type != 4)
+		return NULL;
+
+	return srh;
+}
+
+__attribute__((always_inline))
+int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad,
+		   uint32_t old_pad, uint32_t pad_off)
+{
+	int err;
+
+	if (new_pad != old_pad) {
+		err = bpf_lwt_seg6_adjust_srh(skb, pad_off,
+					  (int) new_pad - (int) old_pad);
+		if (err)
+			return err;
+	}
+
+	if (new_pad > 0) {
+		char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+					0, 0, 0};
+		struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf;
+
+		pad_tlv->type = SR6_TLV_PADDING;
+		pad_tlv->len = new_pad - 2;
+
+		err = bpf_lwt_seg6_store_bytes(skb, pad_off,
+					       (void *)pad_tlv_buf, new_pad);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+__attribute__((always_inline))
+int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
+			  uint32_t *tlv_off, uint32_t *pad_size,
+			  uint32_t *pad_off)
+{
+	uint32_t srh_off, cur_off;
+	int offset_valid = 0;
+	int err;
+
+	srh_off = (char *)srh - (char *)(long)skb->data;
+	// cur_off = end of segments, start of possible TLVs
+	cur_off = srh_off + sizeof(*srh) +
+		sizeof(struct ip6_addr_t) * (srh->first_segment + 1);
+
+	*pad_off = 0;
+
+	// we can only go as far as ~10 TLVs due to the BPF max stack size
+	#pragma clang loop unroll(full)
+	for (int i = 0; i < 10; i++) {
+		struct sr6_tlv_t tlv;
+
+		if (cur_off == *tlv_off)
+			offset_valid = 1;
+
+		if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3))
+			break;
+
+		err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv));
+		if (err)
+			return err;
+
+		if (tlv.type == SR6_TLV_PADDING) {
+			*pad_size = tlv.len + sizeof(tlv);
+			*pad_off = cur_off;
+
+			if (*tlv_off == srh_off) {
+				*tlv_off = cur_off;
+				offset_valid = 1;
+			}
+			break;
+
+		} else if (tlv.type == SR6_TLV_HMAC) {
+			break;
+		}
+
+		cur_off += sizeof(tlv) + tlv.len;
+	} // we reached the padding or HMAC TLVs, or the end of the SRH
+
+	if (*pad_off == 0)
+		*pad_off = cur_off;
+
+	if (*tlv_off == -1)
+		*tlv_off = cur_off;
+	else if (!offset_valid)
+		return -EINVAL;
+
+	return 0;
+}
+
+__attribute__((always_inline))
+int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off,
+	    struct sr6_tlv_t *itlv, uint8_t tlv_size)
+{
+	uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+	uint8_t len_remaining, new_pad;
+	uint32_t pad_off = 0;
+	uint32_t pad_size = 0;
+	uint32_t partial_srh_len;
+	int err;
+
+	if (tlv_off != -1)
+		tlv_off += srh_off;
+
+	if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC)
+		return -EINVAL;
+
+	err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len);
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size);
+	if (err)
+		return err;
+
+	// the following can't be moved inside update_tlv_pad because the
+	// bpf verifier has some issues with it
+	pad_off += sizeof(*itlv) + itlv->len;
+	partial_srh_len = pad_off - srh_off;
+	len_remaining = partial_srh_len % 8;
+	new_pad = 8 - len_remaining;
+
+	if (new_pad == 1) // cannot pad for 1 byte only
+		new_pad = 9;
+	else if (new_pad == 8)
+		new_pad = 0;
+
+	return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+__attribute__((always_inline))
+int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh,
+	       uint32_t tlv_off)
+{
+	uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+	uint8_t len_remaining, new_pad;
+	uint32_t partial_srh_len;
+	uint32_t pad_off = 0;
+	uint32_t pad_size = 0;
+	struct sr6_tlv_t tlv;
+	int err;
+
+	tlv_off += srh_off;
+
+	err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+	if (err)
+		return err;
+
+	err = bpf_skb_load_bytes(skb, tlv_off, &tlv, sizeof(tlv));
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, -(sizeof(tlv) + tlv.len));
+	if (err)
+		return err;
+
+	pad_off -= sizeof(tlv) + tlv.len;
+	partial_srh_len = pad_off - srh_off;
+	len_remaining = partial_srh_len % 8;
+	new_pad = 8 - len_remaining;
+	if (new_pad == 1) // cannot pad for 1 byte only
+		new_pad = 9;
+	else if (new_pad == 8)
+		new_pad = 0;
+
+	return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+__attribute__((always_inline))
+int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh)
+{
+	int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) +
+		((srh->first_segment + 1) << 4);
+	struct sr6_tlv_t tlv;
+
+	if (bpf_skb_load_bytes(skb, tlv_offset, &tlv, sizeof(struct sr6_tlv_t)))
+		return 0;
+
+	if (tlv.type == SR6_TLV_EGRESS && tlv.len == 18) {
+		struct ip6_addr_t egr_addr;
+
+		if (bpf_skb_load_bytes(skb, tlv_offset + 4, &egr_addr, 16))
+			return 0;
+
+		// check if egress TLV value is correct
+		if (ntohll(egr_addr.hi) == 0xfd00000000000000 &&
+				ntohll(egr_addr.lo) == 0x4)
+			return 1;
+	}
+
+	return 0;
+}
+
+// This function will push a SRH with segments fd00::1, fd00::2, fd00::3,
+// fd00::4
+SEC("encap_srh")
+int __encap_srh(struct __sk_buff *skb)
+{
+	bpf_printk("got pkt\n");
+	unsigned long long hi = 0xfd00000000000000;
+	struct ip6_addr_t *seg;
+	struct ip6_srh_t *srh;
+	char srh_buf[72]; // room for 4 segments
+	int err;
+
+	srh = (struct ip6_srh_t *)srh_buf;
+	srh->nexthdr = 0;
+	srh->hdrlen = 8;
+	srh->type = 4;
+	srh->segments_left = 3;
+	srh->first_segment = 3;
+	srh->flags = 0;
+	srh->tag = 0;
+
+	seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh));
+
+	#pragma clang loop unroll(full)
+	for (unsigned long long lo = 0; lo < 4; lo++) {
+		seg->lo = htonll(4 - lo);
+		seg->hi = htonll(hi);
+		seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg));
+	}
+
+	err = bpf_lwt_push_encap(skb, 0, (void *)srh, sizeof(srh_buf));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_REDIRECT;
+}
+
+// Add an Egress TLV fc00::4, add the flag A,
+// and apply End.X action to fc42::1
+SEC("add_egr_x")
+int __add_egr_x(struct __sk_buff *skb)
+{
+	unsigned long long hi = 0xfc42000000000000;
+	unsigned long long lo = 0x1;
+	struct ip6_srh_t *srh = get_srh(skb);
+	uint8_t new_flags = SR6_FLAG_ALERT;
+	struct ip6_addr_t addr;
+	int err, offset;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+			   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4};
+
+	err = add_tlv(skb, srh, (srh->hdrlen+1) << 3,
+		      (struct sr6_tlv_t *)&tlv, 20);
+	if (err)
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+	err = bpf_lwt_seg6_store_bytes(skb, offset,
+				       (void *)&new_flags, sizeof(new_flags));
+	if (err)
+		return BPF_DROP;
+
+	addr.lo = htonll(lo);
+	addr.hi = htonll(hi);
+	err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
+				  (void *)&addr, sizeof(addr));
+	if (err)
+		return BPF_DROP;
+	return BPF_REDIRECT;
+}
+
+// Pop the Egress TLV, reset the flags, change the tag 2442 and finally do a
+// simple End action
+SEC("pop_egr")
+int __pop_egr(struct __sk_buff *skb)
+{
+	struct ip6_srh_t *srh = get_srh(skb);
+	uint16_t new_tag = bpf_htons(2442);
+	uint8_t new_flags = 0;
+	int err, offset;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	if (srh->flags != SR6_FLAG_ALERT)
+		return BPF_DROP;
+
+	if (srh->hdrlen != 11) // 4 segments + Egress TLV + Padding TLV
+		return BPF_DROP;
+
+	if (!has_egr_tlv(skb, srh))
+		return BPF_DROP;
+
+	err = delete_tlv(skb, srh, 8 + (srh->first_segment + 1) * 16);
+	if (err)
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+	if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_flags,
+				     sizeof(new_flags)))
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, tag);
+	if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_tag,
+				     sizeof(new_tag)))
+		return BPF_DROP;
+
+	return BPF_OK;
+}
+
+// Inspect if the Egress TLV and flag have been removed, if the tag is correct,
+// then apply a End.T action to reach the last segment
+SEC("inspect_t")
+int __inspect_t(struct __sk_buff *skb)
+{
+	struct ip6_srh_t *srh = get_srh(skb);
+	int table = 117;
+	int err;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	if (srh->flags != 0)
+		return BPF_DROP;
+
+	if (srh->tag != bpf_htons(2442))
+		return BPF_DROP;
+
+	if (srh->hdrlen != 8) // 4 segments
+		return BPF_DROP;
+
+	err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_T,
+				  (void *)&table, sizeof(table));
+
+	if (err)
+		return BPF_DROP;
+
+	return BPF_REDIRECT;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
new file mode 100755
index 000000000000..1c77994b5e71
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Connects 6 network namespaces through veths.
+# Each NS may have different IPv6 global scope addresses :
+#   NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6
+# fb00::1           fd00::1  fd00::2  fd00::3  fb00::6
+#                   fc42::1           fd00::4
+#
+# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a
+# IPv6 header with a Segment Routing Header, with segments :
+# 	fd00::1 -> fd00::2 -> fd00::3 -> fd00::4
+#
+# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions :
+# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1
+# - fd00::2 : remove the TLV, change the flags, add a tag
+# - fd00::3 : apply an End.T action to fd00::4, through routing table 117
+#
+# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet.
+# Each End.BPF action will validate the operations applied on the SRH by the
+# previous BPF program in the chain, otherwise the packet is dropped.
+#
+# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this
+# datagram can be read on NS6 when binding to fb00::6.
+
+TMP_FILE="/tmp/selftest_lwt_seg6local.txt"
+
+cleanup()
+{
+	if [ "$?" = "0" ]; then
+		echo "selftests: test_lwt_seg6local [PASS]";
+	else
+		echo "selftests: test_lwt_seg6local [FAILED]";
+	fi
+
+	set +e
+	ip netns del ns1 2> /dev/null
+	ip netns del ns2 2> /dev/null
+	ip netns del ns3 2> /dev/null
+	ip netns del ns4 2> /dev/null
+	ip netns del ns5 2> /dev/null
+	ip netns del ns6 2> /dev/null
+	rm -f $TMP_FILE
+}
+
+set -e
+
+ip netns add ns1
+ip netns add ns2
+ip netns add ns3
+ip netns add ns4
+ip netns add ns5
+ip netns add ns6
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+ip link add veth3 type veth peer name veth4
+ip link add veth5 type veth peer name veth6
+ip link add veth7 type veth peer name veth8
+ip link add veth9 type veth peer name veth10
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+ip link set veth3 netns ns2
+ip link set veth4 netns ns3
+ip link set veth5 netns ns3
+ip link set veth6 netns ns4
+ip link set veth7 netns ns4
+ip link set veth8 netns ns5
+ip link set veth9 netns ns5
+ip link set veth10 netns ns6
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+ip netns exec ns2 ip link set dev veth3 up
+ip netns exec ns3 ip link set dev veth4 up
+ip netns exec ns3 ip link set dev veth5 up
+ip netns exec ns4 ip link set dev veth6 up
+ip netns exec ns4 ip link set dev veth7 up
+ip netns exec ns5 ip link set dev veth8 up
+ip netns exec ns5 ip link set dev veth9 up
+ip netns exec ns6 ip link set dev veth10 up
+ip netns exec ns6 ip link set dev lo up
+
+# All link scope addresses and routes required between veths
+ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link
+ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link
+ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link
+ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link
+ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link
+ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link
+ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link
+ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link
+ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link
+ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link
+ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link
+ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link
+ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link
+ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link
+
+ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo
+ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21
+
+ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2
+ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link
+
+ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65
+ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF obj test_lwt_seg6local.o sec add_egr_x dev veth4
+
+ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF obj test_lwt_seg6local.o sec pop_egr dev veth6
+ip netns exec ns4 ip -6 addr add fc42::1 dev lo
+ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87
+
+ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109
+ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF obj test_lwt_seg6local.o sec inspect_t dev veth8
+
+ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo
+ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo
+
+ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
+
+ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE &
+ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
+sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment
+kill -INT $!
+
+if [[ $(< $TMP_FILE) != "foobar" ]]; then
+	exit 1
+fi
+
+exit 0
-- 
2.16.1

^ permalink raw reply related

* Re: Locking in network code
From: Alexander Duyck @ 2018-05-06 16:16 UTC (permalink / raw)
  To: Jacob S. Moroni; +Cc: Netdev
In-Reply-To: <1525614224.300611.1362511632.7D50FB8C@webmail.messagingengine.com>

On Sun, May 6, 2018 at 6:43 AM, Jacob S. Moroni <mail@jakemoroni.com> wrote:
> Hello,
>
> I have a stupid question regarding which variant of spin_lock to use
> throughout the network stack, and inside RX handlers specifically.
>
> It's my understanding that skbuffs are normally passed into the stack
> from soft IRQ context if the device is using NAPI, and hard IRQ
> context if it's not using NAPI (and I guess process context too if the
> driver does it's own workqueue thing).
>
> So, that means that handlers registered with netdev_rx_handler_register
> may end up being called from any context.

I am pretty sure the Rx handlers are all called from softirq context.
The hard IRQ will just call netif_rx which will queue the packet up to
be handles in the soft IRQ later.

> However, the RX handler in the macvlan code calls ip_check_defrag,
> which could eventually lead to a call to ip_defrag, which ends
> up taking a regular spin_lock around the call to ip_frag_queue.
>
> Is this a risk of deadlock, and if not, why?
>
> What if you're running a system with one CPU and a packet fragment
> arrives on a NAPI interface, then, while the spin_lock is held,
> another fragment somehow arrives on another interface which does
> its processing in hard IRQ context?
>
> --
>   Jacob S. Moroni
>   mail@jakemoroni.com

Take a look at the netif_rx code and it should answer most of your
questions. Basically everything is handed off from the hard IRQ to the
soft IRQ via a backlog queue and then handled in net_rx_action.

- Alex

^ permalink raw reply

* Re: [RFC PATCH ghak32 V2 01/13] audit: add container id
From: Richard Guy Briggs @ 2018-05-06 16:51 UTC (permalink / raw)
  To: Paul Moore
  Cc: simo, jlayton, carlos, linux-api, containers, LKML, Eric Paris,
	dhowells, Linux-Audit Mailing List, ebiederm, luto, netdev,
	linux-fsdevel, cgroups, serge, viro
In-Reply-To: <CAHC9VhTyvxxj2e2Gn+iyW6iLLeYB7hp8a+JvfeMmJ2nUPqtEaw@mail.gmail.com>

On 2018-04-18 19:47, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Implement the proc fs write to set the audit container ID of a process,
> > emitting an AUDIT_CONTAINER record to document the event.
> >
> > This is a write from the container orchestrator task to a proc entry of
> > the form /proc/PID/containerid where PID is the process ID of the newly
> > created task that is to become the first task in a container, or an
> > additional task added to a container.
> >
> > The write expects up to a u64 value (unset: 18446744073709551615).
> >
> > This will produce a record such as this:
> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
> >
> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> > the orchestrator while the "opid" field is the object's PID, the process
> > being "contained".  Old and new container ID values are given in the
> > "contid" fields, while res indicates its success.
> >
> > It is not permitted to self-set, unset or re-set the container ID.  A
> > child inherits its parent's container ID, but then can be set only once
> > after.
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >  include/linux/audit.h      | 16 +++++++++
> >  include/linux/init_task.h  |  4 ++-
> >  include/linux/sched.h      |  1 +
> >  include/uapi/linux/audit.h |  2 ++
> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
> >  6 files changed, 143 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 60316b5..6ce4fbe 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
> >         .read           = proc_sessionid_read,
> >         .llseek         = generic_file_llseek,
> >  };
> > +
> > +static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
> > +                                  size_t count, loff_t *ppos)
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       u64 containerid;
> > +       int rv;
> > +       struct task_struct *task = get_proc_task(inode);
> > +
> > +       if (!task)
> > +               return -ESRCH;
> > +       if (*ppos != 0) {
> > +               /* No partial writes. */
> > +               put_task_struct(task);
> > +               return -EINVAL;
> > +       }
> > +
> > +       rv = kstrtou64_from_user(buf, count, 10, &containerid);
> > +       if (rv < 0) {
> > +               put_task_struct(task);
> > +               return rv;
> > +       }
> > +
> > +       rv = audit_set_containerid(task, containerid);
> > +       put_task_struct(task);
> > +       if (rv < 0)
> > +               return rv;
> > +       return count;
> > +}
> > +
> > +static const struct file_operations proc_containerid_operations = {
> > +       .write          = proc_containerid_write,
> > +       .llseek         = generic_file_llseek,
> > +};
> > +
> >  #endif
> >
> >  #ifdef CONFIG_FAULT_INJECTION
> > @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > @@ -3355,6 +3391,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),

...

> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d258826..1b82191 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -796,6 +796,7 @@ struct task_struct {
> >  #ifdef CONFIG_AUDITSYSCALL
> >         kuid_t                          loginuid;
> >         unsigned int                    sessionid;
> > +       u64                             containerid;
> 
> This one line addition to the task_struct scares me the most of
> anything in this patchset.  Why?  It's a field named "containerid" in
> a perhaps one of the most widely used core kernel structures; the
> possibilities for abuse are endless, and it's foolish to think we
> would ever be able to adequately police this.
> 
> Unfortunately, we can't add the field to audit_context as things
> currently stand because we don't always allocate an audit_context,
> it's dependent on the system's configuration, and we need to track the
> audit container ID for a given process, regardless of the audit
> configuration.  Pretty much the same reason why loginuid and sessionid
> are located directly in task_struct now.  As I stressed during the
> design phase, I really want to keep this as an *audit* container ID
> and not a general purpose kernel wide container ID.  If the kernel
> ever grows a general purpose container ID token, I'll be the first in
> line to convert the audit code, but I don't want audit to be that
> general purpose mechanism ... audit is hated enough as-is ;)
> 
> I think the right solution to this is to create another new struct,
> audit_task_info (or similar, the name really isn't that important),
> which would be stored as a pointer in task_struct and would replace
> the audit_context pointer, loginuid, sessionid, and the newly proposed
> containerid.  The new audit_task_info would always be allocated in the
> audit_alloc() function (please use kmem_cache), and the audit_context
> pointer included inside would continue to be allocated based on the
> existing conditions.  By keeping audit_task_info as a pointer inside
> task_struct we could hide the structure definition inside
> kernel/audit*.c and make it much more difficult for other subsystems
> to abuse it.[1]
> 
>   struct audit_task_info {
>     kuid_t loginuid;
>     unsigned int sessionid;
>     u64 containerid;
>     struct audit_context *ctx;
>   }
> 
> Actually, we might even want to consider storing audit_context in
> audit_task_info (no pointer), or making it a zero length array
> (ctx[0]) and going with a variable sized allocation of audit_task_info
> ... but all that could be done as a follow up optimization once we get
> the basic idea sorted.

I tried statically allocating struct audit_task_info (with a pointer to
struct audit_context) in addition to dynamically allocating struct
audit_task_info due to a bug I'd introduced while dynamically allocating
audit_task_info, so I now have proof-of-concepts for working static and
almost working dynamic allocated struct audit_task_info.

Statically allocating it required a new header file, so I'm not that
crazy about it, but it proved it works.

Dynamically allocating it isn't quite as clean as was hoped since
init/init_task.c still needs initializaiton values for loginuid and
sessionid which could be supplied by a statically allocated struct
audit_task_info and still needs to know the internals of that struct to
do so.  Dynamic allocation is also more disruptive initially, but in the
long run will be more stable to the rest of the kernel.

I'm not crazy about the idea of dynamically (or even statically)
allocating struct audit_task_info which includes allocated space for
struct audit_context since the latter is far larger than the former.

> [1] If for some reason allocating audit_task_info becomes too much
> overhead to bear (somewhat doubtful since we would only do it at task
> creation), we could do some ugly tricks to directly include an
> audit_task_struct chunk in task_struct but I'd like to avoid that if
> possible (and I think we can).

On allocation, I don't see too much of a problem.  When calling
audit_free() if there is no audit context it is pretty lightweight, but
gets heavier if we eliminate the inline audit_free() and rename
__audit_free() back to audit_free().  Having struct audit_task_info
directly in struct task_struct would be faster and also allow defaults
to be set in init/init_task.c (which has recently been populated from
include/linux/init_task.h).  I'm not sure this is enough of a reason to
avoid a pointer from task_struct.

(As an aside, converting allocation of audit_context could also benefit
from kmem_cache... and maybe even struct audit_names)

> >  #endif
> >         struct seccomp                  seccomp;
> 
> ...

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: BUG?: receiving on a packet socket with .sll_protocoll and bridging
From: Willem de Bruijn @ 2018-05-06 16:58 UTC (permalink / raw)
  To: Uwe Kleine-König; +Cc: Network Development
In-Reply-To: <20180505085700.25vuy44mjpamuot2@pengutronix.de>

On Sat, May 5, 2018 at 10:57 AM, Uwe Kleine-König
<u.kleine-koenig@pengutronix.de> wrote:
> Hello,
>
> my eventual goal is to implement MRP and for that I started to program a
> bit and stumbled over a problem I don't understand.
>
> For testing purposes I created a veth device pair (veth0 veth1), open a
> socket for each of the devices and send packets around between them. In
> tcpdump a typical package looks as follows:
>
> 10:36:34.755208 ae:a9:da:50:48:db (oui Unknown) > 01:15:e4:00:00:01 (oui Unknown), ethertype Unknown (0x88e3), length 58:
>         0x0000:  0001 0212 8000 aea9 da50 48db 0000 0000  .........PH.....
>         0x0010:  0000 0589 40f2 6574 6800 0000 0000 0000  ....@.eth.......
>         0x0020:  0000 0100 0a80 3d38 4c5e 0000            ......=8L^..
>
> The socket to receive these packages is opened using:
>
>         #define ETH_P_MRP 0x88e3
>
>         struct sockaddr_ll sa_ll = {
>                 .sll_family = AF_PACKET,
>                 .sll_protocol = htons(ETH_P_MRP),
>                 .sll_ifindex = if_nametoindex("veth0")
>         };
>
>         fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_MRP));
>         bind(fd, (struct sockaddr *)&sa_ll, sizeof(sa_ll));
>
> So far everything works fine and I can receive the packets I send.
>
> If now I add veth0 to a bridge (e.g.
>
>         ip link add br0 type bridge
>         ip link set dev veth0 master br0
>
> ) and continue to send on veth1 and receive on veth0 I don't receive
> the packets any more. The other direction (veth0 sending, veth1
> receiving) still works fine.
>
> Each of the following changes allow to
> receive again:
>
>  a) take veth0 out of the bridge
>  b) bind(2) the receiving socket to br0 instead of veth0
>  c) use .sll_protocol = htons(ETH_P_ALL) for bind(2)
>
> In the end only c) could be sensible (because I need to know the port
> the packet entered the stack and that might well be bridged), but I
> wonder why .sll_protocol = htons(ETH_P_MRP) seems to do the right thing
> for an unbridged device but not for a bridged one.
>
> Is this a bug or a feature I don't understand?

Packets are redirected to the bridge device in __netif_receive_skb_core
at the rx_handler hook.

This happens after packets are passed to packet types attached to
list ptype_all, which includes packet sockets with protocol ETH_P_ALL.
But before packets are passed to protocol specific packet types (and
sockets) attached to ptype_base[].

^ permalink raw reply

* Re: [net-next PATCH v2 4/8] udp: Do not pass checksum as a parameter to GSO segmentation
From: Willem de Bruijn @ 2018-05-06 17:17 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAKgT0UdEE7=eQDmOM+aWjNWc1QdYOWCPz+SAfBNZQT6AS5q-hA@mail.gmail.com>

On Sat, May 5, 2018 at 7:39 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> On Sat, May 5, 2018 at 3:01 AM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Fri, May 4, 2018 at 8:30 PM, Alexander Duyck
>> <alexander.duyck@gmail.com> wrote:
>>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>>
>>> This patch is meant to allow us to avoid having to recompute the checksum
>>> from scratch and have it passed as a parameter.
>>>
>>> Instead of taking that approach we can take advantage of the fact that the
>>> length that was used to compute the existing checksum is included in the
>>> UDP header. If we cancel that out by adding the value XOR with 0xFFFF we
>>> can then just add the new length in and fold that into the new result.
>>>
>>> I think this may be fixing a checksum bug in the original code as well
>>> since the checksum that was passed included the UDP header in the checksum
>>> computation, but then excluded it for the adjustment on the last frame. I
>>> believe this may have an effect on things in the cases where the two differ
>>> by bits that would result in things crossing the byte boundaries.
>>
>> The replacement code, below, subtracts original payload size then adds
>> the new payload size. mss here excludes the udp header size.
>>
>>>                 /* last packet can be partial gso_size */
>>> -               if (!seg->next)
>>> -                       csum_replace2(&uh->check, htons(mss),
>>> -                                     htons(seg->len - hdrlen - sizeof(*uh)));
>
> That is my point. When you calculated your checksum you included the
> UDP header in the calculation.
>
> -       return __udp_gso_segment(gso_skb, features,
> -                                udp_v4_check(sizeof(struct udphdr) + mss,
> -                                             iph->saddr, iph->daddr, 0));
>
> Basically the problem is in one spot you are adding the sizeof(struct
> udphdr) + mss and then in another you are cancelling it out as mss and
> trying to account for it by also dropping the UDP header from the
> payload length of the value you are adding. That works in the cases
> where the effect doesn't cause any issues with the byte ordering,
> however I think when mss + 8 crosses a byte boundary it can lead to
> issues since the calculation is done on a byte swapped value.

Do you mean that the issue is that the arithmetic operations
on a __be16 in csum_replace2 may be incorrect if they exceed
the least significant byte?

csum_replace2 is used in many locations in the stack to adjust a network
byte order csum when the payload length changes (e.g., iph->tot_len in
inet_gro_complete).

Or am I missing something specific about the udphdr calculations?

^ permalink raw reply

* Re: simplify procfs code for seq_file instances V2
From: Alexey Dobriyan @ 2018-05-06 17:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rtc, Alessandro Zummo, Alexandre Belloni, devel, linux-scsi,
	linux-ide, Greg Kroah-Hartman, jfs-discussion, linux-kernel,
	linux-acpi, netdev, netfilter-devel, Alexander Viro, Jiri Slaby,
	Andrew Morton, linux-ext4, linux-afs, megaraidlinux.pdl, drbd-dev
In-Reply-To: <20180425154827.32251-1-hch@lst.de>

On Wed, Apr 25, 2018 at 05:47:47PM +0200, Christoph Hellwig wrote:
> Changes since V1:
>  - open code proc_create_data to avoid setting not fully initialized
>    entries live
>  - use unsigned int for state_size

Need this to maintain sizeof(struct proc_dir_entry):

Otherwise ACK fs/proc/ part.

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 6d171485c45b..a318ae5b36b4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -48,8 +48,8 @@ struct proc_dir_entry {
 		const struct seq_operations *seq_ops;
 		int (*single_show)(struct seq_file *, void *);
 	};
-	unsigned int state_size;
 	void *data;
+	unsigned int state_size;
 	unsigned int low_ino;
 	nlink_t nlink;
 	kuid_t uid;
@@ -62,9 +62,9 @@ struct proc_dir_entry {
 	umode_t mode;
 	u8 namelen;
 #ifdef CONFIG_64BIT
-#define SIZEOF_PDE_INLINE_NAME	(192-139)
+#define SIZEOF_PDE_INLINE_NAME	(192-155)
 #else
-#define SIZEOF_PDE_INLINE_NAME	(128-87)
+#define SIZEOF_PDE_INLINE_NAME	(128-95)
 #endif
 	char inline_name[SIZEOF_PDE_INLINE_NAME];
 } __randomize_layout;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index baf1994289ce..7d94fa005b0d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -40,7 +40,7 @@ static struct net *get_proc_net(const struct inode *inode)
 
 static int seq_open_net(struct inode *inode, struct file *file)
 {
-	size_t state_size = PDE(inode)->state_size;
+	unsigned int state_size = PDE(inode)->state_size;
 	struct seq_net_private *p;
 	struct net *net;

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox