Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 01/11] qede: Optimize aggregation information size
From: Yuval Mintz @ 2016-11-27 14:51 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480258273-24973-1-git-send-email-Yuval.Mintz@cavium.com>

Driver needs to maintain a structure per-each concurrent possible
open aggregation, but the structure storing that metadata is far from
being optimized - biggest waste in it is that there are 2 buffer metadata,
one for a replacement buffer when the aggregation begins and the other for
holding the first aggregation's buffer after it begins [as firmware might
still update it]. Those 2 can safely be united into a single metadata
structure.

struct qede_agg_info changes the following:

	/* size: 120, cachelines: 2, members: 9 */
	/* sum members: 114, holes: 1, sum holes: 4 */
	/* padding: 2 */
	/* paddings: 2, sum paddings: 8 */
	/* last cacheline: 56 bytes */
 -->
	/* size: 48, cachelines: 1, members: 9 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 48 bytes */

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h      | 29 +++++++++----
 drivers/net/ethernet/qlogic/qede/qede_main.c | 63 ++++++++++++----------------
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 0cba21b..1d4c7e0 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -225,15 +225,30 @@ enum qede_agg_state {
 };
 
 struct qede_agg_info {
-	struct sw_rx_data replace_buf;
-	dma_addr_t replace_buf_mapping;
-	struct sw_rx_data start_buf;
-	dma_addr_t start_buf_mapping;
-	struct eth_fast_path_rx_tpa_start_cqe start_cqe;
-	enum qede_agg_state agg_state;
+	/* rx_buf is a data buffer that can be placed /consumed from rx bd
+	 * chain. It has two purposes: We will preallocate the data buffer
+	 * for each aggregation when we open the interface and will place this
+	 * buffer on the rx-bd-ring when we receive TPA_START. We don't want
+	 * to be in a state where allocation fails, as we can't reuse the
+	 * consumer buffer in the rx-chain since FW may still be writing to it
+	 * (since header needs to be modified for TPA).
+	 * The second purpose is to keep a pointer to the bd buffer during
+	 * aggregation.
+	 */
+	struct sw_rx_data buffer;
+	dma_addr_t buffer_mapping;
+
 	struct sk_buff *skb;
-	int frag_id;
+
+	/* We need some structs from the start cookie until termination */
 	u16 vlan_tag;
+	u16 start_cqe_bd_len;
+	u8 start_cqe_placement_offset;
+
+	u8 state;
+	u8 frag_id;
+
+	u8 tunnel_type;
 };
 
 struct qede_rx_queue {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b84a2c4..653be22 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1058,7 +1058,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[tpa_agg_index];
 	struct sk_buff *skb = tpa_info->skb;
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto out;
 
 	/* Add one frag and update the appropriate fields in the skb */
@@ -1084,7 +1084,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	return 0;
 
 out:
-	tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+	tpa_info->state = QEDE_AGG_STATE_ERROR;
 	qede_recycle_rx_bd_ring(rxq, edev, 1);
 	return -ENOMEM;
 }
@@ -1096,8 +1096,8 @@ static void qede_tpa_start(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
 	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
 	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
-	struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
-	dma_addr_t mapping = tpa_info->replace_buf_mapping;
+	struct sw_rx_data *replace_buf = &tpa_info->buffer;
+	dma_addr_t mapping = tpa_info->buffer_mapping;
 	struct sw_rx_data *sw_rx_data_cons;
 	struct sw_rx_data *sw_rx_data_prod;
 	enum pkt_hash_types rxhash_type;
@@ -1122,11 +1122,11 @@ static void qede_tpa_start(struct qede_dev *edev,
 	/* move partial skb from cons to pool (don't unmap yet)
 	 * save mapping, incase we drop the packet later on.
 	 */
-	tpa_info->start_buf = *sw_rx_data_cons;
+	tpa_info->buffer = *sw_rx_data_cons;
 	mapping = HILO_U64(le32_to_cpu(rx_bd_cons->addr.hi),
 			   le32_to_cpu(rx_bd_cons->addr.lo));
 
-	tpa_info->start_buf_mapping = mapping;
+	tpa_info->buffer_mapping = mapping;
 	rxq->sw_rx_cons++;
 
 	/* set tpa state to start only if we are able to allocate skb
@@ -1137,23 +1137,25 @@ static void qede_tpa_start(struct qede_dev *edev,
 					 le16_to_cpu(cqe->len_on_first_bd));
 	if (unlikely(!tpa_info->skb)) {
 		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 		goto cons_buf;
 	}
 
-	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
-	memcpy(&tpa_info->start_cqe, cqe, sizeof(tpa_info->start_cqe));
-
 	/* Start filling in the aggregation info */
+	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
 	tpa_info->frag_id = 0;
-	tpa_info->agg_state = QEDE_AGG_STATE_START;
+	tpa_info->state = QEDE_AGG_STATE_START;
 
 	rxhash = qede_get_rxhash(edev, cqe->bitfields,
 				 cqe->rss_hash, &rxhash_type);
 	skb_set_hash(tpa_info->skb, rxhash, rxhash_type);
+
+	/* Store some information from first CQE */
+	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
+	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
 	if ((le16_to_cpu(cqe->pars_flags.flags) >>
 	     PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT) &
-		    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
+	    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
 		tpa_info->vlan_tag = le16_to_cpu(cqe->vlan_tag);
 	else
 		tpa_info->vlan_tag = 0;
@@ -1169,7 +1171,7 @@ static void qede_tpa_start(struct qede_dev *edev,
 	if (unlikely(cqe->ext_bd_len_list[1])) {
 		DP_ERR(edev,
 		       "Unlikely - got a TPA aggregation with more than one ext_bd_len_list entry in the TPA start\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 	}
 }
 
@@ -1276,7 +1278,7 @@ static void qede_tpa_end(struct qede_dev *edev,
 		DP_ERR(edev,
 		       "Strange - TPA emd with more than a single len_list entry\n");
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto err;
 
 	/* Sanity */
@@ -1290,14 +1292,9 @@ static void qede_tpa_end(struct qede_dev *edev,
 		       le16_to_cpu(cqe->total_packet_len), skb->len);
 
 	memcpy(skb->data,
-	       page_address(tpa_info->start_buf.data) +
-		tpa_info->start_cqe.placement_offset +
-		tpa_info->start_buf.page_offset,
-	       le16_to_cpu(tpa_info->start_cqe.len_on_first_bd));
-
-	/* Recycle [mapped] start buffer for the next replacement */
-	tpa_info->replace_buf = tpa_info->start_buf;
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
+	       page_address(tpa_info->buffer.data) +
+	       tpa_info->start_cqe_placement_offset +
+	       tpa_info->buffer.page_offset, tpa_info->start_cqe_bd_len);
 
 	/* Finalize the SKB */
 	skb->protocol = eth_type_trans(skb, edev->ndev);
@@ -1310,18 +1307,11 @@ static void qede_tpa_end(struct qede_dev *edev,
 
 	qede_gro_receive(edev, fp, skb, tpa_info->vlan_tag);
 
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 
 	return;
 err:
-	/* The BD starting the aggregation is still mapped; Re-use it for
-	 * future aggregations [as replacement buffer]
-	 */
-	memcpy(&tpa_info->replace_buf, &tpa_info->start_buf,
-	       sizeof(struct sw_rx_data));
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
-	tpa_info->start_buf.data = NULL;
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 	dev_kfree_skb_any(tpa_info->skb);
 	tpa_info->skb = NULL;
 }
@@ -2823,7 +2813,7 @@ static void qede_free_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		if (replace_buf->data) {
 			dma_unmap_page(&edev->pdev->dev,
@@ -2905,7 +2895,7 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		replace_buf->data = alloc_pages(GFP_ATOMIC, 0);
 		if (unlikely(!replace_buf->data)) {
@@ -2923,10 +2913,9 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 		}
 
 		replace_buf->mapping = mapping;
-		tpa_info->replace_buf.page_offset = 0;
-
-		tpa_info->replace_buf_mapping = mapping;
-		tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+		tpa_info->buffer.page_offset = 0;
+		tpa_info->buffer_mapping = mapping;
+		tpa_info->state = QEDE_AGG_STATE_NONE;
 	}
 
 	return 0;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 00/11] qed*: Add XDP support
From: Yuval Mintz @ 2016-11-27 14:51 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz

This patch series is intended to add XDP to the qede driver, although
it contains quite a bit of cleanups, refactorings and infrastructure
changes as well.

The content of this series can be roughly divided into:

 - Datapath improvements - mostly focused on having the datapath utilize
parameters which can be more tightly contained in cachelines.
Patches #1, #2, #8, #9 belong to this group.

 - Refactoring - done mostly in favour of XDP. Patches #3, #4, #5, #9.

 - Infrastructure changes - done in favour of XDP. Paches #6 and #7 belong
to this category [#7 being by far the biggest patch in the series].

 - Actual XDP support - last two patches [#10, #11].

Hi Dave,

Please consider applying this to `net-next'.
[Probably optimistic for V1 :-P]

Thanks,
Yuval


Yuval Mintz (11):
  qede: Optimize aggregation information size
  qed: Optimize qed_chain datapath usage
  qede: Remove 'num_tc'.
  qede: Refactor statistics gathering
  qede: Refactor data-path Rx flow
  qede: Revise state locking scheme
  qed*: Handle-based L2-queues.
  qede: Don't check netdevice for receive-hashing
  qede: Better utilize the qede_[rt]x_queue
  qede: Add basic XDP support
  qede: Add support for XDP_TX

 drivers/net/ethernet/qlogic/qed/qed.h             |   12 -
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   33 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  595 +++++----
 drivers/net/ethernet/qlogic/qed/qed_l2.h          |  133 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |    4 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       |  275 ++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |   21 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c          |   90 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.h          |   40 +-
 drivers/net/ethernet/qlogic/qede/qede.h           |  163 ++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c   |  210 +--
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 1409 +++++++++++++--------
 include/linux/qed/qed_chain.h                     |  144 ++-
 include/linux/qed/qed_eth_if.h                    |   56 +-
 14 files changed, 1894 insertions(+), 1291 deletions(-)

-- 
1.9.3

^ permalink raw reply

* Re: [PATCH net-next 00/10] Mellanox 100G mlx5 DCBX and ethtool updates
From: Saeed Mahameed @ 2016-11-27 13:18 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, Linux Netdev List
In-Reply-To: <1480169914-22347-1-git-send-email-saeedm@mellanox.com>

On Sat, Nov 26, 2016 at 4:18 PM, Saeed Mahameed <saeedm@mellanox.com> wrote:
> Hi Dave,
>
> This series provides the following mlx5 updates:
>
> From Huy:
> DCBX CEE API and DCBX firmware/host modes support.
>     - 1st patch ensures the dcbnl_rtnl_ops is published only when the qos
>       capability bits is on.
>     - 2nd patch adds the support for CEE interfaces into mlx5 dcbnl_rtnl_ops
>     - 3rd patch refactors ETS query to read ETS configuration directly from
>       firmware rather than having a software shadow to it. The existing IEEE
>       interfaces stays the same.
>     - 4th patch adds the support for MLX5_REG_DCBX_PARAM and MLX5_REG_DCBX_APP
>       firmware commands to manipulate mlx5 DCBX mode.
>     - 5th patch adds the driver support for the new DCBX firmware.  This ensures
>       the backward compatibility versus the old and new firmware. With the new DCBX
>       firmware, qos settings can be controlled by either firmware or software
>       depending on the DCBX mode.
>
> From Kamal and Saeed:
>     - mlx5 self-test support.
>
> From Shaker:
>     - Private flag to give the user the ability to enable/disable mlx5 CQE
>       compression.
>

Hi Dave,

Please hold on with this series, we need a respin to fix some static
checkers warnings.

Thanks
Saeed.

^ permalink raw reply

* Re: [PATCH net-next] net/sched: Export tc_tunnel_key so its UAPI accessible
From: Roi Dayan @ 2016-11-27 10:14 UTC (permalink / raw)
  To: David S. Miller; +Cc: roid, netdev, Amir Vadai
In-Reply-To: <1480241233-50097-1-git-send-email-roid@mellanox.com>

Hi,

My mistake. this should be targeted for net. I'm sending a patch over 
net branch.

Thanks,
Roi


On 27/11/2016 12:07, Roi Dayan wrote:
> Export tc_tunnel_key so it can be used from user space.
>
> Signed-off-by: Roi Dayan <roid@mellanox.com>
> Reviewed-by: Amir Vadai <amir@vadai.me>
> ---
>   include/uapi/linux/tc_act/Kbuild | 1 +
>   1 file changed, 1 insertion(+)
>
> diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
> index e3969bd..9611c7b 100644
> --- a/include/uapi/linux/tc_act/Kbuild
> +++ b/include/uapi/linux/tc_act/Kbuild
> @@ -11,3 +11,4 @@ header-y += tc_vlan.h
>   header-y += tc_bpf.h
>   header-y += tc_connmark.h
>   header-y += tc_ife.h
> +header-y += tc_tunnel_key.h

^ permalink raw reply

* Did you get my message this time?
From:  Friedrich Mayrhofer @ 2016-11-27  7:42 UTC (permalink / raw)





This is the second time i am sending you this mail.

I, Friedrich Mayrhofer Donate $ 1,000,000.00 to You, Email  Me personally for more details.

Regards.
Friedrich Mayrhofer

^ permalink raw reply

* [PATCH iproute2 0/2] Adding help and inline mode control to eswitch
From: Roi Dayan @ 2016-11-27 11:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Or Gerlitz, Roi Dayan

Hi,

The first patch is adding missing help message for eswitch.
The second patch is adding functionality to show/modify eswitch inline mode.

Thanks,
Roi


Roi Dayan (2):
  devlink: Add usage help for eswitch subcommand
  devlink: Add option to set and show eswitch inline mode

 devlink/devlink.c       |   89 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/devlink.h |    8 ++++
 man/man8/devlink-dev.8  |   21 ++++++++++-
 3 files changed, 115 insertions(+), 3 deletions(-)

^ permalink raw reply

* [PATCH iproute2 2/2] devlink: Add option to set and show eswitch inline mode
From: Roi Dayan @ 2016-11-27 11:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Or Gerlitz, Roi Dayan
In-Reply-To: <1480245663-814-1-git-send-email-roid@mellanox.com>

This is needed for some HWs to do proper macthing and steering.
Possible values are none, link, network, transport.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 devlink/devlink.c       |   82 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/devlink.h |    8 ++++
 man/man8/devlink-dev.8  |   19 +++++++++++
 3 files changed, 108 insertions(+), 1 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 673234f..23db9e7 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -28,6 +28,10 @@
 
 #define ESWITCH_MODE_LEGACY "legacy"
 #define ESWITCH_MODE_SWITCHDEV "switchdev"
+#define ESWITCH_INLINE_MODE_NONE "none"
+#define ESWITCH_INLINE_MODE_LINK "link"
+#define ESWITCH_INLINE_MODE_NETWORK "network"
+#define ESWITCH_INLINE_MODE_TRANSPORT "transport"
 
 #define pr_err(args...) fprintf(stderr, ##args)
 #define pr_out(args...) fprintf(stdout, ##args)
@@ -132,6 +136,7 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_SB_TH		BIT(9)
 #define DL_OPT_SB_TC		BIT(10)
 #define DL_OPT_ESWITCH_MODE	BIT(11)
+#define DL_OPT_ESWITCH_INLINE_MODE	BIT(12)
 
 struct dl_opts {
 	uint32_t present; /* flags of present items */
@@ -148,6 +153,7 @@ struct dl_opts {
 	uint32_t sb_threshold;
 	uint16_t sb_tc_index;
 	enum devlink_eswitch_mode eswitch_mode;
+	enum devlink_eswitch_inline_mode eswitch_inline_mode;
 };
 
 struct dl {
@@ -305,6 +311,9 @@ static int attr_cb(const struct nlattr *attr, void *data)
 	if (type == DEVLINK_ATTR_ESWITCH_MODE &&
 	    mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
 		return MNL_CB_ERROR;
+	if (type == DEVLINK_ATTR_ESWITCH_INLINE_MODE &&
+	    mnl_attr_validate(attr, MNL_TYPE_U8) < 0)
+		return MNL_CB_ERROR;
 	tb[type] = attr;
 	return MNL_CB_OK;
 }
@@ -682,6 +691,24 @@ static int eswitch_mode_get(const char *typestr,
 	return 0;
 }
 
+static int eswitch_inline_mode_get(const char *typestr,
+				   enum devlink_eswitch_inline_mode *p_mode)
+{
+	if (strcmp(typestr, ESWITCH_INLINE_MODE_NONE) == 0) {
+		*p_mode = DEVLINK_ESWITCH_INLINE_MODE_NONE;
+	} else if (strcmp(typestr, ESWITCH_INLINE_MODE_LINK) == 0) {
+		*p_mode = DEVLINK_ESWITCH_INLINE_MODE_LINK;
+	} else if (strcmp(typestr, ESWITCH_INLINE_MODE_NETWORK) == 0) {
+		*p_mode = DEVLINK_ESWITCH_INLINE_MODE_NETWORK;
+	} else if (strcmp(typestr, ESWITCH_INLINE_MODE_TRANSPORT) == 0) {
+		*p_mode = DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT;
+	} else {
+		pr_err("Unknown eswitch inline mode \"%s\"\n", typestr);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 			 uint32_t o_optional)
 {
@@ -803,6 +830,19 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 			if (err)
 				return err;
 			o_found |= DL_OPT_ESWITCH_MODE;
+		} else if (dl_argv_match(dl, "inline-mode") &&
+			   (o_all & DL_OPT_ESWITCH_INLINE_MODE)) {
+			const char *typestr;
+
+			dl_arg_inc(dl);
+			err = dl_argv_str(dl, &typestr);
+			if (err)
+				return err;
+			err = eswitch_inline_mode_get(
+				typestr, &opts->eswitch_inline_mode);
+			if (err)
+				return err;
+			o_found |= DL_OPT_ESWITCH_INLINE_MODE;
 		} else {
 			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
 			return -EINVAL;
@@ -863,6 +903,12 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 		return -EINVAL;
 	}
 
+	if ((o_required & DL_OPT_ESWITCH_INLINE_MODE) &&
+	    !(o_found & DL_OPT_ESWITCH_INLINE_MODE)) {
+		pr_err("E-Switch inline-mode option expected.\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -909,6 +955,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
 	if (opts->present & DL_OPT_ESWITCH_MODE)
 		mnl_attr_put_u16(nlh, DEVLINK_ATTR_ESWITCH_MODE,
 				 opts->eswitch_mode);
+	if (opts->present & DL_OPT_ESWITCH_INLINE_MODE)
+		mnl_attr_put_u8(nlh, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+				opts->eswitch_inline_mode);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -964,6 +1013,7 @@ static void cmd_dev_help(void)
 {
 	pr_err("Usage: devlink dev show [ DEV ]\n");
 	pr_err("       devlink dev eswitch set DEV [ mode { legacy | switchdev } ]\n");
+	pr_err("                               [ inline-mode { none | link | network | transport } ]\n");
 	pr_err("       devlink dev eswitch show DEV\n");
 }
 
@@ -1203,6 +1253,22 @@ static const char *eswitch_mode_name(uint32_t mode)
 	}
 }
 
+static const char *eswitch_inline_mode_name(uint32_t mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_INLINE_MODE_NONE:
+		return ESWITCH_INLINE_MODE_NONE;
+	case DEVLINK_ESWITCH_INLINE_MODE_LINK:
+		return ESWITCH_INLINE_MODE_LINK;
+	case DEVLINK_ESWITCH_INLINE_MODE_NETWORK:
+		return ESWITCH_INLINE_MODE_NETWORK;
+	case DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT:
+		return ESWITCH_INLINE_MODE_TRANSPORT;
+	default:
+		return "<unknown mode>";
+	}
+}
+
 static void pr_out_eswitch(struct dl *dl, struct nlattr **tb)
 {
 	__pr_out_handle_start(dl, tb, true, false);
@@ -1210,6 +1276,12 @@ static void pr_out_eswitch(struct dl *dl, struct nlattr **tb)
 	if (tb[DEVLINK_ATTR_ESWITCH_MODE])
 		pr_out_str(dl, "mode",
 			   eswitch_mode_name(mnl_attr_get_u16(tb[DEVLINK_ATTR_ESWITCH_MODE])));
+
+	if (tb[DEVLINK_ATTR_ESWITCH_INLINE_MODE])
+		pr_out_str(dl, "inline-mode",
+			   eswitch_inline_mode_name(mnl_attr_get_u8(
+				   tb[DEVLINK_ATTR_ESWITCH_INLINE_MODE])));
+
 	pr_out_handle_end(dl);
 }
 
@@ -1252,10 +1324,18 @@ static int cmd_dev_eswitch_set(struct dl *dl)
 	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_ESWITCH_MODE_SET,
 			       NLM_F_REQUEST | NLM_F_ACK);
 
-	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | DL_OPT_ESWITCH_MODE, 0);
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE,
+				DL_OPT_ESWITCH_MODE |
+				DL_OPT_ESWITCH_INLINE_MODE);
+
 	if (err)
 		return err;
 
+	if (dl->opts.present == 1) {
+		pr_err("Need to set at least one option\n");
+		return -ENOENT;
+	}
+
 	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
 }
 
diff --git a/include/linux/devlink.h b/include/linux/devlink.h
index b7c1a06..7c14d77 100644
--- a/include/linux/devlink.h
+++ b/include/linux/devlink.h
@@ -102,6 +102,13 @@ enum devlink_eswitch_mode {
 	DEVLINK_ESWITCH_MODE_SWITCHDEV,
 };
 
+enum devlink_eswitch_inline_mode {
+	DEVLINK_ESWITCH_INLINE_MODE_NONE,
+	DEVLINK_ESWITCH_INLINE_MODE_LINK,
+	DEVLINK_ESWITCH_INLINE_MODE_NETWORK,
+	DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -133,6 +140,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_SB_OCC_CUR,		/* u32 */
 	DEVLINK_ATTR_SB_OCC_MAX,		/* u32 */
 	DEVLINK_ATTR_ESWITCH_MODE,		/* u16 */
+	DEVLINK_ATTR_ESWITCH_INLINE_MODE,	/* u8 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8
index 931e334..6bfe66f 100644
--- a/man/man8/devlink-dev.8
+++ b/man/man8/devlink-dev.8
@@ -31,6 +31,9 @@ devlink-dev \- devlink device configuration
 .RI "[ "
 .BR mode " { " legacy " | " switchdev " } "
 .RI "]"
+.RI "[ "
+.BR inline-mode " { " none " | " link " | " network " | " transport " } "
+.RI "]"
 
 .ti -8
 .BR "devlink dev eswitch show"
@@ -62,6 +65,22 @@ Set eswitch mode
 .I switchdev
 - SRIOV switchdev offloads
 
+.TP
+.BR inline-mode " { " none " | " link " | " network " | " transport " } "
+Some HWs need the VF driver to put part of the packet headers on the TX descriptor so the e-switch can do proper matching and steering.
+
+.I none
+- None
+
+.I link
+- L2 mode
+
+.I network
+- L3 mode
+
+.I transport
+- L4 mode
+
 .SH "EXAMPLES"
 .PP
 devlink dev show
-- 
1.7.1

^ permalink raw reply related

* [PATCH iproute2 1/2] devlink: Add usage help for eswitch subcommand
From: Roi Dayan @ 2016-11-27 11:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Or Gerlitz, Roi Dayan
In-Reply-To: <1480245663-814-1-git-send-email-roid@mellanox.com>

Add missing usage help for devlink dev eswitch subcommand.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 devlink/devlink.c      |    7 ++++++-
 man/man8/devlink-dev.8 |    2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index ccca0fb..673234f 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -963,6 +963,8 @@ static bool dl_dump_filter(struct dl *dl, struct nlattr **tb)
 static void cmd_dev_help(void)
 {
 	pr_err("Usage: devlink dev show [ DEV ]\n");
+	pr_err("       devlink dev eswitch set DEV [ mode { legacy | switchdev } ]\n");
+	pr_err("       devlink dev eswitch show DEV\n");
 }
 
 static bool cmp_arr_last_handle(struct dl *dl, const char *bus_name,
@@ -1259,7 +1261,10 @@ static int cmd_dev_eswitch_set(struct dl *dl)
 
 static int cmd_dev_eswitch(struct dl *dl)
 {
-	if (dl_argv_match(dl, "set")) {
+	if (dl_argv_match(dl, "help") || dl_no_arg(dl)) {
+		cmd_dev_help();
+		return 0;
+	} else if (dl_argv_match(dl, "set")) {
 		dl_arg_inc(dl);
 		return cmd_dev_eswitch_set(dl);
 	} else if (dl_argv_match(dl, "show")) {
diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8
index 9ce3193..931e334 100644
--- a/man/man8/devlink-dev.8
+++ b/man/man8/devlink-dev.8
@@ -54,7 +54,7 @@ BUS_NAME/BUS_ADDRESS
 
 .TP
 .BR mode " { " legacy " | " switchdev " } "
-set eswitch mode
+Set eswitch mode
 
 .I legacy
 - Legacy SRIOV
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next 1/1] driver: ipvlan: Use NF_IP_PRI_LAST as hook priority instead of INT_MAX
From: Gao Feng @ 2016-11-27 11:20 UTC (permalink / raw)
  To: kbuild test robot
  Cc: kbuild-all, David S. Miller, maheshb, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <201611271927.VjauNPub%fengguang.wu@intel.com>

On Sun, Nov 27, 2016 at 7:17 PM, kbuild test robot <lkp@intel.com> wrote:
> Hi Gao,
>
> [auto build test ERROR on net-next/master]
>
> url:    https://github.com/0day-ci/linux/commits/fgao-ikuai8-com/driver-ipvlan-Use-NF_IP_PRI_LAST-as-hook-priority-instead-of-INT_MAX/20161127-182724
> config: x86_64-allmodconfig (attached as .config)
> compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64
>
> All errors (new ones prefixed by >>):
>
>>> drivers/net/ipvlan/ipvlan_main.c:19:15: error: 'NF_IP_PRI_LAST' undeclared here (not in a function)
>       .priority = NF_IP_PRI_LAST,
>                   ^~~~~~~~~~~~~~
>
> vim +/NF_IP_PRI_LAST +19 drivers/net/ipvlan/ipvlan_main.c
>
>     13
>     14  static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
>     15          {
>     16                  .hook     = ipvlan_nf_input,
>     17                  .pf       = NFPROTO_IPV4,
>     18                  .hooknum  = NF_INET_LOCAL_IN,
>   > 19                  .priority = NF_IP_PRI_LAST,
>     20          },
>     21          {
>     22                  .hook     = ipvlan_nf_input,
>
> ---
> 0-DAY kernel test infrastructure                Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

Sorry, it broke the build.
I add the header file, but forget to commit, so that this patch does
not contain header file

Now I have sent the v2 patch.

Regards
Feng

^ permalink raw reply

* [PATCH net-next v2 1/1] driver: ipvlan: Use NF_IP_PRI_LAST as hook priority instead of INT_MAX
From: fgao @ 2016-11-27 11:18 UTC (permalink / raw)
  To: davem, maheshb, edumazet, netdev, gfree.wind; +Cc: Gao Feng

From: Gao Feng <fgao@ikuai8.com>

It is better to use NF_IP_PRI_LAST instead of INT_MAX as hook priority.
The former is good at readability and easier to maintain.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 v2: Add the lost header file. It is added in local but not in v1 patch
 v1: Inital patch

 drivers/net/ipvlan/ipvlan_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ab90b22..01c7446 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include "linux/netfilter_ipv4.h"
 #include "ipvlan.h"
 
 static u32 ipvl_nf_hook_refcnt = 0;
@@ -16,13 +17,13 @@
 		.hook     = ipvlan_nf_input,
 		.pf       = NFPROTO_IPV4,
 		.hooknum  = NF_INET_LOCAL_IN,
-		.priority = INT_MAX,
+		.priority = NF_IP_PRI_LAST,
 	},
 	{
 		.hook     = ipvlan_nf_input,
 		.pf       = NFPROTO_IPV6,
 		.hooknum  = NF_INET_LOCAL_IN,
-		.priority = INT_MAX,
+		.priority = NF_IP_PRI_LAST,
 	},
 };
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net-next 1/1] driver: ipvlan: Use NF_IP_PRI_LAST as hook priority instead of INT_MAX
From: kbuild test robot @ 2016-11-27 11:17 UTC (permalink / raw)
  To: fgao; +Cc: kbuild-all, davem, maheshb, edumazet, netdev, gfree.wind,
	Gao Feng
In-Reply-To: <1480242245-7516-1-git-send-email-fgao@ikuai8.com>

[-- Attachment #1: Type: text/plain, Size: 1115 bytes --]

Hi Gao,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/fgao-ikuai8-com/driver-ipvlan-Use-NF_IP_PRI_LAST-as-hook-priority-instead-of-INT_MAX/20161127-182724
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

>> drivers/net/ipvlan/ipvlan_main.c:19:15: error: 'NF_IP_PRI_LAST' undeclared here (not in a function)
      .priority = NF_IP_PRI_LAST,
                  ^~~~~~~~~~~~~~

vim +/NF_IP_PRI_LAST +19 drivers/net/ipvlan/ipvlan_main.c

    13	
    14	static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
    15		{
    16			.hook     = ipvlan_nf_input,
    17			.pf       = NFPROTO_IPV4,
    18			.hooknum  = NF_INET_LOCAL_IN,
  > 19			.priority = NF_IP_PRI_LAST,
    20		},
    21		{
    22			.hook     = ipvlan_nf_input,

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 56839 bytes --]

^ permalink raw reply

* Re: [PATCH iproute2 0/2] tc/cls_flower: Support for ip tunnel metadata set/release/classify
From: Amir Vadai @ 2016-11-27 10:35 UTC (permalink / raw)
  To: Jiri Benc
  Cc: Stephen Hemminger, David S. Miller, netdev, Or Gerlitz,
	Hadar Har-Zion, Roi Dayan
In-Reply-To: <20161124163355.1c4d686f@griffin>

On Thu, Nov 24, 2016 at 04:33:55PM +0100, Jiri Benc wrote:
> On Thu, 24 Nov 2016 17:06:33 +0200, Amir Vadai wrote:
> > So you mean to just unconditionally call skb_dst_drop() from
> > act_mirred()?
> 
> That's one option. Or just leave the dst there, it shouldn't matter?
> (Except for forwarding to a different tunnel but as I said, it's a
> corner case and we may have a "tunnel_key unset" action for that.)
Ok, so I will write in the docs that it is optional to use the "unset"
operation (and will rename it from "release" to "unset")

> 
> > The use case we already have that uses the release action is the
> > hardware offload support, which is already in the kernel.
> > It is using the "tunnel_key release" action to signal the hardware to
> > strip off the ip tunnel headers.
> 
> The tunnel headers must be removed upon reception on the tunnel
> interface without specifying anything, because that's how the Linux
> kernel behaves currently. If this is offloaded, this behavior must be
> preserved. I don't see how "tunnel_key release" might be used for
> stripping the headers.
Maybe I didn't express myself right: I need to tell the hardware
explicitly during the filter initialization to redirect the packets
arriving from one interface to another and to strip off the tunnel
headers. This is what happens when a "tunnel_key unset" action is
created and offloaded - it configures the hardware respectively.
So this is one usecase where this operation is needed - and yes, in this
use case the actual skb_dst_drop() is not important or needed, but I
don't think it makes any harm.
In the tunnel dev to tunnel dev use case, the operation could be
meaningful, if the user don't want to reuse the metadata created by the
origin tunnel dev.

> 
>  Jiri

^ permalink raw reply

* [PATCH net-next 1/1] driver: ipvlan: Use NF_IP_PRI_LAST as hook priority instead of INT_MAX
From: fgao @ 2016-11-27 10:24 UTC (permalink / raw)
  To: davem, maheshb, edumazet, netdev, gfree.wind; +Cc: Gao Feng

From: Gao Feng <fgao@ikuai8.com>

It is better to use NF_IP_PRI_LAST instead of INT_MAX as hook priority.
The former is good at readability and easier to maintain.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ab90b22..d70d245 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -16,13 +16,13 @@
 		.hook     = ipvlan_nf_input,
 		.pf       = NFPROTO_IPV4,
 		.hooknum  = NF_INET_LOCAL_IN,
-		.priority = INT_MAX,
+		.priority = NF_IP_PRI_LAST,
 	},
 	{
 		.hook     = ipvlan_nf_input,
 		.pf       = NFPROTO_IPV6,
 		.hooknum  = NF_INET_LOCAL_IN,
-		.priority = INT_MAX,
+		.priority = NF_IP_PRI_LAST,
 	},
 };
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net] net/sched: Export tc_tunnel_key so its UAPI accessible
From: Roi Dayan @ 2016-11-27 10:14 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Amir Vadai, Roi Dayan

Export tc_tunnel_key so it can be used from user space.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
---

Hi,

I previously sent the patch targeted for net-next but should be for net.

Thanks,
Roi


 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3969bd..9611c7b 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -11,3 +11,4 @@ header-y += tc_vlan.h
 header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
+header-y += tc_tunnel_key.h
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next] net/sched: Export tc_tunnel_key so its UAPI accessible
From: Roi Dayan @ 2016-11-27 10:07 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Amir Vadai, Roi Dayan

Export tc_tunnel_key so it can be used from user space.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
---
 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3969bd..9611c7b 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -11,3 +11,4 @@ header-y += tc_vlan.h
 header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
+header-y += tc_tunnel_key.h
-- 
2.7.4

^ permalink raw reply related

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
From: Roi Dayan @ 2016-11-27  6:29 UTC (permalink / raw)
  To: Daniel Borkmann, Cong Wang
  Cc: roid, Linux Kernel Network Developers, Jiri Pirko, John Fastabend
In-Reply-To: <583A6567.30003@mellanox.com>



On 27/11/2016 06:47, Roi Dayan wrote:
>
>
> On 27/11/2016 02:33, Daniel Borkmann wrote:
>> On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
>>> On 11/26/2016 07:46 AM, Cong Wang wrote:
>>>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann 
>>>> <daniel@iogearbox.net> wrote:
>> [...]
>>>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>>>> Outstanding readers should either bail out due to if (!cl) or can 
>>>>> still
>>>>> process the chain until read section ends, but during that time, 
>>>>> cl->q
>>>>> resp. bstats should be good. Do you happen to know what's at address
>>>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), 
>>>>> but
>>>>> at least on ingress (netif_receive_skb_internal()) we hold 
>>>>> rcu_read_lock()
>>>>> here. The KASAN report is reliably happening at this location, right?
>>>>
>>>> I am confused as well, I don't see how it could be related to my 
>>>> patch yet.
>>>> I will take a deep look in the weekend.
>
>
>
> Hi Cong,
>
> When reported the new trace I didn't mean it's related to your patch, 
> I just wanted to point it out it exposed something. I should have been 
> clear about it.
>
>
>>>
>>> Ok, I'm currently on the run. Got too late yesterday night, but I'll
>>> write what I found in the evening today, not related to ingress though.
>>
>> Just pushed out my analysis to netdev under "[PATCH net] net, sched: 
>> respect
>> rcu grace period on cls destruction". My conclusion is that both 
>> issues are
>> actually separate, and that one is small enough where we could route 
>> it via
>> net actually. Perhaps this at the same time shrinks your "[PATCH 
>> net-next]
>> net_sched: move the empty tp check from ->destroy() to ->delete()" to a
>> reasonable size that it's suitable to net as well. Your 
>> ->delete()/->destroy()
>> one is definitely needed, too. The tp->root one is independant of 
>> ->delete()/
>> ->destroy() as they are different races and tp->root could also 
>> happen when
>> you just destroy the whole tp directly. I think that seems like a 
>> good path
>> forward to me.
>>
>> Thanks,
>> Daniel
>
>
>
> Hi Daniel,
>
> As for the tainted kernel. I was in old (week or two) net-next tree 
> and only cherry-picked from latest net-next related patches to 
> Mellanox HCA, cls_api, cls_flower, devlink. so those are the tainted 
> modules.
> I have the issue reproducing in that tree so wanted it to check it 
> with Cong's patch instead of latest net-next.
> I'll try running reproducing the issue with your new patch and later 
> try latest net-next as well.
>
> Thanks,
> Roi
>

Hi,

I tested "[PATCH net] net, sched: respect rcu grace period on cls 
destruction" and could not reproduce my original issue.
I rebased "[Patch net-next] net_sched: move the empty tp check from 
->destroy() to ->delete()" over to test it in the same tree and got into 
a new trace in fl_delete.

[35659.012123] BUG: KASAN: wild-memory-access on address 1ffffffff803ca31
[35659.020042] Write of size 1 by task ovs-vswitchd/20135
[35659.025878] CPU: 19 PID: 20135 Comm: ovs-vswitchd Tainted: 
G           O    4.9.0-rc3+ #18
[35659.035948] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
[35659.043730] Call Trace:
[35659.046619]  [<ffffffff95b6dc42>] dump_stack+0x63/0x81
[35659.052456]  [<ffffffff955fbbf8>] kasan_report_error+0x408/0x4e0
[35659.059402]  [<ffffffff955fc2e8>] kasan_report+0x58/0x60
[35659.065428]  [<ffffffff952d5e8d>] ? call_rcu_sched+0x1d/0x20
[35659.072119]  [<ffffffffc01e0701>] ? fl_destroy_filter+0x21/0x30 
[cls_flower]
[35659.080217]  [<ffffffffc01e1ccf>] ? fl_delete+0x1df/0x2e0 [cls_flower]
[35659.087580]  [<ffffffff955fa4ca>] __asan_store1+0x4a/0x50
[35659.093697]  [<ffffffffc01e1ccf>] fl_delete+0x1df/0x2e0 [cls_flower]
[35659.100870]  [<ffffffff9653ecba>] tc_ctl_tfilter+0x10da/0x1b90


0x1d02 is in fl_delete (net/sched/cls_flower.c:805).
800             struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
801
802             rhashtable_remove_fast(&head->ht, &f->ht_node,
803                                    head->ht_params);
804             __fl_delete(tp, f);
805             *last = list_empty(&head->filters);
806             return 0;
807     }


Thanks,
Roi

^ permalink raw reply

* Re: Crash due to mutex genl_lock called from RCU context
From: Cong Wang @ 2016-11-27  6:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Subash Abhinov Kasiviswanathan, Thomas Graf,
	Linux Kernel Network Developers
In-Reply-To: <1480213570.18162.31.camel@edumazet-glaptop3.roam.corp.google.com>

On Sat, Nov 26, 2016 at 6:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Are you telling me inet_release() is called when we close() the first
> file descriptor ?
>
> fd1 = socket()
> fd2 = dup(fd1);
> close(fd2) -> release() ???

Sorry, I didn't express myself clearly, I meant your change,
if exclude the SOCK_RCU_FREE part, basically reverts this commit:

commit 3f660d66dfbc13ea4b61d3865851b348444c24b4
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Thu May 3 03:17:14 2007 -0700

    [NETLINK]: Kill CB only when socket is unused

IOW, ->release() is called when the last sock fd ref is gone, but ->destructor()
is called with the last sock ref is gone. They are very different.


>> I don't see why we need to get genl_lock in ->done() here, because we are
>> already the last sock using it and module ref protects the ops from being
>> removed via module, seems we are pretty safe without any lock.
>
> Well, at least this exposes a real bug in Thomas patch.
>
> Removing the lock might be done for net-next, not stable branches.

I am confused, what Subash reported is a kernel warning which can
surely be fixed by removing genl lock (if it is correct, I need to double
check), so why for net-next?

^ permalink raw reply

* RE: [PATCH] net: fec: turn on device when extracting statistics
From: Andy Duan @ 2016-11-27  6:25 UTC (permalink / raw)
  To: Nikita Yushchenko, David S. Miller, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
  Cc: Chris Healy
In-Reply-To: <1480068120-22137-1-git-send-email-nikita.yoush@cogentembedded.com>

From: Nikita Yushchenko <nikita.yoush@cogentembedded.com> Sent: Friday, November 25, 2016 6:02 PM
 >To: Andy Duan <fugang.duan@nxp.com>; David S. Miller
 ><davem@davemloft.net>; Troy Kisky <troy.kisky@boundarydevices.com>;
 >Andrew Lunn <andrew@lunn.ch>; Eric Nelson <eric@nelint.com>; Philippe
 >Reynes <tremyfr@gmail.com>; Johannes Berg <johannes@sipsolutions.net>;
 >netdev@vger.kernel.org; linux-kernel@vger.kernel.org
 >Cc: Chris Healy <cphealy@gmail.com>; Nikita Yushchenko
 ><nikita.yoush@cogentembedded.com>
 >Subject: [PATCH] net: fec: turn on device when extracting statistics
 >
 >Execution 'ethtool -S' on fec device that is down causes OOPS on Vybrid
 >board:
 >
 >Unhandled fault: external abort on non-linefetch (0x1008) at 0xe0898200 pgd
 >= ddecc000 [e0898200] *pgd=9e406811, *pte=400d1653, *ppte=400d1453
 >Internal error: : 1008 [#1] SMP ARM ...
 >
 >Reason of OOPS is that fec_enet_get_ethtool_stats() accesses fec registers
 >while IPG clock is stopped by PM.
 >
 >Fix that by wrapping statistics extraction into pm_runtime_get_sync() ...
 >pm_runtime_put_autosuspend() braces.
 >
 >Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
 >---

Acked-by: Fugang Duan <fugang.duan@nxp.com>

 > drivers/net/ethernet/freescale/fec_main.c | 11 ++++++++++-
 > 1 file changed, 10 insertions(+), 1 deletion(-)
 >
 >diff --git a/drivers/net/ethernet/freescale/fec_main.c
 >b/drivers/net/ethernet/freescale/fec_main.c
 >index 5aa9d4ded214..9c7592b80ce8 100644
 >--- a/drivers/net/ethernet/freescale/fec_main.c
 >+++ b/drivers/net/ethernet/freescale/fec_main.c
 >@@ -2317,10 +2317,19 @@ static void fec_enet_get_ethtool_stats(struct
 >net_device *dev,
 > 	struct ethtool_stats *stats, u64 *data)  {
 > 	struct fec_enet_private *fep = netdev_priv(dev);
 >-	int i;
 >+	int i, ret;
 >+
 >+	ret = pm_runtime_get_sync(&fep->pdev->dev);
 >+	if (IS_ERR_VALUE(ret)) {
 >+		memset(data, 0, sizeof(*data) * ARRAY_SIZE(fec_stats));
 >+		return;
 >+	}
 >
 > 	for (i = 0; i < ARRAY_SIZE(fec_stats); i++)
 > 		data[i] = readl(fep->hwp + fec_stats[i].offset);
 >+
 >+	pm_runtime_mark_last_busy(&fep->pdev->dev);
 >+	pm_runtime_put_autosuspend(&fep->pdev->dev);
 > }
 >
 > static void fec_enet_get_strings(struct net_device *netdev,
 >--
 >2.1.4

^ permalink raw reply

* Xmas Offer
From: Mrs Julie Leach @ 2016-11-27  6:24 UTC (permalink / raw)
  To: Recipients

You are a recipient to Mrs Julie Leach Donation of $3 million USD. Contact ( julieleach93@gmail.com ) for claims.

^ permalink raw reply

* [PATCH net-next 3/4] Documentation: net: phy: Add blurb about RGMII
From: Florian Fainelli @ 2016-11-27  6:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, sf84, martin.blumenstingl, mans, alexandre.torgue,
	peppe.cavallaro, timur, jbrunet, Florian Fainelli
In-Reply-To: <20161127060133.10357-1-f.fainelli@gmail.com>

RGMII is a recurring source of pain for people with Gigabit Ethernet
hardware since it may require PHY driver and MAC driver level
configuration hints. Document what are the expectations from PHYLIB and
what options exist.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/networking/phy.txt | 56 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index 9a42a9414cea..18e9f518b6f9 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -65,6 +65,62 @@ The MDIO bus
  drivers/net/ethernet/freescale/fsl_pq_mdio.c and an associated DTS file
  for one of the users. (e.g. "git grep fsl,.*-mdio arch/powerpc/boot/dts/")
 
+(RG)MII/electrical interface considerations
+
+ The Reduced Gigabit Medium Independent Interface (RGMII) is a 12 pins
+ electrical signal using a synchronous 125Mhz clock signal and several data
+ lines. Due to this design decision, a 1.5ns to 2ns delay must be added between
+ the clock line (RXC or TXC) and the data lines to let the sink (PHY or MAC)
+ have enough setup and hold times to sample the data lines correctly. The PHY
+ library offers different types of PHY_INTERFACE_MODE_RGMII* values to let the
+ PHY driver and optionaly the MAC driver implement the required delay. The
+ values of phy_interface_t must be understood from the perspective of the PHY
+ device itself, leading to the following:
+
+ * PHY_INTERFACE_MODE_RGMII: the PHY is not responsible for inserting any
+   internal delay by itself, it assumes that either the Ethernet MAC (if capable
+   or the PCB traces) insert the correct 1.5-2ns delay
+
+ * PHY_INTERFACE_MODE_RGMII_TXID: the PHY should be inserting a delay for the
+   transmit data lines (TXD[3:0]) processed by the PHY device
+
+ * PHY_INTERFACE_MODE_RGMII_RXID: the PHY should be inserting a delay for the
+   receive data lines (RXD[3:0]) processed by the PHY device
+
+ * PHY_INTERFACE_MODE_RGMII_ID: the PHY should be inserting a delay for both
+   transmit AND receive data lines from/to the PHY device
+
+ Whenever it is possible, it is preferrable to utilize the PHY side RGMII delay
+ for several reasons:
+
+ * PHY devices may offer sub-nanosecond granularity in how they allow a
+   receiver/transmitter side delay (e.g: 0.5, 1.0, 1.5ns) etc.
+
+ * PHY devices are typically qualified for a large range of temperatures, and
+   they provide a constant and reliable delay across
+   temperature/pressure/voltage ranges
+
+ * PHY device drivers in PHYLIB being reusable by nature, being able to
+   configure correctly a specified delay enables more designs with similar delay
+   requirements to be enabled
+
+ For cases where the PHY is not capable of providing this delay, but the
+ Ethernet MAC driver is capable of doing it, the correct phy_interface_t value
+ should be PHY_INTERFACE_MODE_RGMII, and the Ethernet MAC driver should be
+ configured correctly in order to provide the required transmit and/or receive
+ side delay from the perspective of the PHY device.
+
+ In case neither the Ethernet MAC, nor the PHY are capable of providing the
+ required delays, as defined per the RGMII standard, several options may be
+ available:
+
+ * Some SoCs may offer a pin pad/mux/controller capable of configuring a given
+   set of pins's drive strength, delays and voltage, and it may be a suitable
+   option to insert the expected 2ns RGMII delay
+
+ * Modifying the PCB design to include a fixed delay (e.g: using a specifically
+   designed serpentine), which may not require software configuration at all
+
 Connecting to a PHY
 
  Sometime during startup, the network driver needs to establish a connection
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 0/4] Documentation: net: phy: Improve documentation
From: Florian Fainelli @ 2016-11-27  6:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, sf84, martin.blumenstingl, mans, alexandre.torgue,
	peppe.cavallaro, timur, jbrunet, Florian Fainelli

Hi all,

This patch series addresses discussions and feedback that was recently received
on the mailing-list in the area of: flow control/pause frames, interpretation of
phy_interface_t and finally add some links to useful standards documents.

Florian Fainelli (4):
  Documentation: net: phy: remove description of function pointers
  Documentation: net: phy: Add a paragraph about pause frames/flow
    control
  Documentation: net: phy: Add blurb about RGMII
  Documentation: net: phy: Add links to several standards documents

 Documentation/networking/phy.txt | 119 +++++++++++++++++++++++++++------------
 1 file changed, 84 insertions(+), 35 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [PATCH net-next 2/4] Documentation: net: phy: Add a paragraph about pause frames/flow control
From: Florian Fainelli @ 2016-11-27  6:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, sf84, martin.blumenstingl, mans, alexandre.torgue,
	peppe.cavallaro, timur, jbrunet, Florian Fainelli
In-Reply-To: <20161127060133.10357-1-f.fainelli@gmail.com>

Describe that the Ethernet MAC controller is ultimately responsible for
dealing with proper pause frames/flow control advertisement and
enabling, and that it is therefore allowed to have it change
phydev->supported/advertising with SUPPORTED_Pause and
SUPPORTED_AsymPause.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/networking/phy.txt | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index 4b25c0f24201..9a42a9414cea 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -127,8 +127,9 @@ Letting the PHY Abstraction Layer do Everything
  values pruned from them which don't make sense for your controller (a 10/100
  controller may be connected to a gigabit capable PHY, so you would need to
  mask off SUPPORTED_1000baseT*).  See include/linux/ethtool.h for definitions
- for these bitfields. Note that you should not SET any bits, or the PHY may
- get put into an unsupported state.
+ for these bitfields. Note that you should not SET any bits, except the
+ SUPPORTED_Pause and SUPPORTED_AsymPause bits (see below), or the PHY may get
+ put into an unsupported state.
 
  Lastly, once the controller is ready to handle network traffic, you call
  phy_start(phydev).  This tells the PAL that you are ready, and configures the
@@ -139,6 +140,19 @@ Letting the PHY Abstraction Layer do Everything
  When you want to disconnect from the network (even if just briefly), you call
  phy_stop(phydev).
 
+Pause frames / flow control
+
+ The PHY does not participate directly in flow control/pause frames except by
+ making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in
+ MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC
+ controller supports such a thing. Since flow control/pause frames generation
+ involves the Ethernet MAC driver, it is recommended that this driver takes care
+ of properly indicating advertisement and support for such features by setting
+ the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done
+ either before or after phy_connect() and/or as a result of implementing the
+ ethtool::set_pauseparam feature.
+
+
 Keeping Close Tabs on the PAL
 
  It is possible that the PAL's built-in state machine needs a little help to
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 1/4] Documentation: net: phy: remove description of function pointers
From: Florian Fainelli @ 2016-11-27  6:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, sf84, martin.blumenstingl, mans, alexandre.torgue,
	peppe.cavallaro, timur, jbrunet, Florian Fainelli
In-Reply-To: <20161127060133.10357-1-f.fainelli@gmail.com>

Remove the function pointers documentation which duplicates information
found in include/linux/phy.h. Maintaining documentation about two
different locations just does not work, but the code is less likely to
be outdated.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/networking/phy.txt | 35 ++---------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index 7ab9404a8412..4b25c0f24201 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -251,39 +251,8 @@ Writing a PHY driver
  PHY_BASIC_FEATURES, but you can look in include/mii.h for other
  features.
 
- Each driver consists of a number of function pointers:
-
-   soft_reset: perform a PHY software reset
-   config_init: configures PHY into a sane state after a reset.
-     For instance, a Davicom PHY requires descrambling disabled.
-   probe: Allocate phy->priv, optionally refuse to bind.
-   PHY may not have been reset or had fixups run yet.
-   suspend/resume: power management
-   config_aneg: Changes the speed/duplex/negotiation settings
-   aneg_done: Determines the auto-negotiation result
-   read_status: Reads the current speed/duplex/negotiation settings
-   ack_interrupt: Clear a pending interrupt
-   did_interrupt: Checks if the PHY generated an interrupt
-   config_intr: Enable or disable interrupts
-   remove: Does any driver take-down
-   ts_info: Queries about the HW timestamping status
-   match_phy_device: used for Clause 45 capable PHYs to match devices
-   in package and ensure they are compatible
-   hwtstamp: Set the PHY HW timestamping configuration
-   rxtstamp: Requests a receive timestamp at the PHY level for a 'skb'
-   txtsamp: Requests a transmit timestamp at the PHY level for a 'skb'
-   set_wol: Enable Wake-on-LAN at the PHY level
-   get_wol: Get the Wake-on-LAN status at the PHY level
-   link_change_notify: called to inform the core is about to change the
-   link state, can be used to work around bogus PHY between state changes
-   read_mmd_indirect: Read PHY MMD indirect register
-   write_mmd_indirect: Write PHY MMD indirect register
-   module_info: Get the size and type of an EEPROM contained in an plug-in
-   module
-   module_eeprom: Get EEPROM information of a plug-in module
-   get_sset_count: Get number of strings sets that get_strings will count
-   get_strings: Get strings from requested objects (statistics)
-   get_stats: Get the extended statistics from the PHY device
+ Each driver consists of a number of function pointers, documented
+ in include/linux/phy.h under the phy_driver structure.
 
  Of these, only config_aneg and read_status are required to be
  assigned by the driver code.  The rest are optional.  Also, it is
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 4/4] Documentation: net: phy: Add links to several standards documents
From: Florian Fainelli @ 2016-11-27  6:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, sf84, martin.blumenstingl, mans, alexandre.torgue,
	peppe.cavallaro, timur, jbrunet, Florian Fainelli
In-Reply-To: <20161127060133.10357-1-f.fainelli@gmail.com>

Add links to the IEEE 802.3-2008 document, and the RGMII v1.3 and v2.0
revisions of the standard.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/networking/phy.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index 18e9f518b6f9..9908490363d6 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -386,3 +386,13 @@ Board Fixups
  The stubs set one of the two matching criteria, and set the other one to
  match anything.
 
+Standards
+
+ IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two:
+ http://standards.ieee.org/getieee802/download/802.3-2008_section2.pdf
+
+ RGMII v1.3:
+ http://web.archive.org/web/20160303212629/http://www.hp.com/rnd/pdfs/RGMIIv1_3.pdf
+
+ RGMII v2.0:
+ http://web.archive.org/web/20160303171328/http://www.hp.com/rnd/pdfs/RGMIIv2_0_final_hp.pdf
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH 2/2] net: phy: realtek: fix enabling of the TX-delay for RTL8211F
From: Florian Fainelli @ 2016-11-27  5:55 UTC (permalink / raw)
  To: Martin Blumenstingl, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	mark.rutland-5wv7dgnIgG8, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	sean.wang-NuS5LvNUpcJWk0Htik3J/w, netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-amlogic-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	jbrunet-rdvid1DuHRBWk0Htik3J/w
In-Reply-To: <20161125131201.19994-3-martin.blumenstingl-gM/Ye1E23mwN+BqQ9rBEUg@public.gmane.org>



On 11/25/2016 05:12 AM, Martin Blumenstingl wrote:
> The old logic always enabled the TX-delay when the phy-mode was set to
> PHY_INTERFACE_MODE_RGMII. There are dedicated phy-modes which tell the
> PHY driver to enable the RX and/or TX delays:
> - PHY_INTERFACE_MODE_RGMII should disable the RX and TX delay in the
>   PHY (if required, the MAC should add the delays in this case)
> - PHY_INTERFACE_MODE_RGMII_ID should enable RX and TX delay in the PHY
> - PHY_INTERFACE_MODE_RGMII_TXID should enable the TX delay in the PHY
> - PHY_INTERFACE_MODE_RGMII_RXID should enable the RX delay in the PHY
>   (currently not supported by RTL8211F)
> 
> With this patch we enable the TX delay for PHY_INTERFACE_MODE_RGMII_ID
> and PHY_INTERFACE_MODE_RGMII_TXID.
> Additionally we now explicity disable the TX-delay, which seems to be
> enabled automatically after a hard-reset of the PHY (by triggering it's
> reset pin) to get a consistent state (as defined by the phy-mode).
> 
> This fixes a compatibility problem with some SoCs where the TX-delay was
> also added by the MAC. With the TX-delay being applied twice the TX
> clock was off and TX traffic was broken or very slow (<10Mbit/s) on
> 1000Mbit/s links.
> 
> Signed-off-by: Martin Blumenstingl <martin.blumenstingl-gM/Ye1E23mwN+BqQ9rBEUg@public.gmane.org>

Reviewed-by: Florian Fainelli <f.fainelli-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
-- 
Florian
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox