Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2-next v1 8/9] rdma: Add QP resource tracking information
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

From: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

This patch adds ss-similar interface to view various resource
tracked objects. At this stage, only QP is presented.

1. Get all QPs for the specific device:
$ rdma res show qp link mlx5_4
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

$ rdma res show qp link mlx5_4/
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

2. Provide illegal port number (0 is illegal):
$ rdma res show qp link mlx5_4/0
Wrong device name

3. Get QPs of specific port:
$ rdma res show qp link mlx5_4/1
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

4. Get QPs which have not assigned port yet:
$ rdma res show qp link mlx5_4/-
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]

5. Detailed view:
$ rdma res show qp link mlx5_4/- -d
DEV/PORT  LQPN       RQPN       TYPE  STATE  PID        COMM            SQ-PSN     RQ-PSN     PATH-MIG
mlx5_4/-  8          ---        UD    RESET  0          [ipoib-verbs]   0          ---        ---

6. Limit to specific columns (dev/port is always available):
$ rdma res show qp link mlx5_4/1 display pid,lqpn,comm
DEV/PORT  LQPN       PID        COMM
mlx5_4/1  7          0          [mlx5-gsi]
mlx5_4/1  1          0          [rdma-mad]
mlx5_4/1  0          0          [rdma-mad]

7. Detailed view (no change, due to "display" option):
$ rdma res show qp link mlx5_4/1 display pid,lqpn,comm -d
DEV/PORT  LQPN       PID        COMM
mlx5_4/1  7          0          [mlx5-gsi]
mlx5_4/1  1          0          [rdma-mad]
mlx5_4/1  0          0          [rdma-mad]

8. Limit to specific Local QPNs:
$ rdma res show qp link mlx5_4/1 display pid,lqpn,comm lqpn 0-6
DEV/PORT  LQPN       PID        COMM
mlx5_4/1  1          0          [rdma-mad]
mlx5_4/1  0          0          [rdma-mad]

Signed-off-by: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 rdma/res.c   | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 rdma/utils.c |  11 ++
 2 files changed, 378 insertions(+)

diff --git a/rdma/res.c b/rdma/res.c
index a70e87dd..ecd6b392 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -16,6 +16,9 @@ static int res_help(struct rd *rd)
 {
 	pr_out("Usage: %s resource\n", rd->filename);
 	pr_out("          resource show [DEV]\n");
+	pr_out("          resource show [qp]\n");
+	pr_out("          resource show qp link [DEV/PORT]\n");
+	pr_out("          resource show qp link [DEV/PORT] [FILTER-NAME FILTER-VALUE]\n");
 	return 0;
 }
 
@@ -136,12 +139,376 @@ static int _res_send_msg(struct rd *rd, uint32_t command, mnl_cb_t callback)
 		return rd_exec_link(rd, _##name, strict_port); \
 	}
 
+struct column {
+	char filter_name[32];
+	char column_name[32];
+	bool in_simple_view;
+};
+
+static bool show_column(struct rd *rd, struct column *c)
+{
+	if (rd->json_output)
+		return true;
+
+	if (rd_check_is_key_exist(rd, "display"))
+		return rd_check_is_string_filtered(rd, "display", c->filter_name);
+
+	if (!rd->show_details)
+		return c->in_simple_view;
+	return true;
+}
+
+static const char *path_mig_to_str(uint8_t idx)
+{
+	static const char * const path_mig_str[] = { "MIGRATED",
+						     "REARM", "ARMED" };
+
+	if (idx < ARRAY_SIZE(path_mig_str))
+		return path_mig_str[idx];
+	return "UNKNOWN";
+}
+
+static const char *qp_states_to_str(uint8_t idx)
+{
+	static const char * const qp_states_str[] = { "RESET", "INIT",
+						      "RTR", "RTS", "SQD",
+						      "SQE", "ERR" };
+
+	if (idx < ARRAY_SIZE(qp_states_str))
+		return qp_states_str[idx];
+	return "UNKNOWN";
+}
+
+static const char *qp_types_to_str(uint8_t idx)
+{
+	static const char * const qp_types_str[] = { "SMI", "GSI", "RC",
+						     "UC", "UD", "RAW_IPV6",
+						     "RAW_ETHERTYPE",
+						     "UNKNOWN", "RAW_PACKET",
+						     "XRC_INI", "XRC_TGT" };
+
+	if (idx < ARRAY_SIZE(qp_types_str))
+		return qp_types_str[idx];
+	return "UNKNOWN";
+}
+
+static void print_lqpn(struct rd *rd, struct column *c, uint32_t val)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "lqpn")) {
+		if (rd->json_output) {
+			jsonw_uint_field(rd->jw, "lqpn", val);
+			return;
+		}
+		pr_out("%-11u", val);
+	}
+}
+
+static void print_rqpn(struct rd *rd, struct column *c,
+		       uint32_t val, struct nlattr **nla_line)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "rqpn")) {
+		if (rd->json_output) {
+			if (nla_line[RDMA_NLDEV_ATTR_RES_RQPN])
+				jsonw_uint_field(rd->jw, "rqpn", val);
+			return;
+		}
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_RQPN])
+			pr_out("%-11u", val);
+		else
+			pr_out("%-11s", "---");
+	}
+}
+
+static void print_type(struct rd *rd, struct column *c, uint32_t val)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "type")) {
+		if (rd->json_output) {
+			jsonw_string_field(rd->jw, "type",
+					   qp_types_to_str(val));
+			return;
+		}
+		pr_out("%-6s", qp_types_to_str(val));
+	}
+}
+
+static void print_state(struct rd *rd, struct column *c, uint32_t val)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "state")) {
+		if (rd->json_output) {
+			jsonw_string_field(rd->jw, "state",
+					   qp_states_to_str(val));
+			return;
+		}
+		pr_out("%-7s", qp_states_to_str(val));
+	}
+}
+
+static void print_rqpsn(struct rd *rd, struct column *c,
+			uint32_t val, struct nlattr **nla_line)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "rq-psn")) {
+		if (rd->json_output) {
+			if (nla_line[RDMA_NLDEV_ATTR_RES_RQ_PSN])
+				jsonw_uint_field(rd->jw, "rq-psn", val);
+			return;
+		}
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_RQ_PSN])
+			pr_out("%-11u", val);
+		else
+			pr_out("%-11s", "---");
+	}
+}
+
+static void print_sqpsn(struct rd *rd, struct column *c, uint32_t val)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "sq-psn")) {
+		if (rd->json_output) {
+			jsonw_uint_field(rd->jw, "sq-psn", val);
+			return;
+		}
+		pr_out("%-11d", val);
+	}
+}
+
+static void print_pathmig(struct rd *rd, struct column *c,
+			  uint32_t val, struct nlattr **nla_line)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "path-mig")) {
+		if (rd->json_output) {
+			if (nla_line[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE])
+				jsonw_string_field(rd->jw,
+						   "path-mig-state",
+						   path_mig_to_str(val));
+			return;
+		}
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE])
+			pr_out("%-16s", path_mig_to_str(val));
+		else
+			pr_out("%-16s", "---");
+	}
+}
+
+static void print_pid(struct rd *rd, struct column *c, uint32_t val)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "pid")) {
+		if (rd->json_output) {
+			jsonw_uint_field(rd->jw, "pid", val);
+			return;
+		}
+		pr_out("%-11d", val);
+	}
+}
+
+static void print_comm(struct rd *rd, struct column *c,
+		       const char *str, struct nlattr **nla_line)
+{
+	if (!show_column(rd, c))
+		return;
+
+	if (!strcmpx(c->filter_name, "comm")) {
+		if (rd->json_output) {
+			/* Don't beatify output in JSON format */
+			jsonw_string_field(rd->jw, "comm", str);
+			return;
+		}
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) {
+			pr_out("%-16s ", str);
+		} else {
+			char tmp[18];
+
+			snprintf(tmp, sizeof(tmp), "[%s]", str);
+			pr_out("%-16s", tmp);
+		}
+	}
+}
+
+static int res_qp_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	static struct column c[] = { { .filter_name = "lqpn",
+				       .column_name = "LQPN       ",
+				       .in_simple_view = true },
+				     { .filter_name = "rqpn",
+				       .column_name = "RQPN       " },
+				     { .filter_name = "type",
+				       .column_name = "TYPE  ",
+					.in_simple_view = true },
+				     { .filter_name = "state",
+				       .column_name = "STATE  ",
+				       .in_simple_view = true },
+				     { .filter_name = "pid",
+				       .column_name = "PID        ",
+				       .in_simple_view = true },
+				     { .filter_name = "comm",
+				       .column_name = "COMM            ",
+				       .in_simple_view = true },
+				     { .filter_name = "sq-psn",
+				       .column_name = "SQ-PSN     " },
+				     { .filter_name = "rq-psn",
+				       .column_name = "RQ-PSN     " },
+				     { .filter_name = "path-mig",
+				       .column_name = "PATH-MIG        " } };
+
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct nlattr *nla_table, *nla_entry;
+	static bool print_header = true;
+	struct rd *rd = data;
+	uint32_t cidx, idx;
+	const char *name;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_RES_QP])
+		return MNL_CB_ERROR;
+
+	name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+	idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	nla_table = tb[RDMA_NLDEV_ATTR_RES_QP];
+	if (!rd->json_output && print_header) {
+		pr_out("DEV/PORT  ");
+		for (cidx = 0; cidx < ARRAY_SIZE(c); cidx++)
+			if (show_column(rd, &c[cidx]))
+				pr_out("%s", c[cidx].column_name);
+		pr_out("\n");
+		print_header = false;
+	}
+
+	mnl_attr_for_each_nested(nla_entry, nla_table) {
+		struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+		uint32_t lqpn, rqpn = 0, rq_psn = 0, sq_psn;
+		uint8_t type, state, path_mig_state = 0;
+		uint32_t port = 0, pid = 0;
+		bool ignore_value = false;
+		char port_name[32];
+		const char *comm;
+		int err;
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!nla_line[RDMA_NLDEV_ATTR_RES_LQPN] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_SQ_PSN] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_TYPE] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_STATE] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_PID_COMM]) {
+			return -EINVAL;
+		}
+
+		if (nla_line[RDMA_NLDEV_ATTR_PORT_INDEX])
+			port = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_PORT_INDEX]);
+
+		if (port != rd->port_idx)
+			continue;
+
+		if (nla_line[RDMA_NLDEV_ATTR_PORT_INDEX])
+			snprintf(port_name, 32, "%s/%u", name, port);
+		else
+			snprintf(port_name, 32, "%s/-", name);
+
+		lqpn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_LQPN]);
+		if (rd_check_is_filtered(rd, "lqpn", lqpn, false))
+			continue;
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_RQPN])
+			rqpn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_RQPN]);
+		else
+			ignore_value = true;
+
+		if (rd_check_is_filtered(rd, "rqpn", rqpn, ignore_value))
+			continue;
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_RQ_PSN])
+			rq_psn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_RQ_PSN]);
+
+		sq_psn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_SQ_PSN]);
+		if (nla_line[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE])
+			path_mig_state = mnl_attr_get_u8(nla_line[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE]);
+		type = mnl_attr_get_u8(nla_line[RDMA_NLDEV_ATTR_RES_TYPE]);
+		state = mnl_attr_get_u8(nla_line[RDMA_NLDEV_ATTR_RES_STATE]);
+
+		if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
+			pid = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+
+		if (rd_check_is_filtered(rd, "pid", pid, false))
+			continue;
+
+		comm = mnl_attr_get_str(nla_line[RDMA_NLDEV_ATTR_RES_PID_COMM]);
+
+		if (rd->json_output) {
+			jsonw_start_array(rd->jw);
+			jsonw_uint_field(rd->jw, "ifindex", idx);
+
+			if (nla_line[RDMA_NLDEV_ATTR_PORT_INDEX])
+				jsonw_uint_field(rd->jw, "port", port);
+
+			jsonw_string_field(rd->jw, "ifname", port_name);
+		} else {
+			pr_out("%-10s", port_name);
+		}
+
+		for (cidx = 0; cidx < ARRAY_SIZE(c); cidx++) {
+			print_lqpn(rd, &c[cidx], lqpn);
+			print_rqpn(rd, &c[cidx], rqpn, nla_line);
+
+			print_type(rd, &c[cidx], type);
+			print_state(rd, &c[cidx], state);
+
+			print_rqpsn(rd, &c[cidx], rq_psn, nla_line);
+			print_sqpsn(rd, &c[cidx], sq_psn);
+
+			print_pathmig(rd, &c[cidx], path_mig_state, nla_line);
+			print_pid(rd, &c[cidx], pid);
+			print_comm(rd, &c[cidx], comm, nla_line);
+		}
+
+		if (rd->json_output)
+			jsonw_end_array(rd->jw);
+		else
+			pr_out("\n");
+	}
+	return MNL_CB_OK;
+}
+
 RES_FUNC(res_no_args,	RDMA_NLDEV_CMD_RES_GET,	NULL, true);
 
+static const char * const qp_valid_filters[] = { "link", "lqpn", "rqpn",
+						 "pid", "display", NULL };
+RES_FUNC(res_qp,	RDMA_NLDEV_CMD_RES_QP_GET, qp_valid_filters, false);
+
 static int res_show(struct rd *rd)
 {
 	const struct rd_cmd cmds[] = {
 		{ NULL,		res_no_args	},
+		{ "qp",		res_qp		},
 		{ 0 }
 	};
 
diff --git a/rdma/utils.c b/rdma/utils.c
index d39e926e..288c6b9b 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -352,6 +352,17 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = MNL_TYPE_NUL_STRING,
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = MNL_TYPE_U64,
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_MAX]	= MNL_TYPE_U64,
+	[RDMA_NLDEV_ATTR_RES_QP]		= MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_RES_QP_ENTRY]		= MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_RES_LQPN]	= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_RES_RQPN]	= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_RES_RQ_PSN]		= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_RES_SQ_PSN]		= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE]	= MNL_TYPE_U8,
+	[RDMA_NLDEV_ATTR_RES_TYPE]		= MNL_TYPE_U8,
+	[RDMA_NLDEV_ATTR_RES_STATE]		= MNL_TYPE_U8,
+	[RDMA_NLDEV_ATTR_RES_PID]		= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_RES_PID_COMM]	= MNL_TYPE_NUL_STRING,
 };
 
 int rd_attr_cb(const struct nlattr *attr, void *data)
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH iproute2-next v1 0/9] RDMA resource tracking
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger

Changelog:
 v0 -> v1:
   * Fixed subject title in patch #1: rdam -> rdma.
   * Added newline between variable declaration and the code.
   * Add check to failure in strdup() call in rd_check_is_string_filtered().
   * Rewrote res_qp_parse_cb() to avoid long lines and extra indentation.

------------------------------------------------------------------------
Hi,

This is supplementary (user-space) part of RDMA resource tracking posted
to the RDMA mailing list for the review [1].

The main goal of this new functionality in RDMAtool is to provide debug visibility
of running applications in RDMA stack.

The current series adds new command object (resource) which provides
short summary if it is called without arguments or more detailed
information while it is called with request to present QPs.

1) Summary information:
$ rdma res
1: mlx5_0: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
2: mlx5_1: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
3: mlx5_2: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
4: mlx5_3: curr/max: pd 2/16777216 cq 3/16777216 qp 2/262144
5: mlx5_4: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144

2) Summary information of specific device:
$ rdma res show mlx5_4
5: mlx5_4: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144

3) Detailed information of specific device:
$ rdma res show qp link mlx5_4
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

$ rdma res show qp link mlx5_4/
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

4) Wrong port (it can be 1 or 2):
$ rdma res show qp link mlx5_4/0
Wrong device name

5) Detailed information of specific device and port:
$ rdma res show qp link mlx5_4/1
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/1  7          UD    RTS    0          [mlx5-gsi]
mlx5_4/1  1          GSI   RTS    0          [rdma-mad]
mlx5_4/1  0          SMI   RTS    0          [rdma-mad]

6) Detailed information of QPs not-connected to port yet:
$ rdma res show qp link mlx5_4/-
DEV/PORT  LQPN       TYPE  STATE  PID        COMM
mlx5_4/-  8          UD    RESET  0          [ipoib-verbs]

7) Very detailed view:
$ rdma res show qp link mlx5_4/- -d
DEV/PORT  LQPN       RQPN       TYPE  STATE  PID        COMM            SQ-PSN     RQ-PSN     PATH-MIG
mlx5_4/-  8          ---        UD    RESET  0          [ipoib-verbs]   0          ---        ---

8) Limit display to specific columns:
$ rdma res show qp link mlx5_4/1 display pid,lqpn,comm
DEV/PORT  LQPN       PID        COMM
mlx5_4/1  7          0          [mlx5-gsi]
mlx5_4/1  1          0          [rdma-mad]
mlx5_4/1  0          0          [rdma-mad]

9) Filter specific LQPNs:
$ rdma res show qp link mlx5_4/1 display pid,lqpn,comm lqpn 0,4-7
DEV/PORT  LQPN       PID        COMM
mlx5_4/1  7          0          [mlx5-gsi]
mlx5_4/1  0          0          [rdma-mad]

Thanks

Leon Romanovsky (9):
  rdma: Add option to provide "-" sign for the port number
  rdma: Make visible the number of arguments
  rdma: Add filtering infrastructure
  rdma: Set pointer to device name position
  rdma: Allow external usage of compare string routine
  rdma: Update kernel header file
  rdma: Add resource tracking summary
  rdma: Add QP resource tracking information
  rdma: Document resource tracking

 include/uapi/rdma/rdma_netlink.h |  58 ++++-
 man/man8/rdma-resource.8         |  91 +++++++
 rdma/Makefile                    |   2 +-
 rdma/link.c                      |   2 +-
 rdma/rdma.c                      |   4 +-
 rdma/rdma.h                      |  23 +-
 rdma/res.c                       | 535 +++++++++++++++++++++++++++++++++++++++
 rdma/utils.c                     | 309 +++++++++++++++++++++-
 8 files changed, 1007 insertions(+), 17 deletions(-)
 create mode 100644 man/man8/rdma-resource.8
 create mode 100644 rdma/res.c

^ permalink raw reply

* [PATCH iproute2-next v1 2/9] rdma: Make visible the number of arguments
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/rdma.h  | 1 +
 rdma/utils.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/rdma/rdma.h b/rdma/rdma.h
index cbd9aa89..1b66ae04 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -74,6 +74,7 @@ int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str);
 int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd));
 int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port);
 void rd_free(struct rd *rd);
+int rd_argc(struct rd *rd);
 
 /*
  * Device manipulation
diff --git a/rdma/utils.c b/rdma/utils.c
index b9c668a3..af2b374d 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -12,7 +12,7 @@
 #include "rdma.h"
 #include <ctype.h>
 
-static int rd_argc(struct rd *rd)
+int rd_argc(struct rd *rd)
 {
 	return rd->argc;
 }
-- 
2.15.1

^ permalink raw reply related

* [PATCH iproute2-next v1 5/9] rdma: Allow external usage of compare string routine
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/rdma.h  | 2 ++
 rdma/utils.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/rdma/rdma.h b/rdma/rdma.h
index e842d076..816c8ddd 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -84,6 +84,8 @@ void rd_free(struct rd *rd);
 int rd_set_arg_to_devname(struct rd *rd);
 int rd_argc(struct rd *rd);
 
+int strcmpx(const char *str1, const char *str2);
+
 /*
  * Device manipulation
  */
diff --git a/rdma/utils.c b/rdma/utils.c
index 73f0d04e..e6a727e0 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -24,7 +24,7 @@ char *rd_argv(struct rd *rd)
 	return *rd->argv;
 }
 
-static int strcmpx(const char *str1, const char *str2)
+int strcmpx(const char *str1, const char *str2)
 {
 	if (strlen(str1) > strlen(str2))
 		return -1;
-- 
2.15.1

^ permalink raw reply related

* [PATCH iproute2-next v1 6/9] rdma: Update kernel header file
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Synchronize iporute2 package with latest kernel
RDMA netlink header file.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/uapi/rdma/rdma_netlink.h | 58 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 48fbf3c3..a6f60c22 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _RDMA_NETLINK_H
-#define _RDMA_NETLINK_H
+#ifndef _UAPI_RDMA_NETLINK_H
+#define _UAPI_RDMA_NETLINK_H
 
 #include <linux/types.h>
 
@@ -236,6 +236,16 @@ enum rdma_nldev_command {
 	RDMA_NLDEV_CMD_PORT_NEW,
 	RDMA_NLDEV_CMD_PORT_DEL,
 
+	RDMA_NLDEV_CMD_RES_GET, /* can dump */
+	RDMA_NLDEV_CMD_RES_SET,
+	RDMA_NLDEV_CMD_RES_NEW,
+	RDMA_NLDEV_CMD_RES_DEL,
+
+	RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */
+	RDMA_NLDEV_CMD_RES_QP_SET,
+	RDMA_NLDEV_CMD_RES_QP_NEW,
+	RDMA_NLDEV_CMD_RES_QP_DEL,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -303,6 +313,50 @@ enum rdma_nldev_attr {
 
 	RDMA_NLDEV_ATTR_DEV_NODE_TYPE,		/* u8 */
 
+	RDMA_NLDEV_ATTR_RES_SUMMARY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY,	/* nested table */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME,	/* string */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR,	/* u64 */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_MAX,	/* u64 */
+
+	RDMA_NLDEV_ATTR_RES_QP,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_QP_ENTRY,		/* nested table */
+	/*
+	 * Local QPN
+	 */
+	RDMA_NLDEV_ATTR_RES_LQPN,		/* u32 */
+	/*
+	 * Remote QPN,
+	 * Applicable for RC and UC only IBTA 11.2.5.3 QUERY QUEUE PAIR
+	 */
+	RDMA_NLDEV_ATTR_RES_RQPN,		/* u32 */
+	/*
+	 * Receive Queue PSN,
+	 * Applicable for RC and UC only 11.2.5.3 QUERY QUEUE PAIR
+	 */
+	RDMA_NLDEV_ATTR_RES_RQ_PSN,		/* u32 */
+	/*
+	 * Send Queue PSN
+	 */
+	RDMA_NLDEV_ATTR_RES_SQ_PSN,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,	/* u8 */
+	/*
+	 * QP types as visible to RDMA/core, the reserved QPT
+	 * are not exported through this interface.
+	 */
+	RDMA_NLDEV_ATTR_RES_TYPE,		/* u8 */
+	RDMA_NLDEV_ATTR_RES_STATE,		/* u8 */
+	/*
+	 * Process ID created QP, in case of kernel PID is equal to 0
+	 * and this field won't be set, so user will distinguish user/kernel
+	 * processes without relying on PID number.
+	 */
+	RDMA_NLDEV_ATTR_RES_PID,		/* u32 */
+	/*
+	 * The name of process created following resource.
+	 */
+	RDMA_NLDEV_ATTR_RES_PID_COMM,		/* string */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */
-- 
2.15.1

^ permalink raw reply related

* [PATCH iproute2-next v1 7/9] rdma: Add resource tracking summary
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

The global resource summary information. The object names, current utilization
and maximum numbers are received as is from the kernel.

$ rdma res
1: mlx5_0: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
2: mlx5_1: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
3: mlx5_2: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144
4: mlx5_3: curr/max: pd 2/16777216 cq 3/16777216 qp 2/262144
5: mlx5_4: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144

$ rdma res show mlx5_4
5: mlx5_4: curr/max: pd 3/16777216 cq 5/16777216 qp 4/262144

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/Makefile |   2 +-
 rdma/rdma.c   |   3 +-
 rdma/rdma.h   |   1 +
 rdma/res.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 rdma/utils.c  |   5 ++
 5 files changed, 177 insertions(+), 2 deletions(-)
 create mode 100644 rdma/res.c

diff --git a/rdma/Makefile b/rdma/Makefile
index c8966bfd..875fe53c 100644
--- a/rdma/Makefile
+++ b/rdma/Makefile
@@ -3,7 +3,7 @@ include ../config.mk
 
 ifeq ($(HAVE_MNL),y)
 
-RDMA_OBJ = rdma.o utils.o dev.o link.o
+RDMA_OBJ = rdma.o utils.o dev.o link.o res.o
 
 TARGETS=rdma
 endif
diff --git a/rdma/rdma.c b/rdma/rdma.c
index a21ba440..19608f41 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -15,7 +15,7 @@
 static void help(char *name)
 {
 	pr_out("Usage: %s [ OPTIONS ] OBJECT { COMMAND | help }\n"
-	       "where  OBJECT := { dev | link | help }\n"
+	       "where  OBJECT := { dev | link | resource | help }\n"
 	       "       OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty]}\n", name);
 }
 
@@ -32,6 +32,7 @@ static int rd_cmd(struct rd *rd)
 		{ "help",	cmd_help },
 		{ "dev",	cmd_dev },
 		{ "link",	cmd_link },
+		{ "resource",	cmd_res },
 		{ 0 }
 	};
 
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 816c8ddd..f1ddedd2 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -77,6 +77,7 @@ char *rd_argv(struct rd *rd);
  */
 int cmd_dev(struct rd *rd);
 int cmd_link(struct rd *rd);
+int cmd_res(struct rd *rd);
 int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str);
 int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd));
 int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port);
diff --git a/rdma/res.c b/rdma/res.c
new file mode 100644
index 00000000..a70e87dd
--- /dev/null
+++ b/rdma/res.c
@@ -0,0 +1,168 @@
+/*
+ * res.c	RDMA tool
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Leon Romanovsky <leonro@mellanox.com>
+ */
+
+#include "rdma.h"
+#include <inttypes.h>
+
+static int res_help(struct rd *rd)
+{
+	pr_out("Usage: %s resource\n", rd->filename);
+	pr_out("          resource show [DEV]\n");
+	return 0;
+}
+
+static int res_print_summary(struct rd *rd, struct nlattr **tb)
+{
+	struct nlattr *nla_table = tb[RDMA_NLDEV_ATTR_RES_SUMMARY];
+	struct nlattr *nla_entry;
+	uint64_t max, curr;
+	const char *name;
+	int err;
+
+	if (!rd->json_output)
+		pr_out("curr/max: ");
+
+	mnl_attr_for_each_nested(nla_entry, nla_table) {
+		struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+		char json_name[32];
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] ||
+		    !nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_MAX]) {
+			return -EINVAL;
+		}
+
+		name = mnl_attr_get_str(nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]);
+		curr = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]);
+		if (rd->json_output) {
+			snprintf(json_name, 32, "curr_%s", name);
+			jsonw_lluint_field(rd->jw, json_name, curr);
+		}
+
+		max = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_MAX]);
+		if (rd->json_output) {
+			snprintf(json_name, 32, "max_%s", name);
+			jsonw_lluint_field(rd->jw, json_name, max);
+		} else {
+			pr_out("%s %"PRId64 "/%"PRId64 " ", name, curr, max);
+		}
+	}
+	return 0;
+}
+
+static int res_no_args_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct rd *rd = data;
+	const char *name;
+	uint32_t idx;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_RES_SUMMARY])
+		return MNL_CB_ERROR;
+
+	idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+	if (rd->json_output) {
+		jsonw_uint_field(rd->jw, "ifindex", idx);
+		jsonw_string_field(rd->jw, "ifname", name);
+	} else {
+		pr_out("%u: %s: ", idx, name);
+	}
+
+	res_print_summary(rd, tb);
+
+	if (!rd->json_output)
+		pr_out("\n");
+	return MNL_CB_OK;
+}
+
+static int _res_send_msg(struct rd *rd, uint32_t command, mnl_cb_t callback)
+{
+	uint32_t flags = NLM_F_REQUEST | NLM_F_ACK;
+	uint32_t seq;
+	int ret;
+
+	if (command != RDMA_NLDEV_CMD_RES_GET)
+		flags |= NLM_F_DUMP;
+
+	rd_prepare_msg(rd, command, &seq, flags);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	if (rd->port_idx)
+		mnl_attr_put_u32(rd->nlh,
+				 RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+
+	ret = rd_send_msg(rd);
+	if (ret)
+		return ret;
+
+	if (rd->json_output)
+		jsonw_start_object(rd->jw);
+	ret = rd_recv_msg(rd, callback, rd, seq);
+	if (rd->json_output)
+		jsonw_end_object(rd->jw);
+	return ret;
+}
+
+#define RES_FUNC(name, command, valid_filters, strict_port) \
+	static int _##name(struct rd *rd)\
+	{ \
+		return _res_send_msg(rd, command, name##_parse_cb); \
+	} \
+	static int name(struct rd *rd) \
+	{\
+		int ret = rd_build_filter(rd, valid_filters); \
+		if (ret) \
+			return ret; \
+		if ((uintptr_t)valid_filters != (uintptr_t)NULL) { \
+			ret = rd_set_arg_to_devname(rd); \
+			if (ret) \
+				return ret;\
+		} \
+		return rd_exec_link(rd, _##name, strict_port); \
+	}
+
+RES_FUNC(res_no_args,	RDMA_NLDEV_CMD_RES_GET,	NULL, true);
+
+static int res_show(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		res_no_args	},
+		{ 0 }
+	};
+
+	/*
+	 * Special case to support "rdma res show DEV_NAME"
+	 */
+	if (rd_argc(rd) == 1 && dev_map_lookup(rd, false))
+		return rd_exec_dev(rd, _res_no_args);
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+int cmd_res(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		res_show },
+		{ "show",	res_show },
+		{ "list",	res_show },
+		{ "help",	res_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "resource command");
+}
diff --git a/rdma/utils.c b/rdma/utils.c
index e6a727e0..d39e926e 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -347,6 +347,11 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_PORT_STATE] = MNL_TYPE_U8,
 	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = MNL_TYPE_U8,
 	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = MNL_TYPE_U8,
+	[RDMA_NLDEV_ATTR_RES_SUMMARY]	= MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = MNL_TYPE_NUL_STRING,
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = MNL_TYPE_U64,
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_MAX]	= MNL_TYPE_U64,
 };
 
 int rd_attr_cb(const struct nlattr *attr, void *data)
-- 
2.15.1

^ permalink raw reply related

* [PATCH iproute2-next v1 1/9] rdma: Add option to provide "-" sign for the port number
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

According to the IBTA spec [1], the physical connected port is provided
for the QP in RTR-to-INIT stage performed by modify_qp(). It causes
to do not have port number for newly created QPs.

The following patch adds "-" sign to present absence of port, because
QPs are going to be associated with rdmatool link object, which needs
port number as an index.

[1] InfiniBand Architecture Release 1.3 -
	"Table 96 QP State Transition Properties"

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/link.c  |  2 +-
 rdma/rdma.h  |  3 +--
 rdma/utils.c | 50 ++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/rdma/link.c b/rdma/link.c
index 676cb21d..66bcd50e 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -285,7 +285,7 @@ static int link_one_show(struct rd *rd)
 
 static int link_show(struct rd *rd)
 {
-	return rd_exec_link(rd, link_one_show);
+	return rd_exec_link(rd, link_one_show, true);
 }
 
 int cmd_link(struct rd *rd)
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 8d53d3a0..cbd9aa89 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -64,7 +64,6 @@ bool rd_no_arg(struct rd *rd);
 void rd_arg_inc(struct rd *rd);
 
 char *rd_argv(struct rd *rd);
-uint32_t get_port_from_argv(struct rd *rd);
 
 /*
  * Commands interface
@@ -73,7 +72,7 @@ int cmd_dev(struct rd *rd);
 int cmd_link(struct rd *rd);
 int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str);
 int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd));
-int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd));
+int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port);
 void rd_free(struct rd *rd);
 
 /*
diff --git a/rdma/utils.c b/rdma/utils.c
index 7b2001e2..b9c668a3 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -10,6 +10,7 @@
  */
 
 #include "rdma.h"
+#include <ctype.h>
 
 static int rd_argc(struct rd *rd)
 {
@@ -50,13 +51,43 @@ bool rd_no_arg(struct rd *rd)
 	return rd_argc(rd) == 0;
 }
 
-uint32_t get_port_from_argv(struct rd *rd)
+/*
+ * Possible input:output
+ * dev/port    | first port | is_dump_all
+ * mlx5_1      | 0          | true
+ * mlx5_1/     | 0          | true
+ * mlx5_1/0    | 0          | false
+ * mlx5_1/1    | 1          | false
+ * mlx5_1/-    | 0          | false
+ *
+ * In strict mode, /- will return error.
+ */
+static int get_port_from_argv(struct rd *rd, uint32_t *port,
+			      bool *is_dump_all, bool strict_port)
 {
 	char *slash;
 
+	*port = 0;
+	*is_dump_all = true;
+
 	slash = strchr(rd_argv(rd), '/');
 	/* if no port found, return 0 */
-	return slash ? atoi(slash + 1) : 0;
+	if (slash++) {
+		if (*slash == '-') {
+			if (strict_port)
+				return -EINVAL;
+			*is_dump_all = false;
+			return 0;
+		}
+
+		if (isdigit(*slash)) {
+			*is_dump_all = false;
+			*port = atoi(slash);
+		}
+		if (!*port && strlen(slash))
+			return -EINVAL;
+	}
+	return 0;
 }
 
 static struct dev_map *dev_map_alloc(const char *dev_name)
@@ -152,7 +183,7 @@ void rd_free(struct rd *rd)
 	dev_map_cleanup(rd);
 }
 
-int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd))
+int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port)
 {
 	struct dev_map *dev_map;
 	uint32_t port;
@@ -163,7 +194,8 @@ int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd))
 	if (rd_no_arg(rd)) {
 		list_for_each_entry(dev_map, &rd->dev_map_list, list) {
 			rd->dev_idx = dev_map->idx;
-			for (port = 1; port < dev_map->num_ports + 1; port++) {
+			port = (strict_port) ? 1 : 0;
+			for (; port < dev_map->num_ports + 1; port++) {
 				rd->port_idx = port;
 				ret = cb(rd);
 				if (ret)
@@ -172,21 +204,23 @@ int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd))
 		}
 
 	} else {
+		bool is_dump_all;
+
 		dev_map = dev_map_lookup(rd, true);
-		port = get_port_from_argv(rd);
-		if (!dev_map || port > dev_map->num_ports) {
+		ret = get_port_from_argv(rd, &port, &is_dump_all, strict_port);
+		if (!dev_map || port > dev_map->num_ports || (!port && ret)) {
 			pr_err("Wrong device name\n");
 			ret = -ENOENT;
 			goto out;
 		}
 		rd_arg_inc(rd);
 		rd->dev_idx = dev_map->idx;
-		rd->port_idx = port ? : 1;
+		rd->port_idx = port;
 		for (; rd->port_idx < dev_map->num_ports + 1; rd->port_idx++) {
 			ret = cb(rd);
 			if (ret)
 				goto out;
-			if (port)
+			if (!is_dump_all)
 				/*
 				 * We got request to show link for devname
 				 * with port index.
-- 
2.15.1

^ permalink raw reply related

* [PATCH iproute2-next v1 9/9] rdma: Document resource tracking
From: Leon Romanovsky @ 2018-01-04  7:01 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, David Ahern
  Cc: RDMA mailing list, Leon Romanovsky, netdev, Stephen Hemminger,
	Leon Romanovsky
In-Reply-To: <20180104070150.15625-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Spartan version of resource tracking documentation.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 man/man8/rdma-resource.8 | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 man/man8/rdma-resource.8

diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8
new file mode 100644
index 00000000..e3c83b94
--- /dev/null
+++ b/man/man8/rdma-resource.8
@@ -0,0 +1,91 @@
+.TH RDMA\-RESOURCE 8 "26 Dec 2017" "iproute2" "Linux"
+.SH NAME
+rdma-resource \- rdma resource configuration
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B rdma
+.RI "[ " OPTIONS " ]"
+.B resource
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.IR OPTIONS " := { "
+\fB\-j\fR[\fIson\fR] |
+\fB\-d\fR[\fIetails\fR] }
+
+.ti -8
+.B rdma resource show
+.RI "[ " DEV/PORT_INDEX " ]"
+
+.ti -8
+.B rdma resource help
+
+.SH "DESCRIPTION"
+.SS rdma resource show - display rdma resource tracking information
+
+.PP
+.I "DEV/PORT_INDEX"
+- specifies the RDMA link to show.
+If this argument is omitted all links are listed.
+
+.SH "EXAMPLES"
+.PP
+rdma resource show
+.RS 4
+Shows summary for all devices on the system.
+.RE
+.PP
+rdma resource show mlx5_2
+.RS 4
+Shows the state of specified rdma device.
+.RE
+.PP
+rdma res show qp link mlx5_4
+.RS 4
+Get all QPs for the specific device.
+.RE
+.PP
+rdma res show qp link mlx5_4/1
+.RS 4
+Get QPs of specific port.
+.RE
+.PP
+rdma res show qp link mlx5_4/0
+.RS 4
+Provide illegal port number (0 is illegal).
+.RE
+.PP
+rdma res show qp link mlx5_4/-
+.RS 4
+Get QPs which have not assigned port yet.
+.RE
+.PP
+rdma res show qp link mlx5_4/- -d
+.RS 4
+Detailed view.
+.RE
+.PP
+rdma res show qp link mlx5_4/1 display pid,lqpn,comm
+.RS 4
+Limit to specific columns (dev/port is always available)
+.RE
+.PP
+rdma res show qp link mlx5_4/1 display pid,lqpn,comm lqpn 0-6
+.RS 4
+Limit to specific Local QPNs.
+.RE
+.PP
+
+.SH SEE ALSO
+.BR rdma (8),
+.BR rdma-dev (8),
+.BR rdma-link (8),
+.br
+
+.SH AUTHOR
+Leon Romanovsky <leonro@mellanox.com>
-- 
2.15.1

^ permalink raw reply related

* Re: [patch net-next v4 00/10] net: sched: allow qdiscs to share filter block instances
From: Jakub Kicinski @ 2018-01-04  7:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Ahern, netdev, davem, jhs, xiyou.wangcong, mlxsw, andrew,
	vivien.didelot, f.fainelli, michael.chan, ganeshgr, saeedm,
	matanb, leonro, idosch, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel
In-Reply-To: <20180104065702.GH2067@nanopsycho.orion>

On Thu, 4 Jan 2018 07:57:02 +0100, Jiri Pirko wrote:
> Thu, Jan 04, 2018 at 12:51:52AM CET, kubakici@wp.pl wrote:
> >On Wed, 3 Jan 2018 18:22:09 +0100, Jiri Pirko wrote:  
> >> However I don't agree about breaking the existing filter add and show
> >> and also imposibility to make not-shared block shared in the runtime
> >> before defining it first.  
> >
> >FWIW I would agree with David that allowing add on a shared block
> >modify filters on another interface can break existing users.  (No
> >opinion on dump and lifetime).  
> 
> I don't think that David is saying that, but why do you think it would
> break existing users?

Perhaps I worded is too strongly as "breaking existing users", but it
certainly introduces surprising side effects.  David put it into words
very well:

On Tue, 2 Jan 2018 19:07:36 -0700, David Ahern wrote:
> The disagreement is in how they should be managed. I think my last
> response concisely captures my concerns -- the principle of least surprise.
> 
> So with the initial commands above, all is fine. Then someone is
> debugging a problem or wants to add another filter to ens8, so they run:
> 
> $ tc filter add dev ens8 ingress protocol ip pref 25 flower dst_ip
> 192.168.1.0/16 action drop
> 
> Then traffic flows through ens7 break and some other user is struggling
> to understand what just happened. That the new filter magically appears
> on ens7 when the user operated on ens8 is a surprise. Nothing about that
> last command acknowledges that it is changing a shared resource.
> 
> Consider the commands being run by different people, and a time span
> between. Allowing the shared block to be configured by any device using
> the block is just setting up users for errors and confusion.
> 
> > forcing user to explicitly create some block entity and then to attach
> > it to qdisc instances. I don't really see good reason for it. Could you
> > please clear this up for me?  
> 
> It forces the user to acknowledge it is changing a resource that may be
> shared by more than one device.
> 
> $ tc filter add dev ens8 ingress protocol ip pref 25 flower dst_ip
> 192.168.1.0/16 action drop
> Error: This qdisc is a shared block. Use the block API to configure.

^ permalink raw reply

* [PATCH net-next] net: dsa: lan9303: Fix error return code in lan9303_check_device()
From: Wei Yongjun @ 2018-01-04  7:30 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli; +Cc: Wei Yongjun, netdev

Fix to return error code -ENODEV from the chip not found error handling
case instead of 0(ret have been overwritten to 0 by lan9303_read()), as
done elsewhere in this function.

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
---
 drivers/net/dsa/lan9303-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 944901f..d4a5b45 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -867,7 +867,7 @@ static int lan9303_check_device(struct lan9303 *chip)
 	if ((reg >> 16) != LAN9303_CHIP_ID) {
 		dev_err(chip->dev, "expecting LAN9303 chip, but found: %X\n",
 			reg >> 16);
-		return ret;
+		return -ENODEV;
 	}
 
 	/* The default state of the LAN9303 device is to forward packets between

^ permalink raw reply related

* [PATCH bpf-next 0/2] bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map
From: Yonghong Song @ 2018-01-04  7:27 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

The patch set implements bpf syscall command BPF_MAP_GET_NEXT_KEY
for stacktrace map. Patch #1 is the core implementation
and Patch #2 implements a bpf test at tools/testing/selftests/bpf
directory. Please see individual patch comments for details.

Yonghong Song (2):
  bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map
  tools/bpf: add a bpf selftest for stacktrace

 kernel/bpf/stackmap.c                             |  23 +++-
 tools/testing/selftests/bpf/Makefile              |   2 +-
 tools/testing/selftests/bpf/test_progs.c          | 127 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_stacktrace_map.c |  62 +++++++++++
 4 files changed, 211 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_map.c

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next 2/2] tools/bpf: add a bpf selftest for stacktrace
From: Yonghong Song @ 2018-01-04  7:27 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180104072746.1569033-1-yhs@fb.com>

Added a bpf selftest in test_progs at tools directory for stacktrace.
The test will populate a hashtable map and a stacktrace map
at the same time with the same key, stackid.
The user space will compare both maps, using BPF_MAP_LOOKUP_ELEM
command and BPF_MAP_GET_NEXT_KEY command, to ensure that both have
the same set of keys.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/Makefile              |   2 +-
 tools/testing/selftests/bpf/test_progs.c          | 127 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_stacktrace_map.c |  62 +++++++++++
 3 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_map.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1304753..a8aa7e2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -19,7 +19,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
-	test_l4lb_noinline.o test_xdp_noinline.o
+	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 09087ab..b549308 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -837,6 +837,132 @@ static void test_tp_attach_query(void)
 	free(query);
 }
 
+static int compare_map_keys(int map1_fd, int map2_fd)
+{
+	__u32 key, next_key;
+	char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)];
+	int err;
+
+	err = bpf_map_get_next_key(map1_fd, NULL, &key);
+	if (err)
+		return err;
+	err = bpf_map_lookup_elem(map2_fd, &key, val_buf);
+	if (err)
+		return err;
+
+	while (bpf_map_get_next_key(map1_fd, &key, &next_key) == 0) {
+		err = bpf_map_lookup_elem(map2_fd, &next_key, val_buf);
+		if (err)
+			return err;
+
+		key = next_key;
+	}
+	if (errno != ENOENT)
+		return -1;
+
+	return 0;
+}
+
+static void test_stacktrace_map()
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	const char *file = "./test_stacktrace_map.o";
+	int bytes, efd, err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {};
+	__u32 key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		goto out;
+
+	/* Get the ID for the sched/sched_switch tracepoint */
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+		  "read", "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	/* Open the perf event and attach bpf progrram */
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* give some time for bpf program run */
+	sleep(1);
+
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		; /* fall through */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+
+out:
+	return;
+}
+
 int main(void)
 {
 	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -852,6 +978,7 @@ int main(void)
 	test_pkt_md_access();
 	test_obj_name();
 	test_tp_attach_query();
+	test_stacktrace_map();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
new file mode 100644
index 0000000..76d85c5d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH         127
+#endif
+
+struct bpf_map_def SEC("maps") control_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") stackid_hmap = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 10000,
+};
+
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+	unsigned long long pad;
+	char prev_comm[16];
+	int prev_pid;
+	int prev_prio;
+	long long prev_state;
+	char next_comm[16];
+	int next_pid;
+	int next_prio;
+};
+
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+	__u32 key = 0, val = 0, *value_p;
+
+	value_p = bpf_map_lookup_elem(&control_map, &key);
+	if (value_p && *value_p)
+		return 0; /* skip if non-zero *value_p */
+
+	/* The size of stackmap and stackid_hmap should be the same */
+	key = bpf_get_stackid(ctx, &stackmap, 0);
+	if ((int)key >= 0)
+		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 1/2] bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map
From: Yonghong Song @ 2018-01-04  7:27 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180104072746.1569033-1-yhs@fb.com>

Currently, bpf syscall command BPF_MAP_GET_NEXT_KEY is not
supported for stacktrace map. However, there are use cases where
user space wants to enumerate all stacktrace map entries where
BPF_MAP_GET_NEXT_KEY command will be really helpful.
In addition, if user space wants to delete all map entries
in order to save memory and does not want to close the
map file descriptor, BPF_MAP_GET_NEXT_KEY may help improve
performance if map entries are sparsely populated.

The implementation follows the API specification of existing
BPF_MAP_GET_NEXT_KEY implementation. If user provides
an NULL key pointer, the first key is returned. Otherwise,
the first valid key after the input parameter "key"
is returned, or -ENOENT if no valid key can be found.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc63..207b21c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -226,9 +226,28 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 	return 0;
 }
 
-static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int stack_map_get_next_key(struct bpf_map *map, void *key,
+				  void *next_key)
 {
-	return -EINVAL;
+	struct bpf_stack_map *smap = container_of(map,
+						  struct bpf_stack_map, map);
+	u32 id;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (!key)
+		id = 0;
+	else
+		id = *(u32 *)key + 1;
+
+	while (id < smap->n_buckets && !smap->buckets[id])
+		id++;
+
+	if (id >= smap->n_buckets)
+		return -ENOENT;
+
+	*(u32 *)next_key = id;
+	return 0;
 }
 
 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
-- 
2.9.5

^ permalink raw reply related

* Re: [patch iproute2 v5 1/3] lib/libnetlink: Add a function rtnl_talk_msg
From: Chris Mi @ 2018-01-04  7:27 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: gerlitz.or, stephen, marcelo.leitner
In-Reply-To: <81255ea8-2f27-891e-4fe9-6a97d4de0e64@gmail.com>

2018/1/3 12:08, David Ahern:
> On 1/2/18 7:55 PM, Chris Mi wrote:
>> diff --git a/lib/libnetlink.c b/lib/libnetlink.c
>> index 00e6ce0c..cc02a139 100644
>> --- a/lib/libnetlink.c
>> +++ b/lib/libnetlink.c
>> @@ -581,32 +581,34 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
>>   		strerror(-err->error));
>>   }
>>   
>> -static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
>> -		       struct nlmsghdr **answer,
>> -		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
>> +static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
>> +			   struct nlmsghdr **answer,
>> +			   bool show_rtnl_err, nl_ext_ack_fn_t errfn)
>>   {
>> -	int status;
>> -	unsigned int seq;
>> -	struct nlmsghdr *h;
>> +	int iovlen = m->msg_iovlen;
>> +	unsigned int seq = 0;
>> +	int i, status;
>> +	char *buf;
>> +
>>   	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
>> -	struct iovec iov = {
>> -		.iov_base = n,
>> -		.iov_len = n->nlmsg_len
>> -	};
>> +	struct iovec iov, *v;
>> +	struct nlmsghdr *h;
>>   	struct msghdr msg = {
>>   		.msg_name = &nladdr,
>>   		.msg_namelen = sizeof(nladdr),
>>   		.msg_iov = &iov,
>>   		.msg_iovlen = 1,
>>   	};
>> -	char *buf;
> Reverse xmas tree is the coding standard for net code. Please adhere to
> it. Only dependencies between variables are an acceptable exception.
OK, got it.
>
> Some of those (struct nlmsghdr *h and struct iovec *v) can be moved to
> the for loop which aligns with your intentions of grouping variables.
Done.
>
>>   
>> -	n->nlmsg_seq = seq = ++rtnl->seq;
>> -
>> -	if (answer == NULL)
>> -		n->nlmsg_flags |= NLM_F_ACK;
>> +	for (i = 0; i < iovlen; i++) {
>> +		v = &m->msg_iov[i];
>> +		h = v->iov_base;
>> +		h->nlmsg_seq = seq = ++rtnl->seq;
> doesn't seq need to track the recvmsg loop? I think for batching you
> want it to start at the first seq number and then in the recvmsg loop
> increment it.
Yes, it is a bug. Thanks for your test case.
>
> As it stands this file:
> $ cat tc.batch
> filter add dev eth2 ingress protocol ip pref 21 flower dst_ip
> 192.168.1.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
> 192.168.2.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
> 192.168.3.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 24 flower dst_ip
> 192.168.4.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 25 flower dst_ip
> 192.168.5.0/16 action drop
>
> does not give me an error message:
> $ tc -b tc.batch -bs 5
> <no output>
>
> Yet it failed to insert all filters:
> $ tc filter show dev eth2 ingress
> filter protocol ip pref 21 flower chain 0
> filter protocol ip pref 21 flower chain 0 handle 0x1
>    eth_type ipv4
>    dst_ip 192.168.1.0/16
>    not_in_hw
> 	action order 1: gact action drop
> 	 random type none pass val 0
> 	 index 1 ref 1 bind 1
>
> filter protocol ip pref 22 flower chain 0
> filter protocol ip pref 22 flower chain 0 handle 0x1
>    eth_type ipv4
>    dst_ip 192.168.2.0/16
>    not_in_hw
> 	action order 1: gact action drop
> 	 random type none pass val 0
> 	 index 2 ref 1 bind 1
>
> filter protocol ip pref 24 flower chain 0
> filter protocol ip pref 24 flower chain 0 handle 0x1
>    eth_type ipv4
>    dst_ip 192.168.4.0/16
>    not_in_hw
> 	action order 1: gact action drop
> 	 random type none pass val 0
> 	 index 3 ref 1 bind 1
>
> filter protocol ip pref 25 flower chain 0
> filter protocol ip pref 25 flower chain 0 handle 0x1
>    eth_type ipv4
>    dst_ip 192.168.5.0/16
>    not_in_hw
> 	action order 1: gact action drop
> 	 random type none pass val 0
> 	 index 4 ref 1 bind 1
>
After fixing it, the test result is:

# tc -b tc.batch -bs 5
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
Command failed 1.txt:0-4

We can't tell exactly which command causes this error, so we give a 
range which is less than the batch size.

^ permalink raw reply

* Re: [PATCH net-next 2/2] tun: allow to attach ebpf socket filter
From: Jason Wang @ 2018-01-04  7:28 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, LKML, Michael S. Tsirkin, Willem de Bruijn
In-Reply-To: <CAF=yD-KydM326DErG5XaafCN=R=p2V48MgUDH0LRpPrXCUeOdA@mail.gmail.com>



On 2018年01月02日 17:19, Willem de Bruijn wrote:
>>> More importantly, should this program just return a boolean pass or
>>> drop. Taking a length and trimming may introduce bugs later on if the
>>> stack parses the packet unconditionally, expecting a minimum size
>>> to be present.
>>>
>>> This was the reason for introducing sk_filter_trim_cap and using that
>>> in other sk_filter sites.
>>>
>>> A quick scan shows that tun_put_user expects a full vlan tag to exist
>>> if skb_vlan_tag_present(skb), for instance. If trimmed to below this
>>> length the final call to skb_copy_datagram_iter may have negative
>>> length.
>>>
>>> This is an issue with the existing sk_filter call as much as with the
>>> new run_ebpf_filter call.
>> Good point, so consider it was used by sk_filter too, we need to fix it
>> anyway. Actually, I've considered the boolean return value but finally I
>> decide to obey the style of sk filter. Maybe the trimming has real user. e.g
>> high speed header recoding/analysis? Consider it's not hard to fix, how
>> about just keep that?
> I don't see an obvious use case, but sure. We'll just need to look
> at what the minimum trim length needs to be.

Try to reproduce the possible issue, but looks like we are safe since we 
may hit -EFAULT which is returned by skb_copy_datagram_iter() before. So 
in V2, I will keep the code as is except trim 4 more bytes if vlan tag 
is present.

Thanks

^ permalink raw reply

* Re: [PATCH net] tipc: fix missing rtnl lock protection during setting link properties
From: Ying Xue @ 2018-01-04  7:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, syzkaller-bugs, tipc-discussion
In-Reply-To: <20180103.104850.406238717375090795.davem@davemloft.net>

On 01/03/2018 11:48 PM, David Miller wrote:
> As soon as you drop the RTNL lock, the media or bearer entry can be
> removed from the tables.
> 

Thanks for the review. Yes, you are right. But even if we temporarily
release RTNL lock, it's still safe for us because when we set
media/bearer properties in __tipc_nl_compat_doit(), tipc_nl_media_set()
and tipc_nl_bearer_set() will probe media or bearer again within RTNL
lock protection.

> This invalidates what you do next, whether it's
> tipc_nl_compat_media_set(), tipc_nl_compat_bearer_set(), etc.

In fact tipc_nl_compat_media_set() and tipc_nl_compat_bearer_set() don't
really change media or bearer's properties, instead they only format the
contents pointed by their "msg" parameter.

> 
> Therefore, you have to lock down the tipc configuration state around
> this entire operation, from media/bearer probe to the building of the
> netlink message(s).
> 

Sorry, we cannot hold RTNL lock in the entire operation path because
TIPC now supports two different sets of netlink APIs:

One set of API's execution path:

genl_family_rcv_msg()
  tipc_nl_media_set()
    rtnl_lock()
    tipc_media_find()
    //set media properties

genl_family_rcv_msg()
  tipc_nl_bearer_set()
    rtnl_lock()
    tipc_bearer_find()
    //set bearer properties

Another set of API's execution path:

genl_family_rcv_msg()
  tipc_nl_compat_recv()
    tipc_nl_compat_handle net()
      __tipc_nl_compat_doit net()
        tipc_nl_compat_link_set()
        tipc_nl_media_set()

genl_family_rcv_msg()
  tipc_nl_compat_recv()
    tipc_nl_compat_handle net()
      __tipc_nl_compat_doit net()
        tipc_nl_compat_link_set()
        tipc_nl_bearer_set()

As we see in above call chains, tipc_nl_media_set() and
tipc_nl_bearer_set() are shared by the two sets of netlink APIs. If we
hold RTNL lock from tipc_nl_compat_recv(), it means we cannot directly
call tipc_nl_media_set() or tipc_nl_bearer_set() in
__tipc_nl_compat_doit net().

> Either this entire code path must execute with the bearer/media entry
> present, or without.  If you drop the RTNL mutex in the middle, this
> invariant is not held.

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply

* Re: [patch iproute2 v5 2/3] tc: Add -bs option to batch mode
From: Chris Mi @ 2018-01-04  7:32 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: gerlitz.or, stephen, marcelo.leitner
In-Reply-To: <9326994a-de20-1eb6-71bc-fb757ad05872@gmail.com>

  2018/1/3 12:25, David Ahern:
> You need a patch description here ...
Done.
>
> On 1/2/18 7:55 PM, Chris Mi wrote:
>>   static int tc_action_modify(int cmd, unsigned int flags,
>> -			    int *argc_p, char ***argv_p)
>> +			    int *argc_p, char ***argv_p,
>> +			    int batch_size, int index, bool send)
>>   {
>>   	int argc = *argc_p;
>>   	char **argv = *argv_p;
>>   	int ret = 0;
>> -	struct {
>> -		struct nlmsghdr         n;
>> -		struct tcamsg           t;
>> -		char                    buf[MAX_MSG];
>> -	} req = {
>> -		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
>> -		.n.nlmsg_flags = NLM_F_REQUEST | flags,
>> -		.n.nlmsg_type = cmd,
>> -		.t.tca_family = AF_UNSPEC,
>> +	tc_action_req *req;
>> +	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
>> +	struct iovec *iov = &msg_iov[index];
> Reverse xmas tree is the coding standard for net code. Please check all
> new code to conform to this standard.
Done.
>
> I have not reviewed all of this patch, but I firmly believe the batching
> size option needs to be able handle a file with mixed commands. Your use
> case is filter and action adds and deletes, but you should allow users
> (e.g., test suites) to benefit from this performance speed up with test
> cases that have single files with all of the commands.
Done. There is a little performance pernalty. But I think it is better 
than segfault.
>
> For example,
> $ cat tc.batch
> qdisc add dev eth2 ingress
> filter add dev eth2 ingress protocol ip pref 21 flower dst_ip
> 192.168.1.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
> 192.168.2.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 23 flower dst_ip
> 192.168.3.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 24 flower dst_ip
> 192.168.4.0/16 action drop
> filter add dev eth2 ingress protocol ip pref 25 flower dst_ip
> 192.168.5.0/16 action drop
> qdisc del dev eth2 ingress
>
> (and consider this to be a huge file to really stress tc code paths for
> example). Right now, the above file fails:
>
> $ tc -b tc.batch -bs 5
> Segmentation fault
>
>
> Also, your changes fail to break out on an error:
>
> $ tc -b tc.batch -bs 1
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1
>
> where as the existing command does this:
> $ tc -b tc.batch
> RTNETLINK answers: File exists
> We have an error talking to the kernel
> Command failed tc.batch:1

^ permalink raw reply

* [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
From: Chris Mi @ 2018-01-04  7:34 UTC (permalink / raw)
  To: netdev; +Cc: gerlitz.or, stephen, dsahern, marcelo.leitner

Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
	tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real    0m15.555s
user    0m7.211s
sys     0m8.284s

With this patchset, 'tc -b $file -bs 10' exection time is:

real    0m13.043s
user    0m6.479s
sys     0m6.504s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,
'tc -b $file' exection time is:

real    0m14.748s
user    0m6.944s
sys     0m7.740s

'tc -b $file -bs 10' exection time is:

real    0m12.535s
user    0m6.587s
sys     0m5.888s

We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls
__rtnl_talk_msg
   directly.
3. if (batch_size < 1)
        batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.


Chris Mi (3):
  lib/libnetlink: Add a function rtnl_talk_msg
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage


 include/libnetlink.h |   3 ++
 lib/libnetlink.c     |  66 +++++++++++++++++++-------
 man/man8/tc.8        |   7 +++
 tc/m_action.c        |  93 +++++++++++++++++++++++++++---------
 tc/tc.c              |  96 ++++++++++++++++++++++++++++++++-----
 tc/tc_common.h       |   8 +++-
 tc/tc_filter.c       | 132 +++++++++++++++++++++++++++++++++++----------------
 7 files changed, 310 insertions(+), 95 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [patch iproute2 v6 1/3] lib/libnetlink: Add a function rtnl_talk_msg
From: Chris Mi @ 2018-01-04  7:34 UTC (permalink / raw)
  To: netdev; +Cc: gerlitz.or, stephen, dsahern, marcelo.leitner
In-Reply-To: <20180104073454.11867-1-chrism@mellanox.com>

rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi <chrism@mellanox.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c     | 66 ++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..01d98b16 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr **answer)
 	__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+		  struct nlmsghdr **answer)
+	__attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
 	__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..49ee1208 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,38 +581,40 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
 		strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-		       struct nlmsghdr **answer,
-		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+			   struct nlmsghdr **answer,
+			   bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-	int status;
-	unsigned int seq;
-	struct nlmsghdr *h;
 	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-	struct iovec iov = {
-		.iov_base = n,
-		.iov_len = n->nlmsg_len
-	};
+	int i, status, iovlen = m->msg_iovlen;
+	unsigned int seq = 0;
+	struct nlmsghdr *h;
+	struct iovec iov;
 	struct msghdr msg = {
 		.msg_name = &nladdr,
 		.msg_namelen = sizeof(nladdr),
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 	};
-	char *buf;
-
-	n->nlmsg_seq = seq = ++rtnl->seq;
 
-	if (answer == NULL)
-		n->nlmsg_flags |= NLM_F_ACK;
+	for (i = 0; i < iovlen; i++) {
+		struct iovec *v;
+		v = &m->msg_iov[i];
+		h = v->iov_base;
+		h->nlmsg_seq = seq = ++rtnl->seq;
+		if (answer == NULL)
+			h->nlmsg_flags |= NLM_F_ACK;
+	}
 
-	status = sendmsg(rtnl->fd, &msg, 0);
+	status = sendmsg(rtnl->fd, m, 0);
 	if (status < 0) {
 		perror("Cannot talk to rtnetlink");
 		return -1;
 	}
 
 	while (1) {
+		char *buf;
+next:
 		status = rtnl_recvmsg(rtnl->fd, &msg, &buf);
 
 		if (status < 0)
@@ -642,7 +644,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 
 			if (nladdr.nl_pid != 0 ||
 			    h->nlmsg_pid != rtnl->local.nl_pid ||
-			    h->nlmsg_seq != seq) {
+			    h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
 				/* Don't forget to skip that message. */
 				status -= NLMSG_ALIGN(len);
 				h = (struct nlmsghdr *)((char *)h + NLMSG_ALIGN(len));
@@ -662,7 +664,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 						*answer = (struct nlmsghdr *)buf;
 					else
 						free(buf);
-					return 0;
+					if (h->nlmsg_seq == seq)
+						return 0;
+					else
+						goto next;
 				}
 
 				if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -698,12 +703,37 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+		       struct nlmsghdr **answer,
+		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+	struct iovec iov = {
+		.iov_base = n,
+		.iov_len = n->nlmsg_len
+	};
+	struct msghdr msg = {
+		.msg_name = &nladdr,
+		.msg_namelen = sizeof(nladdr),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	return __rtnl_talk_msg(rtnl, &msg, answer, show_rtnl_err, errfn);
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr **answer)
 {
 	return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+	      struct nlmsghdr **answer)
+{
+	return __rtnl_talk_msg(rtnl, m, answer, true, NULL);
+}
+
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		     struct nlmsghdr **answer,
 		     nl_ext_ack_fn_t errfn)
-- 
2.14.3

^ permalink raw reply related

* [patch iproute2 v6 2/3] tc: Add -bs option to batch mode
From: Chris Mi @ 2018-01-04  7:34 UTC (permalink / raw)
  To: netdev; +Cc: gerlitz.or, stephen, dsahern, marcelo.leitner
In-Reply-To: <20180104073454.11867-1-chrism@mellanox.com>

Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, we can accumulate
several commands before sending to kernel.

Now it only works for the following successive rules,
1. filter add
2. filter delete
3. actions add
4. actions delete

Otherwise, the batch size is still 1.

Signed-off-by: Chris Mi <chrism@mellanox.com>
---
 tc/m_action.c  |  93 ++++++++++++++++++++++++++++++----------
 tc/tc.c        |  96 +++++++++++++++++++++++++++++++++++------
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 132 ++++++++++++++++++++++++++++++++++++++++-----------------
 4 files changed, 252 insertions(+), 77 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..cf5cc95d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <dlfcn.h>
+#include <errno.h>
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,86 @@ bad_val:
 	return ret;
 }
 
+typedef struct {
+	struct nlmsghdr		n;
+	struct tcamsg		t;
+	char			buf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+	free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+	tc_action_req *req;
+
+	if (action_reqs == NULL) {
+		action_reqs = malloc(batch_size * sizeof (tc_action_req));
+		if (action_reqs == NULL)
+			return NULL;
+	}
+	req = &action_reqs[index];
+	memset(req, 0, sizeof (*req));
+
+	return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-			    int *argc_p, char ***argv_p)
+			    int *argc_p, char ***argv_p,
+			    int batch_size, int index, bool send)
 {
-	int argc = *argc_p;
+	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+	struct iovec *iov = &msg_iov[index];
 	char **argv = *argv_p;
-	int ret = 0;
-	struct {
-		struct nlmsghdr         n;
-		struct tcamsg           t;
-		char                    buf[MAX_MSG];
-	} req = {
-		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-		.n.nlmsg_flags = NLM_F_REQUEST | flags,
-		.n.nlmsg_type = cmd,
-		.t.tca_family = AF_UNSPEC,
+	struct msghdr msg = {
+		.msg_name = &nladdr,
+		.msg_namelen = sizeof(nladdr),
+		.msg_iov = msg_iov,
+		.msg_iovlen = index + 1,
 	};
-	struct rtattr *tail = NLMSG_TAIL(&req.n);
+	struct rtattr *tail;
+	tc_action_req *req;
+	int argc = *argc_p;
+	int ret = 0;
+
+	req = get_action_req(batch_size, index);
+	if (req == NULL) {
+		fprintf(stderr, "get_action_req error: not enough buffer\n");
+		return -ENOMEM;
+	}
+	req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+	req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+	req->n.nlmsg_type = cmd;
+	req->t.tca_family = AF_UNSPEC;
+	tail = NLMSG_TAIL(&req->n);
 
 	argc -= 1;
 	argv += 1;
-	if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
+	if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) {
 		fprintf(stderr, "Illegal \"action\"\n");
 		return -1;
 	}
-	tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
+	tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail;
 
-	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
+	*argc_p = argc;
+	*argv_p = argv;
+
+	iov->iov_base = &req->n;
+	iov->iov_len = req->n.nlmsg_len;
+
+	if (!send)
+		return 0;
+
+	if (rtnl_talk_msg(&rth, &msg, NULL) < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		ret = -1;
 	}
 
-	*argc_p = argc;
-	*argv_p = argv;
-
 	return ret;
 }
 
@@ -679,7 +726,7 @@ bad_val:
 	return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
 	int ret = 0;
@@ -689,12 +736,14 @@ int do_action(int argc, char **argv)
 		if (matches(*argv, "add") == 0) {
 			ret =  tc_action_modify(RTM_NEWACTION,
 						NLM_F_EXCL | NLM_F_CREATE,
-						&argc, &argv);
+						&argc, &argv, batch_size,
+						index, send);
 		} else if (matches(*argv, "change") == 0 ||
 			  matches(*argv, "replace") == 0) {
 			ret = tc_action_modify(RTM_NEWACTION,
 					       NLM_F_CREATE | NLM_F_REPLACE,
-					       &argc, &argv);
+					       &argc, &argv, batch_size,
+					       index, send);
 		} else if (matches(*argv, "delete") == 0) {
 			argc -= 1;
 			argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..67c6bfb4 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -189,20 +189,20 @@ static void usage(void)
 	fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT { COMMAND | help }\n"
 			"       tc [-force] -batch filename\n"
 			"where  OBJECT := { qdisc | class | filter | action | monitor | exec }\n"
-	                "       OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -n[etns] name |\n"
+	                "       OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -bs | -batchsize [size] | -n[etns] name |\n"
 			"                    -nm | -nam[es] | { -cf | -conf } path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, int batch_size, int index, bool send)
 {
 	if (matches(*argv, "qdisc") == 0)
 		return do_qdisc(argc-1, argv+1);
 	if (matches(*argv, "class") == 0)
 		return do_class(argc-1, argv+1);
 	if (matches(*argv, "filter") == 0)
-		return do_filter(argc-1, argv+1);
+		return do_filter(argc-1, argv+1, batch_size, index, send);
 	if (matches(*argv, "actions") == 0)
-		return do_action(argc-1, argv+1);
+		return do_action(argc-1, argv+1, batch_size, index, send);
 	if (matches(*argv, "monitor") == 0)
 		return do_tcmonitor(argc-1, argv+1);
 	if (matches(*argv, "exec") == 0)
@@ -217,11 +217,25 @@ static int do_cmd(int argc, char **argv)
 	return -1;
 }
 
-static int batch(const char *name)
+static bool batchsize_enabled(int argc, char *argv[])
 {
+	if (argc < 2)
+		return false;
+	if (((strcmp(argv[0], "filter") != 0) && strcmp(argv[0], "action") != 0)
+	    || ((strcmp(argv[1], "add") != 0) && strcmp(argv[1], "delete") != 0))
+		return false;
+	return true;
+}
+
+static int batch(const char *name, int batch_size)
+{
+	bool lastline = false;
+	int msg_iov_index = 0;
+	char *line2 = NULL;
 	char *line = NULL;
 	size_t len = 0;
 	int ret = 0;
+	bool send;
 
 	batch_mode = 1;
 	if (name && strcmp(name, "-") != 0) {
@@ -240,23 +254,66 @@ static int batch(const char *name)
 	}
 
 	cmdlineno = 0;
-	while (getcmdline(&line, &len, stdin) != -1) {
+	if (getcmdline(&line, &len, stdin) == -1)
+		goto Exit;
+	do {
+		char *largv2[100];
 		char *largv[100];
+		int largc2;
 		int largc;
 
+		if (getcmdline(&line2, &len, stdin) == -1)
+			lastline = true;
+
+		if (batch_size > 1)
+			largc2 = makeargs(line2, largv2, 100);
 		largc = makeargs(line, largv, 100);
+
+		/*
+		 * In batch mode, if we haven't accumulated enough commands
+		 * and this is not the last command and this command & next
+		 * command both support the batchsize feature, don't send the
+		 * message immediately.
+		 */
+		if (batch_size > 1 && msg_iov_index + 1 != batch_size
+		    && !lastline && batchsize_enabled(largc, largv)
+		    && batchsize_enabled(largc2, largv2))
+			send = false;
+		else
+			send = true;
+
+		line = line2;
+		line2 = NULL;
+		len = 0;
+
 		if (largc == 0)
 			continue;	/* blank line */
 
-		if (do_cmd(largc, largv)) {
-			fprintf(stderr, "Command failed %s:%d\n", name, cmdlineno);
+		ret = do_cmd(largc, largv, batch_size, msg_iov_index, send);
+		if (ret != 0) {
+			if (batch_size == 1)
+				fprintf(stderr, "Command failed %s:%d\n",
+					name, cmdlineno - 1);
+			else
+				fprintf(stderr, "Command failed %s:%d-%d\n",
+					name, cmdlineno - msg_iov_index - 1,
+					cmdlineno - 1);
 			ret = 1;
 			if (!force)
 				break;
 		}
-	}
-	if (line)
-		free(line);
+		if (batch_size > 1) {
+			++msg_iov_index;
+			msg_iov_index %= batch_size;
+		}
+		if (send)
+			msg_iov_index = 0;
+	} while (!lastline);
+
+	free_filter_reqs();
+	free_action_reqs();
+Exit:
+	free(line);
 
 	rtnl_close(&rth);
 	return ret;
@@ -267,6 +324,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 	char *batch_file = NULL;
+	int batch_size = 1;
 
 	while (argc > 1) {
 		if (argv[1][0] != '-')
@@ -297,6 +355,16 @@ int main(int argc, char **argv)
 			if (argc <= 1)
 				usage();
 			batch_file = argv[1];
+		} else if (matches(argv[1], "-batchsize") == 0 ||
+				matches(argv[1], "-bs") == 0) {
+			argc--;	argv++;
+			if (argc <= 1)
+				usage();
+			batch_size = atoi(argv[1]);
+			if (batch_size > MSG_IOV_MAX)
+				batch_size = MSG_IOV_MAX;
+			else if (batch_size < 0)
+				batch_size = 1;
 		} else if (matches(argv[1], "-netns") == 0) {
 			NEXT_ARG();
 			if (netns_switch(argv[1]))
@@ -323,7 +391,7 @@ int main(int argc, char **argv)
 	}
 
 	if (batch_file)
-		return batch(batch_file);
+		return batch(batch_file, batch_size);
 
 	if (argc <= 1) {
 		usage();
@@ -341,7 +409,9 @@ int main(int argc, char **argv)
 		goto Exit;
 	}
 
-	ret = do_cmd(argc-1, argv+1);
+	ret = do_cmd(argc-1, argv+1, 1, 0, true);
+	free_filter_reqs();
+	free_action_reqs();
 Exit:
 	rtnl_close(&rth);
 
diff --git a/tc/tc_common.h b/tc/tc_common.h
index 264fbdac..8a82439f 100644
--- a/tc/tc_common.h
+++ b/tc/tc_common.h
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #define TCA_BUF_MAX	(64*1024)
+#define MSG_IOV_MAX	256
 
 extern struct rtnl_handle rth;
 
 extern int do_qdisc(int argc, char **argv);
 extern int do_class(int argc, char **argv);
-extern int do_filter(int argc, char **argv);
-extern int do_action(int argc, char **argv);
+extern int do_filter(int argc, char **argv, int batch_size, int index, bool send);
+extern int do_action(int argc, char **argv, int batch_size, int index, bool send);
 extern int do_tcmonitor(int argc, char **argv);
 extern int do_exec(int argc, char **argv);
 
@@ -24,5 +25,8 @@ struct tc_sizespec;
 extern int parse_size_table(int *p_argc, char ***p_argv, struct tc_sizespec *s);
 extern int check_size_table_opts(struct tc_sizespec *s);
 
+extern void free_filter_reqs(void);
+extern void free_action_reqs(void);
+
 extern int show_graph;
 extern bool use_names;
diff --git a/tc/tc_filter.c b/tc/tc_filter.c
index 545cc3a1..6e80ed2c 100644
--- a/tc/tc_filter.c
+++ b/tc/tc_filter.c
@@ -19,6 +19,7 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <linux/if_ether.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -42,28 +43,69 @@ static void usage(void)
 		"OPTIONS := ... try tc filter add <desired FILTER_KIND> help\n");
 }
 
-static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
+typedef struct {
+	struct nlmsghdr		n;
+	struct tcmsg		t;
+	char			buf[MAX_MSG];
+} tc_filter_req;
+
+static tc_filter_req *filter_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_filter_reqs(void)
 {
-	struct {
-		struct nlmsghdr	n;
-		struct tcmsg		t;
-		char			buf[MAX_MSG];
-	} req = {
-		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
-		.n.nlmsg_flags = NLM_F_REQUEST | flags,
-		.n.nlmsg_type = cmd,
-		.t.tcm_family = AF_UNSPEC,
-	};
+	free(filter_reqs);
+}
+
+static tc_filter_req *get_filter_req(int batch_size, int index)
+{
+	tc_filter_req *req;
+
+	if (filter_reqs == NULL) {
+		filter_reqs = malloc(batch_size * sizeof (tc_filter_req));
+		if (filter_reqs == NULL)
+			return NULL;
+	}
+	req = &filter_reqs[index];
+	memset(req, 0, sizeof (*req));
+
+	return req;
+}
+
+static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv,
+			    int batch_size, int index, bool send)
+{
+	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+	struct iovec *iov = &msg_iov[index];
 	struct filter_util *q = NULL;
-	__u32 prio = 0;
-	__u32 protocol = 0;
-	int protocol_set = 0;
-	__u32 chain_index;
+	struct tc_estimator est = {};
+	char  k[FILTER_NAMESZ] = {};
 	int chain_index_set = 0;
-	char *fhandle = NULL;
 	char  d[IFNAMSIZ] = {};
-	char  k[FILTER_NAMESZ] = {};
-	struct tc_estimator est = {};
+	struct msghdr msg = {
+		.msg_name = &nladdr,
+		.msg_namelen = sizeof(nladdr),
+		.msg_iov = msg_iov,
+		.msg_iovlen = index + 1,
+	};
+	int protocol_set = 0;
+	char *fhandle = NULL;
+	tc_filter_req *req;
+	__u32 protocol = 0;
+	__u32 chain_index;
+	__u32 prio = 0;
+	int ret;
+
+	req = get_filter_req(batch_size, index);
+	if (req == NULL) {
+		fprintf(stderr, "get_filter_req error: not enough buffer\n");
+		return -ENOMEM;
+	}
+
+	req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+	req->n.nlmsg_type = cmd;
+	req->t.tcm_family = AF_UNSPEC;
 
 	if (cmd == RTM_NEWTFILTER && flags & NLM_F_CREATE)
 		protocol = htons(ETH_P_ALL);
@@ -75,37 +117,37 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 				duparg("dev", *argv);
 			strncpy(d, *argv, sizeof(d)-1);
 		} else if (strcmp(*argv, "root") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"root\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_ROOT;
+			req->t.tcm_parent = TC_H_ROOT;
 		} else if (strcmp(*argv, "ingress") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"ingress\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
+			req->t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
 						     TC_H_MIN_INGRESS);
 		} else if (strcmp(*argv, "egress") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"egress\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
+			req->t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
 						     TC_H_MIN_EGRESS);
 		} else if (strcmp(*argv, "parent") == 0) {
 			__u32 handle;
 
 			NEXT_ARG();
-			if (req.t.tcm_parent)
+			if (req->t.tcm_parent)
 				duparg("parent", *argv);
 			if (get_tc_classid(&handle, *argv))
 				invarg("Invalid parent ID", *argv);
-			req.t.tcm_parent = handle;
+			req->t.tcm_parent = handle;
 		} else if (strcmp(*argv, "handle") == 0) {
 			NEXT_ARG();
 			if (fhandle)
@@ -152,26 +194,26 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 		argc--; argv++;
 	}
 
-	req.t.tcm_info = TC_H_MAKE(prio<<16, protocol);
+	req->t.tcm_info = TC_H_MAKE(prio<<16, protocol);
 
 	if (chain_index_set)
-		addattr32(&req.n, sizeof(req), TCA_CHAIN, chain_index);
+		addattr32(&req->n, sizeof(*req), TCA_CHAIN, chain_index);
 
 	if (k[0])
-		addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+		addattr_l(&req->n, sizeof(*req), TCA_KIND, k, strlen(k)+1);
 
 	if (d[0])  {
 		ll_init_map(&rth);
 
-		req.t.tcm_ifindex = ll_name_to_index(d);
-		if (req.t.tcm_ifindex == 0) {
+		req->t.tcm_ifindex = ll_name_to_index(d);
+		if (req->t.tcm_ifindex == 0) {
 			fprintf(stderr, "Cannot find device \"%s\"\n", d);
 			return 1;
 		}
 	}
 
 	if (q) {
-		if (q->parse_fopt(q, fhandle, argc, argv, &req.n))
+		if (q->parse_fopt(q, fhandle, argc, argv, &req->n))
 			return 1;
 	} else {
 		if (fhandle) {
@@ -190,10 +232,17 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 	}
 
 	if (est.ewma_log)
-		addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
+		addattr_l(&req->n, sizeof(*req), TCA_RATE, &est, sizeof(est));
 
-	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
-		fprintf(stderr, "We have an error talking to the kernel\n");
+	iov->iov_base = &req->n;
+	iov->iov_len = req->n.nlmsg_len;
+
+	if (!send)
+		return 0;
+
+	ret = rtnl_talk_msg(&rth, &msg, NULL);
+	if (ret < 0) {
+		fprintf(stderr, "We have an error talking to the kernel, %d\n", ret);
 		return 2;
 	}
 
@@ -636,20 +685,23 @@ static int tc_filter_list(int argc, char **argv)
 	return 0;
 }
 
-int do_filter(int argc, char **argv)
+int do_filter(int argc, char **argv, int batch_size, int index, bool send)
 {
 	if (argc < 1)
 		return tc_filter_list(0, NULL);
 	if (matches(*argv, "add") == 0)
 		return tc_filter_modify(RTM_NEWTFILTER, NLM_F_EXCL|NLM_F_CREATE,
-					argc-1, argv+1);
+					argc-1, argv+1,
+					batch_size, index, send);
 	if (matches(*argv, "change") == 0)
-		return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1);
+		return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1,
+					batch_size, index, send);
 	if (matches(*argv, "replace") == 0)
 		return tc_filter_modify(RTM_NEWTFILTER, NLM_F_CREATE, argc-1,
-					argv+1);
+					argv+1, batch_size, index, send);
 	if (matches(*argv, "delete") == 0)
-		return tc_filter_modify(RTM_DELTFILTER, 0,  argc-1, argv+1);
+		return tc_filter_modify(RTM_DELTFILTER, 0, argc-1, argv+1,
+					batch_size, index, send);
 	if (matches(*argv, "get") == 0)
 		return tc_filter_get(RTM_GETTFILTER, 0,  argc-1, argv+1);
 	if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
-- 
2.14.3

^ permalink raw reply related

* [patch iproute2 v6 3/3] man: Add -bs option to tc manpage
From: Chris Mi @ 2018-01-04  7:34 UTC (permalink / raw)
  To: netdev; +Cc: gerlitz.or, stephen, dsahern, marcelo.leitner
In-Reply-To: <20180104073454.11867-1-chrism@mellanox.com>

Signed-off-by: Chris Mi <chrism@mellanox.com>
---
 man/man8/tc.8 | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..23db730c 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,13 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+Only successive rules of filter add and delete are supported.
+Otherwise, batch size is still 1.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH net-next v2 05/10] net: qualcomm: rmnet: Set pacing rate
From: Eric Dumazet @ 2018-01-04  7:44 UTC (permalink / raw)
  To: Subash Abhinov Kasiviswanathan; +Cc: davem, netdev, lkp
In-Reply-To: <8bd76556945c563980a72009f9c86a13@codeaurora.org>

On Wed, 2018-01-03 at 15:45 -0700, Subash Abhinov Kasiviswanathan
wrote:
> > > +	sk_pacing_shift_update(skb->sk, 8);
> > 
> > Well... Please tell us why this is needed in this driver.
> > 
> > This interface is meant for wifi aggregation, not to work around some
> > strange ethernet drivers designs.
> 
> Hi Eric
> 
> The real device over which the rmnet devices are installed also
> aggregate multiple IP packets and sends them as a single large aggregate
> frame to the hardware.

It would be nice to give some details about this in the changelog.

Also what results you get with different values for the shift (10, 9,
8)

My fear is that people might be tempted to blindly use the
sk_pacing_shift_update() just because a single TCP flow gets 'better'
results.

bufferbloat is a serious issue, we do not want to allow a single TCP
flow to fill a fifo.

Otherwise, we could remove TCP Small queues overhead from the kernel
and be happy.

Thanks.

^ permalink raw reply

* Re: [PATCH v5 26/39] nds32: Device tree support
From: Greentime Hu @ 2018-01-04  7:57 UTC (permalink / raw)
  To: Rob Herring
  Cc: Greentime, linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Arnd Bergmann, linux-arch, Thomas Gleixner, Jason Cooper,
	Marc Zyngier, netdev, Vincent Chen,
	open list:OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS,
	Al Viro, David Howells, Will Deacon, Daniel Lezcano,
	linux-serial-u79uwXL29TY76Z2rM5mHXA, Geert Uytterhoeven,
	Linus Walleij, Mark Rutland, Greg
In-Reply-To: <CAL_Jsq+CC-3w8BVcUP77__ZR8aYMhxiXDYJ--HZwA=ezHG548g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

2018-01-04 3:14 GMT+08:00 Rob Herring <robh+dt-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>:
> On Tue, Jan 2, 2018 at 2:24 AM, Greentime Hu <green.hu-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> From: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
>>
>> This patch adds support for device tree.
>>
>> Signed-off-by: Vincent Chen <vincentc-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
>> Signed-off-by: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
>> ---
>>  arch/nds32/boot/dts/Makefile  |    8 +++++
>>  arch/nds32/boot/dts/ae3xx.dts |   73 +++++++++++++++++++++++++++++++++++++++++
>>  arch/nds32/kernel/devtree.c   |   19 +++++++++++
>>  3 files changed, 100 insertions(+)
>>  create mode 100644 arch/nds32/boot/dts/Makefile
>>  create mode 100644 arch/nds32/boot/dts/ae3xx.dts
>>  create mode 100644 arch/nds32/kernel/devtree.c
>>
>> diff --git a/arch/nds32/boot/dts/Makefile b/arch/nds32/boot/dts/Makefile
>> new file mode 100644
>> index 0000000..d31faa8
>> --- /dev/null
>> +++ b/arch/nds32/boot/dts/Makefile
>> @@ -0,0 +1,8 @@
>> +ifneq '$(CONFIG_NDS32_BUILTIN_DTB)' '""'
>> +BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_NDS32_BUILTIN_DTB)).dtb.o
>> +else
>> +BUILTIN_DTB :=
>> +endif
>> +obj-$(CONFIG_OF) += $(BUILTIN_DTB)
>> +
>> +clean-files := *.dtb *.dtb.S
>> diff --git a/arch/nds32/boot/dts/ae3xx.dts b/arch/nds32/boot/dts/ae3xx.dts
>> new file mode 100644
>> index 0000000..6b23d60
>> --- /dev/null
>> +++ b/arch/nds32/boot/dts/ae3xx.dts
>> @@ -0,0 +1,73 @@
>> +/dts-v1/;
>> +/ {
>> +       compatible = "andestech,ae3xx";
>> +       #address-cells = <1>;
>> +       #size-cells = <1>;
>> +       interrupt-parent = <&intc>;
>> +
>> +       chosen {
>> +               stdout-path = &serial0;
>> +       };
>> +
>> +       memory@0 {
>> +               device_type = "memory";
>> +               reg = <0x00000000 0x40000000>;
>> +       };
>> +
>> +       cpus {
>> +               #address-cells = <1>;
>> +               #size-cells = <0>;
>> +               cpu@0 {
>> +                       device_type = "cpu";
>> +                       compatible = "andestech,n13", "andestech,nds32v3";
>> +                       reg = <0>;
>> +                       clock-frequency = <60000000>;
>> +                       next-level-cache = <&L2>;
>> +               };
>> +       };
>> +
>> +       L2: l2-cache@e0500000 {
>> +               compatible = "andestech,atl2c";
>> +               reg = <0xe0500000 0x1000>;
>> +               cache-unified;
>> +               cache-level = <2>;
>> +       };
>> +
>> +       apb: clk@0 {
>
> unit address without reg is not valid. Drop the "@0".
>
>> +               #clock-cells = <0>;
>> +               compatible = "fixed-clock";
>> +               clock-frequency = <30000000>;
>> +       };
>> +
>> +
>> +       intc: interrupt-controller {
>> +               compatible = "andestech,ativic32";
>> +               #interrupt-cells = <1>;
>> +               interrupt-controller;
>> +       };
>> +
>> +       serial0: serial@f0300000 {
>
> All the memory mapped peripherals should be under at least one simple-bus node.
>
>> +               compatible = "andestech,uart16550", "ns16550a";
>> +               reg = <0xf0300000 0x1000>;
>> +               interrupts = <8>;
>> +               clock-frequency = <14745600>;
>> +               reg-shift = <2>;
>> +               reg-offset = <32>;
>> +               no-loopback-test = <1>;
>> +       };
>> +
>> +       timer0: timer@f0400000 {
>> +               compatible = "andestech,atcpit100";
>> +               reg = <0xf0400000 0x1000>;
>> +               interrupts = <2>;
>> +               clocks = <&apb>;
>> +               clock-names = "PCLK";
>> +       };
>> +
>> +       mac0: mac@e0100000 {
>
> ethernet@...
>

Hi, Rob:

I'd like to modify it like this in the next version patch.

         clock: clk {
                 #clock-cells = <0>;
                 compatible = "fixed-clock";
                 clock-frequency = <30000000>;
         };

         apb {
                 compatible = "simple-bus";
                 #address-cells = <1>;
                 #size-cells = <1>;
                 ranges;

                 serial0: serial@f0300000 {
                         compatible = "andestech,uart16550", "ns16550a";
                         reg = <0xf0300000 0x1000>;
                         interrupts = <8>;
                         clock-frequency = <14745600>;
                         reg-shift = <2>;
                         reg-offset = <32>;
                         no-loopback-test = <1>;
                 };

                 timer0: timer@f0400000 {
                         compatible = "andestech,atcpit100";
                         reg = <0xf0400000 0x1000>;
                         interrupts = <2>;
                         clocks = <&clock>;
                         clock-names = "PCLK";
                 };
         };

         ahb {
                 compatible = "simple-bus";
                 #address-cells = <1>;
                 #size-cells = <1>;
                 ranges;

                 L2: cache-controller@e0500000 {
                         compatible = "andestech,atl2c";
                         reg = <0xe0500000 0x1000>;
                         cache-unified;
                         cache-level = <2>;
                 };

                 mac0: ethernet@e0100000 {
                         compatible = "andestech,atmac100";
                         reg = <0xe0100000 0x1000>;
                         interrupts = <18>;
                };
        };
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* aio poll, io_pgetevents and a new in-kernel poll API
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel

Hi all,

this series adds support for the IOCB_CMD_POLL operation to poll for the
readyness of file descriptors using the aio subsystem.  The API is based
on patches that existed in RHAS2.1 and RHEL3, which means it already is
supported by libaio.  To implement the poll support efficiently new
methods to poll are introduced in struct file_operations:  get_poll_head
and poll_mask.  The first one returns a wait_queue_head to wait on
(lifetime is bound by the file), and the second does a non-blocking
check for the POLL* events.  This allows aio poll to work without
any additional context switches, unlike epoll.

To make the interface fully useful a new io_pgetevents system call is
added, which atomically saves and restores the signal mask over the
io_pgetevents system call.  It it the logical equivalent to pselect and
ppoll for io_pgetevents.

The corresponding libaio changes for io_pgetevents support and
documentation, as well as a test case will be posted in a separate
series.

The changes were sponsored by Scylladb, and improve performance
of the seastar framework up to 10%, while also removing the need
for a privileged SCHED_FIFO epoll listener thread.

The patches are on top of Als __poll_t annoations, so I've also
prepared a git branch on top of those here:

    git://git.infradead.org/users/hch/vfs.git aio-poll

Gitweb:

    http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/aio-poll

Libaio changes:

    http://git.infradead.org/users/hch/libaio.git/shortlog/refs/heads/aio-poll

Seastar changes:

    https://github.com/avikivity/seastar/commits/aio

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* [PATCH 01/31] fs: update documentation for __poll_t
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/vfs.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75d2d57e2c44..220bba28f72b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -439,7 +439,7 @@ prototypes:
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,7 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox