Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/3] xen: modify xenstore watch event interface
From: Juergen Gross @ 2017-01-06 15:05 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, wei.liu2, netdev, paul.durrant, boris.ostrovsky,
	roger.pau
In-Reply-To: <20170106150544.10836-1-jgross@suse.com>

Today a Xenstore watch event is delivered via a callback function
declared as:

void (*callback)(struct xenbus_watch *,
                 const char **vec, unsigned int len);

As all watch events only ever come with two parameters (path and token)
changing the prototype to:

void (*callback)(struct xenbus_watch *,
                 const char *path, const char *token);

is the natural thing to do.

Apply this change and adapt all users.

Cc: konrad.wilk@oracle.com
Cc: roger.pau@citrix.com
Cc: wei.liu2@citrix.com
Cc: paul.durrant@citrix.com
Cc: netdev@vger.kernel.org

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/block/xen-blkback/xenbus.c         |  6 +++---
 drivers/net/xen-netback/xenbus.c           |  8 ++++----
 drivers/xen/cpu_hotplug.c                  |  5 ++---
 drivers/xen/manage.c                       |  6 +++---
 drivers/xen/xen-balloon.c                  |  2 +-
 drivers/xen/xen-pciback/xenbus.c           |  2 +-
 drivers/xen/xenbus/xenbus.h                |  6 +++---
 drivers/xen/xenbus/xenbus_client.c         |  4 ++--
 drivers/xen/xenbus/xenbus_dev_frontend.c   | 21 ++++++++-------------
 drivers/xen/xenbus/xenbus_probe.c          | 11 ++++-------
 drivers/xen/xenbus/xenbus_probe_backend.c  |  8 ++++----
 drivers/xen/xenbus/xenbus_probe_frontend.c | 14 +++++++-------
 drivers/xen/xenbus/xenbus_xs.c             | 29 ++++++++++++++---------------
 include/xen/xenbus.h                       |  6 +++---
 14 files changed, 59 insertions(+), 69 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 415e79b..8fe61b5 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -38,8 +38,8 @@ struct backend_info {
 static struct kmem_cache *xen_blkif_cachep;
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
-static void backend_changed(struct xenbus_watch *, const char **,
-			    unsigned int);
+static void backend_changed(struct xenbus_watch *, const char *,
+			    const char *);
 static void xen_blkif_free(struct xen_blkif *blkif);
 static void xen_vbd_free(struct xen_vbd *vbd);
 
@@ -661,7 +661,7 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
  * ready, connect.
  */
 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
 	int err;
 	unsigned major;
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 3124eae..d8a40fa 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -723,7 +723,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
 }
 
 static void xen_net_rate_changed(struct xenbus_watch *watch,
-				const char **vec, unsigned int len)
+				 const char *path, const char *token)
 {
 	struct xenvif *vif = container_of(watch, struct xenvif, credit_watch);
 	struct xenbus_device *dev = xenvif_to_xenbus_device(vif);
@@ -780,7 +780,7 @@ static void xen_unregister_credit_watch(struct xenvif *vif)
 }
 
 static void xen_mcast_ctrl_changed(struct xenbus_watch *watch,
-				   const char **vec, unsigned int len)
+				   const char *path, const char *token)
 {
 	struct xenvif *vif = container_of(watch, struct xenvif,
 					  mcast_ctrl_watch);
@@ -855,8 +855,8 @@ static void unregister_hotplug_status_watch(struct backend_info *be)
 }
 
 static void hotplug_status_changed(struct xenbus_watch *watch,
-				   const char **vec,
-				   unsigned int vec_size)
+				   const char *path,
+				   const char *token)
 {
 	struct backend_info *be = container_of(watch,
 					       struct backend_info,
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 5676aef..7a4daa2 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -68,13 +68,12 @@ static void vcpu_hotplug(unsigned int cpu)
 }
 
 static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
-					const char **vec, unsigned int len)
+				      const char *path, const char *token)
 {
 	unsigned int cpu;
 	char *cpustr;
-	const char *node = vec[XS_WATCH_PATH];
 
-	cpustr = strstr(node, "cpu/");
+	cpustr = strstr(path, "cpu/");
 	if (cpustr != NULL) {
 		sscanf(cpustr, "cpu/%u", &cpu);
 		vcpu_hotplug(cpu);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 26e5e85..ca62c09 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -218,7 +218,7 @@ static struct shutdown_handler shutdown_handlers[] = {
 };
 
 static void shutdown_handler(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
 	char *str;
 	struct xenbus_transaction xbt;
@@ -266,8 +266,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
 }
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
-			  unsigned int len)
+static void sysrq_handler(struct xenbus_watch *watch, const char *path,
+			  const char *token)
 {
 	char sysrq_key = '\0';
 	struct xenbus_transaction xbt;
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index 79865b8..e7715cb 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -55,7 +55,7 @@ static int register_balloon(struct device *dev);
 
 /* React to a change in the target key */
 static void watch_target(struct xenbus_watch *watch,
-			 const char **vec, unsigned int len)
+			 const char *path, const char *token)
 {
 	unsigned long long new_target;
 	int err;
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 3f0aee0..3814b44 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -652,7 +652,7 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
 }
 
 static void xen_pcibk_be_watch(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			       const char *path, const char *token)
 {
 	struct xen_pcibk_device *pdev =
 	    container_of(watch, struct xen_pcibk_device, be_watch);
diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
index 6a80c1e..bd95c21 100644
--- a/drivers/xen/xenbus/xenbus.h
+++ b/drivers/xen/xenbus/xenbus.h
@@ -39,8 +39,8 @@ struct xen_bus_type {
 	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
 	int (*probe)(struct xen_bus_type *bus, const char *type,
 		     const char *dir);
-	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
-				 unsigned int len);
+	void (*otherend_changed)(struct xenbus_watch *watch, const char *path,
+				 const char *token);
 	struct bus_type bus;
 };
 
@@ -83,7 +83,7 @@ int xenbus_dev_resume(struct device *dev);
 int xenbus_dev_cancel(struct device *dev);
 
 void xenbus_otherend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len,
+			     const char *path, const char *token,
 			     int ignore_on_shutdown);
 
 int xenbus_read_otherend_details(struct xenbus_device *xendev,
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 23edf53..9586c24 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(xenbus_strstate);
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
-				       const char **, unsigned int))
+				       const char *, const char *))
 {
 	int err;
 
@@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path);
 int xenbus_watch_pathfmt(struct xenbus_device *dev,
 			 struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
-					const char **, unsigned int),
+					  const char *, const char *),
 			 const char *pathfmt, ...)
 {
 	int err;
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index e2bc9b3..e4b9847 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -258,26 +258,23 @@ static struct watch_adapter *alloc_watch_adapter(const char *path,
 }
 
 static void watch_fired(struct xenbus_watch *watch,
-			const char **vec,
-			unsigned int len)
+			const char *path,
+			const char *token)
 {
 	struct watch_adapter *adap;
 	struct xsd_sockmsg hdr;
-	const char *path, *token;
-	int path_len, tok_len, body_len, data_len = 0;
+	const char *token_caller;
+	int path_len, tok_len, body_len;
 	int ret;
 	LIST_HEAD(staging_q);
 
 	adap = container_of(watch, struct watch_adapter, watch);
 
-	path = vec[XS_WATCH_PATH];
-	token = adap->token;
+	token_caller = adap->token;
 
 	path_len = strlen(path) + 1;
-	tok_len = strlen(token) + 1;
-	if (len > 2)
-		data_len = vec[len] - vec[2] + 1;
-	body_len = path_len + tok_len + data_len;
+	tok_len = strlen(token_caller) + 1;
+	body_len = path_len + tok_len;
 
 	hdr.type = XS_WATCH_EVENT;
 	hdr.len = body_len;
@@ -288,9 +285,7 @@ static void watch_fired(struct xenbus_watch *watch,
 	if (!ret)
 		ret = queue_reply(&staging_q, path, path_len);
 	if (!ret)
-		ret = queue_reply(&staging_q, token, tok_len);
-	if (!ret && len > 2)
-		ret = queue_reply(&staging_q, vec[2], data_len);
+		ret = queue_reply(&staging_q, token_caller, tok_len);
 
 	if (!ret) {
 		/* success: pass reply list onto watcher */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 6baffbb..74888ca 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -169,7 +169,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev,
 EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
 
 void xenbus_otherend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len,
+			     const char *path, const char *token,
 			     int ignore_on_shutdown)
 {
 	struct xenbus_device *dev =
@@ -180,18 +180,15 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
 	/* Protect us against watches firing on old details when the otherend
 	   details change, say immediately after a resume. */
 	if (!dev->otherend ||
-	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
-		    strlen(dev->otherend))) {
-		dev_dbg(&dev->dev, "Ignoring watch at %s\n",
-			vec[XS_WATCH_PATH]);
+	    strncmp(dev->otherend, path, strlen(dev->otherend))) {
+		dev_dbg(&dev->dev, "Ignoring watch at %s\n", path);
 		return;
 	}
 
 	state = xenbus_read_driver_state(dev->otherend);
 
 	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
-		state, xenbus_strstate(state), dev->otherend_watch.node,
-		vec[XS_WATCH_PATH]);
+		state, xenbus_strstate(state), dev->otherend_watch.node, path);
 
 	/*
 	 * Ignore xenbus transitions during shutdown. This prevents us doing
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index f46b4dc..b0bed4f 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -181,9 +181,9 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
 }
 
 static void frontend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
-	xenbus_otherend_changed(watch, vec, len, 0);
+	xenbus_otherend_changed(watch, path, token, 0);
 }
 
 static struct xen_bus_type xenbus_backend = {
@@ -204,11 +204,11 @@ static struct xen_bus_type xenbus_backend = {
 };
 
 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
 	DPRINTK("");
 
-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+	xenbus_dev_changed(path, &xenbus_backend);
 }
 
 static struct xenbus_watch be_watch = {
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
index d7b77a6..19e45ce 100644
--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -86,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev,
 
 
 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
-	xenbus_otherend_changed(watch, vec, len, 1);
+	xenbus_otherend_changed(watch, path, token, 1);
 }
 
 static void xenbus_frontend_delayed_resume(struct work_struct *w)
@@ -153,11 +153,11 @@ static struct xen_bus_type xenbus_frontend = {
 };
 
 static void frontend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
 	DPRINTK("");
 
-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+	xenbus_dev_changed(path, &xenbus_frontend);
 }
 
 
@@ -332,13 +332,13 @@ static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
 static int backend_state;
 
 static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
-					const char **v, unsigned int l)
+					const char *path, const char *token)
 {
-	if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i",
+	if (xenbus_scanf(XBT_NIL, path, "", "%i",
 			 &backend_state) != 1)
 		backend_state = XenbusStateUnknown;
 	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
-			v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+	       path, xenbus_strstate(backend_state));
 	wake_up(&backend_state_wq);
 }
 
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 4c49d87..ebc768f 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -64,8 +64,8 @@ struct xs_stored_msg {
 		/* Queued watch events. */
 		struct {
 			struct xenbus_watch *handle;
-			char **vec;
-			unsigned int vec_size;
+			const char *path;
+			const char *token;
 		} watch;
 	} u;
 };
@@ -765,7 +765,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
 		if (msg->u.watch.handle != watch)
 			continue;
 		list_del(&msg->list);
-		kfree(msg->u.watch.vec);
+		kfree(msg->u.watch.path);
 		kfree(msg);
 	}
 	spin_unlock(&watch_events_lock);
@@ -833,11 +833,10 @@ static int xenwatch_thread(void *unused)
 
 		if (ent != &watch_events) {
 			msg = list_entry(ent, struct xs_stored_msg, list);
-			msg->u.watch.handle->callback(
-				msg->u.watch.handle,
-				(const char **)msg->u.watch.vec,
-				msg->u.watch.vec_size);
-			kfree(msg->u.watch.vec);
+			msg->u.watch.handle->callback(msg->u.watch.handle,
+						      msg->u.watch.path,
+						      msg->u.watch.token);
+			kfree(msg->u.watch.path);
 			kfree(msg);
 		}
 
@@ -903,24 +902,24 @@ static int process_msg(void)
 	body[msg->hdr.len] = '\0';
 
 	if (msg->hdr.type == XS_WATCH_EVENT) {
-		msg->u.watch.vec = split(body, msg->hdr.len,
-					 &msg->u.watch.vec_size);
-		if (IS_ERR(msg->u.watch.vec)) {
-			err = PTR_ERR(msg->u.watch.vec);
+		if (count_strings(body, msg->hdr.len) != 2) {
+			err = -EINVAL;
 			kfree(msg);
+			kfree(body);
 			goto out;
 		}
+		msg->u.watch.path = (const char *)body;
+		msg->u.watch.token = (const char *)strchr(body, '\0') + 1;
 
 		spin_lock(&watches_lock);
-		msg->u.watch.handle = find_watch(
-			msg->u.watch.vec[XS_WATCH_TOKEN]);
+		msg->u.watch.handle = find_watch(msg->u.watch.token);
 		if (msg->u.watch.handle != NULL) {
 			spin_lock(&watch_events_lock);
 			list_add_tail(&msg->list, &watch_events);
 			wake_up(&watch_events_waitq);
 			spin_unlock(&watch_events_lock);
 		} else {
-			kfree(msg->u.watch.vec);
+			kfree(body);
 			kfree(msg);
 		}
 		spin_unlock(&watches_lock);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 98f73a2..869c816 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -61,7 +61,7 @@ struct xenbus_watch
 
 	/* Callback (executed in a process context with no locks held). */
 	void (*callback)(struct xenbus_watch *,
-			 const char **vec, unsigned int len);
+			 const char *path, const char *token);
 };
 
 
@@ -193,11 +193,11 @@ void xenbus_probe(struct work_struct *);
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
-				       const char **, unsigned int));
+				       const char *, const char *));
 __printf(4, 5)
 int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
-					  const char **, unsigned int),
+					  const char *, const char *),
 			 const char *pathfmt, ...);
 
 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
-- 
2.10.2


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related

* Re: [PATCH iproute2 2/3] ip vrf: Improve cgroup2 error messages
From: David Ahern @ 2017-01-06 15:05 UTC (permalink / raw)
  To: Sergei Shtylyov, netdev, stephen
In-Reply-To: <f9858cd0-277b-02cd-35dc-ffc24862f736@cogentembedded.com>

>> @@ -80,13 +80,21 @@ char *find_cgroup2_mount(void)
>>
>>      if (mount("none", mnt, CGROUP2_FS_NAME, 0, NULL)) {
>>          /* EBUSY means already mounted */
>> -        if (errno != EBUSY) {
>> +        if (errno == EBUSY)
>> +            goto out;
>> +
>> +        if (errno == ENODEV) {
>>              fprintf(stderr,
>>                  "Failed to mount cgroup2. Are CGROUPS enabled in your kernel?\n");
>> -            free(mnt);
>> -            return NULL;
>> +        } else {
>> +            fprintf(stderr,
>> +                "Failed to mount cgroup2: %s\n",
>> +                strerror(errno));
>>          }
> 
>    How about a *switch* instead?

I did consider it. Did not make the code simpler or easier to read.

^ permalink raw reply

* [PATCH net-next] net: ipv6: put autoconf routes into per-interface tables
From: Lorenzo Colitti @ 2017-01-06 15:30 UTC (permalink / raw)
  To: netdev
  Cc: zenczykowski, hannes, ek, hideaki.yoshifuji, davem, dsa, drosen,
	Lorenzo Colitti

Currently, IPv6 router discovery always puts routes into
RT6_TABLE_MAIN. This makes it difficult to maintain and switch
between multiple simultaneous network connections (e.g., wifi
and wired).

To work around this connection managers typically either move
autoconfiguration to userspace entirely (e.g., dhcpcd) or take
the routes they want and re-add them to the main table as static
routes with low metrics (e.g., NetworkManager). This puts the
burden on the connection manager to watch netlink or listen to
RAs to see if the routes have changed, delete the routes when
their lifetime expires, etc. This is complex and often not
implemented correctly.

This patch adds a per-interface sysctl to have the kernel put
autoconf routes into different tables. This allows each interface
to have its own routing table if desired.  Choosing the default
interface, or using different interfaces at the same time on a
per-socket or per-packet basis) can be done using policy routing
mechanisms that use as SO_BINDTODEVICE / IPV6_PKTINFO, mark-based
routing, or UID-based routing to select specific routing tables.

The sysctl behaves as follows:

- = 0: default. Put routes into RT6_TABLE_MAIN if the interface
       is not in a VRF, or into the VRF table if it is.
- > 0: manual. Put routes into the specified table.
- < 0: automatic. Add the absolute value of the sysctl to the
       device's ifindex, and use that table.

The automatic mode is most useful in conjunction with
net.ipv6.conf.default.accept_ra_rt_table. A connection manager
or distribution can set this to, say, -1000 on boot, and
thereafter know that routes received on every interface will
always be in that interface's routing table, and that the mapping
between interfaces and routing tables is deterministic. It also
ensures that if an interface is created and immediately receives
an RA, the route will go into the correct routing table without
needing any intervention from userspace.

The automatic mode (with conf.default.accept_ra_rt_table = -1000)
has been used in Android since 5.0.

Tested: compiles allnoconfig, allyesconfig, allmodconfig
Tested: passes existing Android kernel unit tests
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++
 include/linux/ipv6.h                   |  1 +
 include/net/addrconf.h                 |  2 ++
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 40 +++++++++++++++++++++++++++++++---
 net/ipv6/route.c                       | 11 +++++-----
 6 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf7..d1311d8f33 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1471,6 +1471,19 @@ accept_ra_rt_info_max_plen - INTEGER
 	Functional default: 0 if accept_ra_rtr_pref is enabled.
 			    -1 if accept_ra_rtr_pref is disabled.
 
+accept_ra_rt_table - INTEGER
+	Which table to put routes created by Router Advertisements into.
+
+	= 0: Use the main table if the device is not in a VRF, and the
+	     VRF table if it is.
+	> 0: Use the specified table.
+	< 0: Add the absolute value to the receiving interface index,
+	     and use that table. For example, if set to -1000, an RA
+	     received on interface index 4 will create routes in
+	     table 1004.
+
+	Default: 0
+
 accept_ra_rtr_pref - BOOLEAN
 	Accept Router Preference in RA.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 671d014e64..55d75074aa 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -69,6 +69,7 @@ struct ipv6_devconf {
 	__s32		seg6_require_hmac;
 #endif
 	__u32		enhanced_dad;
+	__s32		accept_ra_rt_table;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8f998afc13..e1bd2bc027 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -242,6 +242,8 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
 void addrconf_prefix_rcv(struct net_device *dev,
 			 u8 *opt, int len, bool sllao);
 
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table);
+
 /*
  *	anycast prototypes (anycast.c)
  */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index eaf65dc82e..95c3553242 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -182,6 +182,7 @@ enum {
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_ENHANCED_DAD,
+	DEVCONF_ACCEPT_RA_RT_TABLE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c1e124bc8e..d4a6b877f8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.accept_ra_rt_table	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.accept_ra_rt_table	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -2210,6 +2212,30 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
 		ipv6_regen_rndid(idev);
 }
 
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table)
+{
+	struct inet6_dev *idev = in6_dev_get(dev);
+	u32 table;
+	int sysctl = idev->cnf.accept_ra_rt_table;
+
+	if (sysctl == 0)
+		table = l3mdev_fib_table(dev) ? : default_table;
+	else if (sysctl > 0)
+		table = (u32)sysctl;
+	else
+		table = (unsigned int)dev->ifindex + (-sysctl);
+
+	in6_dev_put(idev);
+	return table;
+}
+#else
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table)
+{
+	return RT6_TABLE_DFLT;
+}
+#endif
+
 /*
  *	Add prefix route.
  */
@@ -2219,7 +2245,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		      unsigned long expires, u32 flags)
 {
 	struct fib6_config cfg = {
-		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
+		.fc_table = addrconf_rt_table(dev, RT6_TABLE_PREFIX),
 		.fc_metric = IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_expires = expires,
@@ -2252,9 +2278,9 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	struct fib6_node *fn;
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
-	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
 
-	table = fib6_get_table(dev_net(dev), tb_id);
+	table = fib6_get_table(dev_net(dev),
+			       addrconf_rt_table(dev, RT6_TABLE_PREFIX));
 	if (!table)
 		return NULL;
 
@@ -4975,6 +5001,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+	array[DEVCONF_ACCEPT_RA_RT_TABLE] = cnf->accept_ra_rt_table;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6090,6 +6117,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.proc_handler   = proc_dointvec,
 	},
 	{
+		.procname	= "accept_ra_rt_table",
+		.data		= &ipv6_devconf.accept_ra_rt_table,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		/* sentinel */
 	}
 };
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8417c41d8e..86469ec27f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2345,13 +2345,12 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev)
 {
-	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
 	int ifindex = dev->ifindex;
 	struct fib6_node *fn;
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
 
-	table = fib6_get_table(net, tb_id);
+	table = fib6_get_table(net, addrconf_rt_table(dev, RT6_TABLE_INFO));
 	if (!table)
 		return NULL;
 
@@ -2392,7 +2391,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 		.fc_nlinfo.nl_net = net,
 	};
 
-	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+	cfg.fc_table = addrconf_rt_table(dev, RT6_TABLE_INFO);
 	cfg.fc_dst = *prefix;
 	cfg.fc_gateway = *gwaddr;
 
@@ -2408,11 +2407,11 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 
 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
 {
-	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
 	struct rt6_info *rt;
 	struct fib6_table *table;
 
-	table = fib6_get_table(dev_net(dev), tb_id);
+	table = fib6_get_table(dev_net(dev),
+			       addrconf_rt_table(dev, RT6_TABLE_DFLT));
 	if (!table)
 		return NULL;
 
@@ -2434,7 +2433,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 				     unsigned int pref)
 {
 	struct fib6_config cfg = {
-		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
+		.fc_table	= addrconf_rt_table(dev, RT6_TABLE_DFLT),
 		.fc_metric	= IP6_RT_PRIO_USER,
 		.fc_ifindex	= dev->ifindex,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
-- 
2.11.0.390.gc69c2f50cf-goog

^ permalink raw reply related

* RE: [PATCH 2/3] xen: modify xenstore watch event interface
From: Paul Durrant @ 2017-01-06 15:38 UTC (permalink / raw)
  To: Juergen Gross, linux-kernel@vger.kernel.org,
	xen-devel@lists.xenproject.org
  Cc: boris.ostrovsky@oracle.com, konrad.wilk@oracle.com,
	Roger Pau Monne, Wei Liu, netdev@vger.kernel.org
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

> -----Original Message-----
> From: Juergen Gross [mailto:jgross@suse.com]
> Sent: 06 January 2017 15:06
> To: linux-kernel@vger.kernel.org; xen-devel@lists.xenproject.org
> Cc: boris.ostrovsky@oracle.com; Juergen Gross <jgross@suse.com>;
> konrad.wilk@oracle.com; Roger Pau Monne <roger.pau@citrix.com>; Wei Liu
> <wei.liu2@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>;
> netdev@vger.kernel.org
> Subject: [PATCH 2/3] xen: modify xenstore watch event interface
> 
> Today a Xenstore watch event is delivered via a callback function
> declared as:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
> 
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
> 
> is the natural thing to do.
> 
> Apply this change and adapt all users.
> 
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

xen-netback changes...

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
>  drivers/block/xen-blkback/xenbus.c         |  6 +++---
>  drivers/net/xen-netback/xenbus.c           |  8 ++++----
>  drivers/xen/cpu_hotplug.c                  |  5 ++---
>  drivers/xen/manage.c                       |  6 +++---
>  drivers/xen/xen-balloon.c                  |  2 +-
>  drivers/xen/xen-pciback/xenbus.c           |  2 +-
>  drivers/xen/xenbus/xenbus.h                |  6 +++---
>  drivers/xen/xenbus/xenbus_client.c         |  4 ++--
>  drivers/xen/xenbus/xenbus_dev_frontend.c   | 21 ++++++++-------------
>  drivers/xen/xenbus/xenbus_probe.c          | 11 ++++-------
>  drivers/xen/xenbus/xenbus_probe_backend.c  |  8 ++++----
>  drivers/xen/xenbus/xenbus_probe_frontend.c | 14 +++++++-------
>  drivers/xen/xenbus/xenbus_xs.c             | 29 ++++++++++++++---------------
>  include/xen/xenbus.h                       |  6 +++---
>  14 files changed, 59 insertions(+), 69 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-
> blkback/xenbus.c
> index 415e79b..8fe61b5 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -38,8 +38,8 @@ struct backend_info {
>  static struct kmem_cache *xen_blkif_cachep;
>  static void connect(struct backend_info *);
>  static int connect_ring(struct backend_info *);
> -static void backend_changed(struct xenbus_watch *, const char **,
> -			    unsigned int);
> +static void backend_changed(struct xenbus_watch *, const char *,
> +			    const char *);
>  static void xen_blkif_free(struct xen_blkif *blkif);
>  static void xen_vbd_free(struct xen_vbd *vbd);
> 
> @@ -661,7 +661,7 @@ static int xen_blkbk_probe(struct xenbus_device
> *dev,
>   * ready, connect.
>   */
>  static void backend_changed(struct xenbus_watch *watch,
> -			    const char **vec, unsigned int len)
> +			    const char *path, const char *token)
>  {
>  	int err;
>  	unsigned major;
> diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-
> netback/xenbus.c
> index 3124eae..d8a40fa 100644
> --- a/drivers/net/xen-netback/xenbus.c
> +++ b/drivers/net/xen-netback/xenbus.c
> @@ -723,7 +723,7 @@ static int xen_net_read_mac(struct xenbus_device
> *dev, u8 mac[])
>  }
> 
>  static void xen_net_rate_changed(struct xenbus_watch *watch,
> -				const char **vec, unsigned int len)
> +				 const char *path, const char *token)
>  {
>  	struct xenvif *vif = container_of(watch, struct xenvif, credit_watch);
>  	struct xenbus_device *dev = xenvif_to_xenbus_device(vif);
> @@ -780,7 +780,7 @@ static void xen_unregister_credit_watch(struct xenvif
> *vif)
>  }
> 
>  static void xen_mcast_ctrl_changed(struct xenbus_watch *watch,
> -				   const char **vec, unsigned int len)
> +				   const char *path, const char *token)
>  {
>  	struct xenvif *vif = container_of(watch, struct xenvif,
>  					  mcast_ctrl_watch);
> @@ -855,8 +855,8 @@ static void unregister_hotplug_status_watch(struct
> backend_info *be)
>  }
> 
>  static void hotplug_status_changed(struct xenbus_watch *watch,
> -				   const char **vec,
> -				   unsigned int vec_size)
> +				   const char *path,
> +				   const char *token)
>  {
>  	struct backend_info *be = container_of(watch,
>  					       struct backend_info,
> diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
> index 5676aef..7a4daa2 100644
> --- a/drivers/xen/cpu_hotplug.c
> +++ b/drivers/xen/cpu_hotplug.c
> @@ -68,13 +68,12 @@ static void vcpu_hotplug(unsigned int cpu)
>  }
> 
>  static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
> -					const char **vec, unsigned int len)
> +				      const char *path, const char *token)
>  {
>  	unsigned int cpu;
>  	char *cpustr;
> -	const char *node = vec[XS_WATCH_PATH];
> 
> -	cpustr = strstr(node, "cpu/");
> +	cpustr = strstr(path, "cpu/");
>  	if (cpustr != NULL) {
>  		sscanf(cpustr, "cpu/%u", &cpu);
>  		vcpu_hotplug(cpu);
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index 26e5e85..ca62c09 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -218,7 +218,7 @@ static struct shutdown_handler shutdown_handlers[]
> = {
>  };
> 
>  static void shutdown_handler(struct xenbus_watch *watch,
> -			     const char **vec, unsigned int len)
> +			     const char *path, const char *token)
>  {
>  	char *str;
>  	struct xenbus_transaction xbt;
> @@ -266,8 +266,8 @@ static void shutdown_handler(struct xenbus_watch
> *watch,
>  }
> 
>  #ifdef CONFIG_MAGIC_SYSRQ
> -static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
> -			  unsigned int len)
> +static void sysrq_handler(struct xenbus_watch *watch, const char *path,
> +			  const char *token)
>  {
>  	char sysrq_key = '\0';
>  	struct xenbus_transaction xbt;
> diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
> index 79865b8..e7715cb 100644
> --- a/drivers/xen/xen-balloon.c
> +++ b/drivers/xen/xen-balloon.c
> @@ -55,7 +55,7 @@ static int register_balloon(struct device *dev);
> 
>  /* React to a change in the target key */
>  static void watch_target(struct xenbus_watch *watch,
> -			 const char **vec, unsigned int len)
> +			 const char *path, const char *token)
>  {
>  	unsigned long long new_target;
>  	int err;
> diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-
> pciback/xenbus.c
> index 3f0aee0..3814b44 100644
> --- a/drivers/xen/xen-pciback/xenbus.c
> +++ b/drivers/xen/xen-pciback/xenbus.c
> @@ -652,7 +652,7 @@ static int xen_pcibk_setup_backend(struct
> xen_pcibk_device *pdev)
>  }
> 
>  static void xen_pcibk_be_watch(struct xenbus_watch *watch,
> -			     const char **vec, unsigned int len)
> +			       const char *path, const char *token)
>  {
>  	struct xen_pcibk_device *pdev =
>  	    container_of(watch, struct xen_pcibk_device, be_watch);
> diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
> index 6a80c1e..bd95c21 100644
> --- a/drivers/xen/xenbus/xenbus.h
> +++ b/drivers/xen/xenbus/xenbus.h
> @@ -39,8 +39,8 @@ struct xen_bus_type {
>  	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char
> *nodename);
>  	int (*probe)(struct xen_bus_type *bus, const char *type,
>  		     const char *dir);
> -	void (*otherend_changed)(struct xenbus_watch *watch, const char
> **vec,
> -				 unsigned int len);
> +	void (*otherend_changed)(struct xenbus_watch *watch, const char
> *path,
> +				 const char *token);
>  	struct bus_type bus;
>  };
> 
> @@ -83,7 +83,7 @@ int xenbus_dev_resume(struct device *dev);
>  int xenbus_dev_cancel(struct device *dev);
> 
>  void xenbus_otherend_changed(struct xenbus_watch *watch,
> -			     const char **vec, unsigned int len,
> +			     const char *path, const char *token,
>  			     int ignore_on_shutdown);
> 
>  int xenbus_read_otherend_details(struct xenbus_device *xendev,
> diff --git a/drivers/xen/xenbus/xenbus_client.c
> b/drivers/xen/xenbus/xenbus_client.c
> index 23edf53..9586c24 100644
> --- a/drivers/xen/xenbus/xenbus_client.c
> +++ b/drivers/xen/xenbus/xenbus_client.c
> @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(xenbus_strstate);
>  int xenbus_watch_path(struct xenbus_device *dev, const char *path,
>  		      struct xenbus_watch *watch,
>  		      void (*callback)(struct xenbus_watch *,
> -				       const char **, unsigned int))
> +				       const char *, const char *))
>  {
>  	int err;
> 
> @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path);
>  int xenbus_watch_pathfmt(struct xenbus_device *dev,
>  			 struct xenbus_watch *watch,
>  			 void (*callback)(struct xenbus_watch *,
> -					const char **, unsigned int),
> +					  const char *, const char *),
>  			 const char *pathfmt, ...)
>  {
>  	int err;
> diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c
> b/drivers/xen/xenbus/xenbus_dev_frontend.c
> index e2bc9b3..e4b9847 100644
> --- a/drivers/xen/xenbus/xenbus_dev_frontend.c
> +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
> @@ -258,26 +258,23 @@ static struct watch_adapter
> *alloc_watch_adapter(const char *path,
>  }
> 
>  static void watch_fired(struct xenbus_watch *watch,
> -			const char **vec,
> -			unsigned int len)
> +			const char *path,
> +			const char *token)
>  {
>  	struct watch_adapter *adap;
>  	struct xsd_sockmsg hdr;
> -	const char *path, *token;
> -	int path_len, tok_len, body_len, data_len = 0;
> +	const char *token_caller;
> +	int path_len, tok_len, body_len;
>  	int ret;
>  	LIST_HEAD(staging_q);
> 
>  	adap = container_of(watch, struct watch_adapter, watch);
> 
> -	path = vec[XS_WATCH_PATH];
> -	token = adap->token;
> +	token_caller = adap->token;
> 
>  	path_len = strlen(path) + 1;
> -	tok_len = strlen(token) + 1;
> -	if (len > 2)
> -		data_len = vec[len] - vec[2] + 1;
> -	body_len = path_len + tok_len + data_len;
> +	tok_len = strlen(token_caller) + 1;
> +	body_len = path_len + tok_len;
> 
>  	hdr.type = XS_WATCH_EVENT;
>  	hdr.len = body_len;
> @@ -288,9 +285,7 @@ static void watch_fired(struct xenbus_watch *watch,
>  	if (!ret)
>  		ret = queue_reply(&staging_q, path, path_len);
>  	if (!ret)
> -		ret = queue_reply(&staging_q, token, tok_len);
> -	if (!ret && len > 2)
> -		ret = queue_reply(&staging_q, vec[2], data_len);
> +		ret = queue_reply(&staging_q, token_caller, tok_len);
> 
>  	if (!ret) {
>  		/* success: pass reply list onto watcher */
> diff --git a/drivers/xen/xenbus/xenbus_probe.c
> b/drivers/xen/xenbus/xenbus_probe.c
> index 6baffbb..74888ca 100644
> --- a/drivers/xen/xenbus/xenbus_probe.c
> +++ b/drivers/xen/xenbus/xenbus_probe.c
> @@ -169,7 +169,7 @@ int xenbus_read_otherend_details(struct
> xenbus_device *xendev,
>  EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
> 
>  void xenbus_otherend_changed(struct xenbus_watch *watch,
> -			     const char **vec, unsigned int len,
> +			     const char *path, const char *token,
>  			     int ignore_on_shutdown)
>  {
>  	struct xenbus_device *dev =
> @@ -180,18 +180,15 @@ void xenbus_otherend_changed(struct
> xenbus_watch *watch,
>  	/* Protect us against watches firing on old details when the otherend
>  	   details change, say immediately after a resume. */
>  	if (!dev->otherend ||
> -	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
> -		    strlen(dev->otherend))) {
> -		dev_dbg(&dev->dev, "Ignoring watch at %s\n",
> -			vec[XS_WATCH_PATH]);
> +	    strncmp(dev->otherend, path, strlen(dev->otherend))) {
> +		dev_dbg(&dev->dev, "Ignoring watch at %s\n", path);
>  		return;
>  	}
> 
>  	state = xenbus_read_driver_state(dev->otherend);
> 
>  	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
> -		state, xenbus_strstate(state), dev->otherend_watch.node,
> -		vec[XS_WATCH_PATH]);
> +		state, xenbus_strstate(state), dev->otherend_watch.node,
> path);
> 
>  	/*
>  	 * Ignore xenbus transitions during shutdown. This prevents us doing
> diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c
> b/drivers/xen/xenbus/xenbus_probe_backend.c
> index f46b4dc..b0bed4f 100644
> --- a/drivers/xen/xenbus/xenbus_probe_backend.c
> +++ b/drivers/xen/xenbus/xenbus_probe_backend.c
> @@ -181,9 +181,9 @@ static int xenbus_probe_backend(struct
> xen_bus_type *bus, const char *type,
>  }
> 
>  static void frontend_changed(struct xenbus_watch *watch,
> -			    const char **vec, unsigned int len)
> +			     const char *path, const char *token)
>  {
> -	xenbus_otherend_changed(watch, vec, len, 0);
> +	xenbus_otherend_changed(watch, path, token, 0);
>  }
> 
>  static struct xen_bus_type xenbus_backend = {
> @@ -204,11 +204,11 @@ static struct xen_bus_type xenbus_backend = {
>  };
> 
>  static void backend_changed(struct xenbus_watch *watch,
> -			    const char **vec, unsigned int len)
> +			    const char *path, const char *token)
>  {
>  	DPRINTK("");
> 
> -	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
> +	xenbus_dev_changed(path, &xenbus_backend);
>  }
> 
>  static struct xenbus_watch be_watch = {
> diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c
> b/drivers/xen/xenbus/xenbus_probe_frontend.c
> index d7b77a6..19e45ce 100644
> --- a/drivers/xen/xenbus/xenbus_probe_frontend.c
> +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
> @@ -86,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev,
> 
> 
>  static void backend_changed(struct xenbus_watch *watch,
> -			    const char **vec, unsigned int len)
> +			    const char *path, const char *token)
>  {
> -	xenbus_otherend_changed(watch, vec, len, 1);
> +	xenbus_otherend_changed(watch, path, token, 1);
>  }
> 
>  static void xenbus_frontend_delayed_resume(struct work_struct *w)
> @@ -153,11 +153,11 @@ static struct xen_bus_type xenbus_frontend = {
>  };
> 
>  static void frontend_changed(struct xenbus_watch *watch,
> -			     const char **vec, unsigned int len)
> +			     const char *path, const char *token)
>  {
>  	DPRINTK("");
> 
> -	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
> +	xenbus_dev_changed(path, &xenbus_frontend);
>  }
> 
> 
> @@ -332,13 +332,13 @@ static
> DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
>  static int backend_state;
> 
>  static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
> -					const char **v, unsigned int l)
> +					const char *path, const char *token)
>  {
> -	if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i",
> +	if (xenbus_scanf(XBT_NIL, path, "", "%i",
>  			 &backend_state) != 1)
>  		backend_state = XenbusStateUnknown;
>  	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
> -			v[XS_WATCH_PATH],
> xenbus_strstate(backend_state));
> +	       path, xenbus_strstate(backend_state));
>  	wake_up(&backend_state_wq);
>  }
> 
> diff --git a/drivers/xen/xenbus/xenbus_xs.c
> b/drivers/xen/xenbus/xenbus_xs.c
> index 4c49d87..ebc768f 100644
> --- a/drivers/xen/xenbus/xenbus_xs.c
> +++ b/drivers/xen/xenbus/xenbus_xs.c
> @@ -64,8 +64,8 @@ struct xs_stored_msg {
>  		/* Queued watch events. */
>  		struct {
>  			struct xenbus_watch *handle;
> -			char **vec;
> -			unsigned int vec_size;
> +			const char *path;
> +			const char *token;
>  		} watch;
>  	} u;
>  };
> @@ -765,7 +765,7 @@ void unregister_xenbus_watch(struct xenbus_watch
> *watch)
>  		if (msg->u.watch.handle != watch)
>  			continue;
>  		list_del(&msg->list);
> -		kfree(msg->u.watch.vec);
> +		kfree(msg->u.watch.path);
>  		kfree(msg);
>  	}
>  	spin_unlock(&watch_events_lock);
> @@ -833,11 +833,10 @@ static int xenwatch_thread(void *unused)
> 
>  		if (ent != &watch_events) {
>  			msg = list_entry(ent, struct xs_stored_msg, list);
> -			msg->u.watch.handle->callback(
> -				msg->u.watch.handle,
> -				(const char **)msg->u.watch.vec,
> -				msg->u.watch.vec_size);
> -			kfree(msg->u.watch.vec);
> +			msg->u.watch.handle->callback(msg-
> >u.watch.handle,
> +						      msg->u.watch.path,
> +						      msg->u.watch.token);
> +			kfree(msg->u.watch.path);
>  			kfree(msg);
>  		}
> 
> @@ -903,24 +902,24 @@ static int process_msg(void)
>  	body[msg->hdr.len] = '\0';
> 
>  	if (msg->hdr.type == XS_WATCH_EVENT) {
> -		msg->u.watch.vec = split(body, msg->hdr.len,
> -					 &msg->u.watch.vec_size);
> -		if (IS_ERR(msg->u.watch.vec)) {
> -			err = PTR_ERR(msg->u.watch.vec);
> +		if (count_strings(body, msg->hdr.len) != 2) {
> +			err = -EINVAL;
>  			kfree(msg);
> +			kfree(body);
>  			goto out;
>  		}
> +		msg->u.watch.path = (const char *)body;
> +		msg->u.watch.token = (const char *)strchr(body, '\0') + 1;
> 
>  		spin_lock(&watches_lock);
> -		msg->u.watch.handle = find_watch(
> -			msg->u.watch.vec[XS_WATCH_TOKEN]);
> +		msg->u.watch.handle = find_watch(msg->u.watch.token);
>  		if (msg->u.watch.handle != NULL) {
>  			spin_lock(&watch_events_lock);
>  			list_add_tail(&msg->list, &watch_events);
>  			wake_up(&watch_events_waitq);
>  			spin_unlock(&watch_events_lock);
>  		} else {
> -			kfree(msg->u.watch.vec);
> +			kfree(body);
>  			kfree(msg);
>  		}
>  		spin_unlock(&watches_lock);
> diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
> index 98f73a2..869c816 100644
> --- a/include/xen/xenbus.h
> +++ b/include/xen/xenbus.h
> @@ -61,7 +61,7 @@ struct xenbus_watch
> 
>  	/* Callback (executed in a process context with no locks held). */
>  	void (*callback)(struct xenbus_watch *,
> -			 const char **vec, unsigned int len);
> +			 const char *path, const char *token);
>  };
> 
> 
> @@ -193,11 +193,11 @@ void xenbus_probe(struct work_struct *);
>  int xenbus_watch_path(struct xenbus_device *dev, const char *path,
>  		      struct xenbus_watch *watch,
>  		      void (*callback)(struct xenbus_watch *,
> -				       const char **, unsigned int));
> +				       const char *, const char *));
>  __printf(4, 5)
>  int xenbus_watch_pathfmt(struct xenbus_device *dev, struct
> xenbus_watch *watch,
>  			 void (*callback)(struct xenbus_watch *,
> -					  const char **, unsigned int),
> +					  const char *, const char *),
>  			 const char *pathfmt, ...);
> 
>  int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state
> new_state);
> --
> 2.10.2

^ permalink raw reply

* Re: [PATCH net-next] net:add one common config ARCH_WANT_RELAX_ORDER to support relax ordering.
From: Alexander Duyck @ 2017-01-06 15:41 UTC (permalink / raw)
  To: Mao Wenan; +Cc: Netdev, Jeff Kirsher
In-Reply-To: <1483696364-8680-1-git-send-email-maowenan@huawei.com>

On Fri, Jan 6, 2017 at 1:52 AM, Mao Wenan <maowenan@huawei.com> wrote:
> Relax ordering(RO) is one feature of 82599 NIC, to enable this feature can
> enhance the performance for some cpu architecure, such as SPARC and so on.
> Currently it only supports one special cpu architecture(SPARC) in 82599
> driver to enable RO feature, this is not very common for other cpu architecture
> which really needs RO feature.
> This patch add one common config CONFIG_ARCH_WANT_RELAX_ORDER to set RO feature,
> and should define CONFIG_ARCH_WANT_RELAX_ORDER in sparc Kconfig firstly.
>
> Signed-off-by: Mao Wenan <maowenan@huawei.com>
> ---
>  arch/sparc/Kconfig                              | 1 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
>  2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index cf4034c..68ac5c7 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -44,6 +44,7 @@ config SPARC
>         select CPU_NO_EFFICIENT_FFS
>         select HAVE_ARCH_HARDENED_USERCOPY
>         select PROVE_LOCKING_SMALL if PROVE_LOCKING
> +       select ARCH_WANT_RELAX_ORDER
>
>  config SPARC32
>         def_bool !64BIT


I'm pretty sure this is incomplete.  I think you need to add a couple
lines to arch/Kconfig so that the config option itself is listed
somewhere.  You might look at using something like HAVE_CMPXCHG_DOUBLE
as an example.

- Alex

^ permalink raw reply

* RE: [PATCHv3 net-next] sctp: prepare asoc stream for stream reconf
From: David Laight @ 2017-01-06 15:50 UTC (permalink / raw)
  To: 'Xin Long', network dev, linux-sctp@vger.kernel.org
  Cc: Marcelo Ricardo Leitner, Neil Horman, Vlad Yasevich,
	davem@davemloft.net
In-Reply-To: <efd6462731ca0b18a3039f9537dda61e0ed72430.1483712313.git.lucien.xin@gmail.com>

From: Xin Long
> Sent: 06 January 2017 14:19
> sctp stream reconf, described in RFC 6525, needs a structure to
> save per stream information in assoc, like stream state.
> 
> In the future, sctp stream scheduler also needs it to save some
> stream scheduler params and queues.
> 
> This patchset is to prepare the stream array in assoc for stream
> reconf. It defines sctp_stream that includes stream arrays inside
> to replace ssnmap.
> 
> Note that we use different structures for IN and OUT streams, as
> the members in per OUT stream will get more and more different
> from per IN stream.
...
>  /* What is the current SSN number for this stream? */
> -static inline __u16 sctp_ssn_peek(struct sctp_stream *stream, __u16 id)
> -{
> -	return stream->ssn[id];
> -}
> +#define sctp_ssn_peek(stream, type, sid) \
> +	((stream)->type[sid].ssn)
> 
>  /* Return the next SSN number for this stream.	*/
> -static inline __u16 sctp_ssn_next(struct sctp_stream *stream, __u16 id)
> -{
> -	return stream->ssn[id]++;
> -}
> +#define sctp_ssn_next(stream, type, sid) \
> +	((stream)->type[sid].ssn++)
> 
>  /* Skip over this ssn and all below. */
> -static inline void sctp_ssn_skip(struct sctp_stream *stream, __u16 id,
> -				 __u16 ssn)
> -{
> -	stream->ssn[id] = ssn+1;
> -}
> -
> +#define sctp_ssn_skip(stream, type, sid, ssn) \
> +	((stream)->type[sid].ssn = ssn + 1)
...

Is there any reason to convert these from inline functions to #defines?
Inline functions give better type checking and are usually preferred.

	David

^ permalink raw reply

* RE: [PATCH v2] net: stmmac: fix maxmtu assignment to be within valid range
From: Kweh, Hock Leong @ 2017-01-06 15:55 UTC (permalink / raw)
  To: David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe@st.com, Jarod Wilson, Andy Shevchenko
  Cc: Alexandre TORGUE, Joachim Eastwood, Niklas Cassel, Johan Hovold,
	pavel@ucw.cz, lars.persson@axis.com, netdev, LKML
In-Reply-To: <1483697306-10063-1-git-send-email-hock.leong.kweh@intel.com>

> -----Original Message-----
> From: Kweh, Hock Leong
> Sent: Friday, January 06, 2017 6:08 PM
> To: David S. Miller <davem@davemloft.net>; Joao Pinto
> <Joao.Pinto@synopsys.com>; Giuseppe CAVALLARO <peppe.cavallaro@st.com>;
> seraphin.bonnaffe@st.com; Jarod Wilson <jarod@redhat.com>; Andy
> Shevchenko <andy.shevchenko@gmail.com>
> Cc: Alexandre TORGUE <alexandre.torgue@gmail.com>; Joachim Eastwood
> <manabian@gmail.com>; Niklas Cassel <niklas.cassel@axis.com>; Johan Hovold
> <johan@kernel.org>; pavel@ucw.cz; Kweh, Hock Leong
> <hock.leong.kweh@intel.com>; lars.persson@axis.com; netdev
> <netdev@vger.kernel.org>; LKML <linux-kernel@vger.kernel.org>
> Subject: [PATCH v2] net: stmmac: fix maxmtu assignment to be within valid
> range
> 
> From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>
> 
> There is no checking valid value of maxmtu when getting it from device tree.
> This resolution added the checking condition to ensure the assignment is made
> within a valid range.
> 
> Signed-off-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>

I am going to submit V3.

> ---
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 92ac006..4df555e 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
>  		ndev->max_mtu = JUMBO_LEN;
>  	else
>  		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD +
> NET_IP_ALIGN);
> -	if (priv->plat->maxmtu < ndev->max_mtu)
> +
> +	if ((priv->plat->maxmtu < ndev->max_mtu) &&
> +	    (priv->plat->maxmtu >= ndev->min_mtu))
>  		ndev->max_mtu = priv->plat->maxmtu;
> +	else if (priv->plat->maxmtu != 0)
> +		netdev_warn(priv->dev,
> +			    "%s: warning: maxmtu having invalid value (%d)\n",
> +			    __func__, priv->plat->maxmtu);
> 
>  	if (flow_ctrl)
>  		priv->flow_ctrl = FLOW_AUTO;	/* RX/TX pause on */
> --
> 1.7.9.5

^ permalink raw reply

* Re: [PATCHv3 net-next] sctp: prepare asoc stream for stream reconf
From: Marcelo Ricardo Leitner @ 2017-01-06 15:56 UTC (permalink / raw)
  To: David Laight
  Cc: 'Xin Long', network dev, linux-sctp@vger.kernel.org,
	Neil Horman, Vlad Yasevich, davem@davemloft.net
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB025926D@AcuExch.aculab.com>

On Fri, Jan 06, 2017 at 03:50:36PM +0000, David Laight wrote:
> From: Xin Long
> > Sent: 06 January 2017 14:19
> > sctp stream reconf, described in RFC 6525, needs a structure to
> > save per stream information in assoc, like stream state.
> > 
> > In the future, sctp stream scheduler also needs it to save some
> > stream scheduler params and queues.
> > 
> > This patchset is to prepare the stream array in assoc for stream
> > reconf. It defines sctp_stream that includes stream arrays inside
> > to replace ssnmap.
> > 
> > Note that we use different structures for IN and OUT streams, as
> > the members in per OUT stream will get more and more different
> > from per IN stream.
> ...
> >  /* What is the current SSN number for this stream? */
> > -static inline __u16 sctp_ssn_peek(struct sctp_stream *stream, __u16 id)
> > -{
> > -	return stream->ssn[id];
> > -}
> > +#define sctp_ssn_peek(stream, type, sid) \
> > +	((stream)->type[sid].ssn)
> > 
> >  /* Return the next SSN number for this stream.	*/
> > -static inline __u16 sctp_ssn_next(struct sctp_stream *stream, __u16 id)
> > -{
> > -	return stream->ssn[id]++;
> > -}
> > +#define sctp_ssn_next(stream, type, sid) \
> > +	((stream)->type[sid].ssn++)
> > 
> >  /* Skip over this ssn and all below. */
> > -static inline void sctp_ssn_skip(struct sctp_stream *stream, __u16 id,
> > -				 __u16 ssn)
> > -{
> > -	stream->ssn[id] = ssn+1;
> > -}
> > -
> > +#define sctp_ssn_skip(stream, type, sid, ssn) \
> > +	((stream)->type[sid].ssn = ssn + 1)
> ...
> 
> Is there any reason to convert these from inline functions to #defines?
> Inline functions give better type checking and are usually preferred.

Yes, it's to avoid specializing these and also avoid a condition in
them. Now inbound and outbound streams are handled by different structs.
Please see the new struct sctp_stream definition.

  Marcelo

^ permalink raw reply

* [next PATCH 00/11] ixgbe: Add support for writable pages and build_skb
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev

This patch set enables support for using the recent changes that allow for
unmapping pages without invalidating their contents via
DMA_ATTR_SKIP_CPU_SYNC.  With this change DMA pages should be writable and
as a result we should be able to make use of build_skb which can be used to
drop the skb->head memory allocation, header parsing, and memcpy from the
receive path which can greatly help to improve performance.

My main concern at this point is that there might be an architecture where
I didn't get DMA_ATTR_SKIP_CPU_SYNC implemented that might still need it.
For that reason I have also added a ethtool private flag called out as
"legacy-rx".  If a platform encounters an issue where the Rx can possibly
corrupt data it can be enbled by running:
	ethtool --set-priv-flags DEVNAME legacy-rx on

The testing matrix for all of these patches is going to be pretty
extensive.  Basically we want to test these patches on as many platforms
and architectures as possible with as many features being toggled as
possible including RSC, FCoE, SR-IOV, and Jumbo Frames all while receiving
traffic.

Within the patches there is also some intialization changes.  Specifically
I have updated the code paths to defer clearing the rings until we are
about to initialize them and give them to hardware.  By doing this we are
able to avoid having to dirty memory we don't need to which should help to
improve suspend/resume times for when we start looking at possibly using
the suspend/resume approach for migration of interface in VMs.

---

Alexander Duyck (11):
      ixgbe: Add function for checking to see if we can reuse page
      ixgbe: Only DMA sync frame length
      ixgbe: Update driver to make use of DMA attributes in Rx path
      ixgbe: Update code to better handle incrementing page count
      ixgbe: Make use of order 1 pages and 3K buffers independent of FCoE
      ixgbe: Use length to determine if descriptor is done
      ixgbe: Break out Rx buffer page management
      ixgbe: Add support for padding packet
      ixgbe: Add private flag to control buffer mode
      ixgbe: Add support for build_skb
      ixgbe: Don't bother clearing buffer memory for descriptor rings


 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |   45 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   58 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  584 ++++++++++++++--------
 3 files changed, 465 insertions(+), 222 deletions(-)

^ permalink raw reply

* [next PATCH 02/11] ixgbe: Only DMA sync frame length
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

On some platforms, syncing a buffer for DMA is expensive. Rather than
sync the whole 2K receive buffer, only synchronise the length of the
frame, which will typically be the MTU, or a much smaller TCP ACK.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index e80d885af4d3..dbbf5223ace2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1858,7 +1858,7 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 		dma_sync_single_range_for_cpu(rx_ring->dev,
 					      IXGBE_CB(skb)->dma,
 					      frag->page_offset,
-					      ixgbe_rx_bufsz(rx_ring),
+					      skb_frag_size(frag),
 					      DMA_FROM_DEVICE);
 	}
 	IXGBE_CB(skb)->dma = 0;
@@ -1999,12 +1999,11 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
  **/
 static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			      struct ixgbe_rx_buffer *rx_buffer,
-			      union ixgbe_adv_rx_desc *rx_desc,
+			      unsigned int size,
 			      struct sk_buff *skb)
 {
 	struct page *page = rx_buffer->page;
 	unsigned char *va = page_address(page) + rx_buffer->page_offset;
-	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_bufsz(rx_ring);
 #else
@@ -2036,6 +2035,7 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 					     union ixgbe_adv_rx_desc *rx_desc)
 {
+	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 	struct ixgbe_rx_buffer *rx_buffer;
 	struct sk_buff *skb;
 	struct page *page;
@@ -2090,14 +2090,14 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 		dma_sync_single_range_for_cpu(rx_ring->dev,
 					      rx_buffer->dma,
 					      rx_buffer->page_offset,
-					      ixgbe_rx_bufsz(rx_ring),
+					      size,
 					      DMA_FROM_DEVICE);
 
 		rx_buffer->skb = NULL;
 	}
 
 	/* pull page into skb */
-	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
 		/* hand second half of page back to the ring */
 		ixgbe_reuse_rx_page(rx_ring, rx_buffer);
 	} else if (IXGBE_CB(skb)->dma == rx_buffer->dma) {

^ permalink raw reply related

* [next PATCH 01/11] ixgbe: Add function for checking to see if we can reuse page
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch consolidates the code for the ixgbe driver so that it is more
inline with what is already in igb.  The general idea is to just
consolidate functions that represent logical steps in the Rx process so we
can later update them more easily.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   70 +++++++++++++++----------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3beadc8c7a0a..e80d885af4d3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1947,6 +1947,41 @@ static inline bool ixgbe_page_is_reserved(struct page *page)
 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
+static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
+				    struct page *page,
+				    const unsigned int truesize)
+{
+#if (PAGE_SIZE >= 8192)
+	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
+				   ixgbe_rx_bufsz(rx_ring);
+#endif
+	/* avoid re-using remote pages */
+	if (unlikely(ixgbe_page_is_reserved(page)))
+		return false;
+
+#if (PAGE_SIZE < 8192)
+	/* if we are only owner of page we can reuse it */
+	if (unlikely(page_count(page) != 1))
+		return false;
+
+	/* flip page offset to other buffer */
+	rx_buffer->page_offset ^= truesize;
+#else
+	/* move offset up to the next cache line */
+	rx_buffer->page_offset += truesize;
+
+	if (rx_buffer->page_offset > last_offset)
+		return false;
+#endif
+
+	/* Even if we own the page, we are not allowed to use atomic_set()
+	 * This would break get_page_unless_zero() users.
+	 */
+	page_ref_inc(page);
+
+	return true;
+}
+
 /**
  * ixgbe_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @rx_ring: rx descriptor ring to transact packets on
@@ -1968,18 +2003,18 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			      struct sk_buff *skb)
 {
 	struct page *page = rx_buffer->page;
+	unsigned char *va = page_address(page) + rx_buffer->page_offset;
 	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_bufsz(rx_ring);
 #else
 	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
-	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
-				   ixgbe_rx_bufsz(rx_ring);
 #endif
 
-	if ((size <= IXGBE_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
-		unsigned char *va = page_address(page) + rx_buffer->page_offset;
+	if (unlikely(skb_is_nonlinear(skb)))
+		goto add_tail_frag;
 
+	if (size <= IXGBE_RX_HDR_SIZE) {
 		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
 
 		/* page is not reserved, we can reuse buffer as-is */
@@ -1991,34 +2026,11 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 		return false;
 	}
 
+add_tail_frag:
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
 			rx_buffer->page_offset, size, truesize);
 
-	/* avoid re-using remote pages */
-	if (unlikely(ixgbe_page_is_reserved(page)))
-		return false;
-
-#if (PAGE_SIZE < 8192)
-	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
-		return false;
-
-	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= truesize;
-#else
-	/* move offset up to the next cache line */
-	rx_buffer->page_offset += truesize;
-
-	if (rx_buffer->page_offset > last_offset)
-		return false;
-#endif
-
-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
-	 */
-	page_ref_inc(page);
-
-	return true;
+	return ixgbe_can_reuse_rx_page(rx_buffer, page, truesize);
 }
 
 static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,

^ permalink raw reply related

* [next PATCH 03/11] ixgbe: Update driver to make use of DMA attributes in Rx path
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for DMA_ATTR_SKIP_CPU_SYNC and
DMA_ATTR_WEAK_ORDERING.  By enabling both of these for the Rx path we are
able to see performance improvements on architectures that implement either
one due to the fact that page mapping and unmapping only has to sync what
is actually being used instead of the entire buffer.  In addition by
enabling the weak ordering attribute enables a performance improvement for
architectures that can associate a memory ordering with a DMA buffer such
as Sparc.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    3 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   56 +++++++++++++++++--------
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 9c6ccfc34177..97e74deecae2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -107,6 +107,9 @@
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
 #define IXGBE_RX_BUFFER_WRITE	16	/* Must be power of 2 */
 
+#define IXGBE_RX_DMA_ATTR \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 enum ixgbe_tx_flags {
 	/* cmd_type flags */
 	IXGBE_TX_FLAGS_HW_VLAN	= 0x01,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index dbbf5223ace2..062b984ffdf4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1583,8 +1583,10 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 	}
 
 	/* map page for use */
-	dma = dma_map_page(rx_ring->dev, page, 0,
-			   ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
+				 ixgbe_rx_pg_size(rx_ring),
+				 DMA_FROM_DEVICE,
+				 IXGBE_RX_DMA_ATTR);
 
 	/*
 	 * if mapping failed free memory back to system since
@@ -1627,6 +1629,12 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 		if (!ixgbe_alloc_mapped_page(rx_ring, bi))
 			break;
 
+		/* sync the buffer for use by the device */
+		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+						 bi->page_offset,
+						 ixgbe_rx_bufsz(rx_ring),
+						 DMA_FROM_DEVICE);
+
 		/*
 		 * Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
@@ -1849,8 +1857,10 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 {
 	/* if the page was released unmap it, else just sync our portion */
 	if (unlikely(IXGBE_CB(skb)->page_released)) {
-		dma_unmap_page(rx_ring->dev, IXGBE_CB(skb)->dma,
-			       ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+		dma_unmap_page_attrs(rx_ring->dev, IXGBE_CB(skb)->dma,
+				     ixgbe_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE,
+				     IXGBE_RX_DMA_ATTR);
 		IXGBE_CB(skb)->page_released = false;
 	} else {
 		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
@@ -1934,12 +1944,6 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
 
 	/* transfer page from old buffer to new buffer */
 	*new_buff = *old_buff;
-
-	/* sync the buffer for use by the device */
-	dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
-					 new_buff->page_offset,
-					 ixgbe_rx_bufsz(rx_ring),
-					 DMA_FROM_DEVICE);
 }
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
@@ -2105,9 +2109,10 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 		IXGBE_CB(skb)->page_released = true;
 	} else {
 		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page(rx_ring->dev, rx_buffer->dma,
-			       ixgbe_rx_pg_size(rx_ring),
-			       DMA_FROM_DEVICE);
+		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+				     ixgbe_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE,
+				     IXGBE_RX_DMA_ATTR);
 	}
 
 	/* clear contents of buffer_info */
@@ -4941,10 +4946,11 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 		if (rx_buffer->skb) {
 			struct sk_buff *skb = rx_buffer->skb;
 			if (IXGBE_CB(skb)->page_released)
-				dma_unmap_page(dev,
-					       IXGBE_CB(skb)->dma,
-					       ixgbe_rx_bufsz(rx_ring),
-					       DMA_FROM_DEVICE);
+				dma_unmap_page_attrs(dev,
+						     IXGBE_CB(skb)->dma,
+						     ixgbe_rx_pg_size(rx_ring),
+						     DMA_FROM_DEVICE,
+						     IXGBE_RX_DMA_ATTR);
 			dev_kfree_skb(skb);
 			rx_buffer->skb = NULL;
 		}
@@ -4952,8 +4958,20 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 		if (!rx_buffer->page)
 			continue;
 
-		dma_unmap_page(dev, rx_buffer->dma,
-			       ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		dma_sync_single_range_for_cpu(rx_ring->dev,
+					      rx_buffer->dma,
+					      rx_buffer->page_offset,
+					      ixgbe_rx_bufsz(rx_ring),
+					      DMA_FROM_DEVICE);
+
+		/* free resources associated with mapping */
+		dma_unmap_page_attrs(dev, rx_buffer->dma,
+				     ixgbe_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE,
+				     IXGBE_RX_DMA_ATTR);
 		__free_pages(rx_buffer->page, ixgbe_rx_pg_order(rx_ring));
 
 		rx_buffer->page = NULL;

^ permalink raw reply related

* [next PATCH 04/11] ixgbe: Update code to better handle incrementing page count
From: Alexander Duyck @ 2017-01-06 16:06 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Batch the page count updates instead of doing them one at a time.  By doing
this we can improve the overall performance as the atomic increment
operations can be expensive due to the fact that on x86 they are locked
operations which can cause stalls.  By doing bulk updates we can
consolidate the stall which should help to improve the overall receive
performance.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    7 ++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   39 ++++++++++++++++---------
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 97e74deecae2..717c65b0deb2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -198,7 +198,12 @@ struct ixgbe_rx_buffer {
 	struct sk_buff *skb;
 	dma_addr_t dma;
 	struct page *page;
-	unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+	__u32 page_offset;
+#else
+	__u16 page_offset;
+#endif
+	__u16 pagecnt_bias;
 };
 
 struct ixgbe_queue_stats {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 062b984ffdf4..519b6a2b65c1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1602,6 +1602,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 	bi->dma = dma;
 	bi->page = page;
 	bi->page_offset = 0;
+	bi->pagecnt_bias = 1;
 
 	return true;
 }
@@ -1959,13 +1960,15 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
 				   ixgbe_rx_bufsz(rx_ring);
 #endif
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+
 	/* avoid re-using remote pages */
 	if (unlikely(ixgbe_page_is_reserved(page)))
 		return false;
 
 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
+	if (unlikely(page_count(page) != pagecnt_bias))
 		return false;
 
 	/* flip page offset to other buffer */
@@ -1978,10 +1981,14 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 		return false;
 #endif
 
-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
+	/* If we have drained the page fragment pool we need to update
+	 * the pagecnt_bias and page count so that we fully restock the
+	 * number of references the driver holds.
 	 */
-	page_ref_inc(page);
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX);
+		rx_buffer->pagecnt_bias = USHRT_MAX;
+	}
 
 	return true;
 }
@@ -2025,7 +2032,6 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			return true;
 
 		/* this page cannot be reused so discard it */
-		__free_pages(page, ixgbe_rx_pg_order(rx_ring));
 		return false;
 	}
 
@@ -2104,15 +2110,19 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
 		/* hand second half of page back to the ring */
 		ixgbe_reuse_rx_page(rx_ring, rx_buffer);
-	} else if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
-		/* the page has been released from the ring */
-		IXGBE_CB(skb)->page_released = true;
 	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
-				     ixgbe_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IXGBE_RX_DMA_ATTR);
+		if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
+			/* the page has been released from the ring */
+			IXGBE_CB(skb)->page_released = true;
+		} else {
+			/* we are not reusing the buffer so unmap it */
+			dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+					     ixgbe_rx_pg_size(rx_ring),
+					     DMA_FROM_DEVICE,
+					     IXGBE_RX_DMA_ATTR);
+		}
+		__page_frag_cache_drain(page,
+					rx_buffer->pagecnt_bias);
 	}
 
 	/* clear contents of buffer_info */
@@ -4972,7 +4982,8 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
-		__free_pages(rx_buffer->page, ixgbe_rx_pg_order(rx_ring));
+		__page_frag_cache_drain(rx_buffer->page,
+					rx_buffer->pagecnt_bias);
 
 		rx_buffer->page = NULL;
 	}

^ permalink raw reply related

* [next PATCH 05/11] ixgbe: Make use of order 1 pages and 3K buffers independent of FCoE
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

In order to support build_skb with jumbo frames it will be necessary to use
3K buffers for the Rx path with 8K pages backing them.  This is needed on
architectures that implement 4K pages because we can't support 2K buffers
plus padding in a 4K page.

In the case of systems that support page sizes larger than 4K the 3K
attribute will only be applied to FCoE as we can fall back to using just 2K
buffers and adding the padding.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   20 +++++++++-----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   19 +++++++++++++------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 717c65b0deb2..80328e657d6a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -234,13 +234,14 @@ struct ixgbe_rx_queue_stats {
 #define IXGBE_TS_HDR_LEN 8
 
 enum ixgbe_ring_state_t {
+	__IXGBE_RX_3K_BUFFER,
+	__IXGBE_RX_RSC_ENABLED,
+	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
+	__IXGBE_RX_FCOE,
 	__IXGBE_TX_FDIR_INIT_DONE,
 	__IXGBE_TX_XPS_INIT_DONE,
 	__IXGBE_TX_DETECT_HANG,
 	__IXGBE_HANG_CHECK_ARMED,
-	__IXGBE_RX_RSC_ENABLED,
-	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
-	__IXGBE_RX_FCOE,
 };
 
 struct ixgbe_fwd_adapter {
@@ -352,19 +353,16 @@ struct ixgbe_ring_feature {
  */
 static inline unsigned int ixgbe_rx_bufsz(struct ixgbe_ring *ring)
 {
-#ifdef IXGBE_FCOE
-	if (test_bit(__IXGBE_RX_FCOE, &ring->state))
-		return (PAGE_SIZE < 8192) ? IXGBE_RXBUFFER_4K :
-					    IXGBE_RXBUFFER_3K;
-#endif
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+		return IXGBE_RXBUFFER_3K;
 	return IXGBE_RXBUFFER_2K;
 }
 
 static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring)
 {
-#ifdef IXGBE_FCOE
-	if (test_bit(__IXGBE_RX_FCOE, &ring->state))
-		return (PAGE_SIZE < 8192) ? 1 : 0;
+#if (PAGE_SIZE < 8192)
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+		return 1;
 #endif
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 519b6a2b65c1..1b6f1f584044 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1617,6 +1617,7 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 	union ixgbe_adv_rx_desc *rx_desc;
 	struct ixgbe_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
+	u16 bufsz;
 
 	/* nothing to do */
 	if (!cleaned_count)
@@ -1626,14 +1627,15 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
+	bufsz = ixgbe_rx_bufsz(rx_ring);
+
 	do {
 		if (!ixgbe_alloc_mapped_page(rx_ring, bi))
 			break;
 
 		/* sync the buffer for use by the device */
 		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-						 bi->page_offset,
-						 ixgbe_rx_bufsz(rx_ring),
+						 bi->page_offset, bufsz,
 						 DMA_FROM_DEVICE);
 
 		/*
@@ -2016,9 +2018,9 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 	struct page *page = rx_buffer->page;
 	unsigned char *va = page_address(page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = ixgbe_rx_bufsz(rx_ring);
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
 
 	if (unlikely(skb_is_nonlinear(skb)))
@@ -3917,10 +3919,15 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 	 */
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		rx_ring = adapter->rx_ring[i];
+
+		clear_ring_rsc_enabled(rx_ring);
+		clear_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
 			set_ring_rsc_enabled(rx_ring);
-		else
-			clear_ring_rsc_enabled(rx_ring);
+
+		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
 	}
 }
 

^ permalink raw reply related

* [next PATCH 06/11] ixgbe: Use length to determine if descriptor is done
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change makes it so that we use the length of the packet instead of the
DD status bit to determine if a new descriptor is ready to be processed.
The obvious advantage is that it cuts down on reads as we don't really even
need the DD bit if going from a 0 to a non-zero value on size is enough to
inform us that the packet has been completed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1b6f1f584044..6ce596816954 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1653,8 +1653,8 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 			i -= rx_ring->count;
 		}
 
-		/* clear the status bits for the next_to_use descriptor */
-		rx_desc->wb.upper.status_error = 0;
+		/* clear the length for the next_to_use descriptor */
+		rx_desc->wb.upper.length = 0;
 
 		cleaned_count--;
 	} while (cleaned_count);
@@ -2170,7 +2170,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 
 		rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!rx_desc->wb.upper.status_error)
+		if (!rx_desc->wb.upper.length)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -3749,6 +3749,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 			     struct ixgbe_ring *ring)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
+	union ixgbe_adv_rx_desc *rx_desc;
 	u64 rdba = ring->dma;
 	u32 rxdctl;
 	u8 reg_idx = ring->reg_idx;
@@ -3783,6 +3784,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 		rxdctl |=  0x080420;
 	}
 
+	/* initialize Rx descriptor 0 */
+	rx_desc = IXGBE_RX_DESC(ring, 0);
+	rx_desc->wb.upper.length = 0;
+
 	/* enable receive descriptor ring */
 	rxdctl |= IXGBE_RXDCTL_ENABLE;
 	IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(reg_idx), rxdctl);
@@ -4998,9 +5003,6 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 	memset(rx_ring->rx_buffer_info, 0, size);
 
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
-
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;

^ permalink raw reply related

* [next PATCH 07/11] ixgbe: Break out Rx buffer page management
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

We are going to be expanding the number of Rx paths in the driver.  Instead
of duplicating all that code I am pulling it apart into separate functions
so that we don't have so much code duplication.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  230 +++++++++++++------------
 1 file changed, 122 insertions(+), 108 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6ce596816954..79495cc990c2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1864,7 +1864,6 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
-		IXGBE_CB(skb)->page_released = false;
 	} else {
 		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
 
@@ -1874,7 +1873,6 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 					      skb_frag_size(frag),
 					      DMA_FROM_DEVICE);
 	}
-	IXGBE_CB(skb)->dma = 0;
 }
 
 /**
@@ -1945,8 +1943,14 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
 	nta++;
 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
-	/* transfer page from old buffer to new buffer */
-	*new_buff = *old_buff;
+	/* Transfer page from old buffer to new buffer.
+	 * Move each member individually to avoid possible store
+	 * forwarding stalls and unnecessary copy of skb.
+	 */
+	new_buff->dma		= old_buff->dma;
+	new_buff->page		= old_buff->page;
+	new_buff->page_offset	= old_buff->page_offset;
+	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
 }
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
@@ -1954,15 +1958,10 @@ static inline bool ixgbe_page_is_reserved(struct page *page)
 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
-static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
-				    struct page *page,
-				    const unsigned int truesize)
+static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer)
 {
-#if (PAGE_SIZE >= 8192)
-	unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) -
-				   ixgbe_rx_bufsz(rx_ring);
-#endif
-	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
+	struct page *page = rx_buffer->page;
 
 	/* avoid re-using remote pages */
 	if (unlikely(ixgbe_page_is_reserved(page)))
@@ -1970,16 +1969,17 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 
 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != pagecnt_bias))
+	if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
 		return false;
-
-	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= truesize;
 #else
-	/* move offset up to the next cache line */
-	rx_buffer->page_offset += truesize;
-
-	if (rx_buffer->page_offset > last_offset)
+	/* The last offset is a bit aggressive in that we assume the
+	 * worst case of FCoE being enabled and using a 3K buffer.
+	 * However this should have minimal impact as the 1K extra is
+	 * still less than one buffer in size.
+	 */
+#define IXGBE_LAST_OFFSET \
+	(SKB_WITH_OVERHEAD(PAGE_SIZE) - IXGBE_RXBUFFER_3K)
+	if (rx_buffer->page_offset > IXGBE_LAST_OFFSET)
 		return false;
 #endif
 
@@ -1987,7 +1987,7 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
 	 * the pagecnt_bias and page count so that we fully restock the
 	 * number of references the driver holds.
 	 */
-	if (unlikely(pagecnt_bias == 1)) {
+	if (unlikely(!pagecnt_bias)) {
 		page_ref_add(page, USHRT_MAX);
 		rx_buffer->pagecnt_bias = USHRT_MAX;
 	}
@@ -2010,106 +2010,65 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer,
  * The function will then update the page offset if necessary and return
  * true if the buffer can be reused by the adapter.
  **/
-static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
+static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 			      struct ixgbe_rx_buffer *rx_buffer,
-			      unsigned int size,
-			      struct sk_buff *skb)
+			      struct sk_buff *skb,
+			      unsigned int size)
 {
-	struct page *page = rx_buffer->page;
-	unsigned char *va = page_address(page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
-
-	if (unlikely(skb_is_nonlinear(skb)))
-		goto add_tail_frag;
-
-	if (size <= IXGBE_RX_HDR_SIZE) {
-		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
-
-		/* page is not reserved, we can reuse buffer as-is */
-		if (likely(!ixgbe_page_is_reserved(page)))
-			return true;
-
-		/* this page cannot be reused so discard it */
-		return false;
-	}
-
-add_tail_frag:
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
-
-	return ixgbe_can_reuse_rx_page(rx_buffer, page, truesize);
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
 }
 
-static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
-					     union ixgbe_adv_rx_desc *rx_desc)
+static struct ixgbe_rx_buffer *ixgbe_get_rx_buffer(struct ixgbe_ring *rx_ring,
+						   union ixgbe_adv_rx_desc *rx_desc,
+						   struct sk_buff **skb,
+						   const unsigned int size)
 {
-	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 	struct ixgbe_rx_buffer *rx_buffer;
-	struct sk_buff *skb;
-	struct page *page;
 
 	rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-	page = rx_buffer->page;
-	prefetchw(page);
-
-	skb = rx_buffer->skb;
-
-	if (likely(!skb)) {
-		void *page_addr = page_address(page) +
-				  rx_buffer->page_offset;
+	prefetchw(rx_buffer->page);
+	*skb = rx_buffer->skb;
 
-		/* prefetch first cache line of first page */
-		prefetch(page_addr);
-#if L1_CACHE_BYTES < 128
-		prefetch(page_addr + L1_CACHE_BYTES);
-#endif
-
-		/* allocate a skb to store the frags */
-		skb = napi_alloc_skb(&rx_ring->q_vector->napi,
-				     IXGBE_RX_HDR_SIZE);
-		if (unlikely(!skb)) {
-			rx_ring->rx_stats.alloc_rx_buff_failed++;
-			return NULL;
-		}
-
-		/*
-		 * we will be copying header into skb->data in
-		 * pskb_may_pull so it is in our interest to prefetch
-		 * it now to avoid a possible cache miss
-		 */
-		prefetchw(skb->data);
-
-		/*
-		 * Delay unmapping of the first packet. It carries the
-		 * header information, HW may still access the header
-		 * after the writeback.  Only unmap it when EOP is
-		 * reached
-		 */
-		if (likely(ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)))
-			goto dma_sync;
-
-		IXGBE_CB(skb)->dma = rx_buffer->dma;
+	/* Delay unmapping of the first packet. It carries the header
+	 * information, HW may still access the header after the writeback.
+	 * Only unmap it when EOP is reached
+	 */
+	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) {
+		if (!*skb)
+			goto skip_sync;
 	} else {
-		if (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
-			ixgbe_dma_sync_frag(rx_ring, skb);
+		if (*skb)
+			ixgbe_dma_sync_frag(rx_ring, *skb);
+	}
 
-dma_sync:
-		/* we are reusing so sync this buffer for CPU use */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      rx_buffer->dma,
-					      rx_buffer->page_offset,
-					      size,
-					      DMA_FROM_DEVICE);
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      size,
+				      DMA_FROM_DEVICE);
+skip_sync:
+	rx_buffer->pagecnt_bias--;
 
-		rx_buffer->skb = NULL;
-	}
+	return rx_buffer;
+}
 
-	/* pull page into skb */
-	if (ixgbe_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
+static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
+				struct ixgbe_rx_buffer *rx_buffer,
+				struct sk_buff *skb)
+{
+	if (ixgbe_can_reuse_rx_page(rx_buffer)) {
 		/* hand second half of page back to the ring */
 		ixgbe_reuse_rx_page(rx_ring, rx_buffer);
 	} else {
@@ -2123,12 +2082,55 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 					     DMA_FROM_DEVICE,
 					     IXGBE_RX_DMA_ATTR);
 		}
-		__page_frag_cache_drain(page,
+		__page_frag_cache_drain(rx_buffer->page,
 					rx_buffer->pagecnt_bias);
 	}
 
-	/* clear contents of buffer_info */
+	/* clear contents of rx_buffer */
 	rx_buffer->page = NULL;
+	rx_buffer->skb = NULL;
+}
+
+static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
+					   struct ixgbe_rx_buffer *rx_buffer,
+					   union ixgbe_adv_rx_desc *rx_desc,
+					   unsigned int size)
+{
+	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+#endif
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(va);
+#if L1_CACHE_BYTES < 128
+	prefetch(va + L1_CACHE_BYTES);
+#endif
+
+	/* allocate a skb to store the frags */
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
+	if (unlikely(!skb))
+		return NULL;
+
+	if (size > IXGBE_RX_HDR_SIZE) {
+		if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
+			IXGBE_CB(skb)->dma = rx_buffer->dma;
+
+		skb_add_rx_frag(skb, 0, rx_buffer->page,
+				rx_buffer->page_offset,
+				size, truesize);
+#if (PAGE_SIZE < 8192)
+		rx_buffer->page_offset ^= truesize;
+#else
+		rx_buffer->page_offset += truesize;
+#endif
+	} else {
+		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
+		rx_buffer->pagecnt_bias++;
+	}
 
 	return skb;
 }
@@ -2160,7 +2162,9 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 
 	while (likely(total_rx_packets < budget)) {
 		union ixgbe_adv_rx_desc *rx_desc;
+		struct ixgbe_rx_buffer *rx_buffer;
 		struct sk_buff *skb;
+		unsigned int size;
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
@@ -2169,8 +2173,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 		rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
-
-		if (!rx_desc->wb.upper.length)
+		size = le16_to_cpu(rx_desc->wb.upper.length);
+		if (!size)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -2179,13 +2183,23 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		 */
 		dma_rmb();
 
+		rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);
+
 		/* retrieve a buffer from the ring */
-		skb = ixgbe_fetch_rx_buffer(rx_ring, rx_desc);
+		if (skb)
+			ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
+		else
+			skb = ixgbe_construct_skb(rx_ring, rx_buffer,
+						  rx_desc, size);
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb)
+		if (!skb) {
+			rx_ring->rx_stats.alloc_rx_buff_failed++;
+			rx_buffer->pagecnt_bias++;
 			break;
+		}
 
+		ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb);
 		cleaned_count++;
 
 		/* place incomplete frames back on ring for completion */

^ permalink raw reply related

* [next PATCH 08/11] ixgbe: Add support for padding packet
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for providing a buffer with headroom and tailroom
to allow for shared info, NET_SKB_PAD, and NET_IP_ALIGN.  With this
combined with the DMA changes we can start using build_skb to build frames
around an incoming Rx buffer instead of having to memcpy the headers.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   17 ++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   43 +++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 80328e657d6a..3537d07b4807 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -94,6 +94,14 @@
 #define IXGBE_RXBUFFER_4K    4096
 #define IXGBE_MAX_RXBUFFER  16384  /* largest size for a single descriptor */
 
+#define IXGBE_SKB_PAD		(NET_SKB_PAD + NET_IP_ALIGN)
+#if (PAGE_SIZE < 8192)
+#define IXGBE_MAX_FRAME_BUILD_SKB \
+	(SKB_WITH_OVERHEAD(IXGBE_RXBUFFER_2K) - IXGBE_SKB_PAD)
+#else
+#define IGB_MAX_FRAME_BUILD_SKB IXGBE_RXBUFFER_2K
+#endif
+
 /*
  * NOTE: netdev_alloc_skb reserves up to 64 bytes, NET_IP_ALIGN means we
  * reserve 64 more, and skb_shared_info adds an additional 320 bytes more,
@@ -235,6 +243,7 @@ struct ixgbe_rx_queue_stats {
 
 enum ixgbe_ring_state_t {
 	__IXGBE_RX_3K_BUFFER,
+	__IXGBE_RX_BUILD_SKB_ENABLED,
 	__IXGBE_RX_RSC_ENABLED,
 	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
 	__IXGBE_RX_FCOE,
@@ -244,6 +253,9 @@ enum ixgbe_ring_state_t {
 	__IXGBE_HANG_CHECK_ARMED,
 };
 
+#define ring_uses_build_skb(ring) \
+	test_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &(ring)->state)
+
 struct ixgbe_fwd_adapter {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	struct net_device *netdev;
@@ -355,6 +367,10 @@ static inline unsigned int ixgbe_rx_bufsz(struct ixgbe_ring *ring)
 {
 	if (test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
 		return IXGBE_RXBUFFER_3K;
+#if (PAGE_SIZE < 8192)
+	if (ring_uses_build_skb(ring))
+		return IXGBE_MAX_FRAME_BUILD_SKB;
+#endif
 	return IXGBE_RXBUFFER_2K;
 }
 
@@ -670,6 +686,7 @@ struct ixgbe_adapter {
 #define IXGBE_FLAG2_VLAN_PROMISC		BIT(13)
 #define IXGBE_FLAG2_EEE_CAPABLE			BIT(14)
 #define IXGBE_FLAG2_EEE_ENABLED			BIT(15)
+#define IXGBE_FLAG2_RX_LEGACY			BIT(16)
 
 	/* Tx fast path data */
 	int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 79495cc990c2..8529eafb9717 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1565,6 +1565,11 @@ static inline void ixgbe_rx_checksum(struct ixgbe_ring *ring,
 	}
 }
 
+static inline unsigned int ixgbe_rx_offset(struct ixgbe_ring *rx_ring)
+{
+	return ring_uses_build_skb(rx_ring) ? IXGBE_SKB_PAD : 0;
+}
+
 static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 				    struct ixgbe_rx_buffer *bi)
 {
@@ -1601,7 +1606,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = 0;
+	bi->page_offset = ixgbe_rx_offset(rx_ring);
 	bi->pagecnt_bias = 1;
 
 	return true;
@@ -2018,7 +2023,9 @@ static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = ring_uses_build_skb(rx_ring) ?
+				SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) :
+				SKB_DATA_ALIGN(size);
 #endif
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
@@ -2100,7 +2107,7 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+	unsigned int truesize = SKB_DATA_ALIGN(size);
 #endif
 	struct sk_buff *skb;
 
@@ -3462,7 +3469,10 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
 	srrctl = IXGBE_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT;
 
 	/* configure the packet buffer length */
-	srrctl |= ixgbe_rx_bufsz(rx_ring) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+	if (test_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state))
+		srrctl |= IXGBE_RXBUFFER_3K >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+	else
+		srrctl |= IXGBE_RXBUFFER_2K >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
 
 	/* configure descriptor type */
 	srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
@@ -3796,6 +3806,17 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 		 */
 		rxdctl &= ~0x3FFFFF;
 		rxdctl |=  0x080420;
+#if (PAGE_SIZE < 8192)
+	} else {
+		rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
+			    IXGBE_RXDCTL_RLPML_EN);
+
+		/* Limit the maximum frame size so we don't overrun the skb */
+		if (ring_uses_build_skb(ring) &&
+		    !test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
+			rxdctl |= IXGBE_MAX_FRAME_BUILD_SKB |
+				  IXGBE_RXDCTL_RLPML_EN;
+#endif
 	}
 
 	/* initialize Rx descriptor 0 */
@@ -3941,12 +3962,26 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 
 		clear_ring_rsc_enabled(rx_ring);
 		clear_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+		clear_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
 
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
 			set_ring_rsc_enabled(rx_ring);
 
 		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
 			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
+		if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
+			continue;
+
+		set_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
+
+#if (PAGE_SIZE < 8192)
+		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+
+		if (max_frame > (ETH_FRAME_LEN + ETH_FCS_LEN))
+			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
+#endif
 	}
 }
 

^ permalink raw reply related

* [next PATCH 09/11] ixgbe: Add private flag to control buffer mode
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Since there are potential drawbacks to the new Rx allocation approach I
thought it best to add a "chicken bit" so that we can turn the feature off
if in the event that a problem is found.

It also provides a means of validating the legacy Rx path in the event that
we are forced to fall back.  At some point in the future when we are
convinced we don't need it anymore we might be able to drop the legacy-rx
flag.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index aa80ece9ea96..6466d828bc6f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -151,6 +151,13 @@ struct ixgbe_stats {
 };
 #define IXGBE_TEST_LEN sizeof(ixgbe_gstrings_test) / ETH_GSTRING_LEN
 
+static const char ixgbe_priv_flags_strings[][ETH_GSTRING_LEN] = {
+#define IXGBE_PRIV_FLAGS_LEGACY_RX	BIT(0)
+	"legacy-rx",
+};
+
+#define IXGBE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbe_priv_flags_strings)
+
 /* currently supported speeds for 10G */
 #define ADVRTSD_MSK_10G (SUPPORTED_10000baseT_Full | \
 			 SUPPORTED_10000baseKX4_Full | \
@@ -1002,6 +1009,8 @@ static void ixgbe_get_drvinfo(struct net_device *netdev,
 
 	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
 		sizeof(drvinfo->bus_info));
+
+	drvinfo->n_priv_flags = IXGBE_PRIV_FLAGS_STR_LEN;
 }
 
 static void ixgbe_get_ringparam(struct net_device *netdev,
@@ -1141,6 +1150,8 @@ static int ixgbe_get_sset_count(struct net_device *netdev, int sset)
 		return IXGBE_TEST_LEN;
 	case ETH_SS_STATS:
 		return IXGBE_STATS_LEN;
+	case ETH_SS_PRIV_FLAGS:
+		return IXGBE_PRIV_FLAGS_STR_LEN;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1305,6 +1316,9 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
 		}
 		/* BUG_ON(p - data != IXGBE_STATS_LEN * ETH_GSTRING_LEN); */
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		memcpy(data, ixgbe_priv_flags_strings,
+		       IXGBE_PRIV_FLAGS_STR_LEN * ETH_GSTRING_LEN);
 	}
 }
 
@@ -3386,6 +3400,37 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
 	return 0;
 }
 
+static u32 ixgbe_get_priv_flags(struct net_device *netdev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	u32 priv_flags = 0;
+
+	if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
+		priv_flags |= IXGBE_PRIV_FLAGS_LEGACY_RX;
+
+	return priv_flags;
+}
+
+static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	unsigned int flags2 = adapter->flags2;
+
+	flags2 &= ~IXGBE_FLAG2_RX_LEGACY;
+	if (priv_flags & IXGBE_PRIV_FLAGS_LEGACY_RX)
+		flags2 |= IXGBE_FLAG2_RX_LEGACY;
+
+	if (flags2 != adapter->flags2) {
+		adapter->flags2 = flags2;
+
+		/* reset interface to repopulate queues */
+		if (netif_running(netdev))
+			ixgbe_reinit_locked(adapter);
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops ixgbe_ethtool_ops = {
 	.get_settings           = ixgbe_get_settings,
 	.set_settings           = ixgbe_set_settings,
@@ -3422,6 +3467,8 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
 	.set_eee		= ixgbe_set_eee,
 	.get_channels		= ixgbe_get_channels,
 	.set_channels		= ixgbe_set_channels,
+	.get_priv_flags		= ixgbe_get_priv_flags,
+	.set_priv_flags		= ixgbe_set_priv_flags,
 	.get_ts_info		= ixgbe_get_ts_info,
 	.get_module_info	= ixgbe_get_module_info,
 	.get_module_eeprom	= ixgbe_get_module_eeprom,

^ permalink raw reply related

* [next PATCH 10/11] ixgbe: Add support for build_skb
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds build_skb support to the Rx path.  There are several
advantages to this change.

1.  It avoids the memcpy and skb->head allocation for small packets which
    improves performance by about 5% in my tests.
2.  It avoids the memcpy, skb->head allocation, and eth_get_headlen
    for larger packets improving performance by about 10% in my tests.
3.  For VXLAN packets it allows the full header to be in skb->data which
    improves the performance by as much as 30% in some of my tests.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   49 ++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8529eafb9717..dd04b76ae0bd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1913,7 +1913,7 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
 	}
 
 	/* place header in linear portion of buffer */
-	if (skb_is_nonlinear(skb))
+	if (!skb_headlen(skb))
 		ixgbe_pull_tail(rx_ring, skb);
 
 #ifdef IXGBE_FCOE
@@ -2142,6 +2142,49 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 	return skb;
 }
 
+static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
+				       struct ixgbe_rx_buffer *rx_buffer,
+				       union ixgbe_adv_rx_desc *rx_desc,
+				       unsigned int size)
+{
+	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
+				SKB_DATA_ALIGN(IXGBE_SKB_PAD + size);
+#endif
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(va);
+#if L1_CACHE_BYTES < 128
+	prefetch(va + L1_CACHE_BYTES);
+#endif
+
+	/* build an skb to around the page buffer */
+	skb = build_skb(va - IXGBE_SKB_PAD, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* update pointers within the skb to store the data */
+	skb_reserve(skb, IXGBE_SKB_PAD);
+	__skb_put(skb, size);
+
+	/* record DMA address if this is the start of a chain of buffers */
+	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
+		IXGBE_CB(skb)->dma = rx_buffer->dma;
+
+	/* update buffer offset */
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
+
+	return skb;
+}
+
 /**
  * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @q_vector: structure containing interrupt and ring information
@@ -2195,6 +2238,9 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		/* retrieve a buffer from the ring */
 		if (skb)
 			ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
+		else if (ring_uses_build_skb(rx_ring))
+			skb = ixgbe_build_skb(rx_ring, rx_buffer,
+					      rx_desc, size);
 		else
 			skb = ixgbe_construct_skb(rx_ring, rx_buffer,
 						  rx_desc, size);
@@ -3970,6 +4016,7 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 		if (test_bit(__IXGBE_RX_FCOE, &rx_ring->state))
 			set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state);
 
+		clear_bit(__IXGBE_RX_BUILD_SKB_ENABLED, &rx_ring->state);
 		if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
 			continue;
 

^ permalink raw reply related

* [next PATCH 11/11] ixgbe: Don't bother clearing buffer memory for descriptor rings
From: Alexander Duyck @ 2017-01-06 16:07 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch makes it so that we don't need to bother with clearing the
memory out for the descriptor rings.  The general idea is to only free
buffers associated with buffers in use which are located between the
next_to_clean and next_to_use or next_to_alloc values.  Everything outside
of those regions can be safely ignored since they should have no buffers
associated with them.

The advantage to doing things this way is that is should speed up bring-up
and tear-down of the rings.  Specifically we can avoid the 512 or more
cycles required to memset the rings in tear-down.  In the bring-up phase we
then clear the memory as a part of initialization.  The general idea is
that the clearing in initialization can act as a prefetch of sorts for the
buffer info structures so they are in the local CPU when we go to populate
them.  This should help to improve overall time needed to perform a
suspend/resume.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   11 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  158 ++++++++++++----------
 2 files changed, 98 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 6466d828bc6f..8c6f40d4db6f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1945,7 +1945,16 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
 
 		/* unmap buffer on Tx side */
 		tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer);
+
+		/* Free all the Tx ring sk_buffs */
+		dev_kfree_skb_any(tx_buffer->skb);
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buffer, len, 0);
 
 		/* increment Rx/Tx next to clean counters */
 		rx_ntc++;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index dd04b76ae0bd..6ec0ebf7f174 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -958,28 +958,6 @@ static inline void ixgbe_irq_rearm_queues(struct ixgbe_adapter *adapter,
 	}
 }
 
-void ixgbe_unmap_and_free_tx_resource(struct ixgbe_ring *ring,
-				      struct ixgbe_tx_buffer *tx_buffer)
-{
-	if (tx_buffer->skb) {
-		dev_kfree_skb_any(tx_buffer->skb);
-		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
-	} else if (dma_unmap_len(tx_buffer, len)) {
-		dma_unmap_page(ring->dev,
-			       dma_unmap_addr(tx_buffer, dma),
-			       dma_unmap_len(tx_buffer, len),
-			       DMA_TO_DEVICE);
-	}
-	tx_buffer->next_to_watch = NULL;
-	tx_buffer->skb = NULL;
-	dma_unmap_len_set(tx_buffer, len, 0);
-	/* tx_buffer must be completely set up in the transmit path */
-}
-
 static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1211,7 +1189,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 				 DMA_TO_DEVICE);
 
 		/* clear tx_buffer data */
-		tx_buffer->skb = NULL;
 		dma_unmap_len_set(tx_buffer, len, 0);
 
 		/* unmap remaining buffers */
@@ -3345,6 +3322,10 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
 
 	clear_bit(__IXGBE_HANG_CHECK_ARMED, &ring->state);
 
+	/* reinitialize tx_buffer_info */
+	memset(ring->tx_buffer_info, 0,
+	       sizeof(struct ixgbe_tx_buffer) * ring->count);
+
 	/* enable queue */
 	IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(reg_idx), txdctl);
 
@@ -3865,6 +3846,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 #endif
 	}
 
+	/* initialize rx_buffer_info */
+	memset(ring->rx_buffer_info, 0,
+	       sizeof(struct ixgbe_rx_buffer) * ring->count);
+
 	/* initialize Rx descriptor 0 */
 	rx_desc = IXGBE_RX_DESC(ring, 0);
 	rx_desc->wb.upper.length = 0;
@@ -5049,33 +5034,22 @@ static void ixgbe_fwd_psrtype(struct ixgbe_fwd_adapter *vadapter)
  **/
 static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 {
-	struct device *dev = rx_ring->dev;
-	unsigned long size;
-	u16 i;
-
-	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_buffer_info)
-		return;
+	u16 i = rx_ring->next_to_clean;
+	struct ixgbe_rx_buffer *rx_buffer = &rx_ring->rx_buffer_info[i];
 
 	/* Free all the Rx ring sk_buffs */
-	for (i = 0; i < rx_ring->count; i++) {
-		struct ixgbe_rx_buffer *rx_buffer = &rx_ring->rx_buffer_info[i];
-
+	while (i != rx_ring->next_to_alloc) {
 		if (rx_buffer->skb) {
 			struct sk_buff *skb = rx_buffer->skb;
 			if (IXGBE_CB(skb)->page_released)
-				dma_unmap_page_attrs(dev,
+				dma_unmap_page_attrs(rx_ring->dev,
 						     IXGBE_CB(skb)->dma,
 						     ixgbe_rx_pg_size(rx_ring),
 						     DMA_FROM_DEVICE,
 						     IXGBE_RX_DMA_ATTR);
 			dev_kfree_skb(skb);
-			rx_buffer->skb = NULL;
 		}
 
-		if (!rx_buffer->page)
-			continue;
-
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
@@ -5086,19 +5060,21 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(dev, rx_buffer->dma,
+		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 				     ixgbe_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IXGBE_RX_DMA_ATTR);
 		__page_frag_cache_drain(rx_buffer->page,
 					rx_buffer->pagecnt_bias);
 
-		rx_buffer->page = NULL;
+		i++;
+		rx_buffer++;
+		if (i == rx_ring->count) {
+			i = 0;
+			rx_buffer = rx_ring->rx_buffer_info;
+		}
 	}
 
-	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_buffer_info, 0, size);
-
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
@@ -5567,28 +5543,57 @@ void ixgbe_reset(struct ixgbe_adapter *adapter)
  **/
 static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 {
-	struct ixgbe_tx_buffer *tx_buffer_info;
-	unsigned long size;
-	u16 i;
+	u16 i = tx_ring->next_to_clean;
+	struct ixgbe_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
 
-	/* ring already cleared, nothing to do */
-	if (!tx_ring->tx_buffer_info)
-		return;
+	while (i != tx_ring->next_to_use) {
+		union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
 
-	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++) {
-		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
-	}
+		/* Free all the Tx ring sk_buffs */
+		dev_kfree_skb_any(tx_buffer->skb);
 
-	netdev_tx_reset_queue(txring_txq(tx_ring));
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
 
-	size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
-	memset(tx_ring->tx_buffer_info, 0, size);
+		/* check for eop_desc to determine the end of the packet */
+		eop_desc = tx_buffer->next_to_watch;
+		tx_desc = IXGBE_TX_DESC(tx_ring, i);
 
-	/* Zero out the descriptor ring */
-	memset(tx_ring->desc, 0, tx_ring->size);
+		/* unmap remaining buffers */
+		while (tx_desc != eop_desc) {
+			tx_buffer++;
+			tx_desc++;
+			i++;
+			if (unlikely(i == tx_ring->count)) {
+				i = 0;
+				tx_buffer = tx_ring->tx_buffer_info;
+				tx_desc = IXGBE_TX_DESC(tx_ring, 0);
+			}
+
+			/* unmap any remaining paged data */
+			if (dma_unmap_len(tx_buffer, len))
+				dma_unmap_page(tx_ring->dev,
+					       dma_unmap_addr(tx_buffer, dma),
+					       dma_unmap_len(tx_buffer, len),
+					       DMA_TO_DEVICE);
+		}
 
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buffer++;
+		i++;
+		if (unlikely(i == tx_ring->count)) {
+			i = 0;
+			tx_buffer = tx_ring->tx_buffer_info;
+		}
+	}
+
+	/* reset BQL for queue */
+	netdev_tx_reset_queue(txring_txq(tx_ring));
+
+	/* reset next_to_use and next_to_clean */
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 }
@@ -6034,9 +6039,9 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
 	if (tx_ring->q_vector)
 		ring_node = tx_ring->q_vector->numa_node;
 
-	tx_ring->tx_buffer_info = vzalloc_node(size, ring_node);
+	tx_ring->tx_buffer_info = vmalloc_node(size, ring_node);
 	if (!tx_ring->tx_buffer_info)
-		tx_ring->tx_buffer_info = vzalloc(size);
+		tx_ring->tx_buffer_info = vmalloc(size);
 	if (!tx_ring->tx_buffer_info)
 		goto err;
 
@@ -6116,9 +6121,9 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
 	if (rx_ring->q_vector)
 		ring_node = rx_ring->q_vector->numa_node;
 
-	rx_ring->rx_buffer_info = vzalloc_node(size, ring_node);
+	rx_ring->rx_buffer_info = vmalloc_node(size, ring_node);
 	if (!rx_ring->rx_buffer_info)
-		rx_ring->rx_buffer_info = vzalloc(size);
+		rx_ring->rx_buffer_info = vmalloc(size);
 	if (!rx_ring->rx_buffer_info)
 		goto err;
 
@@ -7836,16 +7841,29 @@ static void ixgbe_tx_map(struct ixgbe_ring *tx_ring,
 	dev_err(tx_ring->dev, "TX DMA map failed\n");
 
 	/* clear dma mappings for failed tx_buffer_info map */
-	for (;;) {
+	while (tx_buffer != first) {
+		if (dma_unmap_len(tx_buffer, len))
+			dma_unmap_page(tx_ring->dev,
+				       dma_unmap_addr(tx_buffer, dma),
+				       dma_unmap_len(tx_buffer, len),
+				       DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buffer, len, 0);
+
+		if (i--)
+			i += tx_ring->count;
 		tx_buffer = &tx_ring->tx_buffer_info[i];
-		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer);
-		if (tx_buffer == first)
-			break;
-		if (i == 0)
-			i = tx_ring->count;
-		i--;
 	}
 
+	if (dma_unmap_len(tx_buffer, len))
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buffer, len, 0);
+
+	dev_kfree_skb_any(first->skb);
+	first->skb = NULL;
+
 	tx_ring->next_to_use = i;
 }
 

^ permalink raw reply related

* Re: [for-next 07/10] IB/mlx5: Use blue flame register allocator in mlx5_ib
From: David Miller @ 2017-01-06 16:11 UTC (permalink / raw)
  To: leonro-VPRAkNaXOzVWk0Htik3J/w
  Cc: eli-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	saeedm-VPRAkNaXOzVWk0Htik3J/w, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	eli-VPRAkNaXOzVWk0Htik3J/w, matanb-VPRAkNaXOzVWk0Htik3J/w
In-Reply-To: <20170106060608.GG15685-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>

From: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Fri, 6 Jan 2017 08:06:09 +0200

> On Thu, Jan 05, 2017 at 03:07:31PM -0500, David Miller wrote:
>> From: Eli Cohen <eli-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
>> Date: Thu, 5 Jan 2017 14:03:18 -0600
>>
>> > If necessary I can make sure it builds on 32 bits as well.
>>
>> Please do.
> 
> Dave,
> 
> I'm failing to understand the benefits of building mlx5 on 32 bits, and
> see only disadvantages:
>  * It is actual dead code without test coverage.
>  * It misleads reviewers/customers by seeing code for 32 bits.
>  * It adds compilation time for 32 bits platforms and "punishes" them
>    for not relevant for them driver.
> 
> Why do you call removing all that as a "regression"?

We have this thing called "CONFIG_COMPILE_TEST", it has tons of value,
perhaps you've seen it before?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Fixed link for 10G
From: Andrew Lunn @ 2017-01-06 16:24 UTC (permalink / raw)
  To: Madalin-Cristian Bucur
  Cc: Florian Fainelli, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <AM2PR04MB11228A96940B1E409CEC6E04EC630@AM2PR04MB1122.eurprd04.prod.outlook.com>

On Fri, Jan 06, 2017 at 12:01:06PM +0000, Madalin-Cristian Bucur wrote:
> Hi Florian,
> 
> I'm trying to add a fixed-link property that declares 10G speed
> for a XGMII PHY and I'm encountering some issues as the fixed
> link infrastructure does not seem to support this speed.
> 
> I'm using this device tree snippet (using the legacy format, but it
> should not matter):
> 
>         ethernet@f2000 { /* 10GEC2 */
>                 fixed-link = <0 1 10000 0 0>;
>                 phy-connection-type = "xgmii";
>         };
> 
> and I get this error:
> 
> 	[    0.464238] swphy: unknown speed
> 	[    0.467464] fsl_mac: probe of 1af2000.ethernet failed with error -22
> 
> Looking at the code, fixed_phy_register() seems to check for speeds up
> to 1G and swphy only caters 1G and lower speeds, the swphy_decode_speed()
> returning -EINVAL for 10G, triggering the error printed above in
> swphy_validate_state().
> 
> What would be the proper way to add support for the 10G fixed link speed?

Hi Madalin

I came across the same issue a couple of months ago. But i found a
different way to solve my problem.

Anyway, fixed-link emulates a PHY. It has the common PHY registers,
and sets the register values to indicate the device tree
configuration.

As you have found out, it only emulates 10/100/1000. For 10G, you need
to extend the emulation to include the 10G registers. Assuming the 10G
registers are standardised.

	  Andrew

^ permalink raw reply

* Re: [PATCH 2/3] xen: modify xenstore watch event interface
From: Wei Liu @ 2017-01-06 16:29 UTC (permalink / raw)
  To: Juergen Gross
  Cc: linux-kernel, xen-devel, boris.ostrovsky, konrad.wilk, roger.pau,
	wei.liu2, paul.durrant, netdev
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

On Fri, Jan 06, 2017 at 04:05:43PM +0100, Juergen Gross wrote:
> Today a Xenstore watch event is delivered via a callback function
> declared as:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
> 
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
> 
> is the natural thing to do.
> 
> Apply this change and adapt all users.
> 
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

Reviewed-by: Wei Liu <wei.liu2@citrix.com>

^ permalink raw reply

* Re: [PATCH 2/3] xen: modify xenstore watch event interface
From: Roger Pau Monné @ 2017-01-06 16:37 UTC (permalink / raw)
  To: Juergen Gross
  Cc: linux-kernel, xen-devel, boris.ostrovsky, konrad.wilk, wei.liu2,
	paul.durrant, netdev
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

On Fri, Jan 06, 2017 at 04:05:43PM +0100, Juergen Gross wrote:
> Today a Xenstore watch event is delivered via a callback function
> declared as:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
> 
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
> 
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
> 
> is the natural thing to do.
> 
> Apply this change and adapt all users.
> 
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

blkback changes:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.

^ permalink raw reply

* [PATCH net] dccp: fix option range in dccp_parse_options()
From: Eric Dumazet @ 2017-01-06 16:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrey Konovalov, Gerrit Renker

From: Eric Dumazet <edumazet@google.com>

dccp_parse_options() improperly parses 12 or 16 bytes in excess,
because it forgets to subtract DCCP header len.

This causes various issues, since these 12/16 bytes are part of the
payload and this might not even be present in skb->head, as
dccp_invalid_packet() only pulled everything but payload.

KASAN complains since we might access uninitialized data.

Strangely enough, net/netfilter/xt_dccp.c got this right.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/dccp/options.c b/net/dccp/options.c
index 74d29c56c36709fd4e31f0e63a1f8b1aa38a32cd..41bd4bc4026f97b155e12a3a37095653836ce7fc 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,10 +54,9 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
-	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
-	unsigned char *opt_ptr = options;
-	const unsigned char *opt_end = (unsigned char *)dh +
-					(dh->dccph_doff * 4);
+	unsigned char *opt_ptr = (unsigned char *)dh + __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff * 4 - __dccp_hdr_len(dh);
+	const unsigned char *opt_end = opt_ptr + optlen;
 	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
 	unsigned char opt, len;
 	unsigned char *uninitialized_var(value);

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox