Netdev List
 help / color / mirror / Atom feed
* [PATCH 05/42] proc: add a proc_create_reg helper
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: linux-rtc, Alessandro Zummo, Alexandre Belloni, devel,
	linux-kernel, linux-scsi, linux-ide, Greg Kroah-Hartman,
	jfs-discussion, linux-afs, linux-acpi, netdev, netfilter-devel,
	Jiri Slaby, linux-ext4, Alexey Dobriyan, megaraidlinux.pdl,
	drbd-dev
In-Reply-To: <20180516094346.20506-1-hch@lst.de>

Common code for creating a regular file.  Factor out of proc_create_data, to
be reused by other functions soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/generic.c  | 44 +++++++++++++++++++++++++-------------------
 fs/proc/internal.h |  2 ++
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bd8480ff0d35..ab6a321076b8 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -511,33 +511,39 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
 }
 EXPORT_SYMBOL(proc_create_mount_point);
 
-struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
-					struct proc_dir_entry *parent,
-					const struct file_operations *proc_fops,
-					void *data)
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+		struct proc_dir_entry **parent, void *data)
 {
-	struct proc_dir_entry *pde;
+	struct proc_dir_entry *p;
+
 	if ((mode & S_IFMT) == 0)
 		mode |= S_IFREG;
-
-	if (!S_ISREG(mode)) {
-		WARN_ON(1);	/* use proc_mkdir() */
+	if ((mode & S_IALLUGO) == 0)
+		mode |= S_IRUGO;
+	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
+
+	p = __proc_create(parent, name, mode, 1);
+	if (p) {
+		p->proc_iops = &proc_file_inode_operations;
+		p->data = data;
 	}
+	return p;
+}
+
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		const struct file_operations *proc_fops, void *data)
+{
+	struct proc_dir_entry *p;
 
 	BUG_ON(proc_fops == NULL);
 
-	if ((mode & S_IALLUGO) == 0)
-		mode |= S_IRUGO;
-	pde = __proc_create(&parent, name, mode, 1);
-	if (!pde)
-		goto out;
-	pde->proc_fops = proc_fops;
-	pde->data = data;
-	pde->proc_iops = &proc_file_inode_operations;
-	return proc_register(parent, pde);
-out:
-	return NULL;
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = proc_fops;
+	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_data);
  
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 488e67490312..dd1e11400b97 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,6 +162,8 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
 /*
  * generic.c
  */
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+		struct proc_dir_entry **parent, void *data);
 struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
 		struct proc_dir_entry *dp);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 04/42] proc: simplify proc_register calling conventions
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: Alexey Dobriyan, Greg Kroah-Hartman, Jiri Slaby, Alessandro Zummo,
	Alexandre Belloni, linux-acpi, drbd-dev, linux-ide, netdev,
	linux-rtc, megaraidlinux.pdl, linux-scsi, devel, linux-afs,
	linux-ext4, jfs-discussion, netfilter-devel, linux-kernel
In-Reply-To: <20180516094346.20506-1-hch@lst.de>

Return registered entry on success, return NULL on failure and free the
passed in entry.  Also expose it in internal.h as we'll start using it
in proc_net.c soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/generic.c  | 44 ++++++++++++++++++--------------------------
 fs/proc/internal.h |  2 ++
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2078e70e1595..bd8480ff0d35 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -346,13 +346,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 	.setattr	= proc_notify_change,
 };
 
-static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+/* returns the registered entry, or frees dp and returns NULL on failure */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+		struct proc_dir_entry *dp)
 {
-	int ret;
-
-	ret = proc_alloc_inum(&dp->low_ino);
-	if (ret)
-		return ret;
+	if (proc_alloc_inum(&dp->low_ino))
+		goto out_free_entry;
 
 	write_lock(&proc_subdir_lock);
 	dp->parent = dir;
@@ -360,12 +359,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 		WARN(1, "proc_dir_entry '%s/%s' already registered\n",
 		     dir->name, dp->name);
 		write_unlock(&proc_subdir_lock);
-		proc_free_inum(dp->low_ino);
-		return -EEXIST;
+		goto out_free_inum;
 	}
 	write_unlock(&proc_subdir_lock);
 
-	return 0;
+	return dp;
+out_free_inum:
+	proc_free_inum(dp->low_ino);
+out_free_entry:
+	pde_free(dp);
+	return NULL;
 }
 
 static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
@@ -443,10 +446,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 		if (ent->data) {
 			strcpy((char*)ent->data,dest);
 			ent->proc_iops = &proc_link_inode_operations;
-			if (proc_register(parent, ent) < 0) {
-				pde_free(ent);
-				ent = NULL;
-			}
+			ent = proc_register(parent, ent);
 		} else {
 			pde_free(ent);
 			ent = NULL;
@@ -470,11 +470,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
 		ent->proc_fops = &proc_dir_operations;
 		ent->proc_iops = &proc_dir_inode_operations;
 		parent->nlink++;
-		if (proc_register(parent, ent) < 0) {
-			pde_free(ent);
+		ent = proc_register(parent, ent);
+		if (!ent)
 			parent->nlink--;
-			ent = NULL;
-		}
 	}
 	return ent;
 }
@@ -505,11 +503,9 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
 		ent->proc_fops = NULL;
 		ent->proc_iops = NULL;
 		parent->nlink++;
-		if (proc_register(parent, ent) < 0) {
-			pde_free(ent);
+		ent = proc_register(parent, ent);
+		if (!ent)
 			parent->nlink--;
-			ent = NULL;
-		}
 	}
 	return ent;
 }
@@ -539,11 +535,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 	pde->proc_fops = proc_fops;
 	pde->data = data;
 	pde->proc_iops = &proc_file_inode_operations;
-	if (proc_register(parent, pde) < 0)
-		goto out_free;
-	return pde;
-out_free:
-	pde_free(pde);
+	return proc_register(parent, pde);
 out:
 	return NULL;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0f1692e63cb6..488e67490312 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,6 +162,8 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
 /*
  * generic.c
  */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+		struct proc_dir_entry *dp);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
 extern int proc_readdir(struct file *, struct dir_context *);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 03/42] proc: don't detour through seq->private to get the inode
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: linux-rtc, Alessandro Zummo, Alexandre Belloni, devel,
	linux-kernel, linux-scsi, linux-ide, Greg Kroah-Hartman,
	jfs-discussion, linux-afs, linux-acpi, netdev, netfilter-devel,
	Jiri Slaby, linux-ext4, Alexey Dobriyan, megaraidlinux.pdl,
	drbd-dev
In-Reply-To: <20180516094346.20506-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/array.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 911f66924d81..4a8e413bf59b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -677,20 +677,22 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
 
 static int children_seq_show(struct seq_file *seq, void *v)
 {
-	seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(seq->private)));
+	struct inode *inode = file_inode(seq->file);
+
+	seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode)));
 	return 0;
 }
 
 static void *children_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	return get_children_pid(seq->private, NULL, *pos);
+	return get_children_pid(file_inode(seq->file), NULL, *pos);
 }
 
 static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct pid *pid;
 
-	pid = get_children_pid(seq->private, v, *pos + 1);
+	pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
 	put_pid(v);
 
 	++*pos;
@@ -711,17 +713,7 @@ static const struct seq_operations children_seq_ops = {
 
 static int children_seq_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *m;
-	int ret;
-
-	ret = seq_open(file, &children_seq_ops);
-	if (ret)
-		return ret;
-
-	m = file->private_data;
-	m->private = inode;
-
-	return ret;
+	return seq_open(file, &children_seq_ops);
 }
 
 const struct file_operations proc_tid_children_operations = {
-- 
2.17.0

^ permalink raw reply related

* [PATCH 02/42] proc: introduce a proc_pid_ns helper
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: linux-rtc, Alessandro Zummo, Alexandre Belloni, devel,
	linux-kernel, linux-scsi, linux-ide, Greg Kroah-Hartman,
	jfs-discussion, linux-afs, linux-acpi, netdev, netfilter-devel,
	Jiri Slaby, linux-ext4, Alexey Dobriyan, megaraidlinux.pdl,
	drbd-dev
In-Reply-To: <20180516094346.20506-1-hch@lst.de>

Factor out retrieving the per-sb pid namespaces from the sb private data
into an easier to understand helper.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/proc/array.c         |  7 +------
 fs/proc/base.c          | 18 ++++++++----------
 fs/proc/self.c          |  4 ++--
 fs/proc/thread_self.c   |  4 ++--
 include/linux/proc_fs.h |  6 ++++++
 5 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ae2c807fd719..911f66924d81 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -677,12 +677,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
 
 static int children_seq_show(struct seq_file *seq, void *v)
 {
-	struct inode *inode = seq->private;
-	pid_t pid;
-
-	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
-	seq_printf(seq, "%d ", pid);
-
+	seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(seq->private)));
 	return 0;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b2ede6abcdf..29237cad19fd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
-	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
 	bool has_perms;
 
@@ -733,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = {
 static int proc_single_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns;
-	struct pid *pid;
+	struct pid_namespace *ns = proc_pid_ns(inode);
+	struct pid *pid = proc_pid(inode);
 	struct task_struct *task;
 	int ret;
 
-	ns = inode->i_sb->s_fs_info;
-	pid = proc_pid(inode);
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
@@ -1410,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
@@ -1782,8 +1780,8 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
-	struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -2337,7 +2335,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	tp->pid = proc_pid(inode);
-	tp->ns = inode->i_sb->s_fs_info;
+	tp->ns = proc_pid_ns(inode);
 	return 0;
 }
 
@@ -3239,7 +3237,7 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct tgid_iter iter;
-	struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(file_inode(file));
 	loff_t pos = ctx->pos;
 
 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -3588,7 +3586,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	ns = inode->i_sb->s_fs_info;
+	ns = proc_pid_ns(inode);
 	tid = (int)file->f_version;
 	file->f_version = 0;
 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 4d7d061696b3..127265e5c55f 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
 				      struct inode *inode,
 				      struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
@@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init;
 int proc_setup_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *self;
 	
 	inode_lock(root_inode);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9d2efaca499f..b905010ca9eb 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 					     struct inode *inode,
 					     struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
@@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init;
 int proc_setup_thread_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *thread_self;
 
 	inode_lock(root_inode);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 928ef9e4d912..4edcde510631 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -83,4 +83,10 @@ struct ns_common;
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns));
 
+/* get the associated pid namespace for a file in procfs */
+static inline struct pid_namespace *proc_pid_ns(struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
 #endif /* _LINUX_PROC_FS_H */
-- 
2.17.0

^ permalink raw reply related

* [PATCH 01/42] net/can: single_open_net needs to be paired with single_release_net
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: linux-rtc-u79uwXL29TY76Z2rM5mHXA, Alessandro Zummo,
	Alexandre Belloni, devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-ide-u79uwXL29TY76Z2rM5mHXA, Greg Kroah-Hartman,
	jfs-discussion-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	linux-afs-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-acpi-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA, Jiri Slaby,
	linux-ext4-u79uwXL29TY76Z2rM5mHXA, Alexey Dobriyan,
	megaraidlinux.pdl-dY08KVG/lbpWk0Htik3J/w,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20180516094346.20506-1-hch-jcswGhMUV9g@public.gmane.org>

Otherwise we will leak a reference to the network namespace.

Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
 net/can/bcm.c  | 2 +-
 net/can/proc.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/can/bcm.c b/net/can/bcm.c
index ac5e5e34fee3..8073fa14e143 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -249,7 +249,7 @@ static const struct file_operations bcm_proc_fops = {
 	.open		= bcm_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= single_release_net,
 };
 #endif /* CONFIG_PROC_FS */
 
diff --git a/net/can/proc.c b/net/can/proc.c
index fdf704e9bb8c..fde2fd55b826 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -279,7 +279,7 @@ static const struct file_operations can_stats_proc_fops = {
 	.open		= can_stats_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= single_release_net,
 };
 
 static int can_reset_stats_proc_show(struct seq_file *m, void *v)
@@ -449,7 +449,7 @@ static const struct file_operations can_rcvlist_sff_proc_fops = {
 	.open		= can_rcvlist_sff_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= single_release_net,
 };
 
 
@@ -492,7 +492,7 @@ static const struct file_operations can_rcvlist_eff_proc_fops = {
 	.open		= can_rcvlist_eff_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= single_release_net,
 };
 
 /*
-- 
2.17.0

^ permalink raw reply related

* simplify procfs code for seq_file instances V3
From: Christoph Hellwig @ 2018-05-16  9:43 UTC (permalink / raw)
  To: Andrew Morton, Alexander Viro
  Cc: linux-rtc, Alessandro Zummo, Alexandre Belloni, devel,
	linux-kernel, linux-scsi, linux-ide, Greg Kroah-Hartman,
	jfs-discussion, linux-afs, linux-acpi, netdev, netfilter-devel,
	Jiri Slaby, linux-ext4, Alexey Dobriyan, megaraidlinux.pdl,
	drbd-dev

We currently have hundreds of proc files that implement plain, read-only
seq_file based interfaces.  This series consolidates them using new
procfs helpers that take the seq_operations or simple show callback
directly.

A git tree is available at:

    git://git.infradead.org/users/hch/misc.git proc_create.3

Gitweb:

    http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/proc_create.3

Changes since V2:
 - use unsigned int for state_size everywhere
 - move state_size around in proc_dir_entry to use a struct packing hole
 - update SIZEOF_PDE_INLINE_NAME
 - added a new proc_pid_ns helper
 - improved a few changelogs
 - added back a nubus comment
 - minor typo fix
 - collected various ACKs

Changes since V1:
 - open code proc_create_data to avoid setting not fully initialized
   entries live
 - use unsigned int for state_size
 - dropped the s390/cio/blacklist hunk as it has a write method
 - dropped the IPMI patch given that IPMI proc support is scheduled for
   removal.

^ permalink raw reply

* Re: KMSAN: uninit-value in __sctp_v6_cmp_addr
From: Alexander Potapenko @ 2018-05-16  9:42 UTC (permalink / raw)
  To: syzbot+85490c30c260afff22f2
  Cc: David Miller, LKML, linux-sctp, Networking, nhorman,
	syzkaller-bugs, Vladislav Yasevich
In-Reply-To: <0000000000004f4075056c410b96@google.com>

#syz fix: sctp: handle two v4 addrs comparison in sctp_inet6_cmp_addr

^ permalink raw reply

* Re: [PATCH 10/14] net: sched: extend act API for lockless actions
From: Vlad Buslov @ 2018-05-16  9:39 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <20180516085628.GE1972@nanopsycho>


On Wed 16 May 2018 at 08:56, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, May 16, 2018 at 10:16:13AM CEST, vladbu@mellanox.com wrote:
>>
>>On Wed 16 May 2018 at 07:50, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Mon, May 14, 2018 at 04:27:11PM CEST, vladbu@mellanox.com wrote:
>>>>Implement new action API function to atomically delete action with
>>>>specified index and to atomically insert unique action. These functions are
>>>>required to implement init and delete functions for specific actions that
>>>>do not rely on rtnl lock.
>>>>
>>>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>>>---
>>>> include/net/act_api.h |  2 ++
>>>> net/sched/act_api.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>>> 2 files changed, 47 insertions(+)
>>>>
>>>>diff --git a/include/net/act_api.h b/include/net/act_api.h
>>>>index a8c8570..bce0cf1 100644
>>>>--- a/include/net/act_api.h
>>>>+++ b/include/net/act_api.h
>>>>@@ -153,7 +153,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>>>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>>>> 		   int bind, bool cpustats);
>>>> void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>>>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a);
>>>> 
>>>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index);
>>>> int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
>>>> 
>>>> static inline int tcf_idr_release(struct tc_action *a, bool bind)
>>>>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>>>>index 2772276e..a5193dc 100644
>>>>--- a/net/sched/act_api.c
>>>>+++ b/net/sched/act_api.c
>>>>@@ -330,6 +330,41 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
>>>> }
>>>> EXPORT_SYMBOL(tcf_idr_check);
>>>> 
>>>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index)
>>>>+{
>>>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>>>+	struct tc_action *p;
>>>>+	int ret = 0;
>>>>+
>>>>+	spin_lock_bh(&idrinfo->lock);
>>>
>>> Why "_bh" is needed here?
>>
>>Original idr remove function used _bh version so I used it here as well.
>>As I already replied to your previous question about idrinfo lock usage,
>>I don't see any particular reason for locking with _bh at this point.
>>I've contacted the author(Chris Mi) and he said that he just preserved
>>locking the same way as it was before he changed hash table to idr for
>>action lookup.
>>
>>You want me to do standalone patch that cleans up idrinfo locking?
>
> Yes please. You can send it separately, not as a part of this
> patchset.

Okay.

>
>
>
>>
>>>
>>>
>>>>+	p = idr_find(&idrinfo->action_idr, index);
>>>>+	if (!p) {
>>>>+		spin_unlock(&idrinfo->lock);
>>>>+		return -ENOENT;
>>>>+	}
>>>>+
>>>>+	if (!atomic_read(&p->tcfa_bindcnt)) {
>>>>+		if (refcount_dec_and_test(&p->tcfa_refcnt)) {
>>>>+			struct module *owner = p->ops->owner;
>>>>+
>>>>+			WARN_ON(p != idr_remove(&idrinfo->action_idr,
>>>>+						p->tcfa_index));
>>>>+			spin_unlock_bh(&idrinfo->lock);
>>>>+
>>>>+			tcf_action_cleanup(p);
>>>>+			module_put(owner);
>>>>+			return 0;
>>>>+		}
>>>>+		ret = 0;
>>>>+	} else {
>>>>+		ret = -EPERM;
>>>
>>> I wonder if "-EPERM" is the best error code for this...
>>
>>This is what original code returned so I decided to preserve
>>compatibility.
>
> Okay.
>
>
>>
>>>
>>>
>>>>+	}
>>>>+
>>>>+	spin_unlock_bh(&idrinfo->lock);
>>>>+	return ret;
>>>>+}
>>>>+EXPORT_SYMBOL(tcf_idr_find_delete);
>>>>+
>>>> int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>>>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>>>> 		   int bind, bool cpustats)
>>>>@@ -407,6 +442,16 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
>>>> }
>>>> EXPORT_SYMBOL(tcf_idr_insert);
>>>> 
>>>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a)
>>>>+{
>>>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>>>+
>>>>+	spin_lock_bh(&idrinfo->lock);
>>>>+	WARN_ON(idr_replace(&idrinfo->action_idr, a, a->tcfa_index));
>>>
>>> Under which condition this WARN_ON is hit?
>>
>>When idr replace returns non-NULL pointer, which means that somehow
>>concurrent insertion of action with same index has happened and we are
>>leaking memory.
>
> Is that possible to happen? Meaning, can I as a user cause this by doing
> something in a wrong/unexpected way?

No, it shouldn't be possible unless there is a race condition.
Otherwise I would put some proper error handling code there.

>
>
>>
>>By the way I'm still not sure if having this insert unique function is
>>warranted or I should just add WARN to regular idr insert. What is your
>>opinion on this?
>
> I have to check where you use this.

Every action init function uses this.

>
>
>>
>>>
>>>
>>>>+	spin_unlock_bh(&idrinfo->lock);
>>>>+}
>>>>+EXPORT_SYMBOL(tcf_idr_insert_unique);
>>>>+
>>>> void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
>>>> 			 struct tcf_idrinfo *idrinfo)
>>>> {
>>>>-- 
>>>>2.7.5
>>>>
>>

^ permalink raw reply

* Xilinx axienet + DP83620 in fiber mode won't set netif_carrier_on
From: Alvaro G. M. @ 2018-05-16  9:16 UTC (permalink / raw)
  To: netdev

Hi,

I have a custom board with a Xilinx FPGA running Microblaze and fitting a
Xilinx Axi Ethernet IP core.  This core communicates through MII mode with a
DP83620 PHY from Texas that supports both cabled and fiber interfaces, of
which I'm using the latter.

Under these circumstances, I've noticed that the interface is pretty much
dead except for receiving broadcast packages, so I tried to dig on the
driver to find the cause. Please, beware that I'm not very familiar with the
netdev subsystem, so I may be mistaken on lots of things.

It seems that of_phy_connect ends up calling netif_carrier_off:

phy_device.c:1036
	/* Initial carrier state is off as the phy is about to be
	 * (re)initialized.
	 */
	netif_carrier_off(phydev->attached_dev);

	/* Do initial configuration here, now that
	 * we have certain key parameters
	 * (dev_flags and interface)
	 */
	err = phy_init_hw(phydev);
	if (err)
		goto error;

	phy_resume(phydev);

However, neither xilinx_axienet_main.c nor dp83848.c ever runs
netif_carrier_on. As a simple test, I tried this patch, and that was enough
to make the interface work.

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index e74e1e897864..d8bbe4c51b8a 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -957,6 +957,8 @@ static int axienet_open(struct net_device *ndev)
 	if (ret)
 		goto err_rx_irq;
 
+	netif_carrier_on(ndev);
+
 	return 0;
 
 err_rx_irq:


I understand, however, that this is just a proof of concept that shows the
underlying issue. I'd like to contribute to making this a proper patch, or
maybe anyone who is familiar with the netdev subsystem knows at first sight
what is the solution for this.

My understanding is that this code works fine with other PHY chips, as
pretty much the same code has been in the kernel for a long time, but that
probably before ee06b1728b95643668e40fc58ae118aeb7c1753e (which I
instigated) this Xilinx core and driver had never been tested with any
interface other than GMII and RGMII, which were back then written
explicitly, with an unknown PHY chip.

I should also note that axienet_adjust_link is never called in this
configuration, which is the place where I think the call to netif_carrier_on
should be (based on what I've read on other ethernet drivers), but it
seems that the dp83620 doesn't notify of any autonegotiation (at least while
on fiber mode).

I'm open to reading and testing whatever is needed, and please, feel free to
correct me if I've said anything incorrect, which most probably I've done.

Best regards

-- 
Alvaro G. M.

^ permalink raw reply related

* [PATCH] net: 8390: ne: Fix accidentally removed RBTX4927 support
From: Geert Uytterhoeven @ 2018-05-16  9:18 UTC (permalink / raw)
  To: David S . Miller
  Cc: Arnd Bergmann, Dominik Brodowski, Atsushi Nemoto, netdev,
	linux-mips, linux-kernel, Geert Uytterhoeven

The configuration settings for RBTX4927 were accidentally removed,
leading to a silently broken network interface.

Re-add the missing settings to fix this.

Fixes: 8eb97ff5a4ec941d ("net: 8390: remove m32r specific bits")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
Bisected between v4.9-rc2 (doh) and v4.17-rc5.

Note to myself: I should do more boot testing on RBTX4927.
Fortunately I caught it before it ends up in a point release ;-)
---
 drivers/net/ethernet/8390/ne.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/8390/ne.c b/drivers/net/ethernet/8390/ne.c
index ac99d089ac7266c3..1c97e39b478e9f89 100644
--- a/drivers/net/ethernet/8390/ne.c
+++ b/drivers/net/ethernet/8390/ne.c
@@ -164,7 +164,9 @@ bad_clone_list[] __initdata = {
 #define NESM_START_PG	0x40	/* First page of TX buffer */
 #define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */
 
-#if defined(CONFIG_ATARI)	/* 8-bit mode on Atari, normal on Q40 */
+#if defined(CONFIG_MACH_TX49XX)
+#  define DCR_VAL 0x48		/* 8-bit mode */
+#elif defined(CONFIG_ATARI)	/* 8-bit mode on Atari, normal on Q40 */
 #  define DCR_VAL (MACH_IS_ATARI ? 0x48 : 0x49)
 #else
 #  define DCR_VAL 0x49
-- 
2.7.4

^ permalink raw reply related

* Re: KMSAN: uninit-value in __sctp_v6_cmp_addr
From: Alexander Potapenko @ 2018-05-16  9:08 UTC (permalink / raw)
  To: lucien xin
  Cc: syzbot+85490c30c260afff22f2, David Miller, LKML, linux-sctp,
	Networking, nhorman, syzkaller-bugs, Vladislav Yasevich
In-Reply-To: <CADvbK_cGes4gorUa2Y8PSkL4qF4WetC2eo3PFvEYd1xbRiSihQ@mail.gmail.com>

On Wed, May 16, 2018 at 9:17 AM Xin Long <lucien.xin@gmail.com> wrote:

> On Wed, May 16, 2018 at 12:25 AM, syzbot
> <syzbot+85490c30c260afff22f2@syzkaller.appspotmail.com> wrote:
> > Hello,
> >
> > syzbot found the following crash on:
> >
> > HEAD commit:    74ee2200b89f kmsan: bump .config.example to v4.17-rc3
> > git tree:       https://github.com/google/kmsan.git/master
> > console output: https://syzkaller.appspot.com/x/log.txt?x=169efb5b800000
> > kernel config:
https://syzkaller.appspot.com/x/.config?x=4ca1e57bafa8ab1f
> > dashboard link:
https://syzkaller.appspot.com/bug?extid=85490c30c260afff22f2
> > compiler:       clang version 7.0.0 (trunk 329391)
> > syzkaller repro:
https://syzkaller.appspot.com/x/repro.syz?x=157e9237800000
> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10fe5de7800000
> >
> > IMPORTANT: if you fix the bug, please add the following tag to the
commit:
> > Reported-by: syzbot+85490c30c260afff22f2@syzkaller.appspotmail.com
> >
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > ==================================================================
> > BUG: KMSAN: uninit-value in __sctp_v6_cmp_addr+0x49a/0x850
> > net/sctp/ipv6.c:580
> > CPU: 0 PID: 4453 Comm: syz-executor325 Not tainted 4.17.0-rc3+ #88
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > Google 01/01/2011
> > Call Trace:
> >  <IRQ>
> >  __dump_stack lib/dump_stack.c:77 [inline]
> >  dump_stack+0x185/0x1d0 lib/dump_stack.c:113
> >  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
> >  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
> >  __sctp_v6_cmp_addr+0x49a/0x850 net/sctp/ipv6.c:580
> Pls check if the testing kernel has this commit:
> commit d625329b06e46bd20baf9ee40847d11982569204
> Author: Xin Long <lucien.xin@gmail.com>
> Date:   Thu Apr 26 14:13:57 2018 +0800

>      sctp: handle two v4 addrs comparison in sctp_inet6_cmp_addr

It doesn't, because we were testing v4.17-rc3, and the patch is in
v4.17-rc4.
I'll update to -rc5 and test again.
> Thanks.
Thank you!
> >  sctp_inet6_cmp_addr+0x3dc/0x400 net/sctp/ipv6.c:898
> >  sctp_bind_addr_match+0x18b/0x2f0 net/sctp/bind_addr.c:330
> >  sctp_addrs_lookup_transport+0x904/0xa20 net/sctp/input.c:942
> >  __sctp_lookup_association net/sctp/input.c:985 [inline]
> >  __sctp_rcv_lookup net/sctp/input.c:1249 [inline]
> >  sctp_rcv+0x15e6/0x4d30 net/sctp/input.c:170
> >  ip_local_deliver_finish+0x874/0xec0 net/ipv4/ip_input.c:215
> >  NF_HOOK include/linux/netfilter.h:288 [inline]
> >  ip_local_deliver+0x43c/0x4e0 net/ipv4/ip_input.c:256
> >  dst_input include/net/dst.h:450 [inline]
> >  ip_rcv_finish+0xa36/0x1d00 net/ipv4/ip_input.c:396
> >  NF_HOOK include/linux/netfilter.h:288 [inline]
> >  ip_rcv+0x118f/0x16d0 net/ipv4/ip_input.c:492
> >  __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4592
> >  __netif_receive_skb net/core/dev.c:4657 [inline]
> >  process_backlog+0x62d/0xe20 net/core/dev.c:5337
> >  napi_poll net/core/dev.c:5735 [inline]
> >  net_rx_action+0x7c1/0x1a70 net/core/dev.c:5801
> >  __do_softirq+0x56d/0x93d kernel/softirq.c:285
> >  do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1046
> >  </IRQ>
> >  do_softirq kernel/softirq.c:329 [inline]
> >  __local_bh_enable_ip+0x114/0x140 kernel/softirq.c:182
> >  local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32
> >  rcu_read_unlock_bh include/linux/rcupdate.h:728 [inline]
> >  ip_finish_output2+0x135a/0x1470 net/ipv4/ip_output.c:231
> >  ip_finish_output+0xcb2/0xff0 net/ipv4/ip_output.c:317
> >  NF_HOOK_COND include/linux/netfilter.h:277 [inline]
> >  ip_output+0x505/0x5d0 net/ipv4/ip_output.c:405
> >  dst_output include/net/dst.h:444 [inline]
> >  ip_local_out net/ipv4/ip_output.c:124 [inline]
> >  ip_queue_xmit+0x1a1e/0x1d10 net/ipv4/ip_output.c:504
> >  sctp_v4_xmit+0x188/0x210 net/sctp/protocol.c:983
> >  sctp_packet_transmit+0x3eaa/0x4350 net/sctp/output.c:650
> >  sctp_outq_flush+0x1a7a/0x6320 net/sctp/outqueue.c:1197
> >  sctp_outq_uncork+0xd2/0xf0 net/sctp/outqueue.c:776
> >  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
> >  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
> >  sctp_do_sm+0x8707/0x8d20 net/sctp/sm_sideeffect.c:1191
> >  sctp_primitive_REQUESTHEARTBEAT+0x175/0x1a0 net/sctp/primitive.c:200
> >  sctp_apply_peer_addr_params+0x207/0x1670 net/sctp/socket.c:2487
> >  sctp_setsockopt_peer_addr_params net/sctp/socket.c:2683 [inline]
> >  sctp_setsockopt+0x10e5f/0x11600 net/sctp/socket.c:4258
> >  sock_common_setsockopt+0x136/0x170 net/core/sock.c:3039
> >  __sys_setsockopt+0x4af/0x560 net/socket.c:1903
> >  __do_sys_setsockopt net/socket.c:1914 [inline]
> >  __se_sys_setsockopt net/socket.c:1911 [inline]
> >  __x64_sys_setsockopt+0x15c/0x1c0 net/socket.c:1911
> >  do_syscall_64+0x154/0x220 arch/x86/entry/common.c:287
> >  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > RIP: 0033:0x43fef9
> > RSP: 002b:00007ffc00d9bfd8 EFLAGS: 00000207 ORIG_RAX: 0000000000000036
> > RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fef9
> > RDX: 0000000000000009 RSI: 0000000000000084 RDI: 0000000000000003
> > RBP: 00000000006ca018 R08: 0000000000000098 R09: 000000000000001c
> > R10: 0000000020000180 R11: 0000000000000207 R12: 0000000000401820
> > R13: 00000000004018b0 R14: 0000000000000000 R15: 0000000000000000
> >
> > Local variable description: ----dest@sctp_rcv
> > Variable was created at:
> >  sctp_rcv+0x13d/0x4d30 net/sctp/input.c:97
> >  ip_local_deliver_finish+0x874/0xec0 net/ipv4/ip_input.c:215
> > ==================================================================
> >
> >
> > ---
> > This bug is generated by a bot. It may contain errors.
> > See https://goo.gl/tpsmEJ for more information about syzbot.
> > syzbot engineers can be reached at syzkaller@googlegroups.com.
> >
> > syzbot will keep track of this bug report. See:
> > https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
> > syzbot.
> > syzbot can test patches for this bug, for details see:
> > https://goo.gl/tpsmEJ#testing-patches

> --
> You received this message because you are subscribed to the Google Groups
"syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an
email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit
https://groups.google.com/d/msgid/syzkaller-bugs/CADvbK_cGes4gorUa2Y8PSkL4qF4WetC2eo3PFvEYd1xbRiSihQ%40mail.gmail.com
.
> For more options, visit https://groups.google.com/d/optout.



-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply

* Re: [PATCH 10/14] net: sched: extend act API for lockless actions
From: Jiri Pirko @ 2018-05-16  8:56 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <vbfbmdg10du.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

Wed, May 16, 2018 at 10:16:13AM CEST, vladbu@mellanox.com wrote:
>
>On Wed 16 May 2018 at 07:50, Jiri Pirko <jiri@resnulli.us> wrote:
>> Mon, May 14, 2018 at 04:27:11PM CEST, vladbu@mellanox.com wrote:
>>>Implement new action API function to atomically delete action with
>>>specified index and to atomically insert unique action. These functions are
>>>required to implement init and delete functions for specific actions that
>>>do not rely on rtnl lock.
>>>
>>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>>---
>>> include/net/act_api.h |  2 ++
>>> net/sched/act_api.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>> 2 files changed, 47 insertions(+)
>>>
>>>diff --git a/include/net/act_api.h b/include/net/act_api.h
>>>index a8c8570..bce0cf1 100644
>>>--- a/include/net/act_api.h
>>>+++ b/include/net/act_api.h
>>>@@ -153,7 +153,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>>> 		   int bind, bool cpustats);
>>> void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a);
>>> 
>>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index);
>>> int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
>>> 
>>> static inline int tcf_idr_release(struct tc_action *a, bool bind)
>>>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>>>index 2772276e..a5193dc 100644
>>>--- a/net/sched/act_api.c
>>>+++ b/net/sched/act_api.c
>>>@@ -330,6 +330,41 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
>>> }
>>> EXPORT_SYMBOL(tcf_idr_check);
>>> 
>>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index)
>>>+{
>>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>>+	struct tc_action *p;
>>>+	int ret = 0;
>>>+
>>>+	spin_lock_bh(&idrinfo->lock);
>>
>> Why "_bh" is needed here?
>
>Original idr remove function used _bh version so I used it here as well.
>As I already replied to your previous question about idrinfo lock usage,
>I don't see any particular reason for locking with _bh at this point.
>I've contacted the author(Chris Mi) and he said that he just preserved
>locking the same way as it was before he changed hash table to idr for
>action lookup.
>
>You want me to do standalone patch that cleans up idrinfo locking?

Yes please. You can send it separately, not as a part of this patchset.



>
>>
>>
>>>+	p = idr_find(&idrinfo->action_idr, index);
>>>+	if (!p) {
>>>+		spin_unlock(&idrinfo->lock);
>>>+		return -ENOENT;
>>>+	}
>>>+
>>>+	if (!atomic_read(&p->tcfa_bindcnt)) {
>>>+		if (refcount_dec_and_test(&p->tcfa_refcnt)) {
>>>+			struct module *owner = p->ops->owner;
>>>+
>>>+			WARN_ON(p != idr_remove(&idrinfo->action_idr,
>>>+						p->tcfa_index));
>>>+			spin_unlock_bh(&idrinfo->lock);
>>>+
>>>+			tcf_action_cleanup(p);
>>>+			module_put(owner);
>>>+			return 0;
>>>+		}
>>>+		ret = 0;
>>>+	} else {
>>>+		ret = -EPERM;
>>
>> I wonder if "-EPERM" is the best error code for this...
>
>This is what original code returned so I decided to preserve
>compatibility.

Okay.


>
>>
>>
>>>+	}
>>>+
>>>+	spin_unlock_bh(&idrinfo->lock);
>>>+	return ret;
>>>+}
>>>+EXPORT_SYMBOL(tcf_idr_find_delete);
>>>+
>>> int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>>> 		   int bind, bool cpustats)
>>>@@ -407,6 +442,16 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
>>> }
>>> EXPORT_SYMBOL(tcf_idr_insert);
>>> 
>>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a)
>>>+{
>>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>>+
>>>+	spin_lock_bh(&idrinfo->lock);
>>>+	WARN_ON(idr_replace(&idrinfo->action_idr, a, a->tcfa_index));
>>
>> Under which condition this WARN_ON is hit?
>
>When idr replace returns non-NULL pointer, which means that somehow
>concurrent insertion of action with same index has happened and we are
>leaking memory.

Is that possible to happen? Meaning, can I as a user cause this by doing
something in a wrong/unexpected way?


>
>By the way I'm still not sure if having this insert unique function is
>warranted or I should just add WARN to regular idr insert. What is your
>opinion on this?

I have to check where you use this.


>
>>
>>
>>>+	spin_unlock_bh(&idrinfo->lock);
>>>+}
>>>+EXPORT_SYMBOL(tcf_idr_insert_unique);
>>>+
>>> void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
>>> 			 struct tcf_idrinfo *idrinfo)
>>> {
>>>-- 
>>>2.7.5
>>>
>

^ permalink raw reply

* Re: [PATCH net-next v2 0/2] of: mdio: Fall back to mdiobus_register() with NULL device_node
From: Geert Uytterhoeven @ 2018-05-16  8:54 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, Andrew Lunn, Vivien Didelot, David S. Miller,
	Nicolas Ferre, Fugang Duan, Sergei Shtylyov, Giuseppe Cavallaro,
	Alexandre Torgue, Jose Abreu, Grygorii Strashko, Woojung Huh,
	Microchip Linux Driver Support, Rob Herring, Frank Rowand,
	Antoine Tenart, Tobias Jordan, Russell King
In-Reply-To: <20180515235619.27773-1-f.fainelli@gmail.com>

Hi Florian,

Thanks for your series!
I like the effect on simplifying drivers.

On Wed, May 16, 2018 at 1:56 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> This patch series updates of_mdiobus_register() such that when the device_node
> argument is NULL, it calls mdiobus_register() directly. This is consistent with
> the behavior of of_mdiobus_register() when CONFIG_OF=n.

IMHO the CONFIG_OF=n behavior of of_mdiobus_register() (which I wasn't
aware of) is inconsistent with the behavior of other of_*() functions,
which are just empty stubs.

So I'm wondering if you should do it the other way around, and let
mdiobus_register() call of_mdiobus_register() if dev->of_node exists?

This does mean mdiobus_register() should gain a struct device * parameter,
and thus changes to many more drivers are needed.

> I only converted the most obvious drivers, there are others that have a much
> less obvious behavior and specifically attempt to deal with CONFIG_ACPI.

I haven't looked at the ACPI handling, but perhaps this can be moved
inside mdiobus_register() as well?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [RFC v4 5/5] virtio_ring: enable packed ring
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie
In-Reply-To: <20180516083737.26504-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index de3839f3621a..b158692263b0 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
 			break;
+		case VIRTIO_F_RING_PACKED:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
-- 
2.17.0

^ permalink raw reply related

* [RFC v4 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180516083737.26504-1-tiwei.bie@intel.com>

This commit introduces the event idx support in
packed ring.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 75 +++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index c6c5deb0e3ae..de3839f3621a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1006,7 +1006,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 flags;
+	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
 	bool needs_kick;
 	u32 snapshot;
 
@@ -1015,9 +1015,19 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 	 * suppressions. */
 	virtio_mb(vq->weak_barriers);
 
+	old = vq->next_avail_idx - vq->num_added;
+	new = vq->next_avail_idx;
+	vq->num_added = 0;
+
 	snapshot = *(u32 *)vq->vring_packed.device;
+	off_wrap = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot & 0xffff));
 	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
 
+	wrap_counter = off_wrap >> 15;
+	event_idx = off_wrap & ~(1<<15);
+	if (wrap_counter != vq->wrap_counter)
+		event_idx -= vq->vring_packed.num;
+
 #ifdef DEBUG
 	if (vq->last_add_time_valid) {
 		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
@@ -1026,7 +1036,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 	vq->last_add_time_valid = false;
 #endif
 
-	needs_kick = (flags != VRING_EVENT_F_DISABLE);
+	if (flags == VRING_EVENT_F_DESC)
+		needs_kick = vring_need_event(event_idx, new, old);
+	else
+		needs_kick = (flags != VRING_EVENT_F_DISABLE);
 	END_USE(vq);
 	return needs_kick;
 }
@@ -1098,7 +1111,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 					  void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 last_used, id;
+	u16 wrap_counter, last_used, id;
 	void *ret;
 
 	START_USE(vq);
@@ -1138,6 +1151,19 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 	ret = vq->desc_state[id].data;
 	detach_buf_packed(vq, last_used, id, ctx);
 
+	wrap_counter = vq->wrap_counter;
+	if (vq->last_used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
+		virtio_store_mb(vq->weak_barriers,
+				&vq->vring_packed.driver->off_wrap,
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
+						(wrap_counter << 15)));
+
 #ifdef DEBUG
 	vq->last_add_time_valid = false;
 #endif
@@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
 static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 wrap_counter;
 
 	START_USE(vq);
 
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+
+	wrap_counter = vq->wrap_counter;
+	if (vq->last_used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+			vq->last_used_idx | (wrap_counter << 15));
 
 	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
 		virtio_wmb(vq->weak_barriers);
-		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+						     VRING_EVENT_F_ENABLE;
 		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
 							vq->event_flags_shadow);
 	}
@@ -1194,15 +1232,40 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
 static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs, used_idx, wrap_counter;
 
 	START_USE(vq);
 
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+
+	/* TODO: tune this threshold */
+	if (vq->next_avail_idx < vq->last_used_idx)
+		bufs = (vq->vring_packed.num + vq->next_avail_idx -
+				vq->last_used_idx) * 3 / 4;
+	else
+		bufs = (vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
+
+	wrap_counter = vq->wrap_counter;
+	if (vq->last_used_idx > vq->next_avail_idx)
+		wrap_counter ^= 1;
+
+	used_idx = vq->last_used_idx + bufs;
+	if (used_idx >= vq->vring_packed.num) {
+		used_idx -= vq->vring_packed.num;
+		wrap_counter ^= 1;
+	}
+
+	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+			used_idx | (wrap_counter << 15));
 
 	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
 		virtio_wmb(vq->weak_barriers);
-		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+						     VRING_EVENT_F_ENABLE;
 		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
 							vq->event_flags_shadow);
 	}
@@ -1869,8 +1932,10 @@ void vring_transport_features(struct virtio_device *vdev)
 		switch (i) {
 		case VIRTIO_RING_F_INDIRECT_DESC:
 			break;
+#if 0
 		case VIRTIO_RING_F_EVENT_IDX:
 			break;
+#endif
 		case VIRTIO_F_VERSION_1:
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
-- 
2.17.0

^ permalink raw reply related

* [RFC v4 3/5] virtio_ring: add packed ring support
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie
In-Reply-To: <20180516083737.26504-1-tiwei.bie@intel.com>

This commit introduces the basic support (without EVENT_IDX)
for packed ring.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 491 ++++++++++++++++++++++++++++++++++-
 1 file changed, 481 insertions(+), 10 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 62d7c407841a..c6c5deb0e3ae 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,7 +58,8 @@
 
 struct vring_desc_state {
 	void *data;			/* Data for callback. */
-	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+	void *indir_desc;		/* Indirect descriptor, if any. */
+	int num;			/* Descriptor list length. */
 };
 
 struct vring_virtqueue {
@@ -116,6 +117,9 @@ struct vring_virtqueue {
 			/* Last written value to driver->flags in
 			 * guest byte order. */
 			u16 event_flags_shadow;
+
+			/* ID allocation. */
+			struct idr buffer_id;
 		};
 	};
 
@@ -142,6 +146,16 @@ struct vring_virtqueue {
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
+					  unsigned int total_sg)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect. FIXME: tune this threshold */
+	return (vq->indirect && total_sg > 1 && vq->vq.num_free);
+}
+
 /*
  * Modern virtio devices have feature bits to specify whether they need a
  * quirk and bypass the IOMMU. If not there, just use the DMA API.
@@ -327,9 +341,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 
 	head = vq->free_head;
 
-	/* If the host supports indirect descriptor tables, and we have multiple
-	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+	if (virtqueue_use_indirect(_vq, total_sg))
 		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
@@ -741,6 +753,63 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
 		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
 }
 
+static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
+				   struct vring_packed_desc *desc)
+{
+	u16 flags;
+
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return;
+
+	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+	if (flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vring_dma_dev(vq),
+				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
+				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 (flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vring_dma_dev(vq),
+			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
+			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       (flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+						       unsigned int total_sg,
+						       gfp_t gfp)
+{
+	struct vring_packed_desc *desc;
+
+	/*
+	 * We require lowmem mappings for the descriptors because
+	 * otherwise virt_to_phys will give us bogus addresses in the
+	 * virtqueue.
+	 */
+	gfp &= ~__GFP_HIGHMEM;
+
+	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+	return desc;
+}
+
+static u16 alloc_id_packed(struct vring_virtqueue *vq)
+{
+	u16 id;
+
+	id = idr_alloc(&vq->buffer_id, NULL, 0, vq->vring_packed.num,
+		       GFP_KERNEL);
+	return id;
+}
+
+static void free_id_packed(struct vring_virtqueue *vq, u16 id)
+{
+	idr_remove(&vq->buffer_id, id);
+}
+
 static inline int virtqueue_add_packed(struct virtqueue *_vq,
 				       struct scatterlist *sgs[],
 				       unsigned int total_sg,
@@ -750,47 +819,446 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 				       void *ctx,
 				       gfp_t gfp)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_packed_desc *desc;
+	struct scatterlist *sg;
+	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+	__virtio16 uninitialized_var(head_flags), flags;
+	u16 head, wrap_counter, id;
+	bool indirect;
+
+	START_USE(vq);
+
+	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return -EIO;
+	}
+
+#ifdef DEBUG
+	{
+		ktime_t now = ktime_get();
+
+		/* No kick or get, with .1 second between?  Warn. */
+		if (vq->last_add_time_valid)
+			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+					    > 100);
+		vq->last_add_time = now;
+		vq->last_add_time_valid = true;
+	}
+#endif
+
+	BUG_ON(total_sg == 0);
+
+	head = vq->next_avail_idx;
+	wrap_counter = vq->wrap_counter;
+
+	if (virtqueue_use_indirect(_vq, total_sg))
+		desc = alloc_indirect_packed(_vq, total_sg, gfp);
+	else {
+		desc = NULL;
+		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
+	}
+
+	if (desc) {
+		/* Use a single buffer which doesn't continue */
+		indirect = true;
+		/* Set up rest to use this indirect table. */
+		i = 0;
+		descs_used = 1;
+	} else {
+		indirect = false;
+		desc = vq->vring_packed.desc;
+		i = head;
+		descs_used = total_sg;
+	}
+
+	if (vq->vq.num_free < descs_used) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 descs_used, vq->vq.num_free);
+		/* FIXME: for historical reasons, we force a notify here if
+		 * there are outgoing parts to the buffer.  Presumably the
+		 * host should service the ring ASAP. */
+		if (out_sgs)
+			vq->notify(&vq->vq);
+		if (indirect)
+			kfree(desc);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	id = alloc_id_packed(vq);
+
+	for (n = 0; n < out_sgs + in_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
+					       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
+			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
+					VRING_DESC_F_AVAIL(vq->wrap_counter) |
+					VRING_DESC_F_USED(!vq->wrap_counter));
+			if (!indirect && i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			i++;
+			if (!indirect && i >= vq->vring_packed.num) {
+				i = 0;
+				vq->wrap_counter ^= 1;
+			}
+		}
+	}
+
+	prev = (i > 0 ? i : vq->vring_packed.num) - 1;
+	desc[prev].id = cpu_to_virtio16(_vq->vdev, id);
+
+	/* Last one doesn't continue. */
+	if (total_sg == 1)
+		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+	else
+		desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
+						~VRING_DESC_F_NEXT);
+
+	if (indirect) {
+		/* Now that the indirect table is filled in, map it. */
+		dma_addr_t addr = vring_map_single(
+			vq, desc, total_sg * sizeof(struct vring_packed_desc),
+			DMA_TO_DEVICE);
+		if (vring_mapping_error(vq, addr))
+			goto unmap_release;
+
+		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
+					     VRING_DESC_F_AVAIL(wrap_counter) |
+					     VRING_DESC_F_USED(!wrap_counter));
+		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
+								   addr);
+		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
+				total_sg * sizeof(struct vring_packed_desc));
+		vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
+	/* Update free pointer */
+	if (indirect) {
+		n = head + 1;
+		if (n >= vq->vring_packed.num) {
+			n = 0;
+			vq->wrap_counter ^= 1;
+		}
+		vq->next_avail_idx = n;
+	} else
+		vq->next_avail_idx = i;
+
+	/* Store token and indirect buffer state. */
+	vq->desc_state[id].num = descs_used;
+	vq->desc_state[id].data = data;
+	if (indirect)
+		vq->desc_state[id].indir_desc = desc;
+	else
+		vq->desc_state[id].indir_desc = ctx;
+
+	/* A driver MUST NOT make the first descriptor in the list
+	 * available before all subsequent descriptors comprising
+	 * the list are made available. */
+	virtio_wmb(vq->weak_barriers);
+	vq->vring_packed.desc[head].flags = head_flags;
+	vq->num_added += descs_used;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+
+	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one_packed(vq, &desc[i]);
+		i++;
+		if (!indirect && i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->wrap_counter = wrap_counter;
+
+	if (indirect)
+		kfree(desc);
+
+	free_id_packed(vq, id);
+
+	END_USE(vq);
 	return -EIO;
 }
 
 static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 flags;
+	bool needs_kick;
+	u32 snapshot;
+
+	START_USE(vq);
+	/* We need to expose the new flags value before checking notification
+	 * suppressions. */
+	virtio_mb(vq->weak_barriers);
+
+	snapshot = *(u32 *)vq->vring_packed.device;
+	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
+
+#ifdef DEBUG
+	if (vq->last_add_time_valid) {
+		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+					      vq->last_add_time)) > 100);
+	}
+	vq->last_add_time_valid = false;
+#endif
+
+	needs_kick = (flags != VRING_EVENT_F_DISABLE);
+	END_USE(vq);
+	return needs_kick;
+}
+
+static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+			      unsigned int id, void **ctx)
+{
+	struct vring_packed_desc *desc;
+	unsigned int i, j;
+
+	/* Clear data ptr. */
+	vq->desc_state[id].data = NULL;
+
+	i = head;
+
+	for (j = 0; j < vq->desc_state[id].num; j++) {
+		desc = &vq->vring_packed.desc[i];
+		vring_unmap_one_packed(vq, desc);
+		i++;
+		if (i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->vq.num_free += vq->desc_state[id].num;
+
+	if (vq->indirect) {
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		desc = vq->desc_state[id].indir_desc;
+		if (!desc)
+			goto out;
+
+		len = virtio32_to_cpu(vq->vq.vdev,
+				      vq->vring_packed.desc[head].len);
+
+		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
+			vring_unmap_one_packed(vq, &desc[j]);
+
+		kfree(desc);
+		vq->desc_state[id].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[id].indir_desc;
+	}
+
+out:
+	free_id_packed(vq, id);
 }
 
 static inline bool more_used_packed(const struct vring_virtqueue *vq)
 {
-	return false;
+	u16 last_used, flags;
+	bool avail, used;
+
+	if (vq->vq.num_free == vq->vring_packed.num)
+		return false;
+
+	last_used = vq->last_used_idx;
+	flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[last_used].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+
+	return avail == used;
 }
 
 static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 					  unsigned int *len,
 					  void **ctx)
 {
-	return NULL;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 last_used, id;
+	void *ret;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used_packed(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used elements after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = vq->last_used_idx;
+	id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
+
+	if (unlikely(id >= vq->vring_packed.num)) {
+		BAD_RING(vq, "id %u out of range\n", id);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[id].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", id);
+		return NULL;
+	}
+
+	vq->last_used_idx += vq->desc_state[id].num;
+	if (vq->last_used_idx >= vq->vring_packed.num)
+		vq->last_used_idx -= vq->vring_packed.num;
+
+	/* detach_buf_packed clears data, so grab it now. */
+	ret = vq->desc_state[id].data;
+	detach_buf_packed(vq, last_used, id, ctx);
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
 }
 
 static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
+		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
 }
 
 static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
 {
-	return 0;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		virtio_wmb(vq->weak_barriers);
+		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+
+	END_USE(vq);
+	return vq->last_used_idx;
 }
 
 static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	bool avail, used;
+	u16 flags;
+
+	virtio_mb(vq->weak_barriers);
+	flags = virtio16_to_cpu(vq->vq.vdev,
+			vq->vring_packed.desc[last_used_idx].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+	return avail == used;
 }
 
 static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		virtio_wmb(vq->weak_barriers);
+		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+
+	if (more_used_packed(vq)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
 }
 
 static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 flags, head, id, i;
+	unsigned int len;
+	void *buf;
+
+	START_USE(vq);
+
+	/* Detach the used descriptors. */
+	if (more_used_packed(vq)) {
+		buf = virtqueue_get_buf_ctx_packed(_vq, &len, NULL);
+		END_USE(vq);
+		return buf;
+	}
+
+	/* Detach the available descriptors. */
+	for (i = vq->last_used_idx; i != vq->next_avail_idx;
+			i = (i + 1) % vq->vring_packed.num) {
+		flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[i].flags);
+		while (flags & VRING_DESC_F_NEXT) {
+			i = (i + 1) % vq->vring_packed.num;
+			flags = virtio16_to_cpu(vq->vq.vdev,
+					vq->vring_packed.desc[i].flags);
+		}
+		id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[i].id);
+		if (!vq->desc_state[id].data)
+			continue;
+
+		len = vq->desc_state[id].num - 1;
+		head = (i < len ? i + vq->vring_packed.num : i) - len;
+
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[id].data;
+		detach_buf_packed(vq, head, id, NULL);
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
+
+	END_USE(vq);
 	return NULL;
 }
 
@@ -1198,6 +1666,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		vq->next_avail_idx = 0;
 		vq->wrap_counter = 1;
 		vq->event_flags_shadow = 0;
+		idr_init(&vq->buffer_id);
 	} else {
 		vq->vring = vring.vring_split;
 		vq->avail_flags_shadow = 0;
@@ -1384,6 +1853,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 					      (void *)vq->vring.desc,
 				 vq->queue_dma_addr);
 	}
+	if (vq->packed)
+		idr_destroy(&vq->buffer_id);
 	list_del(&_vq->list);
 	kfree(vq);
 }
-- 
2.17.0

^ permalink raw reply related

* [RFC v4 2/5] virtio_ring: support creating packed ring
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie
In-Reply-To: <20180516083737.26504-1-tiwei.bie@intel.com>

This commit introduces the support for creating packed ring.
All split ring specific functions are added _split suffix.
Some necessary stubs for packed ring are also added.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 764 +++++++++++++++++++++++------------
 include/linux/virtio_ring.h  |   8 +-
 2 files changed, 513 insertions(+), 259 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..62d7c407841a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -64,8 +64,8 @@ struct vring_desc_state {
 struct vring_virtqueue {
 	struct virtqueue vq;
 
-	/* Actual memory layout for this queue */
-	struct vring vring;
+	/* Is this a packed ring? */
+	bool packed;
 
 	/* Can we use weak barriers? */
 	bool weak_barriers;
@@ -79,19 +79,45 @@ struct vring_virtqueue {
 	/* Host publishes avail event idx */
 	bool event;
 
-	/* Head of free buffer list. */
-	unsigned int free_head;
 	/* Number we've added since last sync. */
 	unsigned int num_added;
 
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
-	/* Last written value to avail->flags */
-	u16 avail_flags_shadow;
+	union {
+		/* Available for split ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring vring;
 
-	/* Last written value to avail->idx in guest byte order */
-	u16 avail_idx_shadow;
+			/* Head of free buffer list. */
+			unsigned int free_head;
+
+			/* Last written value to avail->flags */
+			u16 avail_flags_shadow;
+
+			/* Last written value to avail->idx in
+			 * guest byte order. */
+			u16 avail_idx_shadow;
+		};
+
+		/* Available for packed ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring_packed vring_packed;
+
+			/* Driver ring wrap counter. */
+			u8 wrap_counter;
+
+			/* Index of the next avail descriptor. */
+			u16 next_avail_idx;
+
+			/* Last written value to driver->flags in
+			 * guest byte order. */
+			u16 event_flags_shadow;
+		};
+	};
 
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
@@ -201,8 +227,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 			      cpu_addr, size, direction);
 }
 
-static void vring_unmap_one(const struct vring_virtqueue *vq,
-			    struct vring_desc *desc)
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+			       dma_addr_t addr)
+{
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return 0;
+
+	return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+				  struct vring_desc *desc)
 {
 	u16 flags;
 
@@ -226,17 +261,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq,
 	}
 }
 
-static int vring_mapping_error(const struct vring_virtqueue *vq,
-			       dma_addr_t addr)
-{
-	if (!vring_use_dma_api(vq->vq.vdev))
-		return 0;
-
-	return dma_mapping_error(vring_dma_dev(vq), addr);
-}
-
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
-					 unsigned int total_sg, gfp_t gfp)
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+					       unsigned int total_sg,
+					       gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned int i;
@@ -257,14 +284,14 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 	return desc;
 }
 
-static inline int virtqueue_add(struct virtqueue *_vq,
-				struct scatterlist *sgs[],
-				unsigned int total_sg,
-				unsigned int out_sgs,
-				unsigned int in_sgs,
-				void *data,
-				void *ctx,
-				gfp_t gfp)
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+				      struct scatterlist *sgs[],
+				      unsigned int total_sg,
+				      unsigned int out_sgs,
+				      unsigned int in_sgs,
+				      void *data,
+				      void *ctx,
+				      gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
@@ -303,7 +330,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
-		desc = alloc_indirect(_vq, total_sg, gfp);
+		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
 		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
@@ -424,7 +451,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	for (n = 0; n < total_sg; n++) {
 		if (i == err_idx)
 			break;
-		vring_unmap_one(vq, &desc[i]);
+		vring_unmap_one_split(vq, &desc[i]);
 		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
 	}
 
@@ -435,6 +462,355 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	return -EIO;
 }
 
+static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 new, old;
+	bool needs_kick;
+
+	START_USE(vq);
+	/* We need to expose available array entries before checking avail
+	 * event. */
+	virtio_mb(vq->weak_barriers);
+
+	old = vq->avail_idx_shadow - vq->num_added;
+	new = vq->avail_idx_shadow;
+	vq->num_added = 0;
+
+#ifdef DEBUG
+	if (vq->last_add_time_valid) {
+		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+					      vq->last_add_time)) > 100);
+	}
+	vq->last_add_time_valid = false;
+#endif
+
+	if (vq->event) {
+		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
+					      new, old);
+	} else {
+		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
+	}
+	END_USE(vq);
+	return needs_kick;
+}
+
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+			     void **ctx)
+{
+	unsigned int i, j;
+	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
+
+	/* Clear data ptr. */
+	vq->desc_state[head].data = NULL;
+
+	/* Put back on free list: unmap first-level descriptors and find end */
+	i = head;
+
+	while (vq->vring.desc[i].flags & nextflag) {
+		vring_unmap_one_split(vq, &vq->vring.desc[i]);
+		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
+		vq->vq.num_free++;
+	}
+
+	vring_unmap_one_split(vq, &vq->vring.desc[i]);
+	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
+	vq->free_head = head;
+
+	/* Plus final descriptor */
+	vq->vq.num_free++;
+
+	if (vq->indirect) {
+		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!indir_desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+		BUG_ON(!(vq->vring.desc[head].flags &
+			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_desc); j++)
+			vring_unmap_one_split(vq, &indir_desc[j]);
+
+		kfree(indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
+	}
+}
+
+static inline bool more_used_split(const struct vring_virtqueue *vq)
+{
+	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+}
+
+static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
+					 unsigned int *len,
+					 void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used_split(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used array entries after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = (vq->last_used_idx & (vq->vring.num - 1));
+	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
+
+	if (unlikely(i >= vq->vring.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_split clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_split(vq, i, ctx);
+	vq->last_used_idx++;
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+		virtio_store_mb(vq->weak_barriers,
+				&vring_used_event(&vq->vring),
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
+}
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+}
+
+static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 last_used_idx;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
+	END_USE(vq);
+	return last_used_idx;
+}
+
+static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	virtio_mb(vq->weak_barriers);
+	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+}
+
+static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
+
+	virtio_store_mb(vq->weak_barriers,
+			&vring_used_event(&vq->vring),
+			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
+
+	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+
+static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i;
+	void *buf;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (!vq->desc_state[i].data)
+			continue;
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[i].data;
+		detach_buf_split(vq, i, NULL);
+		vq->avail_idx_shadow--;
+		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring.num);
+
+	END_USE(vq);
+	return NULL;
+}
+
+/*
+ * The layout for the packed ring is a continuous chunk of memory
+ * which looks like this.
+ *
+ * struct vring_packed {
+ *	// The actual descriptors (16 bytes each)
+ *	struct vring_packed_desc desc[num];
+ *
+ *	// Padding to the next align boundary.
+ *	char pad[];
+ *
+ *	// Driver Event Suppression
+ *	struct vring_packed_desc_event driver;
+ *
+ *	// Device Event Suppression
+ *	struct vring_packed_desc_event device;
+ * };
+ */
+static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
+				     void *p, unsigned long align)
+{
+	vr->num = num;
+	vr->desc = p;
+	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
+		* num + align - 1) & ~(align - 1));
+	vr->device = vr->driver + 1;
+}
+
+static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
+{
+	return ((sizeof(struct vring_packed_desc) * num + align - 1)
+		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
+}
+
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+				       struct scatterlist *sgs[],
+				       unsigned int total_sg,
+				       unsigned int out_sgs,
+				       unsigned int in_sgs,
+				       void *data,
+				       void *ctx,
+				       gfp_t gfp)
+{
+	return -EIO;
+}
+
+static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
+{
+	return false;
+}
+
+static inline bool more_used_packed(const struct vring_virtqueue *vq)
+{
+	return false;
+}
+
+static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
+					  unsigned int *len,
+					  void **ctx)
+{
+	return NULL;
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+}
+
+static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
+{
+	return 0;
+}
+
+static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	return false;
+}
+
+static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
+{
+	return false;
+}
+
+static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
+{
+	return NULL;
+}
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				unsigned int total_sg,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				void *ctx,
+				gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
+						 in_sgs, data, ctx, gfp) :
+			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
+						in_sgs, data, ctx, gfp);
+}
+
 /**
  * virtqueue_add_sgs - expose buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -551,34 +927,9 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
 bool virtqueue_kick_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 new, old;
-	bool needs_kick;
 
-	START_USE(vq);
-	/* We need to expose available array entries before checking avail
-	 * event. */
-	virtio_mb(vq->weak_barriers);
-
-	old = vq->avail_idx_shadow - vq->num_added;
-	new = vq->avail_idx_shadow;
-	vq->num_added = 0;
-
-#ifdef DEBUG
-	if (vq->last_add_time_valid) {
-		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
-					      vq->last_add_time)) > 100);
-	}
-	vq->last_add_time_valid = false;
-#endif
-
-	if (vq->event) {
-		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
-					      new, old);
-	} else {
-		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
-	}
-	END_USE(vq);
-	return needs_kick;
+	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
+			    virtqueue_kick_prepare_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
 
@@ -626,58 +977,9 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
-		       void **ctx)
-{
-	unsigned int i, j;
-	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
-
-	/* Clear data ptr. */
-	vq->desc_state[head].data = NULL;
-
-	/* Put back on free list: unmap first-level descriptors and find end */
-	i = head;
-
-	while (vq->vring.desc[i].flags & nextflag) {
-		vring_unmap_one(vq, &vq->vring.desc[i]);
-		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
-		vq->vq.num_free++;
-	}
-
-	vring_unmap_one(vq, &vq->vring.desc[i]);
-	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
-	vq->free_head = head;
-
-	/* Plus final descriptor */
-	vq->vq.num_free++;
-
-	if (vq->indirect) {
-		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
-		u32 len;
-
-		/* Free the indirect table, if any, now that it's unmapped. */
-		if (!indir_desc)
-			return;
-
-		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
-
-		BUG_ON(!(vq->vring.desc[head].flags &
-			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
-		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
-
-		for (j = 0; j < len / sizeof(struct vring_desc); j++)
-			vring_unmap_one(vq, &indir_desc[j]);
-
-		kfree(indir_desc);
-		vq->desc_state[head].indir_desc = NULL;
-	} else if (ctx) {
-		*ctx = vq->desc_state[head].indir_desc;
-	}
-}
-
 static inline bool more_used(const struct vring_virtqueue *vq)
 {
-	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
 }
 
 /**
@@ -700,57 +1002,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
 			    void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	void *ret;
-	unsigned int i;
-	u16 last_used;
 
-	START_USE(vq);
-
-	if (unlikely(vq->broken)) {
-		END_USE(vq);
-		return NULL;
-	}
-
-	if (!more_used(vq)) {
-		pr_debug("No more buffers in queue\n");
-		END_USE(vq);
-		return NULL;
-	}
-
-	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb(vq->weak_barriers);
-
-	last_used = (vq->last_used_idx & (vq->vring.num - 1));
-	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
-	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
-
-	if (unlikely(i >= vq->vring.num)) {
-		BAD_RING(vq, "id %u out of range\n", i);
-		return NULL;
-	}
-	if (unlikely(!vq->desc_state[i].data)) {
-		BAD_RING(vq, "id %u is not a head!\n", i);
-		return NULL;
-	}
-
-	/* detach_buf clears data, so grab it now. */
-	ret = vq->desc_state[i].data;
-	detach_buf(vq, i, ctx);
-	vq->last_used_idx++;
-	/* If we expect an interrupt for the next entry, tell host
-	 * by writing event index and flush out the write before
-	 * the read in the next get_buf call. */
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
-		virtio_store_mb(vq->weak_barriers,
-				&vring_used_event(&vq->vring),
-				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
-
-#ifdef DEBUG
-	vq->last_add_time_valid = false;
-#endif
-
-	END_USE(vq);
-	return ret;
+	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
@@ -772,12 +1026,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-
+	if (vq->packed)
+		virtqueue_disable_cb_packed(_vq);
+	else
+		virtqueue_disable_cb_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
@@ -796,23 +1048,9 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 last_used_idx;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always do both to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
-	END_USE(vq);
-	return last_used_idx;
+	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
+			    virtqueue_enable_cb_prepare_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
 
@@ -829,8 +1067,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	virtio_mb(vq->weak_barriers);
-	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
+			    virtqueue_poll_split(_vq, last_used_idx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_poll);
 
@@ -868,34 +1106,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 bufs;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always update the event index to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	/* TODO: tune this threshold */
-	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
-
-	virtio_store_mb(vq->weak_barriers,
-			&vring_used_event(&vq->vring),
-			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
-
-	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
-		END_USE(vq);
-		return false;
-	}
-
-	END_USE(vq);
-	return true;
+	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
+			    virtqueue_enable_cb_delayed_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 
@@ -910,27 +1123,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	unsigned int i;
-	void *buf;
 
-	START_USE(vq);
-
-	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->desc_state[i].data)
-			continue;
-		/* detach_buf clears data, so grab it now. */
-		buf = vq->desc_state[i].data;
-		detach_buf(vq, i, NULL);
-		vq->avail_idx_shadow--;
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
-		END_USE(vq);
-		return buf;
-	}
-	/* That should have freed everything. */
-	BUG_ON(vq->vq.num_free != vq->vring.num);
-
-	END_USE(vq);
-	return NULL;
+	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
+			    virtqueue_detach_unused_buf_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
 
@@ -955,7 +1150,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool context,
@@ -963,19 +1159,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					void (*callback)(struct virtqueue *),
 					const char *name)
 {
-	unsigned int i;
+	unsigned int num, i;
 	struct vring_virtqueue *vq;
 
-	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+	num = packed ? vring.vring_packed.num : vring.vring_split.num;
+
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
 		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
-	vq->vring = vring;
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
-	vq->vq.num_free = vring.num;
+	vq->vq.num_free = num;
 	vq->vq.index = index;
 	vq->we_own_ring = false;
 	vq->queue_dma_addr = 0;
@@ -984,9 +1181,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
-	vq->avail_flags_shadow = 0;
-	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
+	vq->packed = packed;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
@@ -997,18 +1193,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (vq->packed) {
+		vq->vring_packed = vring.vring_packed;
+		vq->next_avail_idx = 0;
+		vq->wrap_counter = 1;
+		vq->event_flags_shadow = 0;
+	} else {
+		vq->vring = vring.vring_split;
+		vq->avail_flags_shadow = 0;
+		vq->avail_idx_shadow = 0;
+
+		/* Put everything in free lists. */
+		vq->free_head = 0;
+		for (i = 0; i < num-1; i++)
+			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
+	}
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+		if (packed) {
+			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
+						vq->event_flags_shadow);
+		} else {
+			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+			if (!vq->event)
+				vq->vring.avail->flags = cpu_to_virtio16(vdev,
+						vq->avail_flags_shadow);
+		}
 	}
 
-	/* Put everything in free lists. */
-	vq->free_head = 0;
-	for (i = 0; i < vring.num-1; i++)
-		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
@@ -1056,6 +1271,12 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
 	}
 }
 
+static inline int
+__vring_size(unsigned int num, unsigned long align, bool packed)
+{
+	return packed ? vring_size_packed(num, align) : vring_size(num, align);
+}
+
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
 	unsigned int num,
@@ -1072,7 +1293,8 @@ struct virtqueue *vring_create_virtqueue(
 	void *queue = NULL;
 	dma_addr_t dma_addr;
 	size_t queue_size_in_bytes;
-	struct vring vring;
+	union vring_union vring;
+	bool packed;
 
 	/* We assume num is a power of 2. */
 	if (num & (num - 1)) {
@@ -1080,9 +1302,13 @@ struct virtqueue *vring_create_virtqueue(
 		return NULL;
 	}
 
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+
 	/* TODO: allocate each queue chunk individually */
-	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
+			num /= 2) {
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr,
 					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
 		if (queue)
@@ -1094,17 +1320,21 @@ struct virtqueue *vring_create_virtqueue(
 
 	if (!queue) {
 		/* Try to get a single page. You are my only hope! */
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
 	}
 	if (!queue)
 		return NULL;
 
-	queue_size_in_bytes = vring_size(num, vring_align);
-	vring_init(&vring, num, queue, vring_align);
+	queue_size_in_bytes = __vring_size(num, vring_align, packed);
+	if (packed)
+		vring_init_packed(&vring.vring_packed, num, queue, vring_align);
+	else
+		vring_init(&vring.vring_split, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				   notify, callback, name);
+	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				   context, notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
 				 dma_addr);
@@ -1130,10 +1360,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name)
 {
-	struct vring vring;
-	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				     notify, callback, name);
+	union vring_union vring;
+	bool packed;
+
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+	if (packed)
+		vring_init_packed(&vring.vring_packed, num, pages, vring_align);
+	else
+		vring_init(&vring.vring_split, num, pages, vring_align);
+
+	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				     context, notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 
@@ -1143,7 +1380,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 
 	if (vq->we_own_ring) {
 		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
-				 vq->vring.desc, vq->queue_dma_addr);
+				 vq->packed ? (void *)vq->vring_packed.desc :
+					      (void *)vq->vring.desc,
+				 vq->queue_dma_addr);
 	}
 	list_del(&_vq->list);
 	kfree(vq);
@@ -1185,7 +1424,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	return vq->vring.num;
+	return vq->packed ? vq->vring_packed.num : vq->vring.num;
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
@@ -1228,6 +1467,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.avail - (char *)vq->vring.desc);
 }
@@ -1239,11 +1482,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.used - (char *)vq->vring.desc);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
 
+/* Only available for split ring */
 const struct vring *virtqueue_get_vring(struct virtqueue *vq)
 {
 	return &to_vvq(vq)->vring;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..a0075894ad16 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
 struct virtio_device;
 struct virtqueue;
 
+union vring_union {
+	struct vring vring_split;
+	struct vring_packed vring_packed;
+};
+
 /*
  * Creates a virtqueue and allocates the descriptor ring.  If
  * may_reduce_num is set, then this may allocate a smaller ring than
@@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 
 /* Creates a virtqueue with a custom layout. */
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool ctx,
-- 
2.17.0

^ permalink raw reply related

* [RFC v4 1/5] virtio: add packed ring definitions
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180516083737.26504-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 include/uapi/linux/virtio_config.h | 12 +++++++++-
 include/uapi/linux/virtio_ring.h   | 36 ++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..a6e392325e3a 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		34
+#define VIRTIO_TRANSPORT_F_END		36
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,14 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..3932cb80c347 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,9 @@
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
 
+#define VRING_DESC_F_AVAIL(b)	((b) << 7)
+#define VRING_DESC_F_USED(b)	((b) << 15)
+
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
  * will still kick if it's out of buffers. */
@@ -53,6 +56,10 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+#define VRING_EVENT_F_ENABLE	0x0
+#define VRING_EVENT_F_DISABLE	0x1
+#define VRING_EVENT_F_DESC	0x2
+
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC	28
 
@@ -171,4 +178,33 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
 }
 
+struct vring_packed_desc_event {
+	/* __virtio16 off  : 15; // Descriptor Event Offset
+	 * __virtio16 wrap : 1;  // Descriptor Event Wrap Counter */
+	__virtio16 off_wrap;
+	/* __virtio16 flags : 2; // Descriptor Event Flags */
+	__virtio16 flags;
+};
+
+struct vring_packed_desc {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
+struct vring_packed {
+	unsigned int num;
+
+	struct vring_packed_desc *desc;
+
+	struct vring_packed_desc_event *driver;
+
+	struct vring_packed_desc_event *device;
+};
+
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
2.17.0

^ permalink raw reply related

* [RFC v4 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-05-16  8:37 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie

Hello everyone,

This RFC implements packed ring support in virtio driver.

Some simple functional tests have been done with Jason's
packed ring implementation in vhost:

https://lkml.org/lkml/2018/4/23/12

Both of ping and netperf worked as expected (with EVENT_IDX
disabled).

TODO:
- Refinements (for code and commit log);
- More tests;
- Bug fixes;

RFC v3 -> RFC v4:
- Make ID allocation support out-of-order (Jason);
- Various fixes for EVENT_IDX support;

RFC v2 -> RFC v3:
- Split into small patches (Jason);
- Add helper virtqueue_use_indirect() (Jason);
- Just set id for the last descriptor of a list (Jason);
- Calculate the prev in virtqueue_add_packed() (Jason);
- Fix/improve desc suppression code (Jason/MST);
- Refine the code layout for XXX_split/packed and wrappers (MST);
- Fix the comments and API in uapi (MST);
- Remove the BUG_ON() for indirect (Jason);
- Some other refinements and bug fixes;

RFC v1 -> RFC v2:
- Add indirect descriptor support - compile test only;
- Add event suppression supprt - compile test only;
- Move vring_packed_init() out of uapi (Jason, MST);
- Merge two loops into one in virtqueue_add_packed() (Jason);
- Split vring_unmap_one() for packed ring and split ring (Jason);
- Avoid using '%' operator (Jason);
- Rename free_head -> next_avail_idx (Jason);
- Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
- Some other refinements and bug fixes;

Thanks!

Tiwei Bie (5):
  virtio: add packed ring definitions
  virtio_ring: support creating packed ring
  virtio_ring: add packed ring support
  virtio_ring: add event idx support in packed ring
  virtio_ring: enable packed ring

 drivers/virtio/virtio_ring.c       | 1338 ++++++++++++++++++++++------
 include/linux/virtio_ring.h        |    8 +-
 include/uapi/linux/virtio_config.h |   12 +-
 include/uapi/linux/virtio_ring.h   |   36 +
 4 files changed, 1116 insertions(+), 278 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: Hangs in r8152 connected to power management in kernels at least up v4.17-rc4
From: Oliver Neukum @ 2018-05-16  8:26 UTC (permalink / raw)
  To: Hayes Wang; +Cc: netdev@vger.kernel.org
In-Reply-To: <0835B3720019904CB8F7AA43166CEEB2D2E47ABE@RTITMBSV06.realtek.com.tw>

Am Mittwoch, den 16.05.2018, 03:37 +0000 schrieb Hayes Wang:
> Oliver Neukum [mailto:oneukum@suse.com]
> > 
> > Hi,
> > 
> > I got reports about hangs with this trace:
> > 
> > May 13 01:36:55 neroon kernel: INFO: task kworker/0:0:4 blocked for more
> > than 60 seconds.
> > May 13 01:36:55 neroon kernel:       Tainted: G     U
> > 4.17.0-rc4-1.g8257a00-vanilla #1
> > May 13 01:36:55 neroon kernel: "echo 0 >
> > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > May 13 01:36:55 neroon kernel: kworker/0:0     D    0     4      2
> > 0x80000000
> > May 13 01:36:55 neroon kernel: Workqueue: events rtl_work_func_t [r8152]
> > May 13 01:36:55 neroon kernel: Call Trace:
> > May 13 01:36:55 neroon kernel:  ? __schedule+0x289/0x880
> > May 13 01:36:55 neroon kernel:  schedule+0x2f/0x90
> > May 13 01:36:55 neroon kernel:  rpm_resume+0xf9/0x7a0
> > May 13 01:36:55 neroon kernel:  ? wait_woken+0x80/0x80
> > May 13 01:36:55 neroon kernel:  rpm_resume+0x547/0x7a0
> > May 13 01:36:55 neroon kernel:  ? __switch_to_asm+0x40/0x70
> > May 13 01:36:55 neroon kernel:  ? __switch_to_asm+0x34/0x70
> > May 13 01:36:55 neroon kernel:  ? __switch_to_asm+0x40/0x70
> > May 13 01:36:55 neroon kernel:  ? __switch_to_asm+0x34/0x70
> > May 13 01:36:55 neroon kernel:  ? __switch_to_asm+0x40/0x70
> > May 13 01:36:55 neroon kernel:  __pm_runtime_resume+0x3a/0x50
> > May 13 01:36:55 neroon kernel:  usb_autopm_get_interface+0x1d/0x50 [usbcore]
> 
> Would usb_autopm_get_interface() take a long time?
> The driver would wake the device if it has suspended.
> I have no idea about how usb_autopm_get_interface() works, so I don't know how to help.

Hi,

it basically calls r8152_resume() and makes a control request to the
hub. I think we are spinning in rtl8152_runtime_resume(), but where?
It has a lot of NAPI stuff. Any suggestions on how to instrument or
trace this?

	Regards
		Oliver

^ permalink raw reply

* Re: [PATCH 10/14] net: sched: extend act API for lockless actions
From: Vlad Buslov @ 2018-05-16  8:16 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <20180516075000.GC1972@nanopsycho>


On Wed 16 May 2018 at 07:50, Jiri Pirko <jiri@resnulli.us> wrote:
> Mon, May 14, 2018 at 04:27:11PM CEST, vladbu@mellanox.com wrote:
>>Implement new action API function to atomically delete action with
>>specified index and to atomically insert unique action. These functions are
>>required to implement init and delete functions for specific actions that
>>do not rely on rtnl lock.
>>
>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>---
>> include/net/act_api.h |  2 ++
>> net/sched/act_api.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 47 insertions(+)
>>
>>diff --git a/include/net/act_api.h b/include/net/act_api.h
>>index a8c8570..bce0cf1 100644
>>--- a/include/net/act_api.h
>>+++ b/include/net/act_api.h
>>@@ -153,7 +153,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>> 		   int bind, bool cpustats);
>> void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a);
>> 
>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index);
>> int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
>> 
>> static inline int tcf_idr_release(struct tc_action *a, bool bind)
>>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>>index 2772276e..a5193dc 100644
>>--- a/net/sched/act_api.c
>>+++ b/net/sched/act_api.c
>>@@ -330,6 +330,41 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
>> }
>> EXPORT_SYMBOL(tcf_idr_check);
>> 
>>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index)
>>+{
>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>+	struct tc_action *p;
>>+	int ret = 0;
>>+
>>+	spin_lock_bh(&idrinfo->lock);
>
> Why "_bh" is needed here?

Original idr remove function used _bh version so I used it here as well.
As I already replied to your previous question about idrinfo lock usage,
I don't see any particular reason for locking with _bh at this point.
I've contacted the author(Chris Mi) and he said that he just preserved
locking the same way as it was before he changed hash table to idr for
action lookup.

You want me to do standalone patch that cleans up idrinfo locking?

>
>
>>+	p = idr_find(&idrinfo->action_idr, index);
>>+	if (!p) {
>>+		spin_unlock(&idrinfo->lock);
>>+		return -ENOENT;
>>+	}
>>+
>>+	if (!atomic_read(&p->tcfa_bindcnt)) {
>>+		if (refcount_dec_and_test(&p->tcfa_refcnt)) {
>>+			struct module *owner = p->ops->owner;
>>+
>>+			WARN_ON(p != idr_remove(&idrinfo->action_idr,
>>+						p->tcfa_index));
>>+			spin_unlock_bh(&idrinfo->lock);
>>+
>>+			tcf_action_cleanup(p);
>>+			module_put(owner);
>>+			return 0;
>>+		}
>>+		ret = 0;
>>+	} else {
>>+		ret = -EPERM;
>
> I wonder if "-EPERM" is the best error code for this...

This is what original code returned so I decided to preserve
compatibility.

>
>
>>+	}
>>+
>>+	spin_unlock_bh(&idrinfo->lock);
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL(tcf_idr_find_delete);
>>+
>> int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>> 		   struct tc_action **a, const struct tc_action_ops *ops,
>> 		   int bind, bool cpustats)
>>@@ -407,6 +442,16 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
>> }
>> EXPORT_SYMBOL(tcf_idr_insert);
>> 
>>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a)
>>+{
>>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>>+
>>+	spin_lock_bh(&idrinfo->lock);
>>+	WARN_ON(idr_replace(&idrinfo->action_idr, a, a->tcfa_index));
>
> Under which condition this WARN_ON is hit?

When idr replace returns non-NULL pointer, which means that somehow
concurrent insertion of action with same index has happened and we are
leaking memory.

By the way I'm still not sure if having this insert unique function is
warranted or I should just add WARN to regular idr insert. What is your
opinion on this?

>
>
>>+	spin_unlock_bh(&idrinfo->lock);
>>+}
>>+EXPORT_SYMBOL(tcf_idr_insert_unique);
>>+
>> void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
>> 			 struct tcf_idrinfo *idrinfo)
>> {
>>-- 
>>2.7.5
>>

^ permalink raw reply

* INFO: rcu detected stall in sctp_packet_transmit
From: syzbot @ 2018-05-16  8:11 UTC (permalink / raw)
  To: davem, linux-kernel, linux-sctp, marcelo.leitner, netdev, nhorman,
	syzkaller-bugs, vyasevich

Hello,

syzbot found the following crash on:

HEAD commit:    961423f9fcbc Merge branch 'sctp-Introduce-sctp_flush_ctx'
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=1366aea7800000
kernel config:  https://syzkaller.appspot.com/x/.config?x=51fb0a6913f757db
dashboard link: https://syzkaller.appspot.com/bug?extid=ff0b569fb5111dcd1a36
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+ff0b569fb5111dcd1a36@syzkaller.appspotmail.com

INFO: rcu_sched self-detected stall on CPU
	0-....: (1 GPs behind) idle=dae/1/4611686018427387908 softirq=93090/93091  
fqs=30902
	 (t=125000 jiffies g=51107 c=51106 q=972)
NMI backtrace for cpu 0
CPU: 0 PID: 24668 Comm: syz-executor6 Not tainted 4.17.0-rc4+ #44
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
  __rcu_pending kernel/rcu/tree.c:3356 [inline]
  rcu_pending kernel/rcu/tree.c:3401 [inline]
  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
  tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:164
  tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1274
  __run_hrtimer kernel/time/hrtimer.c:1398 [inline]
  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1460
  hrtimer_interrupt+0x2f3/0x750 kernel/time/hrtimer.c:1518
  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
RIP: 0010:sctp_v6_xmit+0x259/0x6b0 net/sctp/ipv6.c:219
RSP: 0018:ffff8801dae068e8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000007 RBX: ffff8801bb7ec800 RCX: ffffffff86f1b345
RDX: 0000000000000000 RSI: ffffffff86f1b381 RDI: ffff8801b73d97c4
RBP: ffff8801dae06988 R08: ffff88019505c300 R09: ffffed003b5c46c2
R10: ffffed003b5c46c2 R11: ffff8801dae23613 R12: ffff88011fd57300
R13: ffff8801bb7ecec8 R14: 0000000000000029 R15: 0000000000000002
  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:642
  sctp_outq_flush_transports net/sctp/outqueue.c:1164 [inline]
  sctp_outq_flush+0x5f5/0x3430 net/sctp/outqueue.c:1212
  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d1/0x200 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
  </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160  
[inline]
RIP: 0010:_raw_spin_unlock_irqrestore+0xa1/0xc0  
kernel/locking/spinlock.c:184
RSP: 0018:ffff880196227328 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: dffffc0000000000 RBX: 0000000000000286 RCX: 0000000000000000
RDX: 1ffffffff11a316d RSI: 0000000000000001 RDI: 0000000000000286
RBP: ffff880196227338 R08: ffffed003b5c4b81 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801dae25c00
R13: ffff8801dae25c80 R14: ffff880196227758 R15: ffff8801dae25c00
  unlock_hrtimer_base kernel/time/hrtimer.c:887 [inline]
  hrtimer_start_range_ns+0x692/0xd10 kernel/time/hrtimer.c:1118
  hrtimer_start_expires include/linux/hrtimer.h:412 [inline]
  futex_wait_queue_me+0x304/0x820 kernel/futex.c:2517
  futex_wait+0x450/0x9f0 kernel/futex.c:2645
  do_futex+0x336/0x27d0 kernel/futex.c:3527
  __do_sys_futex kernel/futex.c:3587 [inline]
  __se_sys_futex kernel/futex.c:3555 [inline]
  __x64_sys_futex+0x46a/0x680 kernel/futex.c:3555
  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x455a09
RSP: 002b:0000000000a3e938 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: ffffffffffffffda RBX: 0000000000045a9b RCX: 0000000000455a09
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072becc
RBP: 000000000072becc R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000a3e940 R11: 0000000000000246 R12: 0000000000000019
R13: 0000000000000002 R14: 000000000072bea0 R15: 0000000000045a8f


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.

^ permalink raw reply

* Re: [PATCH net-next 2/2] pfifo_fast: drop unneeded additional lock on dequeue
From: Paolo Abeni @ 2018-05-16  7:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, David S. Miller, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	John Fastabend
In-Reply-To: <20180515221013-mutt-send-email-mst@kernel.org>

On Tue, 2018-05-15 at 23:17 +0300, Michael S. Tsirkin wrote:
> On Tue, May 15, 2018 at 04:24:37PM +0200, Paolo Abeni wrote:
> > After the previous patch, for NOLOCK qdiscs, q->seqlock is
> > always held when the dequeue() is invoked, we can drop
> > any additional locking to protect such operation.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> >  include/linux/skb_array.h | 5 +++++
> >  net/sched/sch_generic.c   | 4 ++--
> >  2 files changed, 7 insertions(+), 2 deletions(-)
> 
> Is the seqlock taken during qdisc_change_tx_queue_len?
> We need to prevent that racing with dequeue.

Thanks for the head-up! I missed that code-path.

I'll add the lock in qdisc_change_tx_queue_len() in v2.

Thanks you,

Paolo

^ permalink raw reply

* Re: linux-next: BUG: KASAN: use-after-free in tun_chr_close
From: Jason Wang @ 2018-05-16  7:52 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: netdev
In-Reply-To: <20180516074019.GA5601@outlook.office365.com>



On 2018年05月16日 15:40, Andrei Vagin wrote:
> On Wed, May 16, 2018 at 03:32:59PM +0800, Jason Wang wrote:
>> On 2018年05月16日 15:12, Andrei Vagin wrote:
>>> Hi Jason,
>>>
>>> I think the problem is in "tun: hold a tun socket during ptr_ring_cleanup".
>>>
>>> Pls take a look at the attached patch.
>> Yes.
>>
>> It looks to me it's not necessary to take extra refcnt during release, we
>> can just do the cleanup at __tun_detach().
>>
>> Could you help to test the attached patch?
> I've run my test on the kernel with this patch. It fixes the problem.
> The patch looks correct for me.
>
> Acked-by: Andrei Vagin<avagin@virtuozzo.com>
>

Cool, thanks a lot!

Let me post a formal patch.

^ permalink raw reply

* Re: [PATCH 09/14] net: sched: don't release reference on action overwrite
From: Jiri Pirko @ 2018-05-16  7:50 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <vbfd0xw11pn.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

Wed, May 16, 2018 at 09:47:32AM CEST, vladbu@mellanox.com wrote:
>
>On Wed 16 May 2018 at 07:43, Jiri Pirko <jiri@resnulli.us> wrote:
>> Mon, May 14, 2018 at 04:27:10PM CEST, vladbu@mellanox.com wrote:
>>>Return from action init function with reference to action taken,
>>>even when overwriting existing action.
>>>
>>>Action init API initializes its fourth argument (pointer to pointer to
>>>tc action) to either existing action with same index or newly created
>>>action. In case of existing index(and bind argument is zero), init
>>>function returns without incrementing action reference counter. Caller
>>>of action init then proceeds working with action without actually
>>>holding reference to it. This means that action could be deleted
>>>concurrently. To prevent such scenario this patch changes action init
>>
>> Be imperative to the codebase in the patch description.
>>
>>
>>>behavior to always take reference to action before returning
>>>successfully.
>>
>> Where's the balance? Who does the release instead? I'm probably missing
>> something.
>
>I've resplit these patches for V2 to always do take/release in same
>patch.

Good. Thanks.

>
>>
>>>
>>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>>---
>>> net/sched/act_bpf.c        |  8 ++++----
>>> net/sched/act_connmark.c   |  5 +++--
>>> net/sched/act_csum.c       |  8 ++++----
>>> net/sched/act_gact.c       |  5 +++--
>>> net/sched/act_ife.c        | 12 +++++-------
>>> net/sched/act_ipt.c        |  5 +++--
>>> net/sched/act_mirred.c     |  5 ++---
>>> net/sched/act_nat.c        |  5 +++--
>>> net/sched/act_pedit.c      |  5 +++--
>>> net/sched/act_police.c     |  8 +++-----
>>> net/sched/act_sample.c     |  8 +++-----
>>> net/sched/act_simple.c     |  5 +++--
>>> net/sched/act_skbedit.c    |  5 +++--
>>> net/sched/act_skbmod.c     |  8 +++-----
>>> net/sched/act_tunnel_key.c |  8 +++-----
>>> net/sched/act_vlan.c       |  8 +++-----
>>> 16 files changed, 51 insertions(+), 57 deletions(-)
>>>
>>>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>>>index 5d95c43..5554bf7 100644
>>>--- a/net/sched/act_bpf.c
>>>+++ b/net/sched/act_bpf.c
>>>@@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
>>> 		if (bind)
>>> 			return 0;
>>> 
>>>-		tcf_idr_release(*act, bind);
>>>-		if (!replace)
>>>+		if (!replace) {
>>>+			tcf_idr_release(*act, bind);
>>> 			return -EEXIST;
>>>+		}
>>> 	}
>>> 
>>> 	is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
>>
>> [...]
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox