[PATCH 2/3] fuse: add fuse numa node struct

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Srinivas Eeda <srinivas.eeda@oracle.com>
To: linux-fsdevel@vger.kernel.org, fuse-devel@lists.sourceforge.net
Cc: mszeredi@suse.cz, srinivas.eeda@oracle.com
Subject: [PATCH 2/3] fuse: add fuse numa node struct
Date: Mon, 29 Apr 2013 23:17:32 -0700	[thread overview]
Message-ID: <1367302653-10544-3-git-send-email-srinivas.eeda@oracle.com> (raw)
In-Reply-To: <1367302653-10544-1-git-send-email-srinivas.eeda@oracle.com>

This patch introduces new structure fuse_numa_node, which groups some fields
from fuse_conn structure. An instance of fuse_numa_node is created per each
numa node that exists on the system. This is to reduce contention on single
spinlock which creates latencies when accessesed across NUMA regions.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
 fs/fuse/control.c |   25 ++++++++----
 fs/fuse/cuse.c    |   11 ++++-
 fs/fuse/fuse_i.h  |  118 +++++++++++++++++++++++++++++++----------------------
 fs/fuse/inode.c   |  114 +++++++++++++++++++++++++++++++++++++++------------
 4 files changed, 182 insertions(+), 86 deletions(-)

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855..9a9ca5c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -48,12 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
 	size_t size;
 
 	if (!*ppos) {
-		long value;
+		long i, value;
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (!fc)
 			return 0;
 
-		value = atomic_read(&fc->num_waiting);
+		for (i = 0, value = 0; i < fc->nr_nodes; i++)
+			value += atomic_read(&fc->nn[i]->num_waiting);
 		file->private_data = (void *)value;
 		fuse_conn_put(fc);
 	}
@@ -101,13 +102,14 @@ static ssize_t fuse_conn_max_background_read(struct file *file,
 					     loff_t *ppos)
 {
 	struct fuse_conn *fc;
-	unsigned val;
+	unsigned i, val;
 
 	fc = fuse_ctl_file_conn_get(file);
 	if (!fc)
 		return 0;
 
-	val = fc->max_background;
+	for (i = 0, val = 0; i < fc->nr_nodes; i++)
+		val += fc->nn[i]->max_background;
 	fuse_conn_put(fc);
 
 	return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -123,9 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
 				    max_user_bgreq);
 	if (ret > 0) {
+		int i;
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (fc) {
-			fc->max_background = val;
+			val = (val  + fc->nr_nodes - 1) / fc->nr_nodes;
+			for (i = 0; i < fc->nr_nodes; i++)
+				fc->nn[i]->max_background = val;
 			fuse_conn_put(fc);
 		}
 	}
@@ -138,13 +143,14 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
 						   loff_t *ppos)
 {
 	struct fuse_conn *fc;
-	unsigned val;
+	unsigned i, val;
 
 	fc = fuse_ctl_file_conn_get(file);
 	if (!fc)
 		return 0;
 
-	val = fc->congestion_threshold;
+	for (i = 0, val = 0; i < fc->nr_nodes; i++)
+		val += fc->nn[i]->congestion_threshold;
 	fuse_conn_put(fc);
 
 	return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -160,9 +166,12 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
 				    max_user_congthresh);
 	if (ret > 0) {
+		int i;
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (fc) {
-			fc->congestion_threshold = val;
+			val = (val  + fc->nr_nodes - 1) / fc->nr_nodes;
+			for (i = 0; i < fc->nr_nodes; i++)
+				fc->nn[i]->congestion_threshold = val;
 			fuse_conn_put(fc);
 		}
 	}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de10bdf..90d99d4 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -498,13 +498,14 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
 	if (!cc)
 		return -ENOMEM;
 
-	fuse_conn_init(&cc->fc, 0);
+	rc = fuse_conn_init(&cc->fc, 0);
+	if (rc < 0)
+		return rc;
 
 	INIT_LIST_HEAD(&cc->list);
 	cc->fc.release = cuse_fc_release;
 
 	cc->fc.connected = 1;
-	cc->fc.blocked = 0;
 	rc = cuse_send_init(cc);
 	if (rc) {
 		fuse_conn_put(&cc->fc);
@@ -562,8 +563,12 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
 				       struct device_attribute *attr, char *buf)
 {
 	struct cuse_conn *cc = dev_get_drvdata(dev);
+	int i, val;
+
+	for (i = 0, val = 0; i < cc->fc.nr_nodes; i++)
+		val += atomic_read(&cc->fc.nn[i]->num_waiting);
 
-	return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+	return sprintf(buf, "%d\n", val);
 }
 
 static ssize_t cuse_class_abort_store(struct device *dev,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dd9a7ad..b44675b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -232,6 +232,9 @@ enum fuse_req_state {
  * A request to the client
  */
 struct fuse_req {
+	/* numa node number on which fuse_req is allocated from */
+	int numaid;
+
 	/** This can be on either pending processing or io lists in
 	    fuse_conn */
 	struct list_head list;
@@ -342,6 +345,66 @@ struct fuse_req {
 	struct file *stolen_file;
 };
 
+/* structure that tracks numa node specific fields */
+struct fuse_numa_node {
+	/* numa node id */
+	int numaid;
+
+	/* Lock protecting accessess to members of this structure */
+	spinlock_t lock;
+
+	/* pointer to main fuse_connection */
+	struct fuse_conn *fc;
+
+	/* Flag indicating if queue is blocked.  This will be
+	   the case before the INIT reply is received, and if there
+	   are too many outstading backgrounds requests */
+	int blocked;
+
+	/* Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/* Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
+	/* Number of requests currently in the background */
+	unsigned num_background;
+
+	/* Number of background requests currently queued for userspace */
+	unsigned active_background;
+
+	/* The number of requests waiting for completion */
+	atomic_t num_waiting;
+
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
+	/* waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
+
+	/* Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/* The list of background requests set aside for later queuing */
+	struct list_head bg_queue;
+
+	/* Pending interrupts */
+	struct list_head interrupts;
+
+	/* The list of pending requests */
+	struct list_head pending;
+
+	/* The list of requests being processed */
+	struct list_head processing;
+
+	/* The list of requests under I/O */
+	struct list_head io;
+};
+
 /**
  * A Fuse connection.
  *
@@ -356,6 +419,9 @@ struct fuse_conn {
 	/** tracks if numa enabled */
 	int numa_on;
 
+	/** Number of numa nodes */
+	int nr_nodes;
+
 	/** Mutex protecting against directory alias creation */
 	struct mutex inst_mutex;
 
@@ -377,57 +443,12 @@ struct fuse_conn {
 	/** Maximum write size */
 	unsigned max_write;
 
-	/** Readers of the connection are waiting on this */
-	wait_queue_head_t waitq;
-
-	/** The list of pending requests */
-	struct list_head pending;
-
-	/** The list of requests being processed */
-	struct list_head processing;
-
-	/** The list of requests under I/O */
-	struct list_head io;
-
 	/** The next unique kernel file handle */
 	u64 khctr;
 
 	/** rbtree of fuse_files waiting for poll events indexed by ph */
 	struct rb_root polled_files;
 
-	/** Maximum number of outstanding background requests */
-	unsigned max_background;
-
-	/** Number of background requests at which congestion starts */
-	unsigned congestion_threshold;
-
-	/** Number of requests currently in the background */
-	unsigned num_background;
-
-	/** Number of background requests currently queued for userspace */
-	unsigned active_background;
-
-	/** The list of background requests set aside for later queuing */
-	struct list_head bg_queue;
-
-	/** Pending interrupts */
-	struct list_head interrupts;
-
-	/** Queue of pending forgets */
-	struct fuse_forget_link forget_list_head;
-	struct fuse_forget_link *forget_list_tail;
-
-	/** Batching of FORGET requests (positive indicates FORGET batch) */
-	int forget_batch;
-
-	/** Flag indicating if connection is blocked.  This will be
-	    the case before the INIT reply is received, and if there
-	    are too many outstading backgrounds requests */
-	int blocked;
-
-	/** waitq for blocked connection */
-	wait_queue_head_t blocked_waitq;
-
 	/** waitq for reserved requests */
 	wait_queue_head_t reserved_req_waitq;
 
@@ -523,9 +544,6 @@ struct fuse_conn {
 	/** Does the filesystem want adaptive readdirplus? */
 	unsigned readdirplus_auto:1;
 
-	/** The number of requests waiting for completion */
-	atomic_t num_waiting;
-
 	/** Negotiated minor version */
 	unsigned minor;
 
@@ -564,6 +582,8 @@ struct fuse_conn {
 
 	/** Read/write semaphore to hold when accessing sb. */
 	struct rw_semaphore killsb;
+
+	struct fuse_numa_node **nn;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -766,7 +786,7 @@ void fuse_conn_kill(struct fuse_conn *fc);
 /**
  * Initialize fuse_conn
  */
-void fuse_conn_init(struct fuse_conn *fc, int numaon);
+int fuse_conn_init(struct fuse_conn *fc, int numaon);
 
 /**
  * Release reference to fuse_conn
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1837f74..250eb38 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -360,14 +360,21 @@ static void fuse_bdi_destroy(struct fuse_conn *fc)
 
 void fuse_conn_kill(struct fuse_conn *fc)
 {
+	int i;
+	struct fuse_numa_node *nn;
+
 	spin_lock(&fc->lock);
 	fc->connected = 0;
-	fc->blocked = 0;
 	spin_unlock(&fc->lock);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
-	wake_up_all(&fc->waitq);
-	wake_up_all(&fc->blocked_waitq);
+	for (i = 0; i < fc->nr_nodes; i++) {
+		nn = fc->nn[i];
+		nn->blocked = 0;
+		wake_up_all(&nn->waitq);
+		wake_up_all(&nn->blocked_waitq);
+	}
+	wake_up_all(&fc->poll_waitq);
 	wake_up_all(&fc->reserved_req_waitq);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_kill);
@@ -567,8 +574,11 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
-void fuse_conn_init(struct fuse_conn *fc, int numaon)
+int fuse_conn_init(struct fuse_conn *fc, int numaon)
 {
+	int i, sz, ret;
+	struct fuse_numa_node *nn;
+
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
 	mutex_init(&fc->inst_mutex);
@@ -576,25 +586,61 @@ void fuse_conn_init(struct fuse_conn *fc, int numaon)
 	atomic_set(&fc->count, 1);
 	if (numaon)
 		fc->numa_on = 1;
-	init_waitqueue_head(&fc->waitq);
-	init_waitqueue_head(&fc->blocked_waitq);
 	init_waitqueue_head(&fc->reserved_req_waitq);
-	INIT_LIST_HEAD(&fc->pending);
-	INIT_LIST_HEAD(&fc->processing);
-	INIT_LIST_HEAD(&fc->io);
-	INIT_LIST_HEAD(&fc->interrupts);
-	INIT_LIST_HEAD(&fc->bg_queue);
+	init_waitqueue_head(&fc->poll_waitq);
 	INIT_LIST_HEAD(&fc->entry);
-	fc->forget_list_tail = &fc->forget_list_head;
-	atomic_set(&fc->num_waiting, 0);
-	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
-	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
-	fc->blocked = 1;
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+
+	if (numaon) {
+		fc->numa_on = 1;
+		fc->nr_nodes = nr_node_ids;
+	} else
+		fc->nr_nodes = 1;
+
+	ret = -ENOMEM;
+	sz = sizeof(struct fuse_numa_node *) * fc->nr_nodes;
+	fc->nn = kmalloc(sz, GFP_KERNEL);
+	if (!fc->nn)
+		return ret;
+	memset(fc->nn, 0, sz);
+
+	sz = sizeof(struct fuse_numa_node);
+	for (i = 0; i < fc->nr_nodes; i++) {
+		nn = kmalloc_node(sz, GFP_KERNEL, i);
+		if (!nn)
+			goto out;
+		memset(nn, 0, sz);
+		fc->nn[i] = nn;
+		nn->fc = fc;
+		nn->numaid = i;
+		nn->blocked = 1;
+		spin_lock_init(&nn->lock);
+		init_waitqueue_head(&nn->waitq);
+		init_waitqueue_head(&nn->blocked_waitq);
+		INIT_LIST_HEAD(&nn->bg_queue);
+		INIT_LIST_HEAD(&nn->interrupts);
+		INIT_LIST_HEAD(&nn->pending);
+		INIT_LIST_HEAD(&nn->processing);
+		INIT_LIST_HEAD(&nn->io);
+		nn->forget_list_tail = &nn->forget_list_head;
+		atomic_set(&nn->num_waiting, 0);
+		nn->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+		nn->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+	}
+	return 0;
+out:
+	while (i > 0) {
+		if (fc->nn[i - 1])
+			kfree(fc->nn[i - 1]);
+		i--;
+	};
+	if (fc->nn)
+		kfree(fc->nn);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
@@ -816,6 +862,7 @@ static int set_global_limit(const char *val, struct kernel_param *kp)
 static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
 {
 	int cap_sys_admin = capable(CAP_SYS_ADMIN);
+	int i, val;
 
 	if (arg->minor < 13)
 		return;
@@ -824,22 +871,29 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
 	sanitize_global_limit(&max_user_congthresh);
 
 	if (arg->max_background) {
-		fc->max_background = arg->max_background;
+		val = arg->max_background;
+		if (!cap_sys_admin && (val > max_user_bgreq))
+			val = max_user_bgreq;
+
+		val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+		for (i = 0; i < fc->nr_nodes; i++)
+			fc->nn[i]->max_background = val;
 
-		if (!cap_sys_admin && fc->max_background > max_user_bgreq)
-			fc->max_background = max_user_bgreq;
 	}
 	if (arg->congestion_threshold) {
-		fc->congestion_threshold = arg->congestion_threshold;
+		val = arg->congestion_threshold;
+		if (!cap_sys_admin && val > max_user_congthresh)
+			val = max_user_congthresh;
 
-		if (!cap_sys_admin &&
-		    fc->congestion_threshold > max_user_congthresh)
-			fc->congestion_threshold = max_user_congthresh;
+		val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+		for (i = 0; i < fc->nr_nodes; i++)
+			fc->nn[i]->congestion_threshold = val;
 	}
 }
 
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
+	int i;
 	struct fuse_init_out *arg = &req->misc.init_out;
 
 	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -891,8 +945,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fc->blocked = 0;
-	wake_up_all(&fc->blocked_waitq);
+	for (i = 0; i < fc->nr_nodes; i++) {
+		fc->nn[i]->blocked = 0;
+		wake_up_all(&fc->nn[i]->blocked_waitq);
+	}
 }
 
 static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
@@ -924,6 +980,11 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 static void fuse_free_conn(struct fuse_conn *fc)
 {
+	int i;
+
+	for (i = 0; i < fc->nr_nodes; i++)
+		if (fc->nn[i])
+			kfree(fc->nn[i]);
 	kfree(fc);
 }
 
@@ -1019,7 +1080,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!fc)
 		goto err_fput;
 
-	fuse_conn_init(fc, d.numaon);
+	if (fuse_conn_init(fc, d.numaon) < 0)
+		goto err_fput;
 
 	fc->dev = sb->s_dev;
 	fc->sb = sb;
-- 
1.5.4.3

next prev parent reply	other threads:[~2013-04-30  6:18 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-30  6:17 FUSE: fixes to improve scalability on NUMA systems Srinivas Eeda
2013-04-30  6:17 ` [PATCH 1/3] fuse: add numa mount option Srinivas Eeda
2013-04-30  6:17 ` Srinivas Eeda [this message]
2013-04-30  6:17 ` [PATCH 3/3] fuse: split fuse queues to help numa systems Srinivas Eeda
2013-04-30 16:29 ` [fuse-devel] FUSE: fixes to improve scalability on NUMA systems Miklos Szeredi
2013-04-30 18:28   ` Srinivas Eeda
2013-05-01  9:53     ` Miklos Szeredi
     [not found]       ` <CAJfpegvi=Npv1Da2gqDb50xWzO4GHusbrwZMn5tUp8hQ89AJjQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-08  9:11         ` Anand Avati

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:a0b0855 dfblob:9a9ca5c dfblob:de10bdf dfblob:90d99d4
dfblob:dd9a7ad dfblob:b44675b dfblob:1837f74 dfblob:250eb38 )
 OR (
bs:"[PATCH 2/3] fuse: add fuse numa node struct" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1367302653-10544-3-git-send-email-srinivas.eeda@oracle.com \
    --to=srinivas.eeda@oracle.com \
    --cc=fuse-devel@lists.sourceforge.net \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=mszeredi@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).