From: Srinivas Eeda <srinivas.eeda@oracle.com>
To: linux-fsdevel@vger.kernel.org, fuse-devel@lists.sourceforge.net
Cc: mszeredi@suse.cz, srinivas.eeda@oracle.com
Subject: [PATCH 2/3] fuse: add fuse numa node struct
Date: Mon, 29 Apr 2013 23:17:32 -0700 [thread overview]
Message-ID: <1367302653-10544-3-git-send-email-srinivas.eeda@oracle.com> (raw)
In-Reply-To: <1367302653-10544-1-git-send-email-srinivas.eeda@oracle.com>
This patch introduces new structure fuse_numa_node, which groups some fields
from fuse_conn structure. An instance of fuse_numa_node is created per each
numa node that exists on the system. This is to reduce contention on single
spinlock which creates latencies when accessesed across NUMA regions.
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
fs/fuse/control.c | 25 ++++++++----
fs/fuse/cuse.c | 11 ++++-
fs/fuse/fuse_i.h | 118 +++++++++++++++++++++++++++++++----------------------
fs/fuse/inode.c | 114 +++++++++++++++++++++++++++++++++++++++------------
4 files changed, 182 insertions(+), 86 deletions(-)
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855..9a9ca5c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -48,12 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
size_t size;
if (!*ppos) {
- long value;
+ long i, value;
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (!fc)
return 0;
- value = atomic_read(&fc->num_waiting);
+ for (i = 0, value = 0; i < fc->nr_nodes; i++)
+ value += atomic_read(&fc->nn[i]->num_waiting);
file->private_data = (void *)value;
fuse_conn_put(fc);
}
@@ -101,13 +102,14 @@ static ssize_t fuse_conn_max_background_read(struct file *file,
loff_t *ppos)
{
struct fuse_conn *fc;
- unsigned val;
+ unsigned i, val;
fc = fuse_ctl_file_conn_get(file);
if (!fc)
return 0;
- val = fc->max_background;
+ for (i = 0, val = 0; i < fc->nr_nodes; i++)
+ val += fc->nn[i]->max_background;
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -123,9 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
max_user_bgreq);
if (ret > 0) {
+ int i;
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fc->max_background = val;
+ val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+ for (i = 0; i < fc->nr_nodes; i++)
+ fc->nn[i]->max_background = val;
fuse_conn_put(fc);
}
}
@@ -138,13 +143,14 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
loff_t *ppos)
{
struct fuse_conn *fc;
- unsigned val;
+ unsigned i, val;
fc = fuse_ctl_file_conn_get(file);
if (!fc)
return 0;
- val = fc->congestion_threshold;
+ for (i = 0, val = 0; i < fc->nr_nodes; i++)
+ val += fc->nn[i]->congestion_threshold;
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -160,9 +166,12 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
max_user_congthresh);
if (ret > 0) {
+ int i;
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fc->congestion_threshold = val;
+ val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+ for (i = 0; i < fc->nr_nodes; i++)
+ fc->nn[i]->congestion_threshold = val;
fuse_conn_put(fc);
}
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de10bdf..90d99d4 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -498,13 +498,14 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
if (!cc)
return -ENOMEM;
- fuse_conn_init(&cc->fc, 0);
+ rc = fuse_conn_init(&cc->fc, 0);
+ if (rc < 0)
+ return rc;
INIT_LIST_HEAD(&cc->list);
cc->fc.release = cuse_fc_release;
cc->fc.connected = 1;
- cc->fc.blocked = 0;
rc = cuse_send_init(cc);
if (rc) {
fuse_conn_put(&cc->fc);
@@ -562,8 +563,12 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cuse_conn *cc = dev_get_drvdata(dev);
+ int i, val;
+
+ for (i = 0, val = 0; i < cc->fc.nr_nodes; i++)
+ val += atomic_read(&cc->fc.nn[i]->num_waiting);
- return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+ return sprintf(buf, "%d\n", val);
}
static ssize_t cuse_class_abort_store(struct device *dev,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dd9a7ad..b44675b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -232,6 +232,9 @@ enum fuse_req_state {
* A request to the client
*/
struct fuse_req {
+ /* numa node number on which fuse_req is allocated from */
+ int numaid;
+
/** This can be on either pending processing or io lists in
fuse_conn */
struct list_head list;
@@ -342,6 +345,66 @@ struct fuse_req {
struct file *stolen_file;
};
+/* structure that tracks numa node specific fields */
+struct fuse_numa_node {
+ /* numa node id */
+ int numaid;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* pointer to main fuse_connection */
+ struct fuse_conn *fc;
+
+ /* Flag indicating if queue is blocked. This will be
+ the case before the INIT reply is received, and if there
+ are too many outstading backgrounds requests */
+ int blocked;
+
+ /* Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /* Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
+ /* Number of requests currently in the background */
+ unsigned num_background;
+
+ /* Number of background requests currently queued for userspace */
+ unsigned active_background;
+
+ /* The number of requests waiting for completion */
+ atomic_t num_waiting;
+
+ /** Queue of pending forgets */
+ struct fuse_forget_link forget_list_head;
+ struct fuse_forget_link *forget_list_tail;
+
+ /** Batching of FORGET requests (positive indicates FORGET batch) */
+ int forget_batch;
+
+ /* waitq for blocked connection */
+ wait_queue_head_t blocked_waitq;
+
+ /* Readers of the connection are waiting on this */
+ wait_queue_head_t waitq;
+
+ /* The list of background requests set aside for later queuing */
+ struct list_head bg_queue;
+
+ /* Pending interrupts */
+ struct list_head interrupts;
+
+ /* The list of pending requests */
+ struct list_head pending;
+
+ /* The list of requests being processed */
+ struct list_head processing;
+
+ /* The list of requests under I/O */
+ struct list_head io;
+};
+
/**
* A Fuse connection.
*
@@ -356,6 +419,9 @@ struct fuse_conn {
/** tracks if numa enabled */
int numa_on;
+ /** Number of numa nodes */
+ int nr_nodes;
+
/** Mutex protecting against directory alias creation */
struct mutex inst_mutex;
@@ -377,57 +443,12 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
- /** Readers of the connection are waiting on this */
- wait_queue_head_t waitq;
-
- /** The list of pending requests */
- struct list_head pending;
-
- /** The list of requests being processed */
- struct list_head processing;
-
- /** The list of requests under I/O */
- struct list_head io;
-
/** The next unique kernel file handle */
u64 khctr;
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
- /** Maximum number of outstanding background requests */
- unsigned max_background;
-
- /** Number of background requests at which congestion starts */
- unsigned congestion_threshold;
-
- /** Number of requests currently in the background */
- unsigned num_background;
-
- /** Number of background requests currently queued for userspace */
- unsigned active_background;
-
- /** The list of background requests set aside for later queuing */
- struct list_head bg_queue;
-
- /** Pending interrupts */
- struct list_head interrupts;
-
- /** Queue of pending forgets */
- struct fuse_forget_link forget_list_head;
- struct fuse_forget_link *forget_list_tail;
-
- /** Batching of FORGET requests (positive indicates FORGET batch) */
- int forget_batch;
-
- /** Flag indicating if connection is blocked. This will be
- the case before the INIT reply is received, and if there
- are too many outstading backgrounds requests */
- int blocked;
-
- /** waitq for blocked connection */
- wait_queue_head_t blocked_waitq;
-
/** waitq for reserved requests */
wait_queue_head_t reserved_req_waitq;
@@ -523,9 +544,6 @@ struct fuse_conn {
/** Does the filesystem want adaptive readdirplus? */
unsigned readdirplus_auto:1;
- /** The number of requests waiting for completion */
- atomic_t num_waiting;
-
/** Negotiated minor version */
unsigned minor;
@@ -564,6 +582,8 @@ struct fuse_conn {
/** Read/write semaphore to hold when accessing sb. */
struct rw_semaphore killsb;
+
+ struct fuse_numa_node **nn;
};
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -766,7 +786,7 @@ void fuse_conn_kill(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, int numaon);
+int fuse_conn_init(struct fuse_conn *fc, int numaon);
/**
* Release reference to fuse_conn
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1837f74..250eb38 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -360,14 +360,21 @@ static void fuse_bdi_destroy(struct fuse_conn *fc)
void fuse_conn_kill(struct fuse_conn *fc)
{
+ int i;
+ struct fuse_numa_node *nn;
+
spin_lock(&fc->lock);
fc->connected = 0;
- fc->blocked = 0;
spin_unlock(&fc->lock);
/* Flush all readers on this fs */
kill_fasync(&fc->fasync, SIGIO, POLL_IN);
- wake_up_all(&fc->waitq);
- wake_up_all(&fc->blocked_waitq);
+ for (i = 0; i < fc->nr_nodes; i++) {
+ nn = fc->nn[i];
+ nn->blocked = 0;
+ wake_up_all(&nn->waitq);
+ wake_up_all(&nn->blocked_waitq);
+ }
+ wake_up_all(&fc->poll_waitq);
wake_up_all(&fc->reserved_req_waitq);
}
EXPORT_SYMBOL_GPL(fuse_conn_kill);
@@ -567,8 +574,11 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-void fuse_conn_init(struct fuse_conn *fc, int numaon)
+int fuse_conn_init(struct fuse_conn *fc, int numaon)
{
+ int i, sz, ret;
+ struct fuse_numa_node *nn;
+
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
mutex_init(&fc->inst_mutex);
@@ -576,25 +586,61 @@ void fuse_conn_init(struct fuse_conn *fc, int numaon)
atomic_set(&fc->count, 1);
if (numaon)
fc->numa_on = 1;
- init_waitqueue_head(&fc->waitq);
- init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
- INIT_LIST_HEAD(&fc->pending);
- INIT_LIST_HEAD(&fc->processing);
- INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->interrupts);
- INIT_LIST_HEAD(&fc->bg_queue);
+ init_waitqueue_head(&fc->poll_waitq);
INIT_LIST_HEAD(&fc->entry);
- fc->forget_list_tail = &fc->forget_list_head;
- atomic_set(&fc->num_waiting, 0);
- fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
- fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
- fc->blocked = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+
+ if (numaon) {
+ fc->numa_on = 1;
+ fc->nr_nodes = nr_node_ids;
+ } else
+ fc->nr_nodes = 1;
+
+ ret = -ENOMEM;
+ sz = sizeof(struct fuse_numa_node *) * fc->nr_nodes;
+ fc->nn = kmalloc(sz, GFP_KERNEL);
+ if (!fc->nn)
+ return ret;
+ memset(fc->nn, 0, sz);
+
+ sz = sizeof(struct fuse_numa_node);
+ for (i = 0; i < fc->nr_nodes; i++) {
+ nn = kmalloc_node(sz, GFP_KERNEL, i);
+ if (!nn)
+ goto out;
+ memset(nn, 0, sz);
+ fc->nn[i] = nn;
+ nn->fc = fc;
+ nn->numaid = i;
+ nn->blocked = 1;
+ spin_lock_init(&nn->lock);
+ init_waitqueue_head(&nn->waitq);
+ init_waitqueue_head(&nn->blocked_waitq);
+ INIT_LIST_HEAD(&nn->bg_queue);
+ INIT_LIST_HEAD(&nn->interrupts);
+ INIT_LIST_HEAD(&nn->pending);
+ INIT_LIST_HEAD(&nn->processing);
+ INIT_LIST_HEAD(&nn->io);
+ nn->forget_list_tail = &nn->forget_list_head;
+ atomic_set(&nn->num_waiting, 0);
+ nn->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ nn->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+ }
+ return 0;
+out:
+ while (i > 0) {
+ if (fc->nn[i - 1])
+ kfree(fc->nn[i - 1]);
+ i--;
+ };
+ if (fc->nn)
+ kfree(fc->nn);
+ return ret;
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -816,6 +862,7 @@ static int set_global_limit(const char *val, struct kernel_param *kp)
static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
{
int cap_sys_admin = capable(CAP_SYS_ADMIN);
+ int i, val;
if (arg->minor < 13)
return;
@@ -824,22 +871,29 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
sanitize_global_limit(&max_user_congthresh);
if (arg->max_background) {
- fc->max_background = arg->max_background;
+ val = arg->max_background;
+ if (!cap_sys_admin && (val > max_user_bgreq))
+ val = max_user_bgreq;
+
+ val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+ for (i = 0; i < fc->nr_nodes; i++)
+ fc->nn[i]->max_background = val;
- if (!cap_sys_admin && fc->max_background > max_user_bgreq)
- fc->max_background = max_user_bgreq;
}
if (arg->congestion_threshold) {
- fc->congestion_threshold = arg->congestion_threshold;
+ val = arg->congestion_threshold;
+ if (!cap_sys_admin && val > max_user_congthresh)
+ val = max_user_congthresh;
- if (!cap_sys_admin &&
- fc->congestion_threshold > max_user_congthresh)
- fc->congestion_threshold = max_user_congthresh;
+ val = (val + fc->nr_nodes - 1) / fc->nr_nodes;
+ for (i = 0; i < fc->nr_nodes; i++)
+ fc->nn[i]->congestion_threshold = val;
}
}
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
+ int i;
struct fuse_init_out *arg = &req->misc.init_out;
if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -891,8 +945,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->max_write = max_t(unsigned, 4096, fc->max_write);
fc->conn_init = 1;
}
- fc->blocked = 0;
- wake_up_all(&fc->blocked_waitq);
+ for (i = 0; i < fc->nr_nodes; i++) {
+ fc->nn[i]->blocked = 0;
+ wake_up_all(&fc->nn[i]->blocked_waitq);
+ }
}
static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
@@ -924,6 +980,11 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_free_conn(struct fuse_conn *fc)
{
+ int i;
+
+ for (i = 0; i < fc->nr_nodes; i++)
+ if (fc->nn[i])
+ kfree(fc->nn[i]);
kfree(fc);
}
@@ -1019,7 +1080,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (!fc)
goto err_fput;
- fuse_conn_init(fc, d.numaon);
+ if (fuse_conn_init(fc, d.numaon) < 0)
+ goto err_fput;
fc->dev = sb->s_dev;
fc->sb = sb;
--
1.5.4.3
next prev parent reply other threads:[~2013-04-30 6:18 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-04-30 6:17 FUSE: fixes to improve scalability on NUMA systems Srinivas Eeda
2013-04-30 6:17 ` [PATCH 1/3] fuse: add numa mount option Srinivas Eeda
2013-04-30 6:17 ` Srinivas Eeda [this message]
2013-04-30 6:17 ` [PATCH 3/3] fuse: split fuse queues to help numa systems Srinivas Eeda
2013-04-30 16:29 ` [fuse-devel] FUSE: fixes to improve scalability on NUMA systems Miklos Szeredi
2013-04-30 18:28 ` Srinivas Eeda
2013-05-01 9:53 ` Miklos Szeredi
[not found] ` <CAJfpegvi=Npv1Da2gqDb50xWzO4GHusbrwZMn5tUp8hQ89AJjQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-08 9:11 ` Anand Avati
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1367302653-10544-3-git-send-email-srinivas.eeda@oracle.com \
--to=srinivas.eeda@oracle.com \
--cc=fuse-devel@lists.sourceforge.net \
--cc=linux-fsdevel@vger.kernel.org \
--cc=mszeredi@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).