* [PATCH 06/12] mm: Add get_kernel_page[s] for pinning of kernel addresses for I/O
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
This patch adds two new APIs get_kernel_pages() and get_kernel_page()
that may be used to pin a vector of kernel addresses for IO. The initial
user is expected to be NFS for allowing pages to be written to swap
using aops->direct_IO(). Strictly speaking, swap-over-NFS only needs
to pin one page for IO but it makes sense to express the API in terms
of a vector and add a helper for pinning single pages.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
include/linux/blk_types.h | 2 ++
include/linux/fs.h | 2 ++
include/linux/mm.h | 4 ++++
mm/memory.c | 53 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 61 insertions(+)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0edb65d..7b7ac9c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -160,6 +160,7 @@ enum rq_flag_bits {
__REQ_FLUSH_SEQ, /* request for flush sequence */
__REQ_IO_STAT, /* account I/O stat */
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
+ __REQ_KERNEL, /* direct IO to kernel pages */
__REQ_NR_BITS, /* stops here */
};
@@ -201,5 +202,6 @@ enum rq_flag_bits {
#define REQ_IO_STAT (1 << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
#define REQ_SECURE (1 << __REQ_SECURE)
+#define REQ_KERNEL (1 << __REQ_KERNEL)
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 006aa85..c2a4554 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -165,6 +165,8 @@ struct inodes_stat_t {
#define READ 0
#define WRITE RW_MASK
#define READA RWA_MASK
+#define KERNEL_READ (READ|REQ_KERNEL)
+#define KERNEL_WRITE (WRITE|REQ_KERNEL)
#define READ_SYNC (READ | REQ_SYNC)
#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b3d4cd9..bbb3167 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1019,6 +1019,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct page **pages, struct vm_area_struct **vmas);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
+struct kvec;
+int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
+ struct page **pages);
+int get_kernel_page(unsigned long start, int write, struct page **pages);
struct page *get_dump_page(unsigned long addr);
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
diff --git a/mm/memory.c b/mm/memory.c
index 6322d36..c8153f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1843,6 +1843,59 @@ next_page:
EXPORT_SYMBOL(__get_user_pages);
/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov: An array of struct kvec structures
+ * @nr_segs: number of segments to pin
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+ struct page **pages)
+{
+ int seg;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+ return seg;
+
+ /* virt_to_page sanity checks the PFN */
+ pages[seg] = virt_to_page(kiov[seg].iov_base);
+ page_cache_get(pages[seg]);
+ }
+
+ return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start: starting kernel address
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointer to the page pinned.
+ * Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+ const struct kvec kiov = {
+ .iov_base = (void *)start,
+ .iov_len = PAGE_SIZE
+ };
+
+ return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
+
+/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 07/12] mm: Add support for direct_IO to highmem pages
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
The patch "mm: Add support for a filesystem to activate swap files and
use direct_IO for writing swap pages" added support for using direct_IO
to write swap pages but it is insufficient for highmem pages.
To support highmem pages, this patch kmaps() the page before calling the
direct_IO() handler. As direct_IO deals with virtual addresses an
additional helper is necessary for get_kernel_pages() to lookup the
struct page for a kmap virtual address.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
include/linux/highmem.h | 7 +++++++
mm/highmem.c | 12 ++++++++++++
mm/memory.c | 3 +--
mm/page_io.c | 3 ++-
4 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 774fa47..ef788b5 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -39,10 +39,17 @@ extern unsigned long totalhigh_pages;
void kmap_flush_unused(void);
+struct page *kmap_to_page(void *addr);
+
#else /* CONFIG_HIGHMEM */
static inline unsigned int nr_free_highpages(void) { return 0; }
+static inline struct page *kmap_to_page(void *addr)
+{
+ return virt_to_page(addr);
+}
+
#define totalhigh_pages 0UL
#ifndef ARCH_HAS_KMAP
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6..d517cd1 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+ int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+
static void flush_all_zero_pkmaps(void)
{
int i;
diff --git a/mm/memory.c b/mm/memory.c
index c8153f5..94f916e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1864,8 +1864,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
return seg;
- /* virt_to_page sanity checks the PFN */
- pages[seg] = virt_to_page(kiov[seg].iov_base);
+ pages[seg] = kmap_to_page(kiov[seg].iov_base);
page_cache_get(pages[seg]);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 4a37962..78eee32 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -205,7 +205,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
struct iovec iov = {
- .iov_base = page_address(page),
+ .iov_base = kmap(page),
.iov_len = PAGE_SIZE,
};
@@ -218,6 +218,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
&kiocb, &iov,
kiocb.ki_pos, 1);
+ kunmap(page);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 08/12] nfs: teach the NFS client how to treat PG_swapcache pages
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
Replace all relevant occurences of page->index and page->mapping in
the NFS client with the new page_file_index() and page_file_mapping()
functions.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
fs/nfs/file.c | 6 +++---
fs/nfs/internal.h | 7 ++++---
fs/nfs/pagelist.c | 2 +-
fs/nfs/read.c | 6 +++---
fs/nfs/write.c | 38 +++++++++++++++++++-------------------
5 files changed, 30 insertions(+), 29 deletions(-)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a6708e6b..406caac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -442,7 +442,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
- nfs_wb_page_cancel(page->mapping->host, page);
+ nfs_wb_page_cancel(page_file_mapping(page)->host, page);
nfs_fscache_invalidate_page(page, page->mapping->host);
}
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
*/
static int nfs_launder_page(struct page *page)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_inode *nfsi = NFS_I(inode);
dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
@@ -533,7 +533,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
lock_page(page);
- mapping = page->mapping;
+ mapping = page_file_mapping(page);
if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 18f99ef..43ea79a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -463,13 +463,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
static inline
unsigned int nfs_page_length(struct page *page)
{
- loff_t i_size = i_size_read(page->mapping->host);
+ loff_t i_size = i_size_read(page_file_mapping(page)->host);
if (i_size > 0) {
+ pgoff_t page_index = page_file_index(page);
pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
- if (page->index < end_index)
+ if (page_index < end_index)
return PAGE_CACHE_SIZE;
- if (page->index == end_index)
+ if (page_index == end_index)
return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
}
return 0;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index aed913c..9ef8b3c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -117,7 +117,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
- req->wb_index = page->index;
+ req->wb_index = page_file_index(page);
page_cache_get(page);
req->wb_offset = offset;
req->wb_pgbase = offset;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 86ced78..c5b83ce 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -532,11 +532,11 @@ static const struct rpc_call_ops nfs_read_common_ops = {
int nfs_readpage(struct file *file, struct page *page)
{
struct nfs_open_context *ctx;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_CACHE_SIZE, page->index);
+ page, PAGE_CACHE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
@@ -590,7 +590,7 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *new;
unsigned int len;
int error;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee929e5..f6a8ebc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -153,7 +153,7 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page)
static struct nfs_page *nfs_page_find_request(struct page *page)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req = NULL;
spin_lock(&inode->i_lock);
@@ -165,16 +165,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
loff_t end, i_size;
pgoff_t end_index;
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
- if (i_size > 0 && page->index < end_index)
+ if (i_size > 0 && page_file_index(page) < end_index)
goto out;
- end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
+ end = page_file_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
goto out;
i_size_write(inode, end);
@@ -187,7 +187,7 @@ out:
static void nfs_set_pageerror(struct page *page)
{
SetPageError(page);
- nfs_zap_mapping(page->mapping->host, page->mapping);
+ nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -228,7 +228,7 @@ static int nfs_set_page_writeback(struct page *page)
int ret = test_set_page_writeback(page);
if (!ret) {
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);
if (atomic_long_inc_return(&nfss->writeback) >
@@ -242,7 +242,7 @@ static int nfs_set_page_writeback(struct page *page)
static void nfs_end_page_writeback(struct page *page)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);
end_page_writeback(page);
@@ -252,7 +252,7 @@ static void nfs_end_page_writeback(struct page *page)
static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req;
int ret;
@@ -313,13 +313,13 @@ out:
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
int ret;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
- nfs_pageio_cond_complete(pgio, page->index);
+ nfs_pageio_cond_complete(pgio, page_file_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
@@ -336,8 +336,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
struct nfs_pageio_descriptor pgio;
int err;
- nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
- &nfs_async_write_completion_ops);
+ nfs_pageio_init_write(&pgio, page_file_mapping(page)->host,
+ wb_priority(wbc), &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
if (err < 0)
@@ -470,7 +470,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
@@ -537,7 +537,7 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
}
static void
@@ -788,7 +788,7 @@ out_err:
static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
struct page *page, unsigned int offset, unsigned int bytes)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req;
req = nfs_try_to_update_request(inode, page, offset, bytes);
@@ -841,7 +841,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
nfs_release_request(req);
if (!do_flush)
return 0;
- status = nfs_wb_page(page->mapping->host, page);
+ status = nfs_wb_page(page_file_mapping(page)->host, page);
} while (status == 0);
return status;
}
@@ -871,7 +871,7 @@ int nfs_updatepage(struct file *file, struct page *page,
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -879,7 +879,7 @@ int nfs_updatepage(struct file *file, struct page *page,
dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name, count,
- (long long)(page_offset(page) + offset));
+ (long long)(page_file_offset(page) + offset));
/* If we're not using byte range locks, and we know the page
* is up to date, it may be more efficient to extend the write
@@ -1475,7 +1475,7 @@ void nfs_retry_commit(struct list_head *page_list,
nfs_mark_request_commit(req, lseg, cinfo);
if (!cinfo->dreq) {
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
BDI_RECLAIMABLE);
}
nfs_unlock_and_release_request(req);
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 09/12] nfs: disable data cache revalidation for swapfiles
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
The VM does not like PG_private set on PG_swapcache pages. As suggested
by Trond in http://lkml.org/lkml/2006/8/25/348, this patch disables
NFS data cache revalidation on swap files. as it does not make
sense to have other clients change the file while it is being used as
swap. This avoids setting PG_private on swap pages, since there ought
to be no further races with invalidate_inode_pages2() to deal with.
Since we cannot set PG_private we cannot use page->private which
is already used by PG_swapcache pages to store the nfs_page. Thus
augment the new nfs_page_find_request logic.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
fs/nfs/inode.c | 4 ++++
fs/nfs/write.c | 49 +++++++++++++++++++++++++++++++++++--------------
2 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index edecd05..90810a4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -883,6 +883,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
struct nfs_inode *nfsi = NFS_I(inode);
int ret = 0;
+ /* swapfiles are not supposed to be shared. */
+ if (IS_SWAPFILE(inode))
+ goto out;
+
if (nfs_mapping_need_revalidate_inode(inode)) {
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (ret < 0)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f6a8ebc..947e1e6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -139,15 +139,28 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
}
-static struct nfs_page *nfs_page_find_request_locked(struct page *page)
+static struct nfs_page *
+nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
{
struct nfs_page *req = NULL;
- if (PagePrivate(page)) {
+ if (PagePrivate(page))
req = (struct nfs_page *)page_private(page);
- if (req != NULL)
- kref_get(&req->wb_kref);
+ else if (unlikely(PageSwapCache(page))) {
+ struct nfs_page *freq, *t;
+
+ /* Linearly search the commit list for the correct req */
+ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
+ if (freq->wb_page == page) {
+ req = freq;
+ break;
+ }
+ }
}
+
+ if (req)
+ kref_get(&req->wb_kref);
+
return req;
}
@@ -157,7 +170,7 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
struct nfs_page *req = NULL;
spin_lock(&inode->i_lock);
- req = nfs_page_find_request_locked(page);
+ req = nfs_page_find_request_locked(NFS_I(inode), page);
spin_unlock(&inode->i_lock);
return req;
}
@@ -258,7 +271,7 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
spin_lock(&inode->i_lock);
for (;;) {
- req = nfs_page_find_request_locked(page);
+ req = nfs_page_find_request_locked(NFS_I(inode), page);
if (req == NULL)
break;
if (nfs_lock_request(req))
@@ -412,9 +425,15 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
spin_lock(&inode->i_lock);
if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
inode->i_version++;
- set_bit(PG_MAPPED, &req->wb_flags);
- SetPagePrivate(req->wb_page);
- set_page_private(req->wb_page, (unsigned long)req);
+ /*
+ * Swap-space should not get truncated. Hence no need to plug the race
+ * with invalidate/truncate.
+ */
+ if (likely(!PageSwapCache(req->wb_page))) {
+ set_bit(PG_MAPPED, &req->wb_flags);
+ SetPagePrivate(req->wb_page);
+ set_page_private(req->wb_page, (unsigned long)req);
+ }
nfsi->npages++;
kref_get(&req->wb_kref);
spin_unlock(&inode->i_lock);
@@ -431,9 +450,11 @@ static void nfs_inode_remove_request(struct nfs_page *req)
BUG_ON (!NFS_WBACK_BUSY(req));
spin_lock(&inode->i_lock);
- set_page_private(req->wb_page, 0);
- ClearPagePrivate(req->wb_page);
- clear_bit(PG_MAPPED, &req->wb_flags);
+ if (likely(!PageSwapCache(req->wb_page))) {
+ set_page_private(req->wb_page, 0);
+ ClearPagePrivate(req->wb_page);
+ clear_bit(PG_MAPPED, &req->wb_flags);
+ }
nfsi->npages--;
spin_unlock(&inode->i_lock);
nfs_release_request(req);
@@ -729,7 +750,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
spin_lock(&inode->i_lock);
for (;;) {
- req = nfs_page_find_request_locked(page);
+ req = nfs_page_find_request_locked(NFS_I(inode), page);
if (req == NULL)
goto out_unlock;
@@ -1744,7 +1765,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
*/
int nfs_wb_page(struct inode *inode, struct page *page)
{
- loff_t range_start = page_offset(page);
+ loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 10/12] nfs: enable swap on NFS
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
Implement the new swapfile a_ops for NFS and hook up ->direct_IO. This
will set the NFS socket to SOCK_MEMALLOC and run socket reconnect
under PF_MEMALLOC as well as reset SOCK_MEMALLOC before engaging the
protocol ->connect() method.
PF_MEMALLOC should allow the allocation of struct socket and related
objects and the early (re)setting of SOCK_MEMALLOC should allow us
to receive the packets required for the TCP connection buildup.
[jlayton@redhat.com: Restore PF_MEMALLOC task flags in all cases]
[dfeng@redhat.com: Fix handling of multiple swap files]
[a.p.zijlstra@chello.nl: Original patch]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
fs/nfs/Kconfig | 8 +++++
fs/nfs/direct.c | 82 ++++++++++++++++++++++++++++---------------
fs/nfs/file.c | 22 ++++++++++--
include/linux/nfs_fs.h | 4 +--
include/linux/sunrpc/xprt.h | 3 ++
net/sunrpc/Kconfig | 5 +++
net/sunrpc/clnt.c | 2 ++
net/sunrpc/sched.c | 7 ++--
net/sunrpc/xprtsock.c | 54 ++++++++++++++++++++++++++++
9 files changed, 153 insertions(+), 34 deletions(-)
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b47452f..07f35c6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -85,6 +85,14 @@ config NFS_V4
If unsure, say Y.
+config NFS_SWAP
+ bool "Provide swap over NFS support"
+ default n
+ depends on NFS_FS
+ select SUNRPC_SWAP
+ help
+ This option enables swapon to work on files located on NFS mounts.
+
config NFS_V4_1
bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
depends on NFS_V4 && EXPERIMENTAL
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9a4cbfc..0f9f264 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
* @nr_segs: size of iovec array
*
* The presence of this routine in the address space ops vector means
- * the NFS client supports direct I/O. However, we shunt off direct
- * read and write requests before the VFS gets them, so this method
- * should never be called.
+ * the NFS client supports direct I/O. However, for most direct IO, we
+ * shunt off direct read and write requests before the VFS gets them,
+ * so this method is only ever called for swap.
*/
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
+#ifndef CONFIG_NFS_SWAP
dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
iocb->ki_filp->f_path.dentry->d_name.name,
(long long) pos, nr_segs);
return -EINVAL;
+#else
+ VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
+ VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+
+ if (rw == READ || rw == KERNEL_READ)
+ return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+ rw == READ ? true : false);
+ return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+ rw == WRITE ? true : false);
+#endif /* CONFIG_NFS_SWAP */
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
*/
static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
const struct iovec *iov,
- loff_t pos)
+ loff_t pos, bool uio)
{
struct nfs_direct_req *dreq = desc->pg_dreq;
struct nfs_open_context *ctx = dreq->ctx;
@@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
GFP_KERNEL);
if (!pagevec)
break;
- down_read(¤t->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
+ if (uio) {
+ down_read(¤t->mm->mmap_sem);
+ result = get_user_pages(current, current->mm, user_addr,
npages, 1, 0, pagevec, NULL);
- up_read(¤t->mm->mmap_sem);
- if (result < 0)
- break;
+ up_read(¤t->mm->mmap_sem);
+ if (result < 0)
+ break;
+ } else {
+ WARN_ON(npages != 1);
+ result = get_kernel_page(user_addr, 1, pagevec);
+ if (WARN_ON(result != 1))
+ break;
+ }
+
if ((unsigned)result < npages) {
bytes = result * PAGE_SIZE;
if (bytes <= pgbase) {
@@ -386,7 +405,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
const struct iovec *iov,
unsigned long nr_segs,
- loff_t pos)
+ loff_t pos, bool uio)
{
struct nfs_pageio_descriptor desc;
ssize_t result = -EINVAL;
@@ -400,7 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *vec = &iov[seg];
- result = nfs_direct_read_schedule_segment(&desc, vec, pos);
+ result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
if (result < 0)
break;
requested_bytes += result;
@@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
}
static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ unsigned long nr_segs, loff_t pos, bool uio)
{
ssize_t result = -ENOMEM;
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
+ result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
if (!result)
result = nfs_direct_wait(dreq);
NFS_I(inode)->read_io += result;
@@ -606,7 +625,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
*/
static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
const struct iovec *iov,
- loff_t pos)
+ loff_t pos, bool uio)
{
struct nfs_direct_req *dreq = desc->pg_dreq;
struct nfs_open_context *ctx = dreq->ctx;
@@ -634,12 +653,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
if (!pagevec)
break;
- down_read(¤t->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
- npages, 0, 0, pagevec, NULL);
- up_read(¤t->mm->mmap_sem);
- if (result < 0)
- break;
+ if (uio) {
+ down_read(¤t->mm->mmap_sem);
+ result = get_user_pages(current, current->mm, user_addr,
+ npages, 0, 0, pagevec, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (result < 0)
+ break;
+ } else {
+ WARN_ON(npages != 1);
+ result = get_kernel_page(user_addr, 0, pagevec);
+ if (WARN_ON(result != 1))
+ break;
+ }
if ((unsigned)result < npages) {
bytes = result * PAGE_SIZE;
@@ -770,7 +796,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
const struct iovec *iov,
unsigned long nr_segs,
- loff_t pos)
+ loff_t pos, bool uio)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -786,7 +812,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *vec = &iov[seg];
- result = nfs_direct_write_schedule_segment(&desc, vec, pos);
+ result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
if (result < 0)
break;
requested_bytes += result;
@@ -814,7 +840,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos,
- size_t count)
+ size_t count, bool uio)
{
ssize_t result = -ENOMEM;
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -832,7 +858,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
+ result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
if (!result)
result = nfs_direct_wait(dreq);
out_release:
@@ -863,7 +889,7 @@ out:
* cache.
*/
ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ unsigned long nr_segs, loff_t pos, bool uio)
{
ssize_t retval = -EINVAL;
struct file *file = iocb->ki_filp;
@@ -888,7 +914,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
task_io_account_read(count);
- retval = nfs_direct_read(iocb, iov, nr_segs, pos);
+ retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
if (retval > 0)
iocb->ki_pos = pos + retval;
@@ -919,7 +945,7 @@ out:
* is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ unsigned long nr_segs, loff_t pos, bool uio)
{
ssize_t retval = -EINVAL;
struct file *file = iocb->ki_filp;
@@ -951,7 +977,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
task_io_account_write(count);
- retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
+ retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
if (retval > 0) {
struct inode *inode = mapping->host;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 406caac..d010335 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -194,7 +194,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos);
+ return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -494,6 +494,20 @@ static int nfs_launder_page(struct page *page)
return nfs_wb_page(inode, page);
}
+#ifdef CONFIG_NFS_SWAP
+static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ *span = sis->pages;
+ return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+}
+
+static void nfs_swap_deactivate(struct file *file)
+{
+ xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+}
+#endif
+
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
.readpages = nfs_readpages,
@@ -508,6 +522,10 @@ const struct address_space_operations nfs_file_aops = {
.migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
.error_remove_page = generic_error_remove_page,
+#ifdef CONFIG_NFS_SWAP
+ .swap_activate = nfs_swap_activate,
+ .swap_deactivate = nfs_swap_deactivate,
+#endif
};
/*
@@ -582,7 +600,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
size_t count = iov_length(iov, nr_segs);
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, iov, nr_segs, pos);
+ return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b23cfc1..fae495a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -477,10 +477,10 @@ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
unsigned long);
extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
const struct iovec *iov, unsigned long nr_segs,
- loff_t pos);
+ loff_t pos, bool uio);
extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
const struct iovec *iov, unsigned long nr_segs,
- loff_t pos);
+ loff_t pos, bool uio);
/*
* linux/fs/nfs/dir.c
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 77d278d..cff40aa 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -174,6 +174,8 @@ struct rpc_xprt {
unsigned long state; /* transport state */
unsigned char shutdown : 1, /* being shut down */
resvport : 1; /* use a reserved port */
+ unsigned int swapper; /* we're swapping over this
+ transport */
unsigned int bind_index; /* bind function index */
/*
@@ -316,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task);
void xprt_disconnect_done(struct rpc_xprt *xprt);
void xprt_force_disconnect(struct rpc_xprt *xprt);
void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
+int xs_swapper(struct rpc_xprt *xprt, int enable);
/*
* Reserved bit positions in xprt->state
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 9fe8857..03d03e3 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
+config SUNRPC_SWAP
+ bool
+ depends on SUNRPC
+ select NETVM
+
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism"
depends on SUNRPC && CRYPTO
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f56f045..09e71d1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -717,6 +717,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
task->tk_flags |= RPC_TASK_SOFT;
+ if (task->tk_client->cl_xprt->swapper)
+ task->tk_flags |= RPC_TASK_SWAPPER;
/* Add to the client's list of all tasks */
spin_lock(&clnt->cl_lock);
list_add_tail(&task->tk_task, &clnt->cl_tasks);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 994cfea..83a4c43 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -812,7 +812,10 @@ static void rpc_async_schedule(struct work_struct *work)
void *rpc_malloc(struct rpc_task *task, size_t size)
{
struct rpc_buffer *buf;
- gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
+ gfp_t gfp = GFP_NOWAIT;
+
+ if (RPC_IS_SWAPPER(task))
+ gfp |= __GFP_MEMALLOC;
size += sizeof(struct rpc_buffer);
if (size <= RPC_BUFFER_MAXSIZE)
@@ -886,7 +889,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
static struct rpc_task *
rpc_alloc_task(void)
{
- return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+ return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
}
/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 890b03f..3d58b92 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1930,6 +1930,45 @@ out:
xprt_wake_pending_tasks(xprt, status);
}
+#ifdef CONFIG_SUNRPC_SWAP
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+
+ if (xprt->swapper)
+ sk_set_memalloc(transport->inet);
+}
+
+/**
+ * xs_swapper - Tag this transport as being used for swap.
+ * @xprt: transport to tag
+ * @enable: enable/disable
+ *
+ */
+int xs_swapper(struct rpc_xprt *xprt, int enable)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+ int err = 0;
+
+ if (enable) {
+ xprt->swapper++;
+ xs_set_memalloc(xprt);
+ } else if (xprt->swapper) {
+ xprt->swapper--;
+ sk_clear_memalloc(transport->inet);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xs_swapper);
+#else
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+}
+#endif
+
static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1954,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
transport->sock = sock;
transport->inet = sk;
+ xs_set_memalloc(xprt);
+
write_unlock_bh(&sk->sk_callback_lock);
}
xs_udp_do_set_buffer_size(xprt);
@@ -1965,11 +2006,15 @@ static void xs_udp_setup_socket(struct work_struct *work)
container_of(work, struct sock_xprt, connect_worker.work);
struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock = transport->sock;
+ unsigned long pflags = current->flags;
int status = -EIO;
if (xprt->shutdown)
goto out;
+ if (xprt->swapper)
+ current->flags |= PF_MEMALLOC;
+
/* Start by resetting any existing state */
xs_reset_transport(transport);
sock = xs_create_sock(xprt, transport,
@@ -1988,6 +2033,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
}
/*
@@ -2078,6 +2124,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!xprt_bound(xprt))
goto out;
+ xs_set_memalloc(xprt);
+
/* Tell the socket layer to start connecting... */
xprt->stat.connect_count++;
xprt->stat.connect_start = jiffies;
@@ -2108,11 +2156,15 @@ static void xs_tcp_setup_socket(struct work_struct *work)
container_of(work, struct sock_xprt, connect_worker.work);
struct socket *sock = transport->sock;
struct rpc_xprt *xprt = &transport->xprt;
+ unsigned long pflags = current->flags;
int status = -EIO;
if (xprt->shutdown)
goto out;
+ if (xprt->swapper)
+ current->flags |= PF_MEMALLOC;
+
if (!sock) {
clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
sock = xs_create_sock(xprt, transport,
@@ -2162,6 +2214,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EINPROGRESS:
case -EALREADY:
xprt_clear_connecting(xprt);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
return;
case -EINVAL:
/* Happens, for instance, if the user specified a link
@@ -2174,6 +2227,7 @@ out_eagain:
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
}
/**
--
1.7.9.2
^ permalink raw reply related
* [PATCH 12/12] Avoid dereferencing bd_disk during swap_entry_free for network storage
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
Commit [b3a27d: swap: Add swap slot free callback to
block_device_operations] dereferences p->bdev->bd_disk but this is a
NULL dereference if using swap-over-NFS. This patch checks SWP_BLKDEV
on the swap_info_struct before dereferencing.
With reference to this callback, Christoph Hellwig stated "Please
just remove the callback entirely. It has no user outside the staging
tree and was added clearly against the rules for that staging tree".
This would also be my preference but there was not an obvious way of
keeping zram in staging/ happy.
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
mm/swapfile.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1d77b13..f4c802d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -549,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
/* free if no reference */
if (!usage) {
- struct gendisk *disk = p->bdev->bd_disk;
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -560,9 +559,11 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
nr_swap_pages++;
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
- if ((p->flags & SWP_BLKDEV) &&
- disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev, offset);
+ if (p->flags & SWP_BLKDEV) {
+ struct gendisk *disk = p->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev, offset);
+ }
}
return usage;
--
1.7.9.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 11/12] nfs: Prevent page allocator recursions with swap over NFS.
From: Mel Gorman @ 2012-06-29 13:33 UTC (permalink / raw)
To: Andrew Morton
Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
Mike Christie, Eric B Munson, Sebastian Andrzej Siewior,
Mel Gorman
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>
GFP_NOFS is _more_ permissive than GFP_NOIO in that it will initiate
IO, just not of any filesystem data.
The problem is that previously NOFS was correct because that avoids
recursion into the NFS code. With swap-over-NFS, it is no longer
correct as swap IO can lead to this recursion.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
fs/nfs/pagelist.c | 2 +-
fs/nfs/write.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9ef8b3c..7de1646 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -70,7 +70,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
static inline struct nfs_page *
nfs_page_alloc(void)
{
- struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+ struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO);
if (p)
INIT_LIST_HEAD(&p->wb_list);
return p;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 947e1e6..0f7fdb7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool;
struct nfs_commit_data *nfs_commitdata_alloc(void)
{
- struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+ struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
if (p) {
memset(p, 0, sizeof(*p));
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
struct nfs_write_header *nfs_writehdr_alloc(void)
{
- struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+ struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
if (p) {
struct nfs_pgio_header *hdr = &p->header;
--
1.7.9.2
^ permalink raw reply related
* Re: [PATCH net-next 1/2] r8169: support RTL8106E
From: Francois Romieu @ 2012-06-29 13:50 UTC (permalink / raw)
To: Hayes Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1340966060-2749-1-git-send-email-hayeswang@realtek.com>
Hayes Wang <hayeswang@realtek.com> :
[...]
> Support the new chip RTL8106E.
I'll give it a try this week end.
Thanks.
--
Ueimor
^ permalink raw reply
* Re: [PATCH net-next 2/2] r8169: support RTL8168G
From: Francois Romieu @ 2012-06-29 13:51 UTC (permalink / raw)
To: Hayes Wang; +Cc: netdev, linux-kernel
In-Reply-To: <1340966060-2749-2-git-send-email-hayeswang@realtek.com>
Hayes Wang <hayeswang@realtek.com> :
[...]
> @@ -264,6 +267,11 @@ static const struct {
> [RTL_GIGA_MAC_VER_39] =
> _R("RTL8106e", RTL_TD_1, FIRMWARE_8106E_1,
> JUMBO_1K, true),
> + [RTL_GIGA_MAC_VER_40] =
> + _R("RTL8168g/8111g", RTL_TD_1, FIRMWARE_8168G_1,
> + JUMBO_9K, false),
> + [RTL_GIGA_MAC_VER_41] =
> + _R("RTL8168g/8111g", RTL_TD_1, NULL, JUMBO_9K, false),
You may explicitely state that jumbo operation requires no special action
by completing rtl_init_jumbo_ops.
(no checksuming with jumbo, sigh)
[...]
> static void rtl_lock_work(struct rtl8169_private *tp)
> {
> @@ -919,6 +936,99 @@ static int r8168dp_check_dash(struct rtl8169_private *tp)
> return (ocp_read(tp, 0x0f, reg) & 0x00008000) ? 1 : 0;
> }
>
> +static void r8168_phy_ocp_write(void __iomem *ioaddr, u32 reg, u32 data)
> +{
> + int i;
> +
> + if (reg & 0xffff0001)
> + BUG();
The patch adds a lot of BUG(). BUG is terrible from a system or end user
viewpoint.
Were they only a devel helper or are they still supposed to be of use
in the future ? If the latter applies, why ?
[...]
> +static u16 r8168_phy_ocp_read(void __iomem *ioaddr, u32 reg)
> +{
> + int i;
> + u32 data;
> +
> + if (reg & 0xffff0001)
> + BUG();
> +
> + RTL_W32(GPHY_OCP, (reg << 15));
You can save on parenthesis here.
[...]
> +static void r8168g_mdio_write(void __iomem *ioaddr, int reg_addr, int value)
> +{
> + if (reg_addr == 0x1f)
> + return;
> +
> + r8168_phy_ocp_write(ioaddr, 0xa400 + reg_addr * 2, value);
> +}
> +
> +static int r8168g_mdio_read(void __iomem *ioaddr, int reg_addr)
> +{
> + return r8168_phy_ocp_read(ioaddr, 0xa400 + reg_addr * 2);
> +}
#define XYZ_{BASE/OFFSET} 0xa400 ?
[...]
> @@ -2241,6 +2355,92 @@ static void rtl_phy_write_fw(struct rtl8169_private *tp, struct rtl_fw *rtl_fw)
> }
> }
>
> +static void rtl_ocp_write_fw(struct rtl8169_private *tp, struct rtl_fw *rtl_fw)
> +{
> + struct rtl_fw_phy_action *pa = &rtl_fw->phy_action;
> + void __iomem *ioaddr = tp->mmio_addr;
> + u32 predata, count;
> + u32 base_addr;
> + size_t index;
> +
> + predata = count = 0;
> + base_addr = 0xa400;
> +
> + for (index = 0; index < pa->size; ) {
> + u32 action = le32_to_cpu(pa->code[index]);
> + u32 data = action & 0x0000ffff;
> + u32 regno = (action & 0x0fff0000) >> 16;
> +
> + if (!action)
> + break;
> +
> + switch(action & 0xf0000000) {
> + case PHY_READ:
> + predata = r8168_phy_ocp_read(ioaddr,
> + base_addr + (regno -16) * 2);
> + count++;
> + index++;
> + break;
[duplicated code removed]
> + case PHY_WRITE:
> + if (regno == 0x1f)
> + base_addr = data << 4;
> + else
> + r8168_phy_ocp_write(ioaddr,
> + base_addr + (regno - 0x10) * 2,
> + data);
> + index++;
> + break;
[duplicated code removed]
> + case PHY_WRITE_PREVIOUS:
> + r8168_phy_ocp_write(ioaddr, base_addr + (regno -16) * 2,
> + predata);
> + index++;
> + break;
I can't believe that the hardware people have designed something which
needs a different firmware write method, especially as it copies at lot
of code.
How did you come to the conclusion that it was not possible to hide this
stuff behind r8168g_mdio_{read / write} ?
I would not mind replacing the PHY_{READ/WRITE/WRITE_PREVIOUS} case with
chipset specific {READ/WRITE/WRITE_PREVIOUS} methods as long as the
semantic looks the same but going through a different (*write_fw) does not
trivially seem to be the best abstraction.
[...]
> @@ -3221,6 +3421,56 @@ static void rtl8411_hw_phy_config(struct rtl8169_private *tp)
> rtl_writephy(tp, 0x1f, 0x0000);
> }
>
> +static void rtl8168g_1_hw_phy_config(struct rtl8169_private *tp)
> +{
> + void __iomem *ioaddr = tp->mmio_addr;
> + u32 mac_ocp_addr, i;
> + static const u16 mac_ocp_patch[] = {
> + 0xE008, 0xE01B, 0xE01D, 0xE01F,
> + 0xE021, 0xE023, 0xE025, 0xE027,
> + 0x49D2 ,0xF10D, 0x766C, 0x49E2,
> + 0xF00A, 0x1EC0, 0x8EE1, 0xC60A,
> + 0x77C0, 0x4870, 0x9FC0, 0x1EA0,
> + 0xC707, 0x8EE1, 0x9D6C, 0xC603,
> + 0xBE00, 0xB416, 0x0076, 0xE86C,
> + 0xC602, 0xBE00, 0x0000, 0xC602,
> + 0xBE00, 0x0000, 0xC602, 0xBE00,
> + 0x0000, 0xC602, 0xBE00, 0x0000,
> + 0xC602, 0xBE00, 0x0000, 0xC602,
> + 0xBE00, 0x0000, 0xC602, 0xBE00,
> + 0x0000, 0x0000, 0x0000, 0x0000
Please s/\(.*\)/\L\1/
> + };
> +
> + /* patch code for GPHY reset */
> + mac_ocp_addr = 0xf800;
> + for (i = 0; mac_ocp_addr < 0xf868; i++) {
> + r8168_mac_ocp_write(ioaddr, mac_ocp_addr, mac_ocp_patch[i]);
> + mac_ocp_addr += 2;
> + }
for (i = 0; i < ARRAY_SIZE(mac_ocp_patch); i++)
r8168_mac_ocp_write(ioaddr, 0xf800 + 2*i, mac_ocp_patch[i]);
The array must be correctly sized anyway. :o)
You may save a bit on the 'mac_ocp_patch' identifier and replace 0xf800 with
a #define.
> + r8168_mac_ocp_write(ioaddr, 0xfc26, 0x8000);
> + r8168_mac_ocp_write(ioaddr, 0xfc28, 0x0075);
> +
> + rtl_apply_firmware(tp);
> +
> + if (r8168_phy_ocp_read(ioaddr, 0xa460) & 0x0100)
> + rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x0000, 0x8000);
> + else
> + rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x8000, 0x0000);
> +
> + if (r8168_phy_ocp_read(ioaddr, 0xa466) & 0x0100)
> + rtl_w1w0_phy_ocp(ioaddr, 0xc41a, 0x0002, 0x0000);
> + else
> + rtl_w1w0_phy_ocp(ioaddr, 0xbcc4, 0x0000, 0x0002);
> +
> + rtl_w1w0_phy_ocp(ioaddr, 0xa442, 0x000c, 0x0000);
> + rtl_w1w0_phy_ocp(ioaddr, 0xa4b2, 0x0004, 0x0000);
> +
> + r8168_phy_ocp_write(ioaddr, 0xa436, 0x8012);
> + rtl_w1w0_phy_ocp(ioaddr, 0xa438, 0x8000, 0x0000);
> +
> + rtl_w1w0_phy_ocp(ioaddr, 0xc422, 0x4000, 0x2000);
> +}
Is there any chance for this part to be a bit more literate ?
[...]
> @@ -4921,6 +5193,28 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp)
> ERIAR_EXGMAC);
> }
>
> +static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
> +{
> + void __iomem *ioaddr = tp->mmio_addr;
> + struct pci_dev *pdev = tp->pci_dev;
> +
> + rtl_eri_write(ioaddr, 0xc8, ERIAR_MASK_0101, 0x080002, ERIAR_EXGMAC);
> + rtl_eri_write(ioaddr, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC);
> + rtl_eri_write(ioaddr, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC);
> + rtl_eri_write(ioaddr, 0xe8, ERIAR_MASK_1111, 0x00100006, ERIAR_EXGMAC);
> + rtl_csi_access_enable_1(tp);
> + rtl_tx_performance_tweak(pdev, 0x5 << MAX_READ_REQUEST_SHIFT);
> + rtl_w1w0_eri(ioaddr, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
> + rtl_w1w0_eri(ioaddr, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
> + RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
> + RTL_W32(MISC, RTL_R32(MISC) & ~RXDV_GATED_EN);
> + RTL_W8(MaxTxPacketSize, EarlySize);
> + rtl_eri_write(ioaddr, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
> + rtl_eri_write(ioaddr, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
> + RTL_W8(EEE_LED, RTL_R8(EEE_LED) & ~0x07);
> + rtl_w1w0_eri(ioaddr, 0x2fc, ERIAR_MASK_0001, 0x01, 0x02, ERIAR_EXGMAC);
> +}
(ok, now it can be compared with similar functions)
[...]
> @@ -6491,6 +6790,47 @@ static unsigned rtl_try_msi(struct rtl8169_private *tp,
> return msi;
> }
>
> +static void __devinit rtl_hw_init_8168g(struct rtl8169_private *tp)
> +{
> + void __iomem *ioaddr = tp->mmio_addr;
> + u32 tmp_data;
> +
> + RTL_W32(MISC, RTL_R32(MISC) | RXDV_GATED_EN);
> + while (!(RTL_R32(TxConfig) & TXCFG_EMPTY))
> + udelay(100);
> +
> + while ((RTL_R8(MCU) & (TX_EMPTY | RX_EMPTY)) != (TX_EMPTY | RX_EMPTY))
> + udelay(100);
#define RXTX_EMPTY (TX_EMPTY | RX_EMPTY) ?
> +
> + RTL_W8(ChipCmd, RTL_R8(ChipCmd) & ~(CmdTxEnb | CmdRxEnb));
> + msleep(1);
> + RTL_W8(MCU, RTL_R8(MCU) & ~NOW_IS_OOB);
> +
> + tmp_data = r8168_mac_ocp_read(ioaddr, 0xe8de);
> + tmp_data &= ~(1 << 14);
> + r8168_mac_ocp_write(ioaddr, 0xe8de, tmp_data);
> + while (!(RTL_R8(MCU) & LINK_LIST_RDY))
> + udelay(100);
> +
> + tmp_data = r8168_mac_ocp_read(ioaddr, 0xe8de);
Same 0xe8de offset used twice. #define ?
> + tmp_data |= (1 << 15);
> + r8168_mac_ocp_write(ioaddr, 0xe8de, tmp_data);
> + while (!(RTL_R8(MCU) & LINK_LIST_RDY))
> + udelay(100);
> +}
> +
> +static void __devinit rtl_hw_initialize(struct rtl8169_private *tp)
> +{
> + switch (tp->mac_version) {
> + case RTL_GIGA_MAC_VER_40:
> + case RTL_GIGA_MAC_VER_41:
> + rtl_hw_init_8168g(tp);
> + break;
> + default:
> + break;
> + }
> +}
Why doesn't it belong to hw_start ?
Is it completely unneeded if the device requires a rtl8169_hw_reset,
resumes or such ?
Thanks.
--
Ueimor
^ permalink raw reply
* [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Andreas Gruenbacher @ 2012-06-29 14:53 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller
Hello,
I'm (still) trying to pass data from the network to the block layer without
copying. The block layer needs blocks to be contiguous in memory, and may have
some alignment restrictions as well. A lot of modern network hardware will
receive large packets into separate buffers, so individual large packets will
end up in contiguous, aligned buffers. I would like to make use of that, but
tcp currently doesn't allow me to control what ends up in which packets.
This patch series introduces a new flag for indicating to tcp when it should
start a new segment. Using that on the sender side, I can get data over the
network with no cpu copying at all.
[My last posting on this topic from May 8 is archived here:
http://www.spinics.net/lists/netdev/msg197788.html ]
Thanks,
Andreas
Andreas Gruenbacher (3):
tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
tcp: Zero-copy receive from a socket into a bio
fs: Export bio_release_pages()
fs/bio.c | 3 +-
include/linux/bio.h | 1 +
include/linux/socket.h | 1 +
include/net/tcp.h | 3 +
net/ipv4/Makefile | 3 +-
net/ipv4/tcp.c | 5 +-
net/ipv4/tcp_recvbio.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 180 insertions(+), 4 deletions(-)
create mode 100644 net/ipv4/tcp_recvbio.c
--
1.7.10.2
^ permalink raw reply
* [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Andreas Gruenbacher @ 2012-06-29 14:54 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller
The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
page should be put into a new packet even when there is still room left in the
previous packet.
In the tcp protocol, messages which are not sent immediately are queued. When
more data is sent, it will be added to the last segment in that queue until
that segment is "full" whenever possible; only then is a new segment added.
Right now, there is no way to indicate when tcp should start a new segment.
The new flag allows to control that.
Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
include/linux/socket.h | 1 +
net/ipv4/tcp.c | 5 +++--
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322..be166de 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -266,6 +266,7 @@ struct ucred {
#define MSG_MORE 0x8000 /* Sender will send more */
#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
+#define MSG_NEW_PACKET 0x40000 /* tcp: try to put message into a new packet */
#define MSG_EOF MSG_FIN
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f..148aebe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -854,7 +854,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int size = min_t(size_t, psize, PAGE_SIZE - offset);
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+ if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ (flags & MSG_NEW_PACKET)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -1044,7 +1045,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
copy = max - skb->len;
}
- if (copy <= 0) {
+ if (copy <= 0 || (flags & MSG_NEW_PACKET)) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
--
1.7.10.2
^ permalink raw reply related
* [RFC] [TCP 2/3] tcp: Zero-copy receive from a socket into a bio
From: Andreas Gruenbacher @ 2012-06-29 14:55 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller
"Receive" data from a tcp socket by directly mapping sectors in the socket receive
buffers into a bio without copying. This requires that the receive buffer
contains contiguous sectors which are well-enough aligned for the block device
associated with the bio.
Any data that cannot be mapped into the bio is left in the socket receive
buffers and can be received conventionally, by copying it out of the buffers.
Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
include/net/tcp.h | 3 +
net/ipv4/Makefile | 3 +-
net/ipv4/tcp_recvbio.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+), 1 deletion(-)
create mode 100644 net/ipv4/tcp_recvbio.c
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e79aa48..c4d924b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -538,6 +538,9 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
+/* tcp_recvbio.c */
+extern int tcp_recvbio(struct sock *sk, struct bio *bio, size_t size);
+
extern void tcp_initialize_rcv_mss(struct sock *sk);
extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3b..7ee9f92 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,7 +11,8 @@ obj-y := route.o inetpeer.o protocol.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
- inet_fragment.o ping.o
+ inet_fragment.o ping.o \
+ tcp_recvbio.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/tcp_recvbio.c b/net/ipv4/tcp_recvbio.c
new file mode 100644
index 0000000..4d6f833
--- /dev/null
+++ b/net/ipv4/tcp_recvbio.c
@@ -0,0 +1,168 @@
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+static int tcp_recvbio_add(struct bio *bio, struct sk_buff *skb,
+ struct bio_vec *last)
+{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ unsigned short vcnt = bio->bi_vcnt;
+ int ret;
+
+ if (vcnt == queue_max_segments(q))
+ return 0;
+ if (!blk_rq_aligned(q, last->bv_offset, last->bv_len))
+ return -EOPNOTSUPP;
+ ret = bio_add_page(bio, last->bv_page, last->bv_len, last->bv_offset);
+ if (vcnt != bio->bi_vcnt)
+ get_page(last->bv_page);
+ return ret;
+}
+
+static int tcp_recvbio_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct bio *bio = rd_desc->arg.data;
+ int start = skb_headlen(skb), consumed = 0, frag_len, i;
+ struct sk_buff *frag_iter;
+ struct bio_vec last = { };
+ int ret = 0;
+
+ if (offset > (int)skb->len - len)
+ return -EFAULT;
+
+ /* Do not consume more data than we need. */
+ if (len > rd_desc->count)
+ len = rd_desc->count;
+
+ /* Head of the skb */
+ frag_len = start - offset;
+ if (frag_len > 0) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ frag_len = end - offset;
+ if (frag_len > 0) {
+ if (frag_len > len)
+ frag_len = len;
+
+ last.bv_page = skb_frag_page(frag);
+ last.bv_offset = frag->page_offset + offset - start;
+ last.bv_len = frag_len;
+ ret = tcp_recvbio_add(bio, skb, &last);
+ if (ret <= 0)
+ goto out;
+ consumed += frag_len;
+ len -= frag_len;
+ if (!len)
+ break;
+ offset += frag_len;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ frag_len = end - offset;
+ if (frag_len > 0) {
+ if (frag_len > len)
+ frag_len = len;
+
+ ret = tcp_recvbio_data(rd_desc, frag_iter, offset -
+ start, frag_len);
+ if (ret <= 0)
+ goto out;
+ consumed += frag_len;
+ len -= frag_len;
+ if (!len)
+ break;
+ offset += frag_len;
+ }
+ start = end;
+ }
+
+out:
+ rd_desc->written += consumed;
+ rd_desc->count -= consumed;
+ return consumed ? consumed : ret;
+}
+
+/**
+ * tcp_recvbio - zero-copy receive from a socket into a bio
+ * @sk: socket to receive from
+ * @bio: empty bio to receive into
+ * @size: number of bytes to receive
+ *
+ * Directly add page fragments from @sk's receive buffer to @bio. The page
+ * fragments are held referenced with get_page(). Release those references
+ * with bio_release_pages() when done.
+ *
+ * Returns the number of bytes received into @bio.
+ */
+int tcp_recvbio(struct sock *sk, struct bio *bio, size_t size)
+{
+ long timeo = sock_rcvtimeo(sk, 0);
+ read_descriptor_t rd_desc = {
+ .count = size,
+ .arg = { .data = bio },
+ };
+ int ret = 0;
+
+ BUG_ON(bio->bi_idx != 0);
+
+ lock_sock(sk);
+ while (rd_desc.count) {
+ read_lock(&sk->sk_callback_lock);
+ ret = tcp_read_sock(sk, &rd_desc, tcp_recvbio_data);
+ read_unlock(&sk->sk_callback_lock);
+ if (ret < 0)
+ break;
+ else if (ret > 0)
+ timeo = sock_rcvtimeo(sk, 0);
+ else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ if (!sock_flag(sk, SOCK_DONE))
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ sk_wait_data(sk, &timeo);
+ if (signal_pending(current)) {
+ ret = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+ timeo = 0;
+ }
+ }
+ release_sock(sk);
+ return rd_desc.written ? rd_desc.written : ret;
+}
+EXPORT_SYMBOL(tcp_recvbio);
--
1.7.10.2
^ permalink raw reply related
* [RFC] [TCP 3/3] fs: Export bio_release_pages()
From: Andreas Gruenbacher @ 2012-06-29 14:56 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: Herbert Xu, David S. Miller
Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
---
fs/bio.c | 3 ++-
include/linux/bio.h | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/bio.c b/fs/bio.c
index 73922ab..90501a5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1335,7 +1335,7 @@ void bio_set_pages_dirty(struct bio *bio)
}
}
-static void bio_release_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio)
{
struct bio_vec *bvec = bio->bi_io_vec;
int i;
@@ -1347,6 +1347,7 @@ static void bio_release_pages(struct bio *bio)
put_page(page);
}
}
+EXPORT_SYMBOL(bio_release_pages);
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2643589..268ec49 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -246,6 +246,7 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
gfp_t, int);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
+extern void bio_release_pages(struct bio *bio);
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
--
1.7.10.2
^ permalink raw reply related
* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Eric Dumazet @ 2012-06-29 15:08 UTC (permalink / raw)
To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340981632.25226.2.camel@gurkel.linbit>
On Fri, 2012-06-29 at 16:53 +0200, Andreas Gruenbacher wrote:
> Hello,
>
> I'm (still) trying to pass data from the network to the block layer without
> copying. The block layer needs blocks to be contiguous in memory, and may have
> some alignment restrictions as well. A lot of modern network hardware will
> receive large packets into separate buffers, so individual large packets will
> end up in contiguous, aligned buffers. I would like to make use of that, but
> tcp currently doesn't allow me to control what ends up in which packets.
>
> This patch series introduces a new flag for indicating to tcp when it should
> start a new segment. Using that on the sender side, I can get data over the
> network with no cpu copying at all.
>
> [My last posting on this topic from May 8 is archived here:
> http://www.spinics.net/lists/netdev/msg197788.html ]
>
> Thanks,
> Andreas
>
> Andreas Gruenbacher (3):
> tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
> tcp: Zero-copy receive from a socket into a bio
> fs: Export bio_release_pages()
This looks like yet another zero copy, needing another couple of hundred
of lines.
Why splice infrastructure doesnt fit your needs ?
^ permalink raw reply
* [patch net-next v2 0/4] net: introduce and use IFF_LIFE_ADDR_CHANGE
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
davem
three drivers updated, but this can be used in many others.
v1->v2:
%s/LIFE/LIVE
Jiri Pirko (4):
net: introduce new priv_flag indicating iface capable of change mac
when running
virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
team: use IFF_LIVE_ADDR_CHANGE priv_flag
dummy: use IFF_LIVE_ADDR_CHANGE priv_flag
drivers/net/dummy.c | 15 ++-------------
drivers/net/team/team.c | 9 +++++----
drivers/net/virtio_net.c | 11 +++++------
include/linux/if.h | 2 ++
net/ethernet/eth.c | 2 +-
5 files changed, 15 insertions(+), 24 deletions(-)
--
1.7.10.4
^ permalink raw reply
* [patch net-next v2 1/4] net: introduce new priv_flag indicating iface capable of change mac when running
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
To: netdev; +Cc: mst, shimoda.hiroaki, virtualization, danny.kukawka, edumazet,
davem
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>
Introduce IFF_LIVE_ADDR_CHANGE priv_flag and use it to disable
netif_running() check in eth_mac_addr()
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
include/linux/if.h | 2 ++
net/ethernet/eth.c | 2 +-
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/if.h b/include/linux/if.h
index f995c66..1ec407b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -81,6 +81,8 @@
#define IFF_UNICAST_FLT 0x20000 /* Supports unicast filtering */
#define IFF_TEAM_PORT 0x40000 /* device used as team port */
#define IFF_SUPP_NOFCS 0x80000 /* device supports sending custom FCS */
+#define IFF_LIVE_ADDR_CHANGE 0x100000 /* device supports hardware address
+ * change when it's running */
#define IF_GET_IFACE 0x0001 /* for querying only */
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 36e5880..db6a6c1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -283,7 +283,7 @@ int eth_mac_addr(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
- if (netif_running(dev))
+ if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
return -EBUSY;
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
--
1.7.10.4
^ permalink raw reply related
* [patch net-next v2 2/4] virtio_net: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
To: netdev
Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/virtio_net.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 36a16d5..1db445b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -679,12 +679,11 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
struct virtnet_info *vi = netdev_priv(dev);
struct virtio_device *vdev = vi->vdev;
- struct sockaddr *addr = p;
+ int ret;
- if (!is_valid_ether_addr(addr->sa_data))
- return -EADDRNOTAVAIL;
- memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
- dev->addr_assign_type &= ~NET_ADDR_RANDOM;
+ ret = eth_mac_addr(dev, p);
+ if (ret)
+ return ret;
if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
@@ -1063,7 +1062,7 @@ static int virtnet_probe(struct virtio_device *vdev)
return -ENOMEM;
/* Set up network device as normal. */
- dev->priv_flags |= IFF_UNICAST_FLT;
+ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
dev->netdev_ops = &virtnet_netdev;
dev->features = NETIF_F_HIGHDMA;
--
1.7.10.4
^ permalink raw reply related
* [patch net-next v2 3/4] team: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
To: netdev
Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/team/team.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 89853c3..9b94f53 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1188,10 +1188,11 @@ static int team_set_mac_address(struct net_device *dev, void *p)
{
struct team *team = netdev_priv(dev);
struct team_port *port;
- struct sockaddr *addr = p;
+ int err;
- dev->addr_assign_type &= ~NET_ADDR_RANDOM;
- memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+ err = eth_mac_addr(dev, p);
+ if (err)
+ return err;
rcu_read_lock();
list_for_each_entry_rcu(port, &team->port_list, list)
if (team->ops.port_change_mac)
@@ -1393,7 +1394,7 @@ static void team_setup(struct net_device *dev)
* bring us to promisc mode in case a unicast addr is added.
* Let this up to underlay drivers.
*/
- dev->priv_flags |= IFF_UNICAST_FLT;
+ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
dev->features |= NETIF_F_LLTX;
dev->features |= NETIF_F_GRO;
--
1.7.10.4
^ permalink raw reply related
* [patch net-next v2 4/4] dummy: use IFF_LIVE_ADDR_CHANGE priv_flag
From: Jiri Pirko @ 2012-06-29 15:10 UTC (permalink / raw)
To: netdev
Cc: davem, rusty, mst, virtualization, edumazet, danny.kukawka,
shimoda.hiroaki
In-Reply-To: <1340982608-897-1-git-send-email-jpirko@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/dummy.c | 15 ++-------------
1 file changed, 2 insertions(+), 13 deletions(-)
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index bab0158..9d6a067 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -40,18 +40,6 @@
static int numdummies = 1;
-static int dummy_set_address(struct net_device *dev, void *p)
-{
- struct sockaddr *sa = p;
-
- if (!is_valid_ether_addr(sa->sa_data))
- return -EADDRNOTAVAIL;
-
- dev->addr_assign_type &= ~NET_ADDR_RANDOM;
- memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
- return 0;
-}
-
/* fake multicast ability */
static void set_multicast_list(struct net_device *dev)
{
@@ -118,7 +106,7 @@ static const struct net_device_ops dummy_netdev_ops = {
.ndo_start_xmit = dummy_xmit,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_rx_mode = set_multicast_list,
- .ndo_set_mac_address = dummy_set_address,
+ .ndo_set_mac_address = eth_mac_addr,
.ndo_get_stats64 = dummy_get_stats64,
};
@@ -134,6 +122,7 @@ static void dummy_setup(struct net_device *dev)
dev->tx_queue_len = 0;
dev->flags |= IFF_NOARP;
dev->flags &= ~IFF_MULTICAST;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
eth_hw_addr_random(dev);
--
1.7.10.4
^ permalink raw reply related
* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Eric Dumazet @ 2012-06-29 15:11 UTC (permalink / raw)
To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340981690.25226.3.camel@gurkel.linbit>
On Fri, 2012-06-29 at 16:54 +0200, Andreas Gruenbacher wrote:
> The MSG_NEW_PACKET flag indicates to sendmsg / sendpage that the message or
> page should be put into a new packet even when there is still room left in the
> previous packet.
>
> In the tcp protocol, messages which are not sent immediately are queued. When
> more data is sent, it will be added to the last segment in that queue until
> that segment is "full" whenever possible; only then is a new segment added.
> Right now, there is no way to indicate when tcp should start a new segment.
> The new flag allows to control that.
>
> Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
> ---
I don't understand how maintaining any message boundaries at sender can
prevent any middlebox or the receiver to coalesce frames to any
boundaries it prefers ?
^ permalink raw reply
* Re: "ADDRCONF(NETDEV_UP): eth0: link is not ready" with IPv6
From: Ben Hutchings @ 2012-06-29 15:24 UTC (permalink / raw)
To: Arvid Brodin; +Cc: netdev@vger.kernel.org, Alexey Kuznetsov, Stephen Hemminger
In-Reply-To: <4FED14C2.9020200@xdin.com>
On Fri, 2012-06-29 at 02:36 +0000, Arvid Brodin wrote:
> Hi,
>
> After 'ip link set eth0 up' on an avr32 board (network driver macb), the device ends up in
> operational mode "UNKNOWN":
>
> # ip link
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN qlen 1000
> link/ether 00:24:74:00:17:9d brd ff:ff:ff:ff:ff:ff
>
> Unplugging and plugging in the network cable gets the device to mode "UP".
>
> This is a problem for me because I'm trying to use this device as a "slave" device (for a
> virtual HSR device*) and I need to be able to decide if the slave device is operational or
> not.
>
> Following Stephen's advice here:
> http://kerneltrap.org/mailarchive/linux-netdev/2008/9/24/3398834 I checked the macb.c code
> and noticed they do not call netif_carrier_off() neither before register_netdev() nor in
> dev_open().
It should be called after register_netdev() and before the driver's
ndo_open implementation returns.
> I added the call before register_netdev(), which fixed the problem. However, if I then
> enable IPv6:
>
> # ip link set eth0 up
> ADDRCONF(NETDEV_UP): eth0: link is not ready
> eth0: link up (100/Full)
> ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
This looks normal.
> Any idea what is happening / what I'm doing wrong? (This is not just cosmetic; is some
> situations this seems to kill the interface - e.g. ping does not work, down/up does not
> help...) Things work fine without IPv6 configured.
Perhaps some packets sent automatically by IPv6 are triggering a driver
bug? Or there is a bug in multicast support, which IPv6 always uses.
Ben.
> *N.B. I'm writing a driver for a network protocol called "High-availability Seamless
> Redundancy".
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* [PATCH 0/5] netfilter fixes for 3.5-rc4
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Pablo Neira Ayuso <pablo@netfilter.org>
Hi David,
The following are 4 fixes and the update of the MAINTAINERS file
to point to my Netfilter trees.
They are:
* One refcount leak fix in IPVS IPv6 support from Eric Dumazet.
* One fix for interface comparison in ipset hash-netiface sets
from Florian Westphal.
* One fix for a missing rcu_read_unlock in nfnetlink from
Tomasz Bursztyka.
* One fix for a kernel crash if IPSET_CMD_NONE is set to ipset via
nfnetlink, again from Tomasz Bursztyka.
You can pull these changes from:
git://1984.lsi.us.es/nf master
Thanks!
Eric Dumazet (1):
netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6
Florian Westphal (1):
netfilter: ipset: fix interface comparision in hash-netiface sets
Pablo Neira Ayuso (1):
netfilter: update location of my trees
Tomasz Bursztyka (2):
netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
netfilter: nfnetlink: fix missing rcu_read_unlock in nfnetlink_rcv_msg
MAINTAINERS | 4 ++--
net/netfilter/ipset/ip_set_core.c | 12 +++++++++++
net/netfilter/ipset/ip_set_hash_netiface.c | 32 ++++------------------------
net/netfilter/ipvs/ip_vs_ctl.c | 14 ++++++------
net/netfilter/nfnetlink.c | 4 +++-
5 files changed, 28 insertions(+), 38 deletions(-)
--
1.7.10
^ permalink raw reply
* [PATCH 2/5] netfilter: ipvs: fix dst leak in __ip_vs_addr_is_local_v6
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>
From: Eric Dumazet <edumazet@google.com>
After call to ip6_route_output() we must release dst or we leak it.
Also should test dst->error, as ip6_route_output() never returns NULL.
Use boolean while we are at it.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipvs/ip_vs_ctl.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd811b8..d43e3c1 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -76,19 +76,19 @@ static void __ip_vs_del_service(struct ip_vs_service *svc);
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(struct net *net,
- const struct in6_addr *addr)
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
- struct rt6_info *rt;
struct flowi6 fl6 = {
.daddr = *addr,
};
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
- rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
- if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
- return 1;
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
- return 0;
+ dst_release(dst);
+ return is_local;
}
#endif
--
1.7.10
^ permalink raw reply related
* [PATCH 4/5] netfilter: ipset: fix crash if IPSET_CMD_NONE command is sent
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>
From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
This patch fixes a crash if that ipset command is sent over nfnetlink.
Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipset/ip_set_core.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 819c342..9730882 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -640,6 +640,14 @@ find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
}
static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
ip_set_create(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const attr[])
@@ -1539,6 +1547,10 @@ nlmsg_failure:
}
static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_NONE] = {
+ .call = ip_set_none,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ },
[IPSET_CMD_CREATE] = {
.call = ip_set_create,
.attr_count = IPSET_ATTR_CMD_MAX,
--
1.7.10
^ permalink raw reply related
* [PATCH 3/5] netfilter: update location of my trees
From: pablo @ 2012-06-29 15:37 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1340984255-738-1-git-send-email-pablo@netfilter.org>
From: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
MAINTAINERS | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index f6e62de..302aa00 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4654,8 +4654,8 @@ L: netfilter@vger.kernel.org
L: coreteam@netfilter.org
W: http://www.netfilter.org/
W: http://www.iptables.org/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-2.6.git
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next-2.6.git
+T: git git://1984.lsi.us.es/nf
+T: git git://1984.lsi.us.es/nf-next
S: Supported
F: include/linux/netfilter*
F: include/linux/netfilter/
--
1.7.10
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox