linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
@ 2020-11-24 21:56 David Wysochanski
  2020-11-25  1:07 ` Trond Myklebust
  0 siblings, 1 reply; 8+ messages in thread
From: David Wysochanski @ 2020-11-24 21:56 UTC (permalink / raw)
  To: linux-nfs

I've started seeing this failure since testing 5.10-rc4 - this does
not happen on 5.9


f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs' registered for caching
[  125.046104] Key type dns_resolver registered
[  125.770354] NFS: Registering the id_resolver key type
[  125.780599] Key type id_resolver registered
[  125.782440] Key type id_legacy registered
[  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
[  178.736479]
[  178.751380] =============================
[  178.753249] [ BUG: Invalid wait context ]
[  178.754886] 5.10.0-rc4 #127 Not tainted
[  178.756423] -----------------------------
[  178.758055] kworker/1:2/848 is trying to lock:
[  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3}, at:
get_page_from_freelist+0x897/0x2190
[  178.763333] other info that might help us debug this:
[  178.765354] context-{5:5}
[  178.766437] 3 locks held by kworker/1:2/848:
[  178.768158]  #0: ffff8946ce825538
((wq_completion)nfsiod){+.+.}-{0:0}, at: process_one_work+0x1be/0x540
[  178.771871]  #1: ffff9e6b408f7e58
((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
process_one_work+0x1be/0x540
[  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2}, at:
kvfree_call_rcu+0x69/0x230
[  178.779803] stack backtrace:
[  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump: loaded Not
tainted 5.10.0-rc4 #127
[  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
[  178.789308] Call Trace:
[  178.790386]  dump_stack+0x8d/0xb5
[  178.791816]  __lock_acquire.cold+0x20b/0x2c8
[  178.793605]  lock_acquire+0xca/0x380
[  178.795113]  ? get_page_from_freelist+0x897/0x2190
[  178.797116]  _raw_spin_lock+0x2c/0x40
[  178.798638]  ? get_page_from_freelist+0x897/0x2190
[  178.800620]  get_page_from_freelist+0x897/0x2190
[  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
[  178.804416]  __get_free_pages+0xd/0x30
[  178.805987]  kvfree_call_rcu+0x168/0x230
[  178.807687]  nfs_free_request+0xab/0x180 [nfs]
[  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
[  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
[  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
[  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
[  178.817451]  process_one_work+0x23e/0x540
[  178.819136]  worker_thread+0x50/0x3a0
[  178.820657]  ? process_one_work+0x540/0x540
[  178.822427]  kthread+0x10f/0x150
[  178.823805]  ? kthread_park+0x90/0x90
[  178.825339]  ret_from_fork+0x22/0x30


# eu-addr2line -e ./fs/nfs/nfs.ko nfs_free_request+0xab
fs/nfs/pagelist.c:559:24

    541 static void nfs_clear_request(struct nfs_page *req)
    542 {
    543         struct page *page = req->wb_page;
    544         struct nfs_lock_context *l_ctx = req->wb_lock_context;
    545         struct nfs_open_context *ctx;
    546
    547         if (page != NULL) {
    548                 put_page(page);
    549                 req->wb_page = NULL;
    550         }
    551         if (l_ctx != NULL) {
    552                 if (atomic_dec_and_test(&l_ctx->io_count)) {
    553                         wake_up_var(&l_ctx->io_count);
    554                         ctx = l_ctx->open_context;
    555                         if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
    556
rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
    557                 }
    558                 nfs_put_lock_context(l_ctx);
    559                 req->wb_lock_context = NULL;
    560         }
    561 }

    924 void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
    925 {
    926         struct nfs_open_context *ctx = l_ctx->open_context;
    927         struct inode *inode = d_inode(ctx->dentry);
    928
    929         if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
    930                 return;
    931         list_del_rcu(&l_ctx->list);
    932         spin_unlock(&inode->i_lock);
    933         put_nfs_open_context(ctx);
    934         kfree_rcu(l_ctx, rcu_head);
    935 }
    936 EXPORT_SYMBOL_GPL(nfs_put_lock_context);


# eu-addr2line -e ./vmlinux get_page_from_freelist+0x897
mm/page_alloc.c:2887:2

   2875 /*
   2876  * Obtain a specified number of elements from the buddy
allocator, all under
   2877  * a single hold of the lock, for efficiency.  Add them to the
supplied list.
   2878  * Returns the number of new pages which were placed at *list.
   2879  */
   2880 static int rmqueue_bulk(struct zone *zone, unsigned int order,
   2881                         unsigned long count, struct list_head *list,
   2882                         int migratetype, unsigned int alloc_flags)
   2883 {
   2884         int i, alloced = 0;
   2885
   2886         spin_lock(&zone->lock);
   2887         for (i = 0; i < count; ++i) {
   2888                 struct page *page = __rmqueue(zone, order, migratetype,
   2889
 alloc_flags);



# eu-addr2line -e ./vmlinux kvfree_call_rcu+0x168
kernel/rcu/tree.c:3391:6

   3362         /* Check if a new block is required. */
   3363         if (!krcp->bkvhead[idx] ||
   3364                         krcp->bkvhead[idx]->nr_records ==
KVFREE_BULK_MAX_ENTR) {
   3365                 bnode = get_cached_bnode(krcp);
   3366                 if (!bnode) {
   3367                         /*
   3368                          * To keep this path working on raw
non-preemptible
   3369                          * sections, prevent the optional entry into the
   3370                          * allocator as it uses sleeping
locks. In fact, even
   3371                          * if the caller of kfree_rcu() is
preemptible, this
   3372                          * path still is not, as krcp->lock is
a raw spinlock.
   3373                          * With additional page pre-allocation
in the works,
   3374                          * hitting this return is going to be
much less likely.
   3375                          */
   3376                         if (IS_ENABLED(CONFIG_PREEMPT_RT))
   3377                                 return false;
   3378
   3379                         /*
   3380                          * NOTE: For one argument of kvfree_rcu() we can
   3381                          * drop the lock and get the page in sleepable
   3382                          * context. That would allow to
maintain an array
   3383                          * for the CONFIG_PREEMPT_RT as well
if no cached
   3384                          * pages are available.
   3385                          */
   3386                         bnode = (struct kvfree_rcu_bulk_data *)
   3387                                 __get_free_page(GFP_NOWAIT |
__GFP_NOWARN);
   3388                 }
   3389
   3390                 /* Switch to emergency path. */
   3391                 if (unlikely(!bnode))
   3392                         return false;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-24 21:56 NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context David Wysochanski
@ 2020-11-25  1:07 ` Trond Myklebust
  2020-11-25  1:28   ` David Wysochanski
  0 siblings, 1 reply; 8+ messages in thread
From: Trond Myklebust @ 2020-11-25  1:07 UTC (permalink / raw)
  To: linux-nfs@vger.kernel.org, dwysocha@redhat.com

On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> I've started seeing this failure since testing 5.10-rc4 - this does
> not happen on 5.9
> 
> 
> f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs' registered for
> caching
> [  125.046104] Key type dns_resolver registered
> [  125.770354] NFS: Registering the id_resolver key type
> [  125.780599] Key type id_resolver registered
> [  125.782440] Key type id_legacy registered
> [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> [  178.736479]
> [  178.751380] =============================
> [  178.753249] [ BUG: Invalid wait context ]
> [  178.754886] 5.10.0-rc4 #127 Not tainted
> [  178.756423] -----------------------------
> [  178.758055] kworker/1:2/848 is trying to lock:
> [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3}, at:
> get_page_from_freelist+0x897/0x2190
> [  178.763333] other info that might help us debug this:
> [  178.765354] context-{5:5}
> [  178.766437] 3 locks held by kworker/1:2/848:
> [  178.768158]  #0: ffff8946ce825538
> ((wq_completion)nfsiod){+.+.}-{0:0}, at: process_one_work+0x1be/0x540
> [  178.771871]  #1: ffff9e6b408f7e58
> ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> process_one_work+0x1be/0x540
> [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2}, at:
> kvfree_call_rcu+0x69/0x230
> [  178.779803] stack backtrace:
> [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump: loaded Not
> tainted 5.10.0-rc4 #127
> [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> [  178.789308] Call Trace:
> [  178.790386]  dump_stack+0x8d/0xb5
> [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> [  178.793605]  lock_acquire+0xca/0x380
> [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> [  178.797116]  _raw_spin_lock+0x2c/0x40
> [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> [  178.800620]  get_page_from_freelist+0x897/0x2190
> [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> [  178.804416]  __get_free_pages+0xd/0x30
> [  178.805987]  kvfree_call_rcu+0x168/0x230
> [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> [  178.817451]  process_one_work+0x23e/0x540
> [  178.819136]  worker_thread+0x50/0x3a0
> [  178.820657]  ? process_one_work+0x540/0x540
> [  178.822427]  kthread+0x10f/0x150
> [  178.823805]  ? kthread_park+0x90/0x90
> [  178.825339]  ret_from_fork+0x22/0x30
> 

I can't think of any changes that might have caused this. Is this
NFSv3, v4 or other? I haven't been seeing any of this.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25  1:07 ` Trond Myklebust
@ 2020-11-25  1:28   ` David Wysochanski
  2020-11-25  1:33     ` Trond Myklebust
  0 siblings, 1 reply; 8+ messages in thread
From: David Wysochanski @ 2020-11-25  1:28 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs@vger.kernel.org

On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > I've started seeing this failure since testing 5.10-rc4 - this does
> > not happen on 5.9
> >
> >
> > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs' registered for
> > caching
> > [  125.046104] Key type dns_resolver registered
> > [  125.770354] NFS: Registering the id_resolver key type
> > [  125.780599] Key type id_resolver registered
> > [  125.782440] Key type id_legacy registered
> > [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> > [  178.736479]
> > [  178.751380] =============================
> > [  178.753249] [ BUG: Invalid wait context ]
> > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > [  178.756423] -----------------------------
> > [  178.758055] kworker/1:2/848 is trying to lock:
> > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3}, at:
> > get_page_from_freelist+0x897/0x2190
> > [  178.763333] other info that might help us debug this:
> > [  178.765354] context-{5:5}
> > [  178.766437] 3 locks held by kworker/1:2/848:
> > [  178.768158]  #0: ffff8946ce825538
> > ((wq_completion)nfsiod){+.+.}-{0:0}, at: process_one_work+0x1be/0x540
> > [  178.771871]  #1: ffff9e6b408f7e58
> > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > process_one_work+0x1be/0x540
> > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2}, at:
> > kvfree_call_rcu+0x69/0x230
> > [  178.779803] stack backtrace:
> > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump: loaded Not
> > tainted 5.10.0-rc4 #127
> > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> > [  178.789308] Call Trace:
> > [  178.790386]  dump_stack+0x8d/0xb5
> > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > [  178.793605]  lock_acquire+0xca/0x380
> > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > [  178.804416]  __get_free_pages+0xd/0x30
> > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > [  178.817451]  process_one_work+0x23e/0x540
> > [  178.819136]  worker_thread+0x50/0x3a0
> > [  178.820657]  ? process_one_work+0x540/0x540
> > [  178.822427]  kthread+0x10f/0x150
> > [  178.823805]  ? kthread_park+0x90/0x90
> > [  178.825339]  ret_from_fork+0x22/0x30
> >
>
> I can't think of any changes that might have caused this. Is this
> NFSv3, v4 or other? I haven't been seeing any of this.
>

It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS client against
an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64) though not
sure if that matters.
My config has these:
CONFIG_LOCK_DEBUGGING_SUPPORT=y
CONFIG_PROVE_LOCKING=y
CONFIG_PROVE_RAW_LOCK_NESTING=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_LOCKDEP=y


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25  1:28   ` David Wysochanski
@ 2020-11-25  1:33     ` Trond Myklebust
  2020-11-25  1:43       ` David Wysochanski
  0 siblings, 1 reply; 8+ messages in thread
From: Trond Myklebust @ 2020-11-25  1:33 UTC (permalink / raw)
  To: dwysocha@redhat.com; +Cc: linux-nfs@vger.kernel.org

On Tue, 2020-11-24 at 20:28 -0500, David Wysochanski wrote:
> On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <
> trondmy@hammerspace.com> wrote:
> > 
> > On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > > I've started seeing this failure since testing 5.10-rc4 - this
> > > does
> > > not happen on 5.9
> > > 
> > > 
> > > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs' registered
> > > for
> > > caching
> > > [  125.046104] Key type dns_resolver registered
> > > [  125.770354] NFS: Registering the id_resolver key type
> > > [  125.780599] Key type id_resolver registered
> > > [  125.782440] Key type id_legacy registered
> > > [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> > > [  178.736479]
> > > [  178.751380] =============================
> > > [  178.753249] [ BUG: Invalid wait context ]
> > > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > > [  178.756423] -----------------------------
> > > [  178.758055] kworker/1:2/848 is trying to lock:
> > > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3}, at:
> > > get_page_from_freelist+0x897/0x2190
> > > [  178.763333] other info that might help us debug this:
> > > [  178.765354] context-{5:5}
> > > [  178.766437] 3 locks held by kworker/1:2/848:
> > > [  178.768158]  #0: ffff8946ce825538
> > > ((wq_completion)nfsiod){+.+.}-{0:0}, at:
> > > process_one_work+0x1be/0x540
> > > [  178.771871]  #1: ffff9e6b408f7e58
> > > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > > process_one_work+0x1be/0x540
> > > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2}, at:
> > > kvfree_call_rcu+0x69/0x230
> > > [  178.779803] stack backtrace:
> > > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump: loaded
> > > Not
> > > tainted 5.10.0-rc4 #127
> > > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > > [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> > > [  178.789308] Call Trace:
> > > [  178.790386]  dump_stack+0x8d/0xb5
> > > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > > [  178.793605]  lock_acquire+0xca/0x380
> > > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > > [  178.804416]  __get_free_pages+0xd/0x30
> > > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > > [  178.817451]  process_one_work+0x23e/0x540
> > > [  178.819136]  worker_thread+0x50/0x3a0
> > > [  178.820657]  ? process_one_work+0x540/0x540
> > > [  178.822427]  kthread+0x10f/0x150
> > > [  178.823805]  ? kthread_park+0x90/0x90
> > > [  178.825339]  ret_from_fork+0x22/0x30
> > > 
> > 
> > I can't think of any changes that might have caused this. Is this
> > NFSv3, v4 or other? I haven't been seeing any of this.
> > 
> 
> It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS client
> against
> an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64) though not
> sure if that matters.
> My config has these:
> CONFIG_LOCK_DEBUGGING_SUPPORT=y
> CONFIG_PROVE_LOCKING=y
> CONFIG_PROVE_RAW_LOCK_NESTING=y
> CONFIG_DEBUG_SPINLOCK=y
> CONFIG_DEBUG_LOCK_ALLOC=y
> CONFIG_LOCKDEP=y
> 
That helps. It means we can't blame the new READ_PLUS code, since it
would be completely disabled here.
Are you using any special rsize values? Also, could pNFS be involved
(e.g. the pNFS block/scsi code)?


-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25  1:33     ` Trond Myklebust
@ 2020-11-25  1:43       ` David Wysochanski
  2020-11-25  1:47         ` Trond Myklebust
  0 siblings, 1 reply; 8+ messages in thread
From: David Wysochanski @ 2020-11-25  1:43 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs@vger.kernel.org

On Tue, Nov 24, 2020 at 8:33 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Tue, 2020-11-24 at 20:28 -0500, David Wysochanski wrote:
> > On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > >
> > > On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > > > I've started seeing this failure since testing 5.10-rc4 - this
> > > > does
> > > > not happen on 5.9
> > > >
> > > >
> > > > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs' registered
> > > > for
> > > > caching
> > > > [  125.046104] Key type dns_resolver registered
> > > > [  125.770354] NFS: Registering the id_resolver key type
> > > > [  125.780599] Key type id_resolver registered
> > > > [  125.782440] Key type id_legacy registered
> > > > [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> > > > [  178.736479]
> > > > [  178.751380] =============================
> > > > [  178.753249] [ BUG: Invalid wait context ]
> > > > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > > > [  178.756423] -----------------------------
> > > > [  178.758055] kworker/1:2/848 is trying to lock:
> > > > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3}, at:
> > > > get_page_from_freelist+0x897/0x2190
> > > > [  178.763333] other info that might help us debug this:
> > > > [  178.765354] context-{5:5}
> > > > [  178.766437] 3 locks held by kworker/1:2/848:
> > > > [  178.768158]  #0: ffff8946ce825538
> > > > ((wq_completion)nfsiod){+.+.}-{0:0}, at:
> > > > process_one_work+0x1be/0x540
> > > > [  178.771871]  #1: ffff9e6b408f7e58
> > > > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > > > process_one_work+0x1be/0x540
> > > > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2}, at:
> > > > kvfree_call_rcu+0x69/0x230
> > > > [  178.779803] stack backtrace:
> > > > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump: loaded
> > > > Not
> > > > tainted 5.10.0-rc4 #127
> > > > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > > > [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> > > > [  178.789308] Call Trace:
> > > > [  178.790386]  dump_stack+0x8d/0xb5
> > > > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > > > [  178.793605]  lock_acquire+0xca/0x380
> > > > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > > > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > > > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > > > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > > > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > > > [  178.804416]  __get_free_pages+0xd/0x30
> > > > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > > > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > > > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > > > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > > > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > > > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > > > [  178.817451]  process_one_work+0x23e/0x540
> > > > [  178.819136]  worker_thread+0x50/0x3a0
> > > > [  178.820657]  ? process_one_work+0x540/0x540
> > > > [  178.822427]  kthread+0x10f/0x150
> > > > [  178.823805]  ? kthread_park+0x90/0x90
> > > > [  178.825339]  ret_from_fork+0x22/0x30
> > > >
> > >
> > > I can't think of any changes that might have caused this. Is this
> > > NFSv3, v4 or other? I haven't been seeing any of this.
> > >
> >
> > It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS client
> > against
> > an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64) though not
> > sure if that matters.
> > My config has these:
> > CONFIG_LOCK_DEBUGGING_SUPPORT=y
> > CONFIG_PROVE_LOCKING=y
> > CONFIG_PROVE_RAW_LOCK_NESTING=y
> > CONFIG_DEBUG_SPINLOCK=y
> > CONFIG_DEBUG_LOCK_ALLOC=y
> > CONFIG_LOCKDEP=y
> >
> That helps. It means we can't blame the new READ_PLUS code, since it
> would be completely disabled here.
> Are you using any special rsize values? Also, could pNFS be involved
> (e.g. the pNFS block/scsi code)?
>

No special rsize values or pNFS should be involved - here's most of
the /proc/mounts
 /mnt/test nfs4
rw,context=system_u:object_r:root_t:s0,relatime,vers=4.1,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys

If you want I can try against a later server and maybe loopback on the
same 5.10-rc4 kernel and see if it reproduces?

>

FWIW, I've also seen at least once NFS4.1 produces something slightly different:

$ cat nfs41-fail-074-different-backtrace.txt
[   60.125028] run fstests generic/074 at 2020-11-24 11:57:51
[   62.281576]
[   62.300548] =============================
[   62.302205] [ BUG: Invalid wait context ]
[   62.303812] 5.10.0-rc4 #127 Not tainted
[   62.305351] -----------------------------
[   62.306954] fstest/2035 is trying to lock:
[   62.308588] ffff8fe4bffd23d8 (&zone->lock){..-.}-{3:3}, at:
get_page_from_freelist+0x897/0x2190
[   62.312079] other info that might help us debug this:
[   62.314105] context-{5:5}
[   62.315166] 3 locks held by fstest/2035:
[   62.316722]  #0: ffff8fe3b3d78448 (sb_writers#16){.+.+}-{0:0}, at:
do_syscall_64+0x33/0x40
[   62.320040]  #1: ffff8fe3d8f48488
(&sb->s_type->i_mutex_key#21){++++}-{4:4}, at: do_truncate+0x69/0xd0
[   62.323706]  #2: ffff8fe4b7c9b2b0 (krc.lock){..-.}-{2:2}, at:
kvfree_call_rcu+0x69/0x230
[   62.326974] stack backtrace:
[   62.328151] CPU: 2 PID: 2035 Comm: fstest Kdump: loaded Not tainted
5.10.0-rc4 #127
[   62.331172] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[   62.333449] Call Trace:
[   62.334504]  dump_stack+0x8d/0xb5
[   62.335880]  __lock_acquire.cold+0x20b/0x2c8
[   62.337621]  ? find_get_entries+0x2c7/0x5b0
[   62.339316]  lock_acquire+0xca/0x380
[   62.340795]  ? get_page_from_freelist+0x897/0x2190
[   62.342722]  _raw_spin_lock+0x2c/0x40
[   62.344224]  ? get_page_from_freelist+0x897/0x2190
[   62.346123]  get_page_from_freelist+0x897/0x2190
[   62.347979]  ? __lock_acquire+0x3b1/0x25d0
[   62.349625]  __alloc_pages_nodemask+0x1b4/0x460
[   62.351437]  __get_free_pages+0xd/0x30
[   62.352942]  kvfree_call_rcu+0x168/0x230
[   62.354562]  nfs4_do_setattr+0x1f6/0x4e0 [nfsv4]
[   62.356455]  nfs4_proc_setattr+0xb0/0x160 [nfsv4]
[   62.358380]  nfs_setattr+0x102/0x2c0 [nfs]
[   62.360069]  notify_change+0x340/0x4d0
[   62.361587]  ? do_truncate+0x76/0xd0
[   62.363034]  do_truncate+0x76/0xd0
[   62.364413]  do_sys_ftruncate+0x14a/0x230
[   62.366037]  do_syscall_64+0x33/0x40
[   62.367481]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   62.369489] RIP: 0033:0x7f23c70c2abb
[   62.370927] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 73 0c 00 f7 d8
64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 73 0c 00
f7 d8
[   62.378219] RSP: 002b:00007ffd0780e788 EFLAGS: 00000202 ORIG_RAX:
000000000000004d
[   62.381208] RAX: ffffffffffffffda RBX: 0000000000002000 RCX: 00007f23c70c2abb
[   62.384031] RDX: 0000000000000242 RSI: 0000000000a00000 RDI: 0000000000000003
[   62.386851] RBP: 00000000017022b0 R08: 0000000000000000 R09: 000000000000001f
[   62.389677] R10: 00000000000001a4 R11: 0000000000000202 R12: 0000000000000003
[   62.392493] R13: 0000000000000000 R14: 0000000000a00000 R15: 0000000000000001



# eu-addr2line -e fs/nfs/nfsv4.ko nfs4_do_setattr+0x1f6
fs/nfs/nfs4proc.c:3307:6


   3268 static int _nfs4_do_setattr(struct inode *inode,
   3269                             struct nfs_setattrargs *arg,
   3270                             struct nfs_setattrres *res,
   3271                             const struct cred *cred,
   3272                             struct nfs_open_context *ctx)
   3273 {
   3274         struct nfs_server *server = NFS_SERVER(inode);
   3275         struct rpc_message msg = {
   3276                 .rpc_proc       =
&nfs4_procedures[NFSPROC4_CLNT_SETATTR],
   3277                 .rpc_argp       = arg,
   3278                 .rpc_resp       = res,
   3279                 .rpc_cred       = cred,
   3280         };
   3281         const struct cred *delegation_cred = NULL;
   3282         unsigned long timestamp = jiffies;
   3283         bool truncate;
   3284         int status;
   3285
   3286         nfs_fattr_init(res->fattr);
   3287
   3288         /* Servers should only apply open mode checks for file
size changes */
   3289         truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
   3290         if (!truncate) {
   3291                 nfs4_inode_make_writeable(inode);
   3292                 goto zero_stateid;
   3293         }
   3294
   3295         if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE,
&arg->stateid, &delegation_cred)) {
   3296                 /* Use that stateid */
   3297         } else if (ctx != NULL && ctx->state) {
   3298                 struct nfs_lock_context *l_ctx;
   3299                 if (!nfs4_valid_open_stateid(ctx->state))
   3300                         return -EBADF;
   3301                 l_ctx = nfs_get_lock_context(ctx);
   3302                 if (IS_ERR(l_ctx))
   3303                         return PTR_ERR(l_ctx);
   3304                 status = nfs4_select_rw_stateid(ctx->state,
FMODE_WRITE, l_ctx,
   3305                                                 &arg->stateid,
&delegation_cred);
   3306                 nfs_put_lock_context(l_ctx);
   3307                 if (status == -EIO)
   3308                         return -EBADF;




> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25  1:43       ` David Wysochanski
@ 2020-11-25  1:47         ` Trond Myklebust
  2020-11-25 15:19           ` David Wysochanski
  0 siblings, 1 reply; 8+ messages in thread
From: Trond Myklebust @ 2020-11-25  1:47 UTC (permalink / raw)
  To: dwysocha@redhat.com; +Cc: linux-nfs@vger.kernel.org

On Tue, 2020-11-24 at 20:43 -0500, David Wysochanski wrote:
> On Tue, Nov 24, 2020 at 8:33 PM Trond Myklebust
> <trondmy@hammerspace.com> wrote:
> > 
> > On Tue, 2020-11-24 at 20:28 -0500, David Wysochanski wrote:
> > > On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <
> > > trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > > > > I've started seeing this failure since testing 5.10-rc4 -
> > > > > this
> > > > > does
> > > > > not happen on 5.9
> > > > > 
> > > > > 
> > > > > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs'
> > > > > registered
> > > > > for
> > > > > caching
> > > > > [  125.046104] Key type dns_resolver registered
> > > > > [  125.770354] NFS: Registering the id_resolver key type
> > > > > [  125.780599] Key type id_resolver registered
> > > > > [  125.782440] Key type id_legacy registered
> > > > > [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> > > > > [  178.736479]
> > > > > [  178.751380] =============================
> > > > > [  178.753249] [ BUG: Invalid wait context ]
> > > > > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > > > > [  178.756423] -----------------------------
> > > > > [  178.758055] kworker/1:2/848 is trying to lock:
> > > > > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3},
> > > > > at:
> > > > > get_page_from_freelist+0x897/0x2190
> > > > > [  178.763333] other info that might help us debug this:
> > > > > [  178.765354] context-{5:5}
> > > > > [  178.766437] 3 locks held by kworker/1:2/848:
> > > > > [  178.768158]  #0: ffff8946ce825538
> > > > > ((wq_completion)nfsiod){+.+.}-{0:0}, at:
> > > > > process_one_work+0x1be/0x540
> > > > > [  178.771871]  #1: ffff9e6b408f7e58
> > > > > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > > > > process_one_work+0x1be/0x540
> > > > > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2},
> > > > > at:
> > > > > kvfree_call_rcu+0x69/0x230
> > > > > [  178.779803] stack backtrace:
> > > > > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump:
> > > > > loaded
> > > > > Not
> > > > > tainted 5.10.0-rc4 #127
> > > > > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1
> > > > > 01/01/2011
> > > > > [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> > > > > [  178.789308] Call Trace:
> > > > > [  178.790386]  dump_stack+0x8d/0xb5
> > > > > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > > > > [  178.793605]  lock_acquire+0xca/0x380
> > > > > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > > > > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > > > > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > > > > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > > > > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > > > > [  178.804416]  __get_free_pages+0xd/0x30
> > > > > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > > > > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > > > > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > > > > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > > > > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > > > > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > > > > [  178.817451]  process_one_work+0x23e/0x540
> > > > > [  178.819136]  worker_thread+0x50/0x3a0
> > > > > [  178.820657]  ? process_one_work+0x540/0x540
> > > > > [  178.822427]  kthread+0x10f/0x150
> > > > > [  178.823805]  ? kthread_park+0x90/0x90
> > > > > [  178.825339]  ret_from_fork+0x22/0x30
> > > > > 
> > > > 
> > > > I can't think of any changes that might have caused this. Is
> > > > this
> > > > NFSv3, v4 or other? I haven't been seeing any of this.
> > > > 
> > > 
> > > It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS client
> > > against
> > > an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64) though
> > > not
> > > sure if that matters.
> > > My config has these:
> > > CONFIG_LOCK_DEBUGGING_SUPPORT=y
> > > CONFIG_PROVE_LOCKING=y
> > > CONFIG_PROVE_RAW_LOCK_NESTING=y
> > > CONFIG_DEBUG_SPINLOCK=y
> > > CONFIG_DEBUG_LOCK_ALLOC=y
> > > CONFIG_LOCKDEP=y
> > > 
> > That helps. It means we can't blame the new READ_PLUS code, since
> > it
> > would be completely disabled here.
> > Are you using any special rsize values? Also, could pNFS be
> > involved
> > (e.g. the pNFS block/scsi code)?
> > 
> 
> No special rsize values or pNFS should be involved - here's most of
> the /proc/mounts
>  /mnt/test nfs4
> rw,context=system_u:object_r:root_t:s0,relatime,vers=4.1,rsize=524288
> ,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys
> 
> If you want I can try against a later server and maybe loopback on
> the
> same 5.10-rc4 kernel and see if it reproduces?
> 
> > 
> 
> FWIW, I've also seen at least once NFS4.1 produces something slightly
> different:
> 
> $ cat nfs41-fail-074-different-backtrace.txt
> [   60.125028] run fstests generic/074 at 2020-11-24 11:57:51
> [   62.281576]
> [   62.300548] =============================
> [   62.302205] [ BUG: Invalid wait context ]
> [   62.303812] 5.10.0-rc4 #127 Not tainted
> [   62.305351] -----------------------------
> [   62.306954] fstest/2035 is trying to lock:
> [   62.308588] ffff8fe4bffd23d8 (&zone->lock){..-.}-{3:3}, at:
> get_page_from_freelist+0x897/0x2190
> [   62.312079] other info that might help us debug this:
> [   62.314105] context-{5:5}
> [   62.315166] 3 locks held by fstest/2035:
> [   62.316722]  #0: ffff8fe3b3d78448 (sb_writers#16){.+.+}-{0:0}, at:
> do_syscall_64+0x33/0x40
> [   62.320040]  #1: ffff8fe3d8f48488
> (&sb->s_type->i_mutex_key#21){++++}-{4:4}, at: do_truncate+0x69/0xd0
> [   62.323706]  #2: ffff8fe4b7c9b2b0 (krc.lock){..-.}-{2:2}, at:
> kvfree_call_rcu+0x69/0x230
> [   62.326974] stack backtrace:
> [   62.328151] CPU: 2 PID: 2035 Comm: fstest Kdump: loaded Not
> tainted
> 5.10.0-rc4 #127
> [   62.331172] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [   62.333449] Call Trace:
> [   62.334504]  dump_stack+0x8d/0xb5
> [   62.335880]  __lock_acquire.cold+0x20b/0x2c8
> [   62.337621]  ? find_get_entries+0x2c7/0x5b0
> [   62.339316]  lock_acquire+0xca/0x380
> [   62.340795]  ? get_page_from_freelist+0x897/0x2190
> [   62.342722]  _raw_spin_lock+0x2c/0x40
> [   62.344224]  ? get_page_from_freelist+0x897/0x2190
> [   62.346123]  get_page_from_freelist+0x897/0x2190
> [   62.347979]  ? __lock_acquire+0x3b1/0x25d0
> [   62.349625]  __alloc_pages_nodemask+0x1b4/0x460
> [   62.351437]  __get_free_pages+0xd/0x30
> [   62.352942]  kvfree_call_rcu+0x168/0x230
> [   62.354562]  nfs4_do_setattr+0x1f6/0x4e0 [nfsv4]
> [   62.356455]  nfs4_proc_setattr+0xb0/0x160 [nfsv4]
> [   62.358380]  nfs_setattr+0x102/0x2c0 [nfs]
> [   62.360069]  notify_change+0x340/0x4d0
> [   62.361587]  ? do_truncate+0x76/0xd0
> [   62.363034]  do_truncate+0x76/0xd0
> [   62.364413]  do_sys_ftruncate+0x14a/0x230
> [   62.366037]  do_syscall_64+0x33/0x40
> [   62.367481]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   62.369489] RIP: 0033:0x7f23c70c2abb
> [   62.370927] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 73 0c 00 f7 d8
> 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00 00
> 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 73 0c
> 00
> f7 d8
> [   62.378219] RSP: 002b:00007ffd0780e788 EFLAGS: 00000202 ORIG_RAX:
> 000000000000004d
> [   62.381208] RAX: ffffffffffffffda RBX: 0000000000002000 RCX:
> 00007f23c70c2abb
> [   62.384031] RDX: 0000000000000242 RSI: 0000000000a00000 RDI:
> 0000000000000003
> [   62.386851] RBP: 00000000017022b0 R08: 0000000000000000 R09:
> 000000000000001f
> [   62.389677] R10: 00000000000001a4 R11: 0000000000000202 R12:
> 0000000000000003
> [   62.392493] R13: 0000000000000000 R14: 0000000000a00000 R15:
> 0000000000000001
> 

Hmm... Both suggest a use-after-free situation. Would you be able to
run the test with KASAN enabled? That might help finger the real
culprit.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25  1:47         ` Trond Myklebust
@ 2020-11-25 15:19           ` David Wysochanski
  2020-11-25 17:58             ` Trond Myklebust
  0 siblings, 1 reply; 8+ messages in thread
From: David Wysochanski @ 2020-11-25 15:19 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs@vger.kernel.org

On Tue, Nov 24, 2020 at 8:47 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Tue, 2020-11-24 at 20:43 -0500, David Wysochanski wrote:
> > On Tue, Nov 24, 2020 at 8:33 PM Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > >
> > > On Tue, 2020-11-24 at 20:28 -0500, David Wysochanski wrote:
> > > > On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <
> > > > trondmy@hammerspace.com> wrote:
> > > > >
> > > > > On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > > > > > I've started seeing this failure since testing 5.10-rc4 -
> > > > > > this
> > > > > > does
> > > > > > not happen on 5.9
> > > > > >
> > > > > >
> > > > > > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs'
> > > > > > registered
> > > > > > for
> > > > > > caching
> > > > > > [  125.046104] Key type dns_resolver registered
> > > > > > [  125.770354] NFS: Registering the id_resolver key type
> > > > > > [  125.780599] Key type id_resolver registered
> > > > > > [  125.782440] Key type id_legacy registered
> > > > > > [  126.563717] run fstests generic/074 at 2020-11-24 11:23:49
> > > > > > [  178.736479]
> > > > > > [  178.751380] =============================
> > > > > > [  178.753249] [ BUG: Invalid wait context ]
> > > > > > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > > > > > [  178.756423] -----------------------------
> > > > > > [  178.758055] kworker/1:2/848 is trying to lock:
> > > > > > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-{3:3},
> > > > > > at:
> > > > > > get_page_from_freelist+0x897/0x2190
> > > > > > [  178.763333] other info that might help us debug this:
> > > > > > [  178.765354] context-{5:5}
> > > > > > [  178.766437] 3 locks held by kworker/1:2/848:
> > > > > > [  178.768158]  #0: ffff8946ce825538
> > > > > > ((wq_completion)nfsiod){+.+.}-{0:0}, at:
> > > > > > process_one_work+0x1be/0x540
> > > > > > [  178.771871]  #1: ffff9e6b408f7e58
> > > > > > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > > > > > process_one_work+0x1be/0x540
> > > > > > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-{2:2},
> > > > > > at:
> > > > > > kvfree_call_rcu+0x69/0x230
> > > > > > [  178.779803] stack backtrace:
> > > > > > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump:
> > > > > > loaded
> > > > > > Not
> > > > > > tainted 5.10.0-rc4 #127
> > > > > > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1
> > > > > > 01/01/2011
> > > > > > [  178.787071] Workqueue: nfsiod rpc_async_release [sunrpc]
> > > > > > [  178.789308] Call Trace:
> > > > > > [  178.790386]  dump_stack+0x8d/0xb5
> > > > > > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > > > > > [  178.793605]  lock_acquire+0xca/0x380
> > > > > > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > > > > > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > > > > > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > > > > > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > > > > > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > > > > > [  178.804416]  __get_free_pages+0xd/0x30
> > > > > > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > > > > > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > > > > > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > > > > > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > > > > > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > > > > > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > > > > > [  178.817451]  process_one_work+0x23e/0x540
> > > > > > [  178.819136]  worker_thread+0x50/0x3a0
> > > > > > [  178.820657]  ? process_one_work+0x540/0x540
> > > > > > [  178.822427]  kthread+0x10f/0x150
> > > > > > [  178.823805]  ? kthread_park+0x90/0x90
> > > > > > [  178.825339]  ret_from_fork+0x22/0x30
> > > > > >
> > > > >
> > > > > I can't think of any changes that might have caused this. Is
> > > > > this
> > > > > NFSv3, v4 or other? I haven't been seeing any of this.
> > > > >
> > > >
> > > > It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS client
> > > > against
> > > > an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64) though
> > > > not
> > > > sure if that matters.
> > > > My config has these:
> > > > CONFIG_LOCK_DEBUGGING_SUPPORT=y
> > > > CONFIG_PROVE_LOCKING=y
> > > > CONFIG_PROVE_RAW_LOCK_NESTING=y
> > > > CONFIG_DEBUG_SPINLOCK=y
> > > > CONFIG_DEBUG_LOCK_ALLOC=y
> > > > CONFIG_LOCKDEP=y
> > > >
> > > That helps. It means we can't blame the new READ_PLUS code, since
> > > it
> > > would be completely disabled here.
> > > Are you using any special rsize values? Also, could pNFS be
> > > involved
> > > (e.g. the pNFS block/scsi code)?
> > >
> >
> > No special rsize values or pNFS should be involved - here's most of
> > the /proc/mounts
> >  /mnt/test nfs4
> > rw,context=system_u:object_r:root_t:s0,relatime,vers=4.1,rsize=524288
> > ,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys
> >
> > If you want I can try against a later server and maybe loopback on
> > the
> > same 5.10-rc4 kernel and see if it reproduces?
> >
> > >
> >
> > FWIW, I've also seen at least once NFS4.1 produces something slightly
> > different:
> >
> > $ cat nfs41-fail-074-different-backtrace.txt
> > [   60.125028] run fstests generic/074 at 2020-11-24 11:57:51
> > [   62.281576]
> > [   62.300548] =============================
> > [   62.302205] [ BUG: Invalid wait context ]
> > [   62.303812] 5.10.0-rc4 #127 Not tainted
> > [   62.305351] -----------------------------
> > [   62.306954] fstest/2035 is trying to lock:
> > [   62.308588] ffff8fe4bffd23d8 (&zone->lock){..-.}-{3:3}, at:
> > get_page_from_freelist+0x897/0x2190
> > [   62.312079] other info that might help us debug this:
> > [   62.314105] context-{5:5}
> > [   62.315166] 3 locks held by fstest/2035:
> > [   62.316722]  #0: ffff8fe3b3d78448 (sb_writers#16){.+.+}-{0:0}, at:
> > do_syscall_64+0x33/0x40
> > [   62.320040]  #1: ffff8fe3d8f48488
> > (&sb->s_type->i_mutex_key#21){++++}-{4:4}, at: do_truncate+0x69/0xd0
> > [   62.323706]  #2: ffff8fe4b7c9b2b0 (krc.lock){..-.}-{2:2}, at:
> > kvfree_call_rcu+0x69/0x230
> > [   62.326974] stack backtrace:
> > [   62.328151] CPU: 2 PID: 2035 Comm: fstest Kdump: loaded Not
> > tainted
> > 5.10.0-rc4 #127
> > [   62.331172] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > [   62.333449] Call Trace:
> > [   62.334504]  dump_stack+0x8d/0xb5
> > [   62.335880]  __lock_acquire.cold+0x20b/0x2c8
> > [   62.337621]  ? find_get_entries+0x2c7/0x5b0
> > [   62.339316]  lock_acquire+0xca/0x380
> > [   62.340795]  ? get_page_from_freelist+0x897/0x2190
> > [   62.342722]  _raw_spin_lock+0x2c/0x40
> > [   62.344224]  ? get_page_from_freelist+0x897/0x2190
> > [   62.346123]  get_page_from_freelist+0x897/0x2190
> > [   62.347979]  ? __lock_acquire+0x3b1/0x25d0
> > [   62.349625]  __alloc_pages_nodemask+0x1b4/0x460
> > [   62.351437]  __get_free_pages+0xd/0x30
> > [   62.352942]  kvfree_call_rcu+0x168/0x230
> > [   62.354562]  nfs4_do_setattr+0x1f6/0x4e0 [nfsv4]
> > [   62.356455]  nfs4_proc_setattr+0xb0/0x160 [nfsv4]
> > [   62.358380]  nfs_setattr+0x102/0x2c0 [nfs]
> > [   62.360069]  notify_change+0x340/0x4d0
> > [   62.361587]  ? do_truncate+0x76/0xd0
> > [   62.363034]  do_truncate+0x76/0xd0
> > [   62.364413]  do_sys_ftruncate+0x14a/0x230
> > [   62.366037]  do_syscall_64+0x33/0x40
> > [   62.367481]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > [   62.369489] RIP: 0033:0x7f23c70c2abb
> > [   62.370927] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 73 0c 00 f7 d8
> > 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00 00
> > 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 73 0c
> > 00
> > f7 d8
> > [   62.378219] RSP: 002b:00007ffd0780e788 EFLAGS: 00000202 ORIG_RAX:
> > 000000000000004d
> > [   62.381208] RAX: ffffffffffffffda RBX: 0000000000002000 RCX:
> > 00007f23c70c2abb
> > [   62.384031] RDX: 0000000000000242 RSI: 0000000000a00000 RDI:
> > 0000000000000003
> > [   62.386851] RBP: 00000000017022b0 R08: 0000000000000000 R09:
> > 000000000000001f
> > [   62.389677] R10: 00000000000001a4 R11: 0000000000000202 R12:
> > 0000000000000003
> > [   62.392493] R13: 0000000000000000 R14: 0000000000a00000 R15:
> > 0000000000000001
> >
>
> Hmm... Both suggest a use-after-free situation. Would you be able to
> run the test with KASAN enabled? That might help finger the real
> culprit.
>

Trying to get something out of KASAN but so far no luck.
Could this be a junk report from lockdep?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context
  2020-11-25 15:19           ` David Wysochanski
@ 2020-11-25 17:58             ` Trond Myklebust
  0 siblings, 0 replies; 8+ messages in thread
From: Trond Myklebust @ 2020-11-25 17:58 UTC (permalink / raw)
  To: dwysocha@redhat.com; +Cc: linux-nfs@vger.kernel.org

On Wed, 2020-11-25 at 10:19 -0500, David Wysochanski wrote:
> On Tue, Nov 24, 2020 at 8:47 PM Trond Myklebust <
> trondmy@hammerspace.com> wrote:
> > 
> > On Tue, 2020-11-24 at 20:43 -0500, David Wysochanski wrote:
> > > On Tue, Nov 24, 2020 at 8:33 PM Trond Myklebust
> > > <trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Tue, 2020-11-24 at 20:28 -0500, David Wysochanski wrote:
> > > > > On Tue, Nov 24, 2020 at 8:07 PM Trond Myklebust <
> > > > > trondmy@hammerspace.com> wrote:
> > > > > > 
> > > > > > On Tue, 2020-11-24 at 16:56 -0500, David Wysochanski wrote:
> > > > > > > I've started seeing this failure since testing 5.10-rc4 -
> > > > > > > this
> > > > > > > does
> > > > > > > not happen on 5.9
> > > > > > > 
> > > > > > > 
> > > > > > > f31-node1 login: [  124.055768] FS-Cache: Netfs 'nfs'
> > > > > > > registered
> > > > > > > for
> > > > > > > caching
> > > > > > > [  125.046104] Key type dns_resolver registered
> > > > > > > [  125.770354] NFS: Registering the id_resolver key type
> > > > > > > [  125.780599] Key type id_resolver registered
> > > > > > > [  125.782440] Key type id_legacy registered
> > > > > > > [  126.563717] run fstests generic/074 at 2020-11-24
> > > > > > > 11:23:49
> > > > > > > [  178.736479]
> > > > > > > [  178.751380] =============================
> > > > > > > [  178.753249] [ BUG: Invalid wait context ]
> > > > > > > [  178.754886] 5.10.0-rc4 #127 Not tainted
> > > > > > > [  178.756423] -----------------------------
> > > > > > > [  178.758055] kworker/1:2/848 is trying to lock:
> > > > > > > [  178.759866] ffff8947fffd33d8 (&zone->lock){..-.}-
> > > > > > > {3:3},
> > > > > > > at:
> > > > > > > get_page_from_freelist+0x897/0x2190
> > > > > > > [  178.763333] other info that might help us debug this:
> > > > > > > [  178.765354] context-{5:5}
> > > > > > > [  178.766437] 3 locks held by kworker/1:2/848:
> > > > > > > [  178.768158]  #0: ffff8946ce825538
> > > > > > > ((wq_completion)nfsiod){+.+.}-{0:0}, at:
> > > > > > > process_one_work+0x1be/0x540
> > > > > > > [  178.771871]  #1: ffff9e6b408f7e58
> > > > > > > ((work_completion)(&task->u.tk_work)#2){+.+.}-{0:0}, at:
> > > > > > > process_one_work+0x1be/0x540
> > > > > > > [  178.776562]  #2: ffff8947f7c5b2b0 (krc.lock){..-.}-
> > > > > > > {2:2},
> > > > > > > at:
> > > > > > > kvfree_call_rcu+0x69/0x230
> > > > > > > [  178.779803] stack backtrace:
> > > > > > > [  178.780996] CPU: 1 PID: 848 Comm: kworker/1:2 Kdump:
> > > > > > > loaded
> > > > > > > Not
> > > > > > > tainted 5.10.0-rc4 #127
> > > > > > > [  178.784374] Hardware name: Red Hat KVM, BIOS 0.5.1
> > > > > > > 01/01/2011
> > > > > > > [  178.787071] Workqueue: nfsiod rpc_async_release
> > > > > > > [sunrpc]
> > > > > > > [  178.789308] Call Trace:
> > > > > > > [  178.790386]  dump_stack+0x8d/0xb5
> > > > > > > [  178.791816]  __lock_acquire.cold+0x20b/0x2c8
> > > > > > > [  178.793605]  lock_acquire+0xca/0x380
> > > > > > > [  178.795113]  ? get_page_from_freelist+0x897/0x2190
> > > > > > > [  178.797116]  _raw_spin_lock+0x2c/0x40
> > > > > > > [  178.798638]  ? get_page_from_freelist+0x897/0x2190
> > > > > > > [  178.800620]  get_page_from_freelist+0x897/0x2190
> > > > > > > [  178.802537]  __alloc_pages_nodemask+0x1b4/0x460
> > > > > > > [  178.804416]  __get_free_pages+0xd/0x30
> > > > > > > [  178.805987]  kvfree_call_rcu+0x168/0x230
> > > > > > > [  178.807687]  nfs_free_request+0xab/0x180 [nfs]
> > > > > > > [  178.809547]  nfs_page_group_destroy+0x41/0x80 [nfs]
> > > > > > > [  178.811588]  nfs_read_completion+0x129/0x1f0 [nfs]
> > > > > > > [  178.813633]  rpc_free_task+0x39/0x60 [sunrpc]
> > > > > > > [  178.815481]  rpc_async_release+0x29/0x40 [sunrpc]
> > > > > > > [  178.817451]  process_one_work+0x23e/0x540
> > > > > > > [  178.819136]  worker_thread+0x50/0x3a0
> > > > > > > [  178.820657]  ? process_one_work+0x540/0x540
> > > > > > > [  178.822427]  kthread+0x10f/0x150
> > > > > > > [  178.823805]  ? kthread_park+0x90/0x90
> > > > > > > [  178.825339]  ret_from_fork+0x22/0x30
> > > > > > > 
> > > > > > 
> > > > > > I can't think of any changes that might have caused this.
> > > > > > Is
> > > > > > this
> > > > > > NFSv3, v4 or other? I haven't been seeing any of this.
> > > > > > 
> > > > > 
> > > > > It is NFSv4.1 or NFS4.2.  I am running the xfstests NFS
> > > > > client
> > > > > against
> > > > > an older server, RHEL7 based (3.10.0-1127.8.2.el7.x86_64)
> > > > > though
> > > > > not
> > > > > sure if that matters.
> > > > > My config has these:
> > > > > CONFIG_LOCK_DEBUGGING_SUPPORT=y
> > > > > CONFIG_PROVE_LOCKING=y
> > > > > CONFIG_PROVE_RAW_LOCK_NESTING=y
> > > > > CONFIG_DEBUG_SPINLOCK=y
> > > > > CONFIG_DEBUG_LOCK_ALLOC=y
> > > > > CONFIG_LOCKDEP=y
> > > > > 
> > > > That helps. It means we can't blame the new READ_PLUS code,
> > > > since
> > > > it
> > > > would be completely disabled here.
> > > > Are you using any special rsize values? Also, could pNFS be
> > > > involved
> > > > (e.g. the pNFS block/scsi code)?
> > > > 
> > > 
> > > No special rsize values or pNFS should be involved - here's most
> > > of
> > > the /proc/mounts
> > >  /mnt/test nfs4
> > > rw,context=system_u:object_r:root_t:s0,relatime,vers=4.1,rsize=52
> > > 4288
> > > ,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=s
> > > ys
> > > 
> > > If you want I can try against a later server and maybe loopback
> > > on
> > > the
> > > same 5.10-rc4 kernel and see if it reproduces?
> > > 
> > > > 
> > > 
> > > FWIW, I've also seen at least once NFS4.1 produces something
> > > slightly
> > > different:
> > > 
> > > $ cat nfs41-fail-074-different-backtrace.txt
> > > [   60.125028] run fstests generic/074 at 2020-11-24 11:57:51
> > > [   62.281576]
> > > [   62.300548] =============================
> > > [   62.302205] [ BUG: Invalid wait context ]
> > > [   62.303812] 5.10.0-rc4 #127 Not tainted
> > > [   62.305351] -----------------------------
> > > [   62.306954] fstest/2035 is trying to lock:
> > > [   62.308588] ffff8fe4bffd23d8 (&zone->lock){..-.}-{3:3}, at:
> > > get_page_from_freelist+0x897/0x2190
> > > [   62.312079] other info that might help us debug this:
> > > [   62.314105] context-{5:5}
> > > [   62.315166] 3 locks held by fstest/2035:
> > > [   62.316722]  #0: ffff8fe3b3d78448 (sb_writers#16){.+.+}-{0:0},
> > > at:
> > > do_syscall_64+0x33/0x40
> > > [   62.320040]  #1: ffff8fe3d8f48488
> > > (&sb->s_type->i_mutex_key#21){++++}-{4:4}, at:
> > > do_truncate+0x69/0xd0
> > > [   62.323706]  #2: ffff8fe4b7c9b2b0 (krc.lock){..-.}-{2:2}, at:
> > > kvfree_call_rcu+0x69/0x230
> > > [   62.326974] stack backtrace:
> > > [   62.328151] CPU: 2 PID: 2035 Comm: fstest Kdump: loaded Not
> > > tainted
> > > 5.10.0-rc4 #127
> > > [   62.331172] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> > > [   62.333449] Call Trace:
> > > [   62.334504]  dump_stack+0x8d/0xb5
> > > [   62.335880]  __lock_acquire.cold+0x20b/0x2c8
> > > [   62.337621]  ? find_get_entries+0x2c7/0x5b0
> > > [   62.339316]  lock_acquire+0xca/0x380
> > > [   62.340795]  ? get_page_from_freelist+0x897/0x2190
> > > [   62.342722]  _raw_spin_lock+0x2c/0x40
> > > [   62.344224]  ? get_page_from_freelist+0x897/0x2190
> > > [   62.346123]  get_page_from_freelist+0x897/0x2190
> > > [   62.347979]  ? __lock_acquire+0x3b1/0x25d0
> > > [   62.349625]  __alloc_pages_nodemask+0x1b4/0x460
> > > [   62.351437]  __get_free_pages+0xd/0x30
> > > [   62.352942]  kvfree_call_rcu+0x168/0x230
> > > [   62.354562]  nfs4_do_setattr+0x1f6/0x4e0 [nfsv4]
> > > [   62.356455]  nfs4_proc_setattr+0xb0/0x160 [nfsv4]
> > > [   62.358380]  nfs_setattr+0x102/0x2c0 [nfs]
> > > [   62.360069]  notify_change+0x340/0x4d0
> > > [   62.361587]  ? do_truncate+0x76/0xd0
> > > [   62.363034]  do_truncate+0x76/0xd0
> > > [   62.364413]  do_sys_ftruncate+0x14a/0x230
> > > [   62.366037]  do_syscall_64+0x33/0x40
> > > [   62.367481]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > > [   62.369489] RIP: 0033:0x7f23c70c2abb
> > > [   62.370927] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 73 0c 00 f7
> > > d8
> > > 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00
> > > 00
> > > 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 73
> > > 0c
> > > 00
> > > f7 d8
> > > [   62.378219] RSP: 002b:00007ffd0780e788 EFLAGS: 00000202
> > > ORIG_RAX:
> > > 000000000000004d
> > > [   62.381208] RAX: ffffffffffffffda RBX: 0000000000002000 RCX:
> > > 00007f23c70c2abb
> > > [   62.384031] RDX: 0000000000000242 RSI: 0000000000a00000 RDI:
> > > 0000000000000003
> > > [   62.386851] RBP: 00000000017022b0 R08: 0000000000000000 R09:
> > > 000000000000001f
> > > [   62.389677] R10: 00000000000001a4 R11: 0000000000000202 R12:
> > > 0000000000000003
> > > [   62.392493] R13: 0000000000000000 R14: 0000000000a00000 R15:
> > > 0000000000000001
> > > 
> > 
> > Hmm... Both suggest a use-after-free situation. Would you be able
> > to
> > run the test with KASAN enabled? That might help finger the real
> > culprit.
> > 
> 
> Trying to get something out of KASAN but so far no luck.
> Could this be a junk report from lockdep?

If you're not seeing anything from kasan, then probably, yes. 


-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-11-25 17:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-11-24 21:56 NFS failure with generic/074 when lockdep is enabled - BUG: Invalid wait context David Wysochanski
2020-11-25  1:07 ` Trond Myklebust
2020-11-25  1:28   ` David Wysochanski
2020-11-25  1:33     ` Trond Myklebust
2020-11-25  1:43       ` David Wysochanski
2020-11-25  1:47         ` Trond Myklebust
2020-11-25 15:19           ` David Wysochanski
2020-11-25 17:58             ` Trond Myklebust

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).