Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v7 01/27] Documentation/x86: Add CET description
From: Yu-cheng Yu @ 2019-06-06 20:06 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190606200646.3951-1-yu-cheng.yu@intel.com>

Explain how CET works and the no_cet_shstk/no_cet_ibt kernel
parameters.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 .../admin-guide/kernel-parameters.txt         |   6 +
 Documentation/x86/index.rst                   |   1 +
 Documentation/x86/intel_cet.rst               | 268 ++++++++++++++++++
 3 files changed, 275 insertions(+)
 create mode 100644 Documentation/x86/intel_cet.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..b9b054f6c732 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2906,6 +2906,12 @@
 			noexec=on: enable non-executable mappings (default)
 			noexec=off: disable non-executable mappings
 
+	no_cet_ibt	[X86-64] Disable indirect branch tracking for user-mode
+			applications
+
+	no_cet_shstk	[X86-64] Disable shadow stack support for user-mode
+			applications
+
 	nosmap		[X86,PPC]
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index ae36fc5fc649..47822ac02fe0 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -21,6 +21,7 @@ x86-specific Documentation
    pat
    protection-keys
    intel_mpx
+   intel_cet
    amd-memory-encryption
    pti
    mds
diff --git a/Documentation/x86/intel_cet.rst b/Documentation/x86/intel_cet.rst
new file mode 100644
index 000000000000..dac83bbf8a24
--- /dev/null
+++ b/Documentation/x86/intel_cet.rst
@@ -0,0 +1,268 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Control-flow Enforcement Technology (CET)
+=========================================
+
+[1] Overview
+============
+
+Control-flow Enforcement Technology (CET) provides protection against
+return/jump-oriented programming (ROP) attacks.  It can be setup to
+protect both the kernel and applications.  In the first phase,
+only the user-mode protection is implemented in 64-bit mode; 32-bit
+applications are supported in compatibility mode.
+
+CET introduces shadow stack (SHSTK) and indirect branch tracking
+(IBT).  SHSTK is a secondary stack allocated from memory and cannot
+be directly modified by applications.  When executing a CALL, the
+processor pushes a copy of the return address to SHSTK.  Upon
+function return, the processor pops the SHSTK copy and compares it
+to the one from the program stack.  If the two copies differ, the
+processor raises a control-protection exception.  IBT verifies all
+indirect CALL/JMP targets are intended as marked by the compiler
+with 'ENDBR' opcodes (see CET instructions below).
+
+There are two kernel configuration options:
+
+    INTEL_X86_SHADOW_STACK_USER, and
+    INTEL_X86_BRANCH_TRACKING_USER.
+
+To build a CET-enabled kernel, Binutils v2.31 and GCC v8.1 or later
+are required.  To build a CET-enabled application, GLIBC v2.28 or
+later is also required.
+
+There are two command-line options for disabling CET features:
+
+    no_cet_shstk - disables SHSTK, and
+    no_cet_ibt - disables IBT.
+
+At run time, /proc/cpuinfo shows the availability of SHSTK and IBT.
+
+[2] CET assembly instructions
+=============================
+
+RDSSP %r
+    Read the SHSTK pointer into %r.
+
+INCSSP %r
+    Unwind (increment) the SHSTK pointer (0 ~ 255) steps as indicated
+    in the operand register.  The GLIBC longjmp uses INCSSP to unwind
+    the SHSTK until that matches the program stack.  When it is
+    necessary to unwind beyond 255 steps, longjmp divides and repeats
+    the process.
+
+RSTORSSP (%r)
+    Switch to the SHSTK indicated in the 'restore token' pointed by
+    the operand register and replace the 'restore token' with a new
+    token to be saved (with SAVEPREVSSP) for the outgoing SHSTK.
+
+::
+
+                               Before RSTORSSP
+
+             Incoming SHSTK                   Current/Outgoing SHSTK
+
+        |----------------------|             |----------------------|
+ addr=x |                      |       ssp-> |                      |
+        |----------------------|             |----------------------|
+ (%r)-> | rstor_token=(x|Lg)   |    addr=y-8 |                      |
+        |----------------------|             |----------------------|
+
+                               After RSTORSSP
+
+        |----------------------|             |----------------------|
+        |                      |             |                      |
+        |----------------------|             |----------------------|
+  ssp-> | rstor_token=(y|Bz|Lg)|    addr=y-8 |                      |
+        |----------------------|             |----------------------|
+
+    note:
+        1. Only valid addresses and restore tokens can be on the
+           user-mode SHSTK.
+        2. A token is always of type u64 and must align to u64.
+        3. The incoming SHSTK pointer in a rstor_token must point to
+           immediately above the token.
+        4. 'Lg' is bit[0] of a rstor_token indicating a 64-bit SHSTK.
+        5. 'Bz' is bit[1] of a rstor_token indicating the token is to
+           be used only for the next SAVEPREVSSP and invalid for the
+           RSTORSSP.
+
+SAVEPREVSSP
+    Store the SHSTK 'restore token' pointed by
+        (current_SHSTK_pointer + 8).
+
+::
+
+                             After SAVEPREVSSP
+
+        |----------------------|             |----------------------|
+  ssp-> |                      |             |                      |
+        |----------------------|             |----------------------|
+        | rstor_token=(y|Bz|Lg)|    addr=y-8 | rstor_token(y|Lg)    |
+        |----------------------|             |----------------------|
+
+WRUSS %r0, (%r1)
+    Write the value in %r0 to the SHSTK address pointed by (%r1).
+    This is a kernel-mode only instruction.
+
+ENDBR
+    The compiler inserts an ENDBR at all valid branch targets.  Any
+    CALL/JMP to a target without an ENDBR triggers a control
+    protection fault.
+
+[3] Application Enabling
+========================
+
+An application's CET capability is marked in its ELF header and can
+be verified from the following command output, in the
+NT_GNU_PROPERTY_TYPE_0 field:
+
+    readelf -n <application>
+
+If an application supports CET and is statically linked, it will run
+with CET protection.  If the application needs any shared libraries,
+the loader checks all dependencies and enables CET only when all
+requirements are met.
+
+[4] Legacy Libraries
+====================
+
+GLIBC provides a few tunables for backward compatibility.
+
+GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT
+    Turn off SHSTK/IBT for the current shell.
+
+GLIBC_TUNABLES=glibc.tune.x86_shstk=<on, permissive>
+    This controls how dlopen() handles SHSTK legacy libraries:
+        on: continue with SHSTK enabled;
+        permissive: continue with SHSTK off.
+
+[5] CET system calls
+====================
+
+The following arch_prctl() system calls are added for CET:
+
+arch_prctl(ARCH_X86_CET_STATUS, unsigned long *addr)
+    Return CET feature status.
+
+    The parameter 'addr' is a pointer to a user buffer.
+    On returning to the caller, the kernel fills the following
+    information:
+
+    *addr = SHSTK/IBT status
+    *(addr + 1) = SHSTK base address
+    *(addr + 2) = SHSTK size
+
+arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
+    Disable SHSTK and/or IBT specified in 'features'.  Return -EPERM
+    if CET is locked.
+
+arch_prctl(ARCH_X86_CET_LOCK)
+    Lock in CET feature.
+
+arch_prctl(ARCH_X86_CET_ALLOC_SHSTK, unsigned long *addr)
+    Allocate a new SHSTK and put a restore token at top.
+
+    The parameter 'addr' is a pointer to a user buffer and indicates
+    the desired SHSTK size to allocate.  On returning to the caller,
+    the kernel fills *addr with the base address of the new SHSTK.
+
+arch_prctl(ARCH_X86_CET_SET_LEGACY_BITMAP, unsigned long *addr)
+    Setup an IBT legacy code bitmap.
+
+    The parameter 'addr' is a pointer to a user buffer that has the
+    following information:
+
+    *addr = IBT bitmap base address
+    *(addr + 1) = IBT bitmap size
+
+Note:
+  There is no CET enabling arch_prctl function.  By design, CET is
+  enabled automatically if the binary and the system can support it.
+
+  The parameters passed are always unsigned 64-bit.  When an ia32
+  application passing pointers, it should only use the lower 32 bits.
+
+[6] The implementation of the SHSTK
+===================================
+
+SHSTK size
+----------
+
+A task's SHSTK is allocated from memory to a fixed size of
+RLIMIT_STACK.  A compat-mode thread's SHSTK size is 1/4 of
+RLIMIT_STACK.  The smaller 32-bit thread SHSTK allows more threads to
+share a 32-bit address space.
+
+Signal
+------
+
+The main program and its signal handlers use the same SHSTK.  Because
+the SHSTK stores only return addresses, a large SHSTK will cover the
+condition that both the program stack and the sigaltstack run out.
+
+The kernel creates a restore token at the SHSTK restoring address and
+verifies that token when restoring from the signal handler.
+
+Fork
+----
+
+The SHSTK's vma has VM_SHSTK flag set; its PTEs are required to be
+read-only and dirty.  When a SHSTK PTE is not present, RO, and dirty,
+a SHSTK access triggers a page fault with an additional SHSTK bit set
+in the page fault error code.
+
+When a task forks a child, its SHSTK PTEs are copied and both the
+parent's and the child's SHSTK PTEs are cleared of the dirty bit.
+Upon the next SHSTK access, the resulting SHSTK page fault is handled
+by page copy/re-use.
+
+When a pthread child is created, the kernel allocates a new SHSTK for
+the new thread.
+
+Setjmp/Longjmp
+--------------
+
+Longjmp unwinds SHSTK until it matches the program stack.
+
+Ucontext
+--------
+
+In GLIBC, getcontext/setcontext is implemented in similar way as
+setjmp/longjmp.
+
+When makecontext creates a new ucontext, a new SHSTK is allocated for
+that context with ARCH_X86_CET_ALLOC_SHSTK the syscall.  The kernel
+creates a restore token at the top of the new SHSTK and the user-mode
+code switches to the new SHSTK with the RSTORSSP instruction.
+
+[7] The management of read-only & dirty PTEs for SHSTK
+======================================================
+
+A RO and dirty PTE exists in the following cases:
+
+(a) A page is modified and then shared with a fork()'ed child;
+(b) A R/O page that has been COW'ed;
+(c) A SHSTK page.
+
+The processor only checks the dirty bit for (c).  To prevent the use
+of non-SHSTK memory as SHSTK, we use a spare bit of the 64-bit PTE as
+DIRTY_SW for (a) and (b) above.  This results to the following PTE
+settings:
+
+Modified PTE:             (R/W + DIRTY_HW)
+Modified and shared PTE:  (R/O + DIRTY_SW)
+R/O PTE, COW'ed:          (R/O + DIRTY_SW)
+SHSTK PTE:                (R/O + DIRTY_HW)
+SHSTK PTE, COW'ed:        (R/O + DIRTY_HW)
+SHSTK PTE, shared:        (R/O + DIRTY_SW)
+
+Note that DIRTY_SW is only used in R/O PTEs but not R/W PTEs.
+
+[8] The implementation of IBT
+=============================
+
+The kernel provides IBT support in mmap() of the legacy code bit map.
+However, the management of the bitmap is done in the GLIBC or the
+application.
-- 
2.17.1

^ permalink raw reply related

* [PATCH v7 00/27] Control-flow Enforcement: Shadow Stack
From: Yu-cheng Yu @ 2019-06-06 20:06 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu

Intel has published Control-flow Enforcement (CET) in the Architecture
Instruction Set Extensions Programming Reference:

  https://software.intel.com/en-us/download/intel-architecture-instruction-set-
  extensions-programming-reference

The previous version (v6) of CET Shadow Stack patches is here:

  https://lkml.org/lkml/2018/11/20/225

Summary of changes from v6:

  Rebase to v5.2-rc3.

  Adapt XSAVES system state changes to the recent FPU changes:
    - Add modify_fpu_regs_begin(), modify_fpu_regs_end() to protect
      CET MSR changes.
    - Update signal handling.

  Move ELF header-parsing out of x86.

  Other small fixes in response to comments.

This version passed GLIBC built-in tests.

Hi Maintainers,

What else is needed before this series can be merged?

Thanks,
Yu-cheng


Yu-cheng Yu (27):
  Documentation/x86: Add CET description
  x86/cpufeatures: Add CET CPU feature flags for Control-flow
    Enforcement Technology (CET)
  x86/fpu/xstate: Change names to separate XSAVES system and user states
  x86/fpu/xstate: Introduce XSAVES system states
  x86/fpu/xstate: Add XSAVES system states for shadow stack
  x86/cet: Add control protection exception handler
  x86/cet/shstk: Add Kconfig option for user-mode shadow stack
  mm: Introduce VM_SHSTK for shadow stack memory
  mm/mmap: Prevent Shadow Stack VMA merges
  x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
  x86/mm: Introduce _PAGE_DIRTY_SW
  drm/i915/gvt: Update _PAGE_DIRTY to _PAGE_DIRTY_BITS
  x86/mm: Modify ptep_set_wrprotect and pmdp_set_wrprotect for
    _PAGE_DIRTY_SW
  x86/mm: Shadow stack page fault error checking
  mm: Handle shadow stack page fault
  mm: Handle THP/HugeTLB shadow stack page fault
  mm: Update can_follow_write_pte/pmd for shadow stack
  mm: Introduce do_mmap_locked()
  x86/cet/shstk: User-mode shadow stack support
  x86/cet/shstk: Introduce WRUSS instruction
  x86/cet/shstk: Handle signals for shadow stack
  binfmt_elf: Extract .note.gnu.property from an ELF file
  x86/cet/shstk: ELF header parsing of Shadow Stack
  x86/cet/shstk: Handle thread shadow stack
  mm/mmap: Add Shadow stack pages to memory accounting
  x86/cet/shstk: Add arch_prctl functions for Shadow Stack
  x86/cet/shstk: Add Shadow Stack instructions to opcode map

 .../admin-guide/kernel-parameters.txt         |   6 +
 Documentation/x86/index.rst                   |   1 +
 Documentation/x86/intel_cet.rst               | 268 +++++++++++++
 arch/x86/Kconfig                              |  26 ++
 arch/x86/Makefile                             |   7 +
 arch/x86/entry/entry_64.S                     |   2 +-
 arch/x86/ia32/ia32_signal.c                   |   8 +
 arch/x86/include/asm/cet.h                    |  48 +++
 arch/x86/include/asm/cpufeatures.h            |   2 +
 arch/x86/include/asm/disabled-features.h      |   8 +-
 arch/x86/include/asm/fpu/internal.h           |  25 +-
 arch/x86/include/asm/fpu/signal.h             |   2 +
 arch/x86/include/asm/fpu/types.h              |  22 ++
 arch/x86/include/asm/fpu/xstate.h             |  26 +-
 arch/x86/include/asm/mmu_context.h            |   3 +
 arch/x86/include/asm/msr-index.h              |  18 +
 arch/x86/include/asm/pgtable.h                | 191 ++++++++--
 arch/x86/include/asm/pgtable_types.h          |  38 +-
 arch/x86/include/asm/processor.h              |   5 +
 arch/x86/include/asm/special_insns.h          |  32 ++
 arch/x86/include/asm/traps.h                  |   5 +
 arch/x86/include/uapi/asm/prctl.h             |   5 +
 arch/x86/include/uapi/asm/processor-flags.h   |   2 +
 arch/x86/include/uapi/asm/sigcontext.h        |  15 +
 arch/x86/kernel/Makefile                      |   2 +
 arch/x86/kernel/cet.c                         | 327 ++++++++++++++++
 arch/x86/kernel/cet_prctl.c                   |  85 +++++
 arch/x86/kernel/cpu/common.c                  |  25 ++
 arch/x86/kernel/cpu/cpuid-deps.c              |   2 +
 arch/x86/kernel/fpu/core.c                    |  26 +-
 arch/x86/kernel/fpu/init.c                    |  10 -
 arch/x86/kernel/fpu/signal.c                  |  81 +++-
 arch/x86/kernel/fpu/xstate.c                  | 156 +++++---
 arch/x86/kernel/idt.c                         |   4 +
 arch/x86/kernel/process.c                     |   8 +-
 arch/x86/kernel/process_64.c                  |  31 ++
 arch/x86/kernel/relocate_kernel_64.S          |   2 +-
 arch/x86/kernel/signal.c                      |  10 +-
 arch/x86/kernel/signal_compat.c               |   2 +-
 arch/x86/kernel/traps.c                       |  57 +++
 arch/x86/kvm/vmx/vmx.c                        |   2 +-
 arch/x86/lib/x86-opcode-map.txt               |  26 +-
 arch/x86/mm/fault.c                           |  18 +
 arch/x86/mm/pgtable.c                         |  41 ++
 drivers/gpu/drm/i915/gvt/gtt.c                |   2 +-
 fs/Kconfig.binfmt                             |   3 +
 fs/Makefile                                   |   1 +
 fs/binfmt_elf.c                               |  13 +
 fs/gnu_property.c                             | 351 ++++++++++++++++++
 fs/proc/task_mmu.c                            |   3 +
 include/asm-generic/pgtable.h                 |  14 +
 include/linux/elf.h                           |  12 +
 include/linux/mm.h                            |  26 ++
 include/uapi/asm-generic/siginfo.h            |   3 +-
 include/uapi/linux/elf.h                      |  14 +
 mm/gup.c                                      |   8 +-
 mm/huge_memory.c                              |  12 +-
 mm/memory.c                                   |   7 +-
 mm/mmap.c                                     |  11 +
 .../arch/x86/include/asm/disabled-features.h  |   8 +-
 tools/objtool/arch/x86/lib/x86-opcode-map.txt |  26 +-
 61 files changed, 2029 insertions(+), 165 deletions(-)
 create mode 100644 Documentation/x86/intel_cet.rst
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/cet.c
 create mode 100644 arch/x86/kernel/cet_prctl.c
 create mode 100644 fs/gnu_property.c

-- 
2.17.1

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Andy Lutomirski @ 2019-06-06 19:54 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Stephen Smalley, David Howells, Al Viro, Greg Kroah-Hartman,
	USB list, raven, Linux FS Devel, Linux API, linux-block, keyrings,
	LSM List, LKML, Paul Moore
In-Reply-To: <07e92045-2d80-8573-4d36-643deeaff9ec@schaufler-ca.com>

On Thu, Jun 6, 2019 at 11:56 AM Casey Schaufler <casey@schaufler-ca.com> wrote:
>
> On 6/6/2019 10:16 AM, Stephen Smalley wrote:
> > On 6/6/19 12:43 PM, Casey Schaufler wrote:
> >> ...
> >> I don't agree. That is, I don't believe it is sufficient.
> >> There is no guarantee that being able to set a watch on an
> >> object implies that every process that can trigger the event
> >> can send it to you.
> >>
> >>     Watcher has Smack label W
> >>     Triggerer has Smack label T
> >>     Watched object has Smack label O
> >>
> >>     Relevant Smack rules are
> >>
> >>     W O rw
> >>     T O rw
> >>
> >> The watcher will be able to set the watch,
> >> the triggerer will be able to trigger the event,
> >> but there is nothing that would allow the watcher
> >> to receive the event. This is not a case of watcher
> >> reading the watched object, as the event is delivered
> >> without any action by watcher.
> >
> > You are allowing arbitrary information flow between T and W above.  Who cares about notifications?
>
> I do. If Watched object is /dev/null no data flow is possible.
> There are many objects on a modern Linux system for which this
> is true. Even if it's "just a file" the existence of one path
> for data to flow does not justify ignoring the rules for other
> data paths.

Aha!

Even ignoring security, writes to things like /dev/null should
probably not trigger notifications to people who are watching
/dev/null.  (There are probably lots of things like this: /dev/zero,
/dev/urandom, etc.)  David, are there any notification types that have
this issue in your patchset?  If so, is there a straightforward way to
fix it?  Generically, it seems like maybe writes to device nodes
shouldn't trigger notifications since, despite the fact that different
openers of a device node share an inode, there isn't necessarily any
connection between them.

Casey, if this is fixed in general, do you have another case where the
right to write and the right to read do not imply the right to
communicate?

> An analogy is that two processes with different UIDs can open a file,
> but still can't signal each other.

What do you mean "signal"?  If two processes with different UIDs can
open the same file for read and write, then they can communicate with
each other in many ways.  For example, one can write to the file and
the other can read it.  One can take locks and the other can read the
lock state.  They can both map it and use any number of memory access
side channels to communicate.  But, of course, they can't send each
other signals with kill().

If, however, one of these processes is using some fancy mechanism
(inotify, dnotify, kqueue, fanotify, whatever) to watch the file, and
the other one writes it, then it seems inconsistent to lie to the
watching process and say that the file wasn't written because some
security policy has decided to allow the write, allow the read, but
suppress this particular notification.  Hence my request for a real
example: when would it make sense to do this?

^ permalink raw reply

* Re: [PATCH v4 00/16] fs-verity: read-only file-based authenticity protection
From: Eric Biggers @ 2019-06-06 19:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Theodore Y . Ts'o, Darrick J . Wong, Linux API, Dave Chinner,
	linux-f2fs-devel, linux-fscrypt, linux-fsdevel, Jaegeuk Kim,
	linux-integrity, linux-ext4, Christoph Hellwig, Victor Hsieh
In-Reply-To: <CAHk-=wgSzRzoro8ATO5xb6OFxN1A0fjUCQSAHfGuEPbEu+zWvA@mail.gmail.com>

On Thu, Jun 06, 2019 at 10:21:12AM -0700, Linus Torvalds wrote:
> On Thu, Jun 6, 2019 at 8:54 AM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > This is a redesigned version of the fs-verity patchset, implementing
> > Ted's suggestion to build the Merkle tree in the kernel
> > (https://lore.kernel.org/linux-fsdevel/20190207031101.GA7387@mit.edu/).
> > This greatly simplifies the UAPI, since the verity metadata no longer
> > needs to be transferred to the kernel.
> 
> Interfaces look sane to me. My only real concern is whether it would
> make sense to make the FS_IOC_ENABLE_VERITY ioctl be something that
> could be done incrementally, since the way it is done now it looks
> like any random user could create a big file and then do the
> FS_IOC_ENABLE_VERITY to make the kernel do a _very_ expensive
> operation.
> 
> Yes, I see the
> 
> +               if (fatal_signal_pending(current))
> +                       return -EINTR;
> +               cond_resched();
> 
> in there, so it's not like it's some entirely unkillable thing, and
> maybe we don't care as a result. But maybe the ioctl interface could
> be fundamentally restartable?
> 
> If that was already considered and people just went "too complex", never mind.
> 
>                Linus

Making it incremental would be complex.  We could make FS_IOC_ENABLE_VERITY
write checkpoints periodically, and make it resume from the checkpoint if
present.  But then we'd have to worry about sync'ing the Merkle tree before
writing each checkpoint, and storing the Merkle tree parameters in each
checkpoint so that if the second call to FS_IOC_ENABLE_VERITY is made with
different parameters it knows to delete everything and restart from scratch.

Or we could make it explicit in the UAPI, where userspace calls ioctls to build
blocks 0 through 9999, then 10000 through 19999, etc.  But that would make the
UAPI much more complex, and the kernel would need to do lots of extra validation
of the parameters passed in.  This approach would also not be crash-safe unless
userspace did its own checkpointing, whereas the all-or-nothing API naturally
avoids inconsistent states.

And either way of making it incremental, the "partial Merkle tree" would also
become a valid on-disk state.  Conceptually that adds a lot of complexity, and
probably people would want fsck to support removing all the partial trees,
similar to how e2fsck supports optimizing directories and extent trees.

So in the end, it's not something I decided to add.

- Eric

^ permalink raw reply

* Re: [PATCH 01/10] security: Override creds in __fput() with last fputter's creds [ver #3]
From: Andy Lutomirski @ 2019-06-06 19:34 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Andy Lutomirski, David Howells, Al Viro, raven, Linux FS Devel,
	Linux API, linux-block, keyrings, LSM List, LKML, Jann Horn
In-Reply-To: <e434a62a-d92a-c6e2-cda1-309ac99d4d5c@schaufler-ca.com>

On Thu, Jun 6, 2019 at 12:09 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>
> On 6/6/2019 10:18 AM, Andy Lutomirski wrote:
> > On Thu, Jun 6, 2019 at 8:06 AM David Howells <dhowells@redhat.com> wrote:
> >> Andy Lutomirski <luto@amacapital.net> wrote:

> > Casey, I think you need to state your requirement in a way that's well
> > defined, and I think you need to make a compelling case that your
> > requirement is indeed worth dictating the design of parts of the
> > kernel outside LSM.
>
> Err, no, I don't believe so. There's a whole lot more
> going on in this discussion than just what's going on
> within the LSMs. Using examples from the LSMs makes it
> easier, because their policies are better defined than
> the "legacy" policies are. The most important part of the
> discussion is about ensuring that the event mechanism
> doesn't circumvent the legacy policies. Yes, I understand
> that you don't know what that means, or has to do with
> anything.
>
>

Indeed, I do not know what you have in mind about making sure this
mechanism doesn't circumvent legacy policies.  Can you elaborate?

--Andy

^ permalink raw reply

* Re: [PATCH 01/10] security: Override creds in __fput() with last fputter's creds [ver #3]
From: Casey Schaufler @ 2019-06-06 19:09 UTC (permalink / raw)
  To: Andy Lutomirski, David Howells
  Cc: Al Viro, raven, Linux FS Devel, Linux API, linux-block, keyrings,
	LSM List, LKML, Jann Horn
In-Reply-To: <CALCETrUukxNNhbBAifxz1EADzLOvYKoh9oqqZFJteU+MMhh1ig@mail.gmail.com>

On 6/6/2019 10:18 AM, Andy Lutomirski wrote:
> On Thu, Jun 6, 2019 at 8:06 AM David Howells <dhowells@redhat.com> wrote:
>> Andy Lutomirski <luto@amacapital.net> wrote:
>>
>>>> So that the LSM can see the credentials of the last process to do an fput()
>>>> on a file object when the file object is being dismantled, do the following
>>>> steps:
>>>>
>>> I still maintain that this is a giant design error.
>> Yes, I know.  This was primarily a post so that Greg could play with the USB
>> notifications stuff I added.  The LSM support isn't resolved and is unchanged.
>>
>>> Can someone at least come up with a single valid use case that isn't
>>> entirely full of bugs?
>> "Entirely full of bugs"?
> I can say "hey, I have this policy that the person who triggered an
> event needs such-and-such permission, otherwise the event gets
> suppressed".  But this isn't a full use case, and it's buggy.  It's
> not a full use case because I haven't specified what my actual goal is
> and why this particular policy achieves my goals.  And it's entirely
> full of bugs because, as this patch so nicely illustrates, it's not
> well defined who triggered the event.  For example, if I exec a setuid
> process, who triggers the close?  What if I send the fd to systemd
> over a socket and immediately close my copy before systemd gets (and
> ignores) the message?  Or if I send it to Wayland, or to any other
> process?
>
> A file is closed when everyone is done with it.  Trying to figure out
> who the last intentional user of the file was seems little better than
> random guessing.  Defining a security policy based on it seems like a
> poor idea.
>
>> How would you propose I deal with Casey's requirement?  I'm getting the
>> feeling you're going to nak it if I try to fulfil that and he's going to nak
>> it if I don't.
>>
> Casey, I think you need to state your requirement in a way that's well
> defined, and I think you need to make a compelling case that your
> requirement is indeed worth dictating the design of parts of the
> kernel outside LSM.

Err, no, I don't believe so. There's a whole lot more
going on in this discussion than just what's going on
within the LSMs. Using examples from the LSMs makes it
easier, because their policies are better defined than
the "legacy" policies are. The most important part of the
discussion is about ensuring that the event mechanism
doesn't circumvent the legacy policies. Yes, I understand
that you don't know what that means, or has to do with
anything.

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Casey Schaufler @ 2019-06-06 18:56 UTC (permalink / raw)
  To: Stephen Smalley, David Howells
  Cc: viro, Greg Kroah-Hartman, linux-usb, raven, linux-fsdevel,
	linux-api, linux-block, keyrings, linux-security-module,
	linux-kernel, Paul Moore, casey
In-Reply-To: <c82052e5-ca11-67b5-965e-8f828081f31c@tycho.nsa.gov>

On 6/6/2019 10:16 AM, Stephen Smalley wrote:
> On 6/6/19 12:43 PM, Casey Schaufler wrote:
>> ...
>> I don't agree. That is, I don't believe it is sufficient.
>> There is no guarantee that being able to set a watch on an
>> object implies that every process that can trigger the event
>> can send it to you.
>>
>>     Watcher has Smack label W
>>     Triggerer has Smack label T
>>     Watched object has Smack label O
>>
>>     Relevant Smack rules are
>>
>>     W O rw
>>     T O rw
>>
>> The watcher will be able to set the watch,
>> the triggerer will be able to trigger the event,
>> but there is nothing that would allow the watcher
>> to receive the event. This is not a case of watcher
>> reading the watched object, as the event is delivered
>> without any action by watcher.
>
> You are allowing arbitrary information flow between T and W above.  Who cares about notifications?

I do. If Watched object is /dev/null no data flow is possible.
There are many objects on a modern Linux system for which this
is true. Even if it's "just a file" the existence of one path
for data to flow does not justify ignoring the rules for other
data paths.

>
> How is it different from W and T mapping the same file as a shared mapping and communicating by reading and writing the shared memory?  You aren't performing a permission check directly between W and T there.

In this case there is one object O, two subjects, W and T and two accesses.

	W open O
	T open O

They fiddle about with the data in O.

In the event case, there are two objects, O and W, two subjects W and T, and
three accesses.

	W watch O
	T trigger O
	T write-event W

You can't wave away the flow of data. Different objects are involved.

An analogy is that two processes with different UIDs can open a file,
but still can't signal each other. Different mechanisms have different
policies. I'm not saying that's good, but it's the context we're in.

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Andy Lutomirski @ 2019-06-06 18:51 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Andy Lutomirski, Stephen Smalley, David Howells, Al Viro,
	Greg Kroah-Hartman, USB list, raven, Linux FS Devel, Linux API,
	linux-block, keyrings, LSM List, LKML, Paul Moore
In-Reply-To: <7afe1a85-bf19-b5b4-fdf3-69d9be475dab@schaufler-ca.com>



> On Jun 6, 2019, at 11:33 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> 
>> On 6/6/2019 10:11 AM, Andy Lutomirski wrote:
>>> On Thu, Jun 6, 2019 at 9:43 AM Casey Schaufler <casey@schaufler-ca.com> wrote:
>>> ...
>>> I don't agree. That is, I don't believe it is sufficient.
>>> There is no guarantee that being able to set a watch on an
>>> object implies that every process that can trigger the event
>>> can send it to you.
>>> 
>>>        Watcher has Smack label W
>>>        Triggerer has Smack label T
>>>        Watched object has Smack label O
>>> 
>>>        Relevant Smack rules are
>>> 
>>>        W O rw
>>>        T O rw
>>> 
>>> The watcher will be able to set the watch,
>>> the triggerer will be able to trigger the event,
>>> but there is nothing that would allow the watcher
>>> to receive the event. This is not a case of watcher
>>> reading the watched object, as the event is delivered
>>> without any action by watcher.
>> I think this is an example of a bogus policy that should not be
>> supported by the kernel.
> 
> At this point it's pretty hard for me to care much what
> you think. You don't seem to have any insight into the
> implications of the features you're advocating, or their
> potential consequences.
> 
> 

Can you try to spell it out, then?  A mostly or fully worked out example might help.

As Stephen said, it looks like you are considering cases where there is already a full communication channel between two processes, and you’re concerned that this new mechanism might add a side channel too.  If this is wrong, can you explain how?

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Casey Schaufler @ 2019-06-06 18:33 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stephen Smalley, David Howells, Al Viro, Greg Kroah-Hartman,
	USB list, raven, Linux FS Devel, Linux API, linux-block, keyrings,
	LSM List, LKML, Paul Moore, casey
In-Reply-To: <CALCETrWn_C8oReKXGMXiJDOGoYWMs+jg2DWa5ZipKAceyXkx5w@mail.gmail.com>

On 6/6/2019 10:11 AM, Andy Lutomirski wrote:
> On Thu, Jun 6, 2019 at 9:43 AM Casey Schaufler <casey@schaufler-ca.com> wrote:
>> ...
>> I don't agree. That is, I don't believe it is sufficient.
>> There is no guarantee that being able to set a watch on an
>> object implies that every process that can trigger the event
>> can send it to you.
>>
>>         Watcher has Smack label W
>>         Triggerer has Smack label T
>>         Watched object has Smack label O
>>
>>         Relevant Smack rules are
>>
>>         W O rw
>>         T O rw
>>
>> The watcher will be able to set the watch,
>> the triggerer will be able to trigger the event,
>> but there is nothing that would allow the watcher
>> to receive the event. This is not a case of watcher
>> reading the watched object, as the event is delivered
>> without any action by watcher.
> I think this is an example of a bogus policy that should not be
> supported by the kernel.

At this point it's pretty hard for me to care much what
you think. You don't seem to have any insight into the
implications of the features you're advocating, or their
potential consequences.

^ permalink raw reply

* Re: [PATCH v2 for 5.2 08/12] rseq/selftests: arm: use udf instruction for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-06-06 18:02 UTC (permalink / raw)
  To: Will Deacon, Russell King
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, shuah, Andy Lutomirski,
	Dave Watson, Paul Turner, Andrew Morton, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas,
	Michael Kerrisk <mtk.manpages@
In-Reply-To: <20190503193858.9676-1-mathieu.desnoyers@efficios.com>

----- On May 3, 2019, at 3:38 PM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:

> Use udf as the guard instruction for the restartable sequence abort
> handler.
> 
> Previously, the chosen signature was not a valid instruction, based
> on the assumption that it could always sit in a literal pool. However,
> there are compilation environments in which literal pools are not
> available, for instance execute-only code. Therefore, we need to
> choose a signature value that is also a valid instruction.
> 
> Handle compiling with -mbig-endian on ARMv6+, which generates binaries
> with mixed code vs data endianness (little endian code, big endian
> data).
> 
> Else mismatch between code endianness for the generated signatures and
> data endianness for the RSEQ_SIG parameter passed to the rseq
> registration will trigger application segmentation faults when the
> kernel try to abort rseq critical sections.
> 
> Prior to ARMv6, -mbig-endian generates big-endian code and data, so
> endianness should not be reversed in that case.

And of course it cannot be that easy. This breaks when building in
thumb mode (-mthumb). Output from librseq arm32 build [1] (code similar
to what is found in the rseq selftests):

  CC       rseq.lo
/tmp/ccu6Jw1b.s: Assembler messages:
/tmp/ccu6Jw1b.s:297: Error: cannot determine Thumb instruction size. Use .inst.n/.inst.w instead
/tmp/ccu6Jw1b.s:490: Error: cannot determine Thumb instruction size. Use .inst.n/.inst.w instead
Makefile:460: recipe for target 'rseq.lo' failed

This appears to be caused by a missing .arm directive in RSEQ_SIG_DATA.
Fixing with:

-               asm volatile ("b 2f\n\t"                                \
+               asm volatile (".arm\n\t"                                \
+                             "b 2f\n\t"                                \

gets the build to go further, but breaks at:

  CC       basic_percpu_ops_test.o
/tmp/ccpHOMHZ.s: Assembler messages:
/tmp/ccpHOMHZ.s:148: Error: misaligned branch destination
/tmp/ccpHOMHZ.s:956: Error: misaligned branch destination
Makefile:378: recipe for target 'basic_percpu_ops_test.o' failed

I suspect it's caused by the change from:

-               ".word " __rseq_str(RSEQ_SIG) "\n\t"                    \

to

+               ".arm\n\t"                                              \
+               ".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t"               \

which changes the mode from thumb to arm for the rest of the
inline asm within __RSEQ_ASM_DEFINE_ABORT. Better yet, there appears
to be no way to save the arm/thumb state and restore it afterwards.

I'm really starting to wonder if we should go our of our way to try
to get this signature to be a valid instruction on arm32. Perhaps
we should consider going back to use ".word" on arm32 so it ensures
it uses data endianness (which matches the parameter received by the
sys_rseq system call), let objdump and friends print it as a literal
pool (which it is), and just choose an instruction which has little
chances to appear for the cases we care about between ARM32 BE, LE
and THUMB. Perhaps a 32-bit palindrome ? Bonus points if this is a
trap instruction in common configurations for odd-cases-debugging
purposes.

Thoughts ?

Thanks,

Mathieu

[1] https://github.com/compudj/librseq


> 
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: Joel Fernandes <joelaf@google.com>
> CC: Catalin Marinas <catalin.marinas@arm.com>
> CC: Dave Watson <davejwatson@fb.com>
> CC: Will Deacon <will.deacon@arm.com>
> CC: Shuah Khan <shuah@kernel.org>
> CC: Andi Kleen <andi@firstfloor.org>
> CC: linux-kselftest@vger.kernel.org
> CC: "H . Peter Anvin" <hpa@zytor.com>
> CC: Chris Lameter <cl@linux.com>
> CC: Russell King <linux@arm.linux.org.uk>
> CC: Michael Kerrisk <mtk.manpages@gmail.com>
> CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
> CC: Paul Turner <pjt@google.com>
> CC: Boqun Feng <boqun.feng@gmail.com>
> CC: Josh Triplett <josh@joshtriplett.org>
> CC: Steven Rostedt <rostedt@goodmis.org>
> CC: Ben Maurer <bmaurer@fb.com>
> CC: linux-api@vger.kernel.org
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: Linus Torvalds <torvalds@linux-foundation.org>
> ---
> Changes since v1:
> - Fix checkpatch error and warning.
> 
> ---
> tools/testing/selftests/rseq/rseq-arm.h | 52 +++++++++++++++++++++++++++++++--
> 1 file changed, 50 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/rseq/rseq-arm.h
> b/tools/testing/selftests/rseq/rseq-arm.h
> index 5f262c54364f..84f28f147fb6 100644
> --- a/tools/testing/selftests/rseq/rseq-arm.h
> +++ b/tools/testing/selftests/rseq/rseq-arm.h
> @@ -5,7 +5,54 @@
>  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>  */
> 
> -#define RSEQ_SIG	0x53053053
> +/*
> + * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
> + * value 0x5de3. This traps if user-space reaches this instruction by mistake,
> + * and the uncommon operand ensures the kernel does not move the instruction
> + * pointer to attacker-controlled code on rseq abort.
> + *
> + * The instruction pattern in the A32 instruction set is:
> + *
> + * e7f5def3    udf    #24035    ; 0x5de3
> + *
> + * This translates to the following instruction pattern in the T16 instruction
> + * set:
> + *
> + * little endian:
> + * def3        udf    #243      ; 0xf3
> + * e7f5        b.n    <7f5>
> + *
> + * pre-ARMv6 big endian code:
> + * e7f5        b.n    <7f5>
> + * def3        udf    #243      ; 0xf3
> + *
> + * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
> + * code and big-endian data. Ensure the RSEQ_SIG data signature matches code
> + * endianness. Prior to ARMv6, -mbig-endian generates big-endian code and data
> + * (which match), so there is no need to reverse the endianness of the data
> + * representation of the signature. However, the choice between BE32 and BE8
> + * is done by the linker, so we cannot know whether code and data endianness
> + * will be mixed before the linker is invoked.
> + */
> +
> +#define RSEQ_SIG_CODE	0xe7f5def3
> +
> +#ifndef __ASSEMBLER__
> +
> +#define RSEQ_SIG_DATA							\
> +	({								\
> +		int sig;						\
> +		asm volatile ("b 2f\n\t"				\
> +			      "1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
> +			      "2:\n\t"					\
> +			      "ldr %[sig], 1b\n\t"			\
> +			      : [sig] "=r" (sig));			\
> +		sig;							\
> +	})
> +
> +#define RSEQ_SIG	RSEQ_SIG_DATA
> +
> +#endif
> 
> #define rseq_smp_mb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
> #define rseq_smp_rmb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
> @@ -78,7 +125,8 @@ do {									\
> 		__rseq_str(table_label) ":\n\t"				\
> 		".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
> 		".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, "
> 		__rseq_str(abort_ip) ", 0x0\n\t" \
> -		".word " __rseq_str(RSEQ_SIG) "\n\t"			\
> +		".arm\n\t"						\
> +		".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t"		\
> 		__rseq_str(label) ":\n\t"				\
> 		teardown						\
> 		"b %l[" __rseq_str(abort_label) "]\n\t"
> --
> 2.11.0

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH v4 00/16] fs-verity: read-only file-based authenticity protection
From: Linus Torvalds @ 2019-06-06 17:21 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Theodore Y . Ts'o, Darrick J . Wong, Linux API, Dave Chinner,
	linux-f2fs-devel, linux-fscrypt, linux-fsdevel, Jaegeuk Kim,
	linux-integrity, linux-ext4, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

On Thu, Jun 6, 2019 at 8:54 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> This is a redesigned version of the fs-verity patchset, implementing
> Ted's suggestion to build the Merkle tree in the kernel
> (https://lore.kernel.org/linux-fsdevel/20190207031101.GA7387@mit.edu/).
> This greatly simplifies the UAPI, since the verity metadata no longer
> needs to be transferred to the kernel.

Interfaces look sane to me. My only real concern is whether it would
make sense to make the FS_IOC_ENABLE_VERITY ioctl be something that
could be done incrementally, since the way it is done now it looks
like any random user could create a big file and then do the
FS_IOC_ENABLE_VERITY to make the kernel do a _very_ expensive
operation.

Yes, I see the

+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               cond_resched();

in there, so it's not like it's some entirely unkillable thing, and
maybe we don't care as a result. But maybe the ioctl interface could
be fundamentally restartable?

If that was already considered and people just went "too complex", never mind.

               Linus

^ permalink raw reply

* Re: [PATCH 01/10] security: Override creds in __fput() with last fputter's creds [ver #3]
From: Andy Lutomirski @ 2019-06-06 17:18 UTC (permalink / raw)
  To: David Howells
  Cc: Al Viro, Casey Schaufler, raven, Linux FS Devel, Linux API,
	linux-block, keyrings, LSM List, LKML, Jann Horn
In-Reply-To: <11031.1559833574@warthog.procyon.org.uk>

On Thu, Jun 6, 2019 at 8:06 AM David Howells <dhowells@redhat.com> wrote:
>
> Andy Lutomirski <luto@amacapital.net> wrote:
>
> > > So that the LSM can see the credentials of the last process to do an fput()
> > > on a file object when the file object is being dismantled, do the following
> > > steps:
> > >
> >
> > I still maintain that this is a giant design error.
>
> Yes, I know.  This was primarily a post so that Greg could play with the USB
> notifications stuff I added.  The LSM support isn't resolved and is unchanged.
>
> > Can someone at least come up with a single valid use case that isn't
> > entirely full of bugs?
>
> "Entirely full of bugs"?

I can say "hey, I have this policy that the person who triggered an
event needs such-and-such permission, otherwise the event gets
suppressed".  But this isn't a full use case, and it's buggy.  It's
not a full use case because I haven't specified what my actual goal is
and why this particular policy achieves my goals.  And it's entirely
full of bugs because, as this patch so nicely illustrates, it's not
well defined who triggered the event.  For example, if I exec a setuid
process, who triggers the close?  What if I send the fd to systemd
over a socket and immediately close my copy before systemd gets (and
ignores) the message?  Or if I send it to Wayland, or to any other
process?

A file is closed when everyone is done with it.  Trying to figure out
who the last intentional user of the file was seems little better than
random guessing.  Defining a security policy based on it seems like a
poor idea.

>
> How would you propose I deal with Casey's requirement?  I'm getting the
> feeling you're going to nak it if I try to fulfil that and he's going to nak
> it if I don't.
>

Casey, I think you need to state your requirement in a way that's well
defined, and I think you need to make a compelling case that your
requirement is indeed worth dictating the design of parts of the
kernel outside LSM.

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Stephen Smalley @ 2019-06-06 17:16 UTC (permalink / raw)
  To: Casey Schaufler, David Howells
  Cc: viro, Greg Kroah-Hartman, linux-usb, raven, linux-fsdevel,
	linux-api, linux-block, keyrings, linux-security-module,
	linux-kernel, Paul Moore
In-Reply-To: <0eb007c5-b4a0-9384-d915-37b0e5a158bf@schaufler-ca.com>

On 6/6/19 12:43 PM, Casey Schaufler wrote:
> On 6/6/2019 7:05 AM, Stephen Smalley wrote:
>> On 6/6/19 9:16 AM, David Howells wrote:
>>> Stephen Smalley <sds@tycho.nsa.gov> wrote:
>>>
>>> This might be easier to discuss if you can reply to:
>>>
>>>      https://lore.kernel.org/lkml/5393.1559768763@warthog.procyon.org.uk/
>>>
>>> which is on the ver #2 posting of this patchset.
>>
>> Sorry for being late to the party.  Not sure whether you're asking me to respond only there or both there and here to your comments below.  I'll start here but can revisit if it's a problem.
>>>
>>>>> LSM support is included, but controversial:
>>>>>
>>>>>     (1) The creds of the process that did the fput() that reduced the refcount
>>>>>         to zero are cached in the file struct.
>>>>>
>>>>>     (2) __fput() overrides the current creds with the creds from (1) whilst
>>>>>         doing the cleanup, thereby making sure that the creds seen by the
>>>>>         destruction notification generated by mntput() appears to come from
>>>>>         the last fputter.
>>>>>
>>>>>     (3) security_post_notification() is called for each queue that we might
>>>>>         want to post a notification into, thereby allowing the LSM to prevent
>>>>>         covert communications.
>>>>>
>>>>>     (?) Do I need to add security_set_watch(), say, to rule on whether a watch
>>>>>         may be set in the first place?  I might need to add a variant per
>>>>>         watch-type.
>>>>>
>>>>>     (?) Do I really need to keep track of the process creds in which an
>>>>>         implicit object destruction happened?  For example, imagine you create
>>>>>         an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
>>>>>         refers to on close unless move_mount() clears that flag.  Now, imagine
>>>>>         someone looking at that fd through procfs at the same time as you exit
>>>>>         due to an error.  The LSM sees the destruction notification come from
>>>>>         the looker if they happen to do their fput() after yours.
>>>>
>>>>
>>>> I'm not in favor of this approach.
>>>
>>> Which bit?  The last point?  Keeping track of the process creds after an
>>> implicit object destruction.
>>
>> (1), (2), (3), and the last point.
>>
>>>> Can we check permission to the object being watched when a watch is set
>>>> (read-like access),
>>>
>>> Yes, and I need to do that.  I think it's likely to require an extra hook for
>>> each entry point added because the objects are different:
>>>
>>>      int security_watch_key(struct watch *watch, struct key *key);
>>>      int security_watch_sb(struct watch *watch, struct path *path);
>>>      int security_watch_mount(struct watch *watch, struct path *path);
>>>      int security_watch_devices(struct watch *watch);
>>>
>>>> make sure every access that can trigger a notification requires a
>>>> (write-like) permission to the accessed object,
>>>
>>> "write-like permssion" for whom?  The triggerer or the watcher?
>>
>> The former, i.e. the process that performed the operation that triggered the notification.  Think of it as a write from the process to the accessed object, which triggers a notification (another write) on some related object (the watched object), which is then read by the watcher.
> 
> We agree that the process that performed the operation that triggered
> the notification is the initial subject. Smack will treat the process
> with the watch set (in particular, its ring buffer) as the object
> being written to. SELinux, with its finer grained controls, will
> involve other things as noted above. There are other place where we
> see this, notably IP packet delivery.
> 
> The implication is that the information about the triggering
> process needs to be available throughout.
> 
>>
>>> There are various 'classes' of events:
>>>
>>>    (1) System events (eg. hardware I/O errors, automount points expiring).
>>>
>>>    (2) Direct events (eg. automounts, manual mounts, EDQUOT, key linkage).
>>>
>>>    (3) Indirect events (eg. exit/close doing the last fput and causing an
>>>        unmount).
>>>
>>> Class (1) are uncaused by a process, so I use init_cred for them.  One could
>>> argue that the automount point expiry should perhaps take place under the
>>> creds of whoever triggered it in the first place, but we need to be careful
>>> about long-term cred pinning.
>>
>> This seems equivalent to just checking whether the watcher is allowed to get that kind of event, no other cred truly needed.
>>
>>> Class (2) the causing process must've had permission to cause them - otherwise
>>> we wouldn't have got the event.
>>
>> So we've already done a check on the causing process, and we're going to check whether the watcher can set the watch. We just need to establish the connection between the accessed object and the watched object in some manner.
> 
> I don't agree. That is, I don't believe it is sufficient.
> There is no guarantee that being able to set a watch on an
> object implies that every process that can trigger the event
> can send it to you.
> 
> 	Watcher has Smack label W
> 	Triggerer has Smack label T
> 	Watched object has Smack label O
> 
> 	Relevant Smack rules are
> 
> 	W O rw
> 	T O rw
> 
> The watcher will be able to set the watch,
> the triggerer will be able to trigger the event,
> but there is nothing that would allow the watcher
> to receive the event. This is not a case of watcher
> reading the watched object, as the event is delivered
> without any action by watcher.

You are allowing arbitrary information flow between T and W above.  Who 
cares about notifications?

How is it different from W and T mapping the same file as a shared 
mapping and communicating by reading and writing the shared memory?  You 
aren't performing a permission check directly between W and T there.

> 
>>
>>> Class (3) is interesting since it's currently entirely cleanup events and the
>>> process may have the right to do them (close, dup2, exit, but also execve)
>>> whether the LSM thinks it should be able to cause the object to be destroyed
>>> or not.
>>>
>>> It gets more complicated than that, though: multiple processes with different
>>> security attributes can all have fds pointing to a common file object - and
>>> the last one to close carries the can as far as the LSM is concerned.
>>
>> Yes, I'd prefer to avoid that.  You can't write policy that is stable and meaningful that way.  This may fall under a similar situation as class (1) - all we can meaningfully do is check whether the watcher is allowed to see all such events.
> 
> Back in the day when we were doing security evaluations
> the implications of "deleting" an object gave the security
> evaluators fits. UNIX/Linux files don't get deleted, they
> simply fall off the filesystem namespace when no one cares
> about them anymore. The model we used back in the day was
> that "delete" wasn't an operation that occurs on filesystem
> objects.
> 
> But now you want to do something with security implications
> when the object disappears. We could say that the event does
> nothing but signal that the system has removed the watch on
> your behalf because it is now meaningless. No reason to worry
> about who dropped the last reference. We don't care about
> that from a policy viewpoint anyway.
> 
>>
>>> And yet more complicated when you throw in unix sockets with partially passed
>>> fds still in their queues.  That's what patch 01 is designed to try and cope
>>> with.
>>>
>>>> and make sure there is some sane way to control the relationship between the
>>>> accessed object and the watched object (write-like)?
>>>
>>> This is the trick.  Keys and superblocks have object labels of their own and
>>> don't - for now - propagate their watches.  With these, the watch is on the
>>> object you initially assign it to and it goes no further than that.
>>>
>>> mount_notify() is the interesting case since we want to be able to detect
>>> mount topology change events from within the vfs subtree rooted at the watched
>>> directory without having to manually put a watch on every directory in that
>>> subtree - or even just every mount object. >
>>> Or, maybe, that's what I'll have to do: make it mount_notify() can only apply
>>> to the subtree within its superblock, and the caller must call mount_notify()
>>> for every mount object it wants to monitor.  That would at least ensure that
>>> the caller can, at that point, reach all those mount points.
>>
>> Would that at least make it consistent with fanotify (not that it provides a great example)?
>>
>>>> For cases where we have no object per se or at least no security
>>>> structure/label associated with it, we may have to fall back to a
>>>> coarse-grained "Can the watcher get this kind of notification in general?".
>>>
>>> Agreed - and we should probably have that anyway.
>>>
>>> David
> 

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Andy Lutomirski @ 2019-06-06 17:11 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Stephen Smalley, David Howells, Al Viro, Greg Kroah-Hartman,
	USB list, raven, Linux FS Devel, Linux API, linux-block, keyrings,
	LSM List, LKML, Paul Moore
In-Reply-To: <0eb007c5-b4a0-9384-d915-37b0e5a158bf@schaufler-ca.com>

On Thu, Jun 6, 2019 at 9:43 AM Casey Schaufler <casey@schaufler-ca.com> wrote:
>
> On 6/6/2019 7:05 AM, Stephen Smalley wrote:
> > On 6/6/19 9:16 AM, David Howells wrote:
> >> Stephen Smalley <sds@tycho.nsa.gov> wrote:
> >>
> >> This might be easier to discuss if you can reply to:
> >>
> >>     https://lore.kernel.org/lkml/5393.1559768763@warthog.procyon.org.uk/
> >>
> >> which is on the ver #2 posting of this patchset.
> >
> > Sorry for being late to the party.  Not sure whether you're asking me to respond only there or both there and here to your comments below.  I'll start here but can revisit if it's a problem.
> >>
> >>>> LSM support is included, but controversial:
> >>>>
> >>>>    (1) The creds of the process that did the fput() that reduced the refcount
> >>>>        to zero are cached in the file struct.
> >>>>
> >>>>    (2) __fput() overrides the current creds with the creds from (1) whilst
> >>>>        doing the cleanup, thereby making sure that the creds seen by the
> >>>>        destruction notification generated by mntput() appears to come from
> >>>>        the last fputter.
> >>>>
> >>>>    (3) security_post_notification() is called for each queue that we might
> >>>>        want to post a notification into, thereby allowing the LSM to prevent
> >>>>        covert communications.
> >>>>
> >>>>    (?) Do I need to add security_set_watch(), say, to rule on whether a watch
> >>>>        may be set in the first place?  I might need to add a variant per
> >>>>        watch-type.
> >>>>
> >>>>    (?) Do I really need to keep track of the process creds in which an
> >>>>        implicit object destruction happened?  For example, imagine you create
> >>>>        an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
> >>>>        refers to on close unless move_mount() clears that flag.  Now, imagine
> >>>>        someone looking at that fd through procfs at the same time as you exit
> >>>>        due to an error.  The LSM sees the destruction notification come from
> >>>>        the looker if they happen to do their fput() after yours.
> >>>
> >>>
> >>> I'm not in favor of this approach.
> >>
> >> Which bit?  The last point?  Keeping track of the process creds after an
> >> implicit object destruction.
> >
> > (1), (2), (3), and the last point.
> >
> >>> Can we check permission to the object being watched when a watch is set
> >>> (read-like access),
> >>
> >> Yes, and I need to do that.  I think it's likely to require an extra hook for
> >> each entry point added because the objects are different:
> >>
> >>     int security_watch_key(struct watch *watch, struct key *key);
> >>     int security_watch_sb(struct watch *watch, struct path *path);
> >>     int security_watch_mount(struct watch *watch, struct path *path);
> >>     int security_watch_devices(struct watch *watch);
> >>
> >>> make sure every access that can trigger a notification requires a
> >>> (write-like) permission to the accessed object,
> >>
> >> "write-like permssion" for whom?  The triggerer or the watcher?
> >
> > The former, i.e. the process that performed the operation that triggered the notification.  Think of it as a write from the process to the accessed object, which triggers a notification (another write) on some related object (the watched object), which is then read by the watcher.
>
> We agree that the process that performed the operation that triggered
> the notification is the initial subject. Smack will treat the process
> with the watch set (in particular, its ring buffer) as the object
> being written to. SELinux, with its finer grained controls, will
> involve other things as noted above. There are other place where we
> see this, notably IP packet delivery.
>
> The implication is that the information about the triggering
> process needs to be available throughout.
>
> >
> >> There are various 'classes' of events:
> >>
> >>   (1) System events (eg. hardware I/O errors, automount points expiring).
> >>
> >>   (2) Direct events (eg. automounts, manual mounts, EDQUOT, key linkage).
> >>
> >>   (3) Indirect events (eg. exit/close doing the last fput and causing an
> >>       unmount).
> >>
> >> Class (1) are uncaused by a process, so I use init_cred for them.  One could
> >> argue that the automount point expiry should perhaps take place under the
> >> creds of whoever triggered it in the first place, but we need to be careful
> >> about long-term cred pinning.
> >
> > This seems equivalent to just checking whether the watcher is allowed to get that kind of event, no other cred truly needed.
> >
> >> Class (2) the causing process must've had permission to cause them - otherwise
> >> we wouldn't have got the event.
> >
> > So we've already done a check on the causing process, and we're going to check whether the watcher can set the watch. We just need to establish the connection between the accessed object and the watched object in some manner.
>
> I don't agree. That is, I don't believe it is sufficient.
> There is no guarantee that being able to set a watch on an
> object implies that every process that can trigger the event
> can send it to you.
>
>         Watcher has Smack label W
>         Triggerer has Smack label T
>         Watched object has Smack label O
>
>         Relevant Smack rules are
>
>         W O rw
>         T O rw
>
> The watcher will be able to set the watch,
> the triggerer will be able to trigger the event,
> but there is nothing that would allow the watcher
> to receive the event. This is not a case of watcher
> reading the watched object, as the event is delivered
> without any action by watcher.

I think this is an example of a bogus policy that should not be
supported by the kernel.

^ permalink raw reply

* Re: [RFC][PATCH 00/10] Mount, FS, Block and Keyrings notifications [ver #3]
From: Casey Schaufler @ 2019-06-06 16:43 UTC (permalink / raw)
  To: Stephen Smalley, David Howells
  Cc: viro, Greg Kroah-Hartman, linux-usb, raven, linux-fsdevel,
	linux-api, linux-block, keyrings, linux-security-module,
	linux-kernel, Paul Moore, casey
In-Reply-To: <8382af23-548c-f162-0e82-11e308049735@tycho.nsa.gov>

On 6/6/2019 7:05 AM, Stephen Smalley wrote:
> On 6/6/19 9:16 AM, David Howells wrote:
>> Stephen Smalley <sds@tycho.nsa.gov> wrote:
>>
>> This might be easier to discuss if you can reply to:
>>
>>     https://lore.kernel.org/lkml/5393.1559768763@warthog.procyon.org.uk/
>>
>> which is on the ver #2 posting of this patchset.
>
> Sorry for being late to the party.  Not sure whether you're asking me to respond only there or both there and here to your comments below.  I'll start here but can revisit if it's a problem.
>>
>>>> LSM support is included, but controversial:
>>>>
>>>>    (1) The creds of the process that did the fput() that reduced the refcount
>>>>        to zero are cached in the file struct.
>>>>
>>>>    (2) __fput() overrides the current creds with the creds from (1) whilst
>>>>        doing the cleanup, thereby making sure that the creds seen by the
>>>>        destruction notification generated by mntput() appears to come from
>>>>        the last fputter.
>>>>
>>>>    (3) security_post_notification() is called for each queue that we might
>>>>        want to post a notification into, thereby allowing the LSM to prevent
>>>>        covert communications.
>>>>
>>>>    (?) Do I need to add security_set_watch(), say, to rule on whether a watch
>>>>        may be set in the first place?  I might need to add a variant per
>>>>        watch-type.
>>>>
>>>>    (?) Do I really need to keep track of the process creds in which an
>>>>        implicit object destruction happened?  For example, imagine you create
>>>>        an fd with fsopen()/fsmount().  It is marked to dissolve the mount it
>>>>        refers to on close unless move_mount() clears that flag.  Now, imagine
>>>>        someone looking at that fd through procfs at the same time as you exit
>>>>        due to an error.  The LSM sees the destruction notification come from
>>>>        the looker if they happen to do their fput() after yours.
>>>
>>>
>>> I'm not in favor of this approach.
>>
>> Which bit?  The last point?  Keeping track of the process creds after an
>> implicit object destruction.
>
> (1), (2), (3), and the last point.
>
>>> Can we check permission to the object being watched when a watch is set
>>> (read-like access),
>>
>> Yes, and I need to do that.  I think it's likely to require an extra hook for
>> each entry point added because the objects are different:
>>
>>     int security_watch_key(struct watch *watch, struct key *key);
>>     int security_watch_sb(struct watch *watch, struct path *path);
>>     int security_watch_mount(struct watch *watch, struct path *path);
>>     int security_watch_devices(struct watch *watch);
>>
>>> make sure every access that can trigger a notification requires a
>>> (write-like) permission to the accessed object,
>>
>> "write-like permssion" for whom?  The triggerer or the watcher?
>
> The former, i.e. the process that performed the operation that triggered the notification.  Think of it as a write from the process to the accessed object, which triggers a notification (another write) on some related object (the watched object), which is then read by the watcher.

We agree that the process that performed the operation that triggered
the notification is the initial subject. Smack will treat the process
with the watch set (in particular, its ring buffer) as the object
being written to. SELinux, with its finer grained controls, will
involve other things as noted above. There are other place where we
see this, notably IP packet delivery.

The implication is that the information about the triggering
process needs to be available throughout.

>
>> There are various 'classes' of events:
>>
>>   (1) System events (eg. hardware I/O errors, automount points expiring).
>>
>>   (2) Direct events (eg. automounts, manual mounts, EDQUOT, key linkage).
>>
>>   (3) Indirect events (eg. exit/close doing the last fput and causing an
>>       unmount).
>>
>> Class (1) are uncaused by a process, so I use init_cred for them.  One could
>> argue that the automount point expiry should perhaps take place under the
>> creds of whoever triggered it in the first place, but we need to be careful
>> about long-term cred pinning.
>
> This seems equivalent to just checking whether the watcher is allowed to get that kind of event, no other cred truly needed.
>
>> Class (2) the causing process must've had permission to cause them - otherwise
>> we wouldn't have got the event.
>
> So we've already done a check on the causing process, and we're going to check whether the watcher can set the watch. We just need to establish the connection between the accessed object and the watched object in some manner.

I don't agree. That is, I don't believe it is sufficient.
There is no guarantee that being able to set a watch on an
object implies that every process that can trigger the event
can send it to you.

	Watcher has Smack label W
	Triggerer has Smack label T
	Watched object has Smack label O

	Relevant Smack rules are

	W O rw
	T O rw

The watcher will be able to set the watch,
the triggerer will be able to trigger the event,
but there is nothing that would allow the watcher
to receive the event. This is not a case of watcher
reading the watched object, as the event is delivered
without any action by watcher.

>
>> Class (3) is interesting since it's currently entirely cleanup events and the
>> process may have the right to do them (close, dup2, exit, but also execve)
>> whether the LSM thinks it should be able to cause the object to be destroyed
>> or not.
>>
>> It gets more complicated than that, though: multiple processes with different
>> security attributes can all have fds pointing to a common file object - and
>> the last one to close carries the can as far as the LSM is concerned.
>
> Yes, I'd prefer to avoid that.  You can't write policy that is stable and meaningful that way.  This may fall under a similar situation as class (1) - all we can meaningfully do is check whether the watcher is allowed to see all such events.

Back in the day when we were doing security evaluations
the implications of "deleting" an object gave the security
evaluators fits. UNIX/Linux files don't get deleted, they
simply fall off the filesystem namespace when no one cares
about them anymore. The model we used back in the day was
that "delete" wasn't an operation that occurs on filesystem
objects.

But now you want to do something with security implications
when the object disappears. We could say that the event does
nothing but signal that the system has removed the watch on
your behalf because it is now meaningless. No reason to worry
about who dropped the last reference. We don't care about
that from a policy viewpoint anyway.

>
>> And yet more complicated when you throw in unix sockets with partially passed
>> fds still in their queues.  That's what patch 01 is designed to try and cope
>> with.
>>
>>> and make sure there is some sane way to control the relationship between the
>>> accessed object and the watched object (write-like)?
>>
>> This is the trick.  Keys and superblocks have object labels of their own and
>> don't - for now - propagate their watches.  With these, the watch is on the
>> object you initially assign it to and it goes no further than that.
>>
>> mount_notify() is the interesting case since we want to be able to detect
>> mount topology change events from within the vfs subtree rooted at the watched
>> directory without having to manually put a watch on every directory in that
>> subtree - or even just every mount object. >
>> Or, maybe, that's what I'll have to do: make it mount_notify() can only apply
>> to the subtree within its superblock, and the caller must call mount_notify()
>> for every mount object it wants to monitor.  That would at least ensure that
>> the caller can, at that point, reach all those mount points.
>
> Would that at least make it consistent with fanotify (not that it provides a great example)?
>
>>> For cases where we have no object per se or at least no security
>>> structure/label associated with it, we may have to fall back to a
>>> coarse-grained "Can the watcher get this kind of notification in general?".
>>
>> Agreed - and we should probably have that anyway.
>>
>> David

^ permalink raw reply

* [PATCH v4 16/16] f2fs: add fs-verity support
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add fs-verity support to f2fs.  fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files.  It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time.  It
is implemented mainly by helper functions in fs/verity/.  See
Documentation/filesystems/fsverity.rst for the full documentation.

The f2fs support for fs-verity consists of:

- Adding a filesystem feature flag and an inode flag for fs-verity.

- Implementing the fsverity_operations to support enabling verity on an
  inode and reading/writing the verity metadata.

- Updating ->readpages() to verify data as it's read from verity files
  and to support reading verity metadata pages.

- Updating ->write_begin(), ->write_end(), and ->writepages() to support
  writing verity metadata pages.

- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().

Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first
page fully beyond i_size.  This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs.  Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/f2fs/Makefile |   1 +
 fs/f2fs/data.c   |  72 +++++++++++++--
 fs/f2fs/f2fs.h   |  23 ++++-
 fs/f2fs/file.c   |  40 +++++++++
 fs/f2fs/inode.c  |   5 +-
 fs/f2fs/super.c  |   3 +
 fs/f2fs/sysfs.c  |  11 +++
 fs/f2fs/verity.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/xattr.h  |   2 +
 9 files changed, 367 insertions(+), 14 deletions(-)
 create mode 100644 fs/f2fs/verity.c

diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 776c4b936504..2aaecc63834f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -8,3 +8,4 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
+f2fs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eda4181d2092..8f175d47291d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -73,6 +73,7 @@ static enum count_type __read_io_type(struct page *page)
 enum bio_post_read_step {
 	STEP_INITIAL = 0,
 	STEP_DECRYPT,
+	STEP_VERITY,
 };
 
 struct bio_post_read_ctx {
@@ -119,8 +120,23 @@ static void decrypt_work(struct work_struct *work)
 	bio_post_read_processing(ctx);
 }
 
+static void verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fsverity_verify_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
 static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 {
+	/*
+	 * We use different work queues for decryption and for verity because
+	 * verity may require reading metadata pages that need decryption, and
+	 * we shouldn't recurse to the same workqueue.
+	 */
 	switch (++ctx->cur_step) {
 	case STEP_DECRYPT:
 		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
@@ -130,6 +146,14 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 		}
 		ctx->cur_step++;
 		/* fall-through */
+	case STEP_VERITY:
+		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+			INIT_WORK(&ctx->work, verity_work);
+			fsverity_enqueue_verify_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
 	default:
 		__read_end_io(ctx->bio);
 	}
@@ -553,8 +577,15 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	up_write(&io->io_rwsem);
 }
 
+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
 static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
-					unsigned nr_pages, unsigned op_flag)
+				      unsigned nr_pages, unsigned op_flag,
+				      pgoff_t first_idx)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
@@ -570,6 +601,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 
 	if (f2fs_encrypted_file(inode))
 		post_read_steps |= 1 << STEP_DECRYPT;
+
+	if (f2fs_need_verity(inode, first_idx))
+		post_read_steps |= 1 << STEP_VERITY;
+
 	if (post_read_steps) {
 		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
 		if (!ctx) {
@@ -591,7 +626,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
 
-	bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0);
+	bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
@@ -1514,6 +1549,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	return ret;
 }
 
+static inline loff_t f2fs_readpage_limit(struct inode *inode)
+{
+	if (IS_ENABLED(CONFIG_FS_VERITY) &&
+	    (IS_VERITY(inode) || f2fs_verity_in_progress(inode)))
+		return inode->i_sb->s_maxbytes;
+
+	return i_size_read(inode);
+}
+
 static int f2fs_read_single_page(struct inode *inode, struct page *page,
 					unsigned nr_pages,
 					struct f2fs_map_blocks *map,
@@ -1532,7 +1576,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
 
 	block_in_file = (sector_t)page->index;
 	last_block = block_in_file + nr_pages;
-	last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
+	last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >>
 							blkbits;
 	if (last_block > last_block_in_file)
 		last_block = last_block_in_file;
@@ -1576,6 +1620,11 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
 	} else {
 zero_out:
 		zero_user_segment(page, 0, PAGE_SIZE);
+		if (f2fs_need_verity(inode, page->index) &&
+		    !fsverity_verify_page(page)) {
+			ret = -EIO;
+			goto out;
+		}
 		if (!PageUptodate(page))
 			SetPageUptodate(page);
 		unlock_page(page);
@@ -1594,7 +1643,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
 	}
 	if (bio == NULL) {
 		bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
-				is_readahead ? REQ_RAHEAD : 0);
+				is_readahead ? REQ_RAHEAD : 0, page->index);
 		if (IS_ERR(bio)) {
 			ret = PTR_ERR(bio);
 			bio = NULL;
@@ -1991,7 +2040,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
-	if (page->index < end_index)
+	if (page->index < end_index || f2fs_verity_in_progress(inode))
 		goto write;
 
 	/*
@@ -2383,7 +2432,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	 * the block addresses when there is no need to fill the page.
 	 */
 	if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
-			!is_inode_flag_set(inode, FI_NO_PREALLOC))
+	    !is_inode_flag_set(inode, FI_NO_PREALLOC) &&
+	    !f2fs_verity_in_progress(inode))
 		return 0;
 
 	/* f2fs_lock_op avoids race between write CP and convert_inline_page */
@@ -2522,7 +2572,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	if (len == PAGE_SIZE || PageUptodate(page))
 		return 0;
 
-	if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+	if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
+	    !f2fs_verity_in_progress(inode)) {
 		zero_user_segment(page, len, PAGE_SIZE);
 		return 0;
 	}
@@ -2585,7 +2636,8 @@ static int f2fs_write_end(struct file *file,
 
 	set_page_dirty(page);
 
-	if (pos + copied > i_size_read(inode))
+	if (pos + copied > i_size_read(inode) &&
+	    !f2fs_verity_in_progress(inode))
 		f2fs_i_size_write(inode, pos + copied);
 unlock_out:
 	f2fs_put_page(page, 1);
@@ -2906,7 +2958,9 @@ void f2fs_clear_page_cache_dirty_tag(struct page *page)
 
 int __init f2fs_init_post_read_processing(void)
 {
-	bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0);
+	bio_post_read_ctx_cache =
+		kmem_cache_create("f2fs_bio_post_read_ctx",
+				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
 	if (!bio_post_read_ctx_cache)
 		goto fail;
 	bio_post_read_ctx_pool =
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 06b89a9862ab..8477191ad1c9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -25,6 +25,7 @@
 #include <crypto/hash.h>
 
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -148,7 +149,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_QUOTA_INO		0x0080
 #define F2FS_FEATURE_INODE_CRTIME	0x0100
 #define F2FS_FEATURE_LOST_FOUND		0x0200
-#define F2FS_FEATURE_VERITY		0x0400	/* reserved */
+#define F2FS_FEATURE_VERITY		0x0400
 #define F2FS_FEATURE_SB_CHKSUM		0x0800
 
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
@@ -626,7 +627,7 @@ enum {
 #define FADVISE_ENC_NAME_BIT	0x08
 #define FADVISE_KEEP_SIZE_BIT	0x10
 #define FADVISE_HOT_BIT		0x20
-#define FADVISE_VERITY_BIT	0x40	/* reserved */
+#define FADVISE_VERITY_BIT	0x40
 
 #define FADVISE_MODIFIABLE_BITS	(FADVISE_COLD_BIT | FADVISE_HOT_BIT)
 
@@ -646,6 +647,8 @@ enum {
 #define file_is_hot(inode)	is_file(inode, FADVISE_HOT_BIT)
 #define file_set_hot(inode)	set_file(inode, FADVISE_HOT_BIT)
 #define file_clear_hot(inode)	clear_file(inode, FADVISE_HOT_BIT)
+#define file_is_verity(inode)	is_file(inode, FADVISE_VERITY_BIT)
+#define file_set_verity(inode)	set_file(inode, FADVISE_VERITY_BIT)
 
 #define DEF_DIR_LEVEL		0
 
@@ -2344,6 +2347,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 #define F2FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define F2FS_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define F2FS_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define F2FS_VERITY_FL			0x00100000 /* Verity protected inode */
 #define F2FS_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define F2FS_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define F2FS_NOCOW_FL			0x00800000 /* Do not cow file */
@@ -2351,7 +2355,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 #define F2FS_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define F2FS_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define F2FS_FL_USER_VISIBLE		0x30CBDFFF /* User visible flags */
+#define F2FS_FL_USER_VISIBLE		0x30DBDFFF /* User visible flags */
 #define F2FS_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
@@ -2417,6 +2421,7 @@ enum {
 	FI_PROJ_INHERIT,	/* indicate file inherits projectid */
 	FI_PIN_FILE,		/* indicate file should not be gced */
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2456,6 +2461,12 @@ static inline void clear_inode_flag(struct inode *inode, int flag)
 	__mark_inode_dirty_flag(inode, flag, false);
 }
 
+static inline bool f2fs_verity_in_progress(struct inode *inode)
+{
+	return IS_ENABLED(CONFIG_FS_VERITY) &&
+	       is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS);
+}
+
 static inline void set_acl_inode(struct inode *inode, umode_t mode)
 {
 	F2FS_I(inode)->i_acl_mode = mode;
@@ -3524,6 +3535,9 @@ void f2fs_exit_sysfs(void);
 int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
 void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
 
+/* verity.c */
+extern const struct fsverity_operations f2fs_verityops;
+
 /*
  * crypto support
  */
@@ -3546,7 +3560,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
  */
 static inline bool f2fs_post_read_required(struct inode *inode)
 {
-	return f2fs_encrypted_file(inode);
+	return f2fs_encrypted_file(inode) || fsverity_active(inode);
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
@@ -3564,6 +3578,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
 F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
 F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
 F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(verity, VERITY);
 F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
 
 #ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 45b45f37d347..6706c2081941 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -493,6 +493,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 {
 	int err = fscrypt_file_open(inode, filp);
 
+	if (err)
+		return err;
+
+	err = fsverity_file_open(inode, filp);
 	if (err)
 		return err;
 
@@ -781,6 +785,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
+	err = fsverity_prepare_setattr(dentry, attr);
+	if (err)
+		return err;
+
 	if (is_quota_modification(inode, attr)) {
 		err = dquot_initialize(inode);
 		if (err)
@@ -1656,6 +1664,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
 
 	if (IS_ENCRYPTED(inode))
 		flags |= F2FS_ENCRYPT_FL;
+	if (IS_VERITY(inode))
+		flags |= F2FS_VERITY_FL;
 	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
 		flags |= F2FS_INLINE_DATA_FL;
 	if (is_inode_flag_set(inode, FI_PIN_FILE))
@@ -2980,6 +2990,30 @@ static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg)
 	return f2fs_precache_extents(file_inode(filp));
 }
 
+static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+	if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) {
+		f2fs_msg(inode->i_sb, KERN_WARNING,
+			 "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n",
+			 inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	return fsverity_ioctl_enable(filp, (const void __user *)arg);
+}
+
+static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fsverity_ioctl_measure(filp, (void __user *)arg);
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3036,6 +3070,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_set_pin_file(filp, arg);
 	case F2FS_IOC_PRECACHE_EXTENTS:
 		return f2fs_ioc_precache_extents(filp, arg);
+	case FS_IOC_ENABLE_VERITY:
+		return f2fs_ioc_enable_verity(filp, arg);
+	case FS_IOC_MEASURE_VERITY:
+		return f2fs_ioc_measure_verity(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -3149,6 +3187,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_GET_PIN_FILE:
 	case F2FS_IOC_SET_PIN_FILE:
 	case F2FS_IOC_PRECACHE_EXTENTS:
+	case FS_IOC_ENABLE_VERITY:
+	case FS_IOC_MEASURE_VERITY:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index ccb02226dd2c..b2f945b1afe5 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -46,9 +46,11 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	if (file_is_encrypt(inode))
 		new_fl |= S_ENCRYPTED;
+	if (file_is_verity(inode))
+		new_fl |= S_VERITY;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
-			S_ENCRYPTED);
+			S_ENCRYPTED|S_VERITY);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -749,6 +751,7 @@ void f2fs_evict_inode(struct inode *inode)
 	}
 out_clear:
 	fscrypt_put_encryption_info(inode);
+	fsverity_cleanup_inode(inode);
 	clear_inode(inode);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6b959bbb336a..ea4a247d6ed6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3177,6 +3177,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &f2fs_sops;
 #ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &f2fs_cryptops;
+#endif
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &f2fs_verityops;
 #endif
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 729f46a3c9ee..b3e28467db72 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -117,6 +117,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_lost_found(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "lost_found");
+	if (f2fs_sb_has_verity(sbi))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "verity");
 	if (f2fs_sb_has_sb_chksum(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "sb_checksum");
@@ -350,6 +353,7 @@ enum feat_id {
 	FEAT_QUOTA_INO,
 	FEAT_INODE_CRTIME,
 	FEAT_LOST_FOUND,
+	FEAT_VERITY,
 	FEAT_SB_CHECKSUM,
 };
 
@@ -367,6 +371,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_QUOTA_INO:
 	case FEAT_INODE_CRTIME:
 	case FEAT_LOST_FOUND:
+	case FEAT_VERITY:
 	case FEAT_SB_CHECKSUM:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
@@ -455,6 +460,9 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
 F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
 F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
+#ifdef CONFIG_FS_VERITY
+F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
+#endif
 F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -517,6 +525,9 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(quota_ino),
 	ATTR_LIST(inode_crtime),
 	ATTR_LIST(lost_found),
+#ifdef CONFIG_FS_VERITY
+	ATTR_LIST(verity),
+#endif
 	ATTR_LIST(sb_checksum),
 	NULL,
 };
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
new file mode 100644
index 000000000000..259f475de07a
--- /dev/null
+++ b/fs/f2fs/verity.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/f2fs/verity.c: fs-verity support for f2fs
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for f2fs.
+ *
+ * Like ext4, f2fs stores the verity metadata (Merkle tree and
+ * fsverity_descriptor) past the end of the file, starting at the first page
+ * fully beyond i_size.  This approach works because (a) verity files are
+ * readonly, and (b) pages fully beyond i_size aren't visible to userspace but
+ * can be read/written internally by f2fs with only some relatively small
+ * changes to f2fs.  Extended attributes cannot be used because (a) f2fs limits
+ * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be
+ * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't
+ * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is
+ * because it contains hashes of the plaintext data.
+ */
+
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+/*
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	const size_t orig_count = count;
+
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *addr;
+
+		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		addr = kmap_atomic(page);
+		memcpy(buf, addr + offset_in_page(pos), n);
+		kunmap_atomic(addr);
+
+		put_page(page);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return orig_count;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+			   loff_t pos)
+{
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *fsdata;
+		void *addr;
+		int res;
+
+		res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+					    &page, &fsdata);
+		if (res)
+			return res;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + offset_in_page(pos), buf, n);
+		kunmap_atomic(addr);
+
+		res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+					  page, fsdata);
+		if (res < 0)
+			return res;
+		if (res != n)
+			return -EIO;
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Format of f2fs verity xattr.  This points to the location of the verity
+ * descriptor within the file data rather than containing it directly because
+ * the verity descriptor *must* be encrypted when f2fs encryption is used.  But,
+ * f2fs encryption does not encrypt xattrs.
+ */
+struct fsverity_descriptor_location {
+	__le32 version;
+	__le32 size;
+	__le64 pos;
+};
+
+static int f2fs_begin_enable_verity(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int err;
+
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
+
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
+	set_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+	return 0;
+}
+
+static int f2fs_end_enable_verity(struct file *filp, const void *desc,
+				  size_t desc_size, u64 merkle_tree_size)
+{
+	struct inode *inode = file_inode(filp);
+	u64 desc_pos = round_up(inode->i_size, PAGE_SIZE) + merkle_tree_size;
+	struct fsverity_descriptor_location dloc = {
+		.version = cpu_to_le32(1),
+		.size = cpu_to_le32(desc_size),
+		.pos = cpu_to_le64(desc_pos),
+	};
+	int err = 0;
+
+	if (desc != NULL) {
+		/* Succeeded; write the verity descriptor. */
+		err = pagecache_write(inode, desc, desc_size, desc_pos);
+
+		/* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
+		if (!err)
+			err = filemap_write_and_wait(inode->i_mapping);
+	} else {
+		/* Failed; truncate anything we wrote past i_size. */
+		f2fs_truncate(inode);
+	}
+
+	clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+
+	if (desc != NULL && !err) {
+		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+				    F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+				    NULL, XATTR_CREATE);
+		if (!err) {
+			file_set_verity(inode);
+			f2fs_set_inode_flags(inode);
+			f2fs_mark_inode_dirty_sync(inode, true);
+		}
+	}
+	return err;
+}
+
+static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
+				      size_t buf_size)
+{
+	struct fsverity_descriptor_location dloc;
+	int res;
+	u32 size;
+	u64 pos;
+
+	/* Get the descriptor location */
+	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY,
+			    F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
+	if (res < 0 && res != -ERANGE)
+		return res;
+	if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
+		f2fs_msg(inode->i_sb, KERN_WARNING,
+			 "unknown verity xattr format");
+		return -EINVAL;
+	}
+	size = le32_to_cpu(dloc.size);
+	pos = le64_to_cpu(dloc.pos);
+
+	/* Get the descriptor */
+	if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
+	    pos < round_up(inode->i_size, PAGE_SIZE) || size > INT_MAX) {
+		f2fs_msg(inode->i_sb, KERN_WARNING, "invalid verity xattr");
+		return -EUCLEAN; /* EFSCORRUPTED */
+	}
+	if (buf_size == 0)
+		return size;
+	if (size > buf_size)
+		return -ERANGE;
+	return pagecache_read(inode, buf, size, pos);
+}
+
+static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
+					       pgoff_t index)
+{
+	index += DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+
+	return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	loff_t pos = round_up(inode->i_size, PAGE_SIZE) +
+		     (index << log_blocksize);
+
+	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations f2fs_verityops = {
+	.begin_enable_verity	= f2fs_begin_enable_verity,
+	.end_enable_verity	= f2fs_end_enable_verity,
+	.get_verity_descriptor	= f2fs_get_verity_descriptor,
+	.read_merkle_tree_page	= f2fs_read_merkle_tree_page,
+	.write_merkle_tree_block = f2fs_write_merkle_tree_block,
+};
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index a90920e2f949..de0c600b9cab 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -34,8 +34,10 @@
 #define F2FS_XATTR_INDEX_ADVISE			7
 /* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
 #define F2FS_XATTR_INDEX_ENCRYPTION		9
+#define F2FS_XATTR_INDEX_VERITY			11
 
 #define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT	"c"
+#define F2FS_XATTR_NAME_VERITY			"v"
 
 struct f2fs_xattr_header {
 	__le32  h_magic;        /* magic number for identification */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 15/16] ext4: add fs-verity read support
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Make ext4_mpage_readpages() verify data as it is read from fs-verity
files, using the helper functions from fs/verity/.

To support both encryption and verity simultaneously, this required
refactoring the decryption workflow into a generic "post-read
processing" workflow which can do decryption, verification, or both.

The case where the ext4 block size is not equal to the PAGE_SIZE is not
supported yet, since in that case ext4_mpage_readpages() sometimes falls
back to block_read_full_page(), which does not support fs-verity yet.

Co-developed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/ext4.h     |   2 +
 fs/ext4/inode.c    |   2 +
 fs/ext4/readpage.c | 207 ++++++++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c    |   9 +-
 4 files changed, 190 insertions(+), 30 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a1deea3fb3e..3c0d491c4970 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3158,6 +3158,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
 extern int ext4_mpage_readpages(struct address_space *mapping,
 				struct list_head *pages, struct page *page,
 				unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
 
 /* symlink.c */
 extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 514e24f88f90..37571d080b3c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3893,6 +3893,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
 		return 0;
 #endif
+	if (fsverity_active(inode))
+		return 0;
 
 	/*
 	 * If we are doing data journalling we don't support O_DIRECT
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c916017db334..84152b686e49 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,6 +47,11 @@
 
 #include "ext4.h"
 
+#define NUM_PREALLOC_POST_READ_CTXS	128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
 static inline bool ext4_bio_encrypted(struct bio *bio)
 {
 #ifdef CONFIG_FS_ENCRYPTION
@@ -56,6 +61,100 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
 #endif
 }
 
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+	STEP_INITIAL = 0,
+	STEP_DECRYPT,
+	STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+	struct bio *bio;
+	struct work_struct work;
+	unsigned int cur_step;
+	unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
+{
+	struct page *page;
+	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		page = bv->bv_page;
+
+		/* PG_error was set if any post_read step failed */
+		if (bio->bi_status || PageError(page)) {
+			ClearPageUptodate(page);
+			/* will re-read again later */
+			ClearPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fscrypt_decrypt_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fsverity_verify_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+	/*
+	 * We use different work queues for decryption and for verity because
+	 * verity may require reading metadata pages that need decryption, and
+	 * we shouldn't recurse to the same workqueue.
+	 */
+	switch (++ctx->cur_step) {
+	case STEP_DECRYPT:
+		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+			INIT_WORK(&ctx->work, decrypt_work);
+			fscrypt_enqueue_decrypt_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	case STEP_VERITY:
+		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+			INIT_WORK(&ctx->work, verity_work);
+			fsverity_enqueue_verify_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	default:
+		__read_end_io(ctx->bio);
+	}
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+	return bio->bi_private && !bio->bi_status;
+}
+
 /*
  * I/O completion handler for multipage BIOs.
  *
@@ -70,30 +169,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
  */
 static void mpage_end_io(struct bio *bio)
 {
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	if (bio_post_read_required(bio)) {
+		struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	if (ext4_bio_encrypted(bio)) {
-		if (bio->bi_status) {
-			fscrypt_release_ctx(bio->bi_private);
-		} else {
-			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
-			return;
-		}
+		ctx->cur_step = STEP_INITIAL;
+		bio_post_read_processing(ctx);
+		return;
 	}
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
+	__read_end_io(bio);
+}
 
-		if (!bio->bi_status) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+						       struct bio *bio,
+						       pgoff_t first_idx)
+{
+	unsigned int post_read_steps = 0;
+	struct bio_post_read_ctx *ctx = NULL;
+
+	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+		post_read_steps |= 1 << STEP_DECRYPT;
+
+	if (ext4_need_verity(inode, first_idx))
+		post_read_steps |= 1 << STEP_VERITY;
+
+	if (post_read_steps) {
+		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->bio = bio;
+		ctx->enabled_steps = post_read_steps;
+		bio->bi_private = ctx;
 	}
+	return ctx;
+}
 
-	bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+	if (IS_ENABLED(CONFIG_FS_VERITY) &&
+	    (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+		return inode->i_sb->s_maxbytes;
+
+	return i_size_read(inode);
 }
 
 int ext4_mpage_readpages(struct address_space *mapping,
@@ -141,7 +263,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
 
 		block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 		last_block = block_in_file + nr_pages * blocks_per_page;
-		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+		last_block_in_file = (ext4_readpage_limit(inode) +
+				      blocksize - 1) >> blkbits;
 		if (last_block > last_block_in_file)
 			last_block = last_block_in_file;
 		page_block = 0;
@@ -218,6 +341,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			zero_user_segment(page, first_hole << blkbits,
 					  PAGE_SIZE);
 			if (first_hole == 0) {
+				if (ext4_need_verity(inode, page->index) &&
+				    !fsverity_verify_page(page))
+					goto set_error_page;
 				SetPageUptodate(page);
 				unlock_page(page);
 				goto next_page;
@@ -241,18 +367,15 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct fscrypt_ctx *ctx = NULL;
+			struct bio_post_read_ctx *ctx;
 
-			if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
-				ctx = fscrypt_get_ctx(GFP_NOFS);
-				if (IS_ERR(ctx))
-					goto set_error_page;
-			}
 			bio = bio_alloc(GFP_KERNEL,
 				min_t(int, nr_pages, BIO_MAX_PAGES));
-			if (!bio) {
-				if (ctx)
-					fscrypt_release_ctx(ctx);
+			if (!bio)
+				goto set_error_page;
+			ctx = get_bio_post_read_ctx(inode, bio, page->index);
+			if (IS_ERR(ctx)) {
+				bio_put(bio);
 				goto set_error_page;
 			}
 			bio_set_dev(bio, bdev);
@@ -293,3 +416,29 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		submit_bio(bio);
 	return 0;
 }
+
+int __init ext4_init_post_read_processing(void)
+{
+	bio_post_read_ctx_cache =
+		kmem_cache_create("ext4_bio_post_read_ctx",
+				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+	if (!bio_post_read_ctx_cache)
+		goto fail;
+	bio_post_read_ctx_pool =
+		mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+					 bio_post_read_ctx_cache);
+	if (!bio_post_read_ctx_pool)
+		goto fail_free_cache;
+	return 0;
+
+fail_free_cache:
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+	return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+	mempool_destroy(bio_post_read_ctx_pool);
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 05a9874687c3..23e7acd43e4e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6103,6 +6103,10 @@ static int __init ext4_init_fs(void)
 		return err;
 
 	err = ext4_init_pending();
+	if (err)
+		goto out7;
+
+	err = ext4_init_post_read_processing();
 	if (err)
 		goto out6;
 
@@ -6144,8 +6148,10 @@ static int __init ext4_init_fs(void)
 out4:
 	ext4_exit_pageio();
 out5:
-	ext4_exit_pending();
+	ext4_exit_post_read_processing();
 out6:
+	ext4_exit_pending();
+out7:
 	ext4_exit_es();
 
 	return err;
@@ -6162,6 +6168,7 @@ static void __exit ext4_exit_fs(void)
 	ext4_exit_sysfs();
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
+	ext4_exit_post_read_processing();
 	ext4_exit_es();
 	ext4_exit_pending();
 }
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 14/16] ext4: add basic fs-verity support
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add most of fs-verity support to ext4.  fs-verity is a filesystem
feature that enables transparent integrity protection and authentication
of read-only files.  It uses a dm-verity like mechanism at the file
level: a Merkle tree is used to verify any block in the file in
log(filesize) time.  It is implemented mainly by helper functions in
fs/verity/.  See Documentation/filesystems/fsverity.rst for the full
documentation.

This commit adds all of ext4 fs-verity support except for the actual
data verification, including:

- Adding a filesystem feature flag and an inode flag for fs-verity.

- Implementing the fsverity_operations to support enabling verity on an
  inode and reading/writing the verity metadata.

- Updating ->write_begin(), ->write_end(), and ->writepages() to support
  writing verity metadata pages.

- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().

ext4 stores the verity metadata (Merkle tree and fsverity_descriptor)
past the end of the file, starting at the first page fully beyond
i_size.  This approach works because (a) verity files are readonly, and
(b) pages fully beyond i_size aren't visible to userspace but can be
read/written internally by ext4 with only some relatively small changes
to ext4.  This approach avoids having to depend on the EA_INODE feature
and on rearchitecturing ext4's xattr support to support paging
multi-gigabyte xattrs into memory, and to support encrypting xattrs.
Note that the verity metadata *must* be encrypted when the file is,
since it contains hashes of the plaintext data.

This patch incorporates work by Theodore Ts'o and Chandan Rajendra.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/Makefile |   1 +
 fs/ext4/ext4.h   |  21 +++-
 fs/ext4/file.c   |   4 +
 fs/ext4/inode.c  |  46 +++++---
 fs/ext4/ioctl.c  |  12 +++
 fs/ext4/super.c  |   9 ++
 fs/ext4/sysfs.c  |   6 ++
 fs/ext4/verity.c | 272 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/xattr.h  |   2 +
 9 files changed, 358 insertions(+), 15 deletions(-)
 create mode 100644 fs/ext4/verity.c

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3c3e04..b17ddc229ac5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,4 @@ ext4-y	:= balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
+ext4-$(CONFIG_FS_VERITY)		+= verity.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1cb67859e051..5a1deea3fb3e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,7 @@
 #endif
 
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #include <linux/compiler.h>
 
@@ -395,6 +396,7 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL			0x00100000 /* Verity protected inode */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
@@ -402,7 +404,7 @@ struct flex_groups {
 #define EXT4_CASEFOLD_FL		0x40000000 /* Casefolded file */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x704BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE		0x705BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x604BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
@@ -466,6 +468,7 @@ enum {
 	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
 	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
 	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_VERITY	= 20,	/* Verity protected inode */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
@@ -511,6 +514,7 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(TOPDIR);
 	CHECK_FLAG_VALUE(HUGE_FILE);
 	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(VERITY);
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
 	CHECK_FLAG_VALUE(INLINE_DATA);
@@ -1559,6 +1563,7 @@ enum {
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
+	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -1609,6 +1614,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_SB(sb)	(sb)
 #endif
 
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+	return IS_ENABLED(CONFIG_FS_VERITY) &&
+	       ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
 #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
 
 /*
@@ -1661,6 +1672,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 #define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
 #define EXT4_FEATURE_RO_COMPAT_PROJECT		0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY		0x8000
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1755,6 +1767,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,		BIGALLOC)
 EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,	METADATA_CSUM)
 EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,		READONLY)
 EXT4_FEATURE_RO_COMPAT_FUNCS(project,		PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity,		VERITY)
 
 EXT4_FEATURE_INCOMPAT_FUNCS(compression,	COMPRESSION)
 EXT4_FEATURE_INCOMPAT_FUNCS(filetype,		FILETYPE)
@@ -1812,7 +1825,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold,		CASEFOLD)
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
 					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
-					 EXT4_FEATURE_RO_COMPAT_PROJECT)
+					 EXT4_FEATURE_RO_COMPAT_PROJECT |\
+					 EXT4_FEATURE_RO_COMPAT_VERITY)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -3250,6 +3264,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2c5baa5e8291..ed59fb8f268e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -451,6 +451,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	if (ret)
 		return ret;
 
+	ret = fsverity_file_open(inode, filp);
+	if (ret)
+		return ret;
+
 	/*
 	 * Set up the jbd2_inode if we are opening the inode for
 	 * writing and the journal is present
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c7f77c643008..514e24f88f90 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1390,6 +1390,7 @@ static int ext4_write_end(struct file *file,
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_write_end(inode, pos, len, copied);
 	if (inline_data) {
@@ -1407,12 +1408,16 @@ static int ext4_write_end(struct file *file,
 	/*
 	 * it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
+	 *
+	 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+	 * blocks are being written past EOF, so skip the i_size update.
 	 */
-	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		i_size_changed = ext4_update_inode_size(inode, pos + copied);
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1423,7 +1428,7 @@ static int ext4_write_end(struct file *file,
 	if (i_size_changed || inline_data)
 		ext4_mark_inode_dirty(handle, inode);
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1434,7 +1439,7 @@ static int ext4_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -1495,6 +1500,7 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	int size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_SIZE - 1);
@@ -1524,13 +1530,14 @@ static int ext4_journalled_write_end(struct file *file,
 		if (!partial)
 			SetPageUptodate(page);
 	}
-	size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 
 	if (size_changed || inline_data) {
@@ -1539,7 +1546,7 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1550,7 +1557,7 @@ static int ext4_journalled_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -2146,7 +2153,8 @@ static int ext4_writepage(struct page *page,
 
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2230,7 +2238,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 	 * after page tables are updated.
 	 */
 	size = i_size_read(mpd->inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(mpd->inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2329,6 +2338,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
 							>> inode->i_blkbits;
 
+	if (ext4_verity_in_progress(inode))
+		blocks = EXT_MAX_BLOCKS;
+
 	do {
 		BUG_ON(buffer_locked(bh));
 
@@ -3045,8 +3057,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 
 	index = pos >> PAGE_SHIFT;
 
-	if (ext4_nonda_switch(inode->i_sb) ||
-	    S_ISLNK(inode->i_mode)) {
+	if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+	    ext4_verity_in_progress(inode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
@@ -4720,6 +4732,8 @@ static bool ext4_should_use_dax(struct inode *inode)
 		return false;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
 		return false;
+	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
+		return false;
 	return true;
 }
 
@@ -4744,9 +4758,11 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_ENCRYPTED;
 	if (flags & EXT4_CASEFOLD_FL)
 		new_fl |= S_CASEFOLD;
+	if (flags & EXT4_VERITY_FL)
+		new_fl |= S_VERITY;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
-			S_ENCRYPTED|S_CASEFOLD);
+			S_ENCRYPTED|S_CASEFOLD|S_VERITY);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5528,6 +5544,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
+	error = fsverity_prepare_setattr(dentry, attr);
+	if (error)
+		return error;
+
 	if (is_quota_modification(inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e486e49b31ed..93b63697f5dc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1092,6 +1092,16 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
 		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 
+	case FS_IOC_ENABLE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+	case FS_IOC_MEASURE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_measure(filp, (void __user *)arg);
+
 	case EXT4_IOC_FSGETXATTR:
 	{
 		struct fsxattr fa;
@@ -1210,6 +1220,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+	case FS_IOC_ENABLE_VERITY:
+	case FS_IOC_MEASURE_VERITY:
 	case EXT4_IOC_SHUTDOWN:
 	case FS_IOC_GETFSMAP:
 		break;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..05a9874687c3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1179,6 +1179,7 @@ void ext4_clear_inode(struct inode *inode)
 		EXT4_I(inode)->jinode = NULL;
 	}
 	fscrypt_put_encryption_info(inode);
+	fsverity_cleanup_inode(inode);
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -4272,6 +4273,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &ext4_cryptops;
 #endif
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &ext4_verityops;
+#endif
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &ext4_quota_operations;
 	if (ext4_has_feature_quota(sb))
@@ -4419,6 +4423,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+		goto failed_mount_wq;
+	}
+
 	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
 	    !ext4_has_feature_encrypt(sb)) {
 		ext4_set_feature_encrypt(sb);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 04b4f53f0659..534531747bf1 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -241,6 +241,9 @@ EXT4_ATTR_FEATURE(encryption);
 #ifdef CONFIG_UNICODE
 EXT4_ATTR_FEATURE(casefold);
 #endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
 EXT4_ATTR_FEATURE(metadata_csum_seed);
 
 static struct attribute *ext4_feat_attrs[] = {
@@ -252,6 +255,9 @@ static struct attribute *ext4_feat_attrs[] = {
 #endif
 #ifdef CONFIG_UNICODE
 	ATTR_LIST(casefold),
+#endif
+#ifdef CONFIG_FS_VERITY
+	ATTR_LIST(verity),
 #endif
 	ATTR_LIST(metadata_csum_seed),
 	NULL,
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 000000000000..6333b9dd2dff
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first page fully beyond i_size.  This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4.  This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs.  Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+
+/*
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	const size_t orig_count = count;
+
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *addr;
+
+		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		addr = kmap_atomic(page);
+		memcpy(buf, addr + offset_in_page(pos), n);
+		kunmap_atomic(addr);
+
+		put_page(page);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return orig_count;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+			   loff_t pos)
+{
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *fsdata;
+		void *addr;
+		int res;
+
+		res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+					    &page, &fsdata);
+		if (res)
+			return res;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + offset_in_page(pos), buf, n);
+		kunmap_atomic(addr);
+
+		res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+					  page, fsdata);
+		if (res < 0)
+			return res;
+		if (res != n)
+			return -EIO;
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Format of ext4 verity xattr.  This points to the location of the verity
+ * descriptor within the file data rather than containing it directly because
+ * the verity descriptor *must* be encrypted when ext4 encryption is used.  But,
+ * ext4 encryption does not encrypt xattrs.
+ */
+struct fsverity_descriptor_location {
+	__le32 version;
+	__le32 size;
+	__le64 pos;
+};
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int credits = 2; /* superblock and inode for ext4_orphan_add() */
+	handle_t *handle;
+	int err;
+
+	err = ext4_convert_inline_data(inode);
+	if (err)
+		return err;
+
+	err = ext4_inode_attach_jinode(inode);
+	if (err)
+		return err;
+
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_orphan_add(handle, inode);
+	if (err == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	ext4_journal_stop(handle);
+	return err;
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+				  size_t desc_size, u64 merkle_tree_size)
+{
+	struct inode *inode = file_inode(filp);
+	u64 desc_pos = round_up(inode->i_size, PAGE_SIZE) + merkle_tree_size;
+	struct fsverity_descriptor_location dloc = {
+		.version = cpu_to_le32(1),
+		.size = cpu_to_le32(desc_size),
+		.pos = cpu_to_le64(desc_pos),
+	};
+	int credits = 0;
+	handle_t *handle;
+	int err1 = 0;
+	int err;
+
+	if (desc != NULL) {
+		/* Succeeded; write the verity descriptor. */
+		err1 = pagecache_write(inode, desc, desc_size, desc_pos);
+
+		/* Write all pages before clearing VERITY_IN_PROGRESS. */
+		if (!err1)
+			err1 = filemap_write_and_wait(inode->i_mapping);
+
+		if (!err1)
+			err1 = ext4_xattr_set_credits(inode, sizeof(dloc), true,
+						      &credits);
+	} else {
+		/* Failed; truncate anything we wrote past i_size. */
+		ext4_truncate(inode);
+	}
+
+	/*
+	 * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+	 * deleting the inode from the orphan list, even if something failed.
+	 * If everything succeeded, we'll also set the verity bit and descriptor
+	 * location xattr in the same transaction.
+	 */
+
+	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	credits += 2; /* superblock and inode for ext4_orphan_del() */
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle)) {
+		ext4_orphan_del(NULL, inode);
+		return PTR_ERR(handle);
+	}
+
+	err = ext4_orphan_del(handle, inode);
+	if (err)
+		goto out_stop;
+
+	if (desc != NULL && !err1) {
+		struct ext4_iloc iloc;
+
+		err = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_VERITY,
+					    EXT4_XATTR_NAME_VERITY,
+					    &dloc, sizeof(dloc), XATTR_CREATE);
+		if (err)
+			goto out_stop;
+
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto out_stop;
+		ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+		ext4_set_inode_flags(inode);
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	}
+out_stop:
+	ext4_journal_stop(handle);
+	return err ?: err1;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+				      size_t buf_size)
+{
+	struct fsverity_descriptor_location dloc;
+	int res;
+	u32 size;
+	u64 pos;
+
+	/* Get the descriptor location */
+	res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_VERITY,
+			     EXT4_XATTR_NAME_VERITY, &dloc, sizeof(dloc));
+	if (res < 0 && res != -ERANGE)
+		return res;
+	if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
+		ext4_warning_inode(inode, "unknown verity xattr format");
+		return -EINVAL;
+	}
+	size = le32_to_cpu(dloc.size);
+	pos = le64_to_cpu(dloc.pos);
+
+	/* Get the descriptor */
+	if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
+	    pos < round_up(inode->i_size, PAGE_SIZE) || size > INT_MAX) {
+		ext4_warning_inode(inode, "invalid verity xattr");
+		return -EFSCORRUPTED;
+	}
+	if (buf_size == 0)
+		return size;
+	if (size > buf_size)
+		return -ERANGE;
+	return pagecache_read(inode, buf, size, pos);
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+					       pgoff_t index)
+{
+	index += DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+
+	return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	loff_t pos = round_up(inode->i_size, PAGE_SIZE) +
+		     (index << log_blocksize);
+
+	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+	.begin_enable_verity	= ext4_begin_enable_verity,
+	.end_enable_verity	= ext4_end_enable_verity,
+	.get_verity_descriptor	= ext4_get_verity_descriptor,
+	.read_merkle_tree_page	= ext4_read_merkle_tree_page,
+	.write_merkle_tree_block = ext4_write_merkle_tree_block,
+};
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index f39cad2abe2a..029d3511092d 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -26,6 +26,7 @@
 #define EXT4_XATTR_INDEX_RICHACL		8
 #define EXT4_XATTR_INDEX_ENCRYPTION		9
 #define EXT4_XATTR_INDEX_HURD			10 /* Reserved for Hurd */
+#define EXT4_XATTR_INDEX_VERITY			11
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
@@ -126,6 +127,7 @@ extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
 
 #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+#define EXT4_XATTR_NAME_VERITY		   "v"
 
 /*
  * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 13/16] fs-verity: support builtin file signatures
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

To meet some users' needs, add optional support for having fs-verity
handle a portion of the authentication policy in the kernel.  An
".fs-verity" keyring is created to which X.509 certificates can be
added; then a sysctl 'fs.verity.require_signatures' can be set to cause
the kernel to enforce that all fs-verity files contain a signature of
their file measurement by a key in this keyring.

See the "Built-in signature verification" section of
Documentation/filesystems/fsverity.rst for the full documentation.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Kconfig            |  17 +++
 fs/verity/Makefile           |   2 +
 fs/verity/enable.c           |  20 +++-
 fs/verity/fsverity_private.h |  48 +++++++-
 fs/verity/init.c             |   6 +
 fs/verity/open.c             |  25 +++--
 fs/verity/signature.c        | 207 +++++++++++++++++++++++++++++++++++
 fs/verity/verify.c           |   6 +
 8 files changed, 318 insertions(+), 13 deletions(-)
 create mode 100644 fs/verity/signature.c

diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index c2bca0b01ecf..88fb25119899 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -36,3 +36,20 @@ config FS_VERITY_DEBUG
 	  Enable debugging messages related to fs-verity by default.
 
 	  Say N unless you are an fs-verity developer.
+
+config FS_VERITY_BUILTIN_SIGNATURES
+	bool "FS Verity builtin signature support"
+	depends on FS_VERITY
+	select SYSTEM_DATA_VERIFICATION
+	help
+	  Support verifying signatures of verity files against the X.509
+	  certificates that have been loaded into the ".fs-verity"
+	  kernel keyring.
+
+	  This is meant as a relatively simple mechanism that can be
+	  used to provide an authenticity guarantee for verity files, as
+	  an alternative to IMA appraisal.  Userspace programs still
+	  need to check that the verity bit is set in order to get an
+	  authenticity guarantee.
+
+	  If unsure, say N.
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 6f7675ae0a31..570e9136334d 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -6,3 +6,5 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
 			   measure.o \
 			   open.o \
 			   verify.o
+
+obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 7e7ef9d3c376..ee9dd578e59f 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -147,7 +147,7 @@ static int enable_verity(struct file *filp,
 	const struct fsverity_operations *vops = inode->i_sb->s_vop;
 	struct merkle_tree_params params = { };
 	struct fsverity_descriptor *desc;
-	size_t desc_size = sizeof(*desc);
+	size_t desc_size = sizeof(*desc) + arg->sig_size;
 	struct fsverity_info *vi;
 	int err;
 
@@ -169,6 +169,16 @@ static int enable_verity(struct file *filp,
 	}
 	desc->salt_size = arg->salt_size;
 
+	/* Get the signature if the user provided one */
+	if (arg->sig_size &&
+	    copy_from_user(desc->signature,
+			   (const u8 __user *)(uintptr_t)arg->sig_ptr,
+			   arg->sig_size)) {
+		err = -EFAULT;
+		goto out;
+	}
+	desc->sig_size = cpu_to_le32(arg->sig_size);
+
 	desc->data_size = cpu_to_le64(inode->i_size);
 
 	pr_debug("Building Merkle tree...\n");
@@ -209,6 +219,10 @@ static int enable_verity(struct file *filp,
 		goto rollback;
 	}
 
+	if (arg->sig_size)
+		pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n",
+			 arg->sig_size);
+
 	/* Tell the filesystem to finish enabling verity on the file */
 	err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
 	if (err) {
@@ -267,8 +281,8 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
 	if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt))
 		return -EMSGSIZE;
 
-	if (arg.sig_size)
-		return -EINVAL;
+	if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)
+		return -EMSGSIZE;
 
 	/*
 	 * Require a regular file with write access.  But the actual fd must
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 693ef030adbe..51a63fc8f1df 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -78,23 +78,41 @@ struct fsverity_info {
 };
 
 /*
- * Merkle tree properties.  The file measurement is the hash of this structure.
+ * Merkle tree properties.  The file measurement is the hash of this structure
+ * excluding the signature and with the sig_size field set to 0.
  */
 struct fsverity_descriptor {
 	__u8 version;		/* must be 1 */
 	__u8 hash_algorithm;	/* Merkle tree hash algorithm */
 	__u8 log_blocksize;	/* log2 of size of data and tree blocks */
 	__u8 salt_size;		/* size of salt in bytes; 0 if none */
-	__le32 sig_size;	/* reserved, must be 0 */
+	__le32 sig_size;	/* size of signature in bytes; 0 if none */
 	__le64 data_size;	/* size of file the Merkle tree is built over */
 	__u8 root_hash[64];	/* Merkle tree root hash */
 	__u8 salt[32];		/* salt prepended to each hashed block */
 	__u8 __reserved[144];	/* must be 0's */
+	__u8 signature[];	/* optional PKCS#7 signature */
 };
 
 /* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
 #define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
 
+#define FS_VERITY_MAX_SIGNATURE_SIZE	(FS_VERITY_MAX_DESCRIPTOR_SIZE - \
+					 sizeof(struct fsverity_descriptor))
+
+/*
+ * Format in which verity file measurements are signed.  This is the same as
+ * 'struct fsverity_digest', except here some magic bytes are prepended to
+ * provide some context about what is being signed in case the same key is used
+ * for non-fsverity purposes, and here the fields have fixed endianness.
+ */
+struct fsverity_signed_digest {
+	char magic[8];			/* must be "FSVerity" */
+	__le16 digest_algorithm;
+	__le16 digest_size;
+	__u8 digest[];
+};
+
 /* hash_algs.c */
 
 extern struct fsverity_hash_alg fsverity_hash_algs[];
@@ -130,7 +148,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 				     const u8 *salt, size_t salt_size);
 
 struct fsverity_info *fsverity_create_info(const struct inode *inode,
-					   const void *desc, size_t desc_size);
+					   void *desc, size_t desc_size);
 
 void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
 
@@ -139,8 +157,32 @@ void fsverity_free_info(struct fsverity_info *vi);
 int __init fsverity_init_info_cache(void);
 void __init fsverity_exit_info_cache(void);
 
+/* signature.c */
+
+#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+int fsverity_verify_signature(const struct fsverity_info *vi,
+			      const struct fsverity_descriptor *desc,
+			      size_t desc_size);
+
+int __init fsverity_init_signature(void);
+#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+static inline int
+fsverity_verify_signature(const struct fsverity_info *vi,
+			  const struct fsverity_descriptor *desc,
+			  size_t desc_size)
+{
+	return 0;
+}
+
+static inline int fsverity_init_signature(void)
+{
+	return 0;
+}
+#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+
 /* verify.c */
 
 int __init fsverity_init_workqueue(void);
+void __init fsverity_exit_workqueue(void);
 
 #endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/init.c b/fs/verity/init.c
index b593805aafcc..94c104e00861 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -45,9 +45,15 @@ static int __init fsverity_init(void)
 	if (err)
 		goto err_exit_info_cache;
 
+	err = fsverity_init_signature();
+	if (err)
+		goto err_exit_workqueue;
+
 	pr_debug("Initialized fs-verity\n");
 	return 0;
 
+err_exit_workqueue:
+	fsverity_exit_workqueue();
 err_exit_info_cache:
 	fsverity_exit_info_cache();
 	return err;
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 731c7e810719..005ee057a997 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -128,12 +128,22 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 	return err;
 }
 
-/* Compute the file measurement by hashing the fsverity_descriptor. */
+/*
+ * Compute the file measurement by hashing the fsverity_descriptor excluding the
+ * signature and with the sig_size field set to 0.
+ */
 static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
-				    const struct fsverity_descriptor *desc,
+				    struct fsverity_descriptor *desc,
 				    u8 *measurement)
 {
-	return fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+	__le32 sig_size = desc->sig_size;
+	int err;
+
+	desc->sig_size = 0;
+	err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+	desc->sig_size = sig_size;
+
+	return err;
 }
 
 /*
@@ -141,9 +151,9 @@ static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
  * it.  The signature (if present) is also checked.
  */
 struct fsverity_info *fsverity_create_info(const struct inode *inode,
-					   const void *_desc, size_t desc_size)
+					   void *_desc, size_t desc_size)
 {
-	const struct fsverity_descriptor *desc = _desc;
+	struct fsverity_descriptor *desc = _desc;
 	struct fsverity_info *vi;
 	int err;
 
@@ -159,8 +169,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (desc->sig_size ||
-	    memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+	if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
 		fsverity_err(inode, "Reserved bits set in descriptor");
 		return ERR_PTR(-EINVAL);
 	}
@@ -205,6 +214,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 	pr_debug("Computed file measurement: %s:%*phN\n",
 		 vi->tree_params.hash_alg->name,
 		 vi->tree_params.digest_size, vi->measurement);
+
+	err = fsverity_verify_signature(vi, desc, desc_size);
 out:
 	if (err) {
 		fsverity_free_info(vi);
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
new file mode 100644
index 000000000000..b8e7b7ad6974
--- /dev/null
+++ b/fs/verity/signature.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/signature.c: verification of builtin signatures
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/cred.h>
+#include <linux/key.h>
+#include <linux/verification.h>
+
+/*
+ * /proc/sys/fs/verity/require_signatures
+ * If 1, all verity files must have a valid builtin signature.
+ */
+static int fsverity_require_signatures;
+
+/*
+ * Keyring that contains the trusted X.509 certificates.
+ *
+ * Only root (kuid=0) can modify this.  Also, root may use
+ * keyctl_restrict_keyring() to prevent any more additions.
+ */
+static struct key *fsverity_keyring;
+
+struct verify_arg {
+	const struct fsverity_info *vi;
+	u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+	bool have_measurement;
+};
+
+static int extract_measurement(void *ctx, const void *data, size_t len,
+			       size_t asn1hdrlen)
+{
+	struct verify_arg *arg = ctx;
+	const struct fsverity_info *vi = arg->vi;
+	const struct inode *inode = vi->inode;
+	const struct fsverity_signed_digest *d = data;
+	const struct fsverity_hash_alg *hash_alg;
+
+	if (len < sizeof(*d) || memcmp(d->magic, "FSVerity", 8) != 0) {
+		fsverity_warn(inode,
+			      "Signed file measurement uses unrecognized format");
+		return -EBADMSG;
+	}
+
+	hash_alg = fsverity_get_hash_alg(inode,
+					 le16_to_cpu(d->digest_algorithm));
+	if (IS_ERR(hash_alg))
+		return PTR_ERR(hash_alg);
+
+	if (le16_to_cpu(d->digest_size) != hash_alg->digest_size) {
+		fsverity_warn(inode,
+			      "Wrong digest_size in signed file measurement: wanted %u for algorithm %s, but got %u",
+			      hash_alg->digest_size, hash_alg->name,
+			      le16_to_cpu(d->digest_size));
+		return -EBADMSG;
+	}
+
+	if (len < sizeof(*d) + hash_alg->digest_size) {
+		fsverity_warn(inode, "Signed file measurement is truncated");
+		return -EBADMSG;
+	}
+
+	if (hash_alg != vi->tree_params.hash_alg) {
+		fsverity_warn(inode,
+			      "Signed file measurement uses %s, but file uses %s",
+			      hash_alg->name, vi->tree_params.hash_alg->name);
+		return -EBADMSG;
+	}
+
+	memcpy(arg->measurement, d->digest, hash_alg->digest_size);
+	arg->have_measurement = true;
+	return 0;
+}
+
+/**
+ * fsverity_verify_signature - check a verity file's signature
+ *
+ * Verify a signed fsverity_measurement against the certificates in the
+ * fs-verity keyring.  The signature is given as a PKCS#7 formatted message, and
+ * the signed data is included in the message (not detached).
+ *
+ * Return: 0 on success (signature valid or not required); -errno on failure
+ */
+int fsverity_verify_signature(const struct fsverity_info *vi,
+			      const struct fsverity_descriptor *desc,
+			      size_t desc_size)
+{
+	const struct inode *inode = vi->inode;
+	const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
+	const unsigned int digest_size = hash_alg->digest_size;
+	const u32 sig_size = le32_to_cpu(desc->sig_size);
+	struct verify_arg arg = {
+		.vi = vi,
+		.have_measurement = false,
+	};
+	int err;
+
+	if (sig_size == 0) {
+		if (fsverity_require_signatures) {
+			fsverity_err(inode,
+				     "require_signatures=1, rejecting unsigned file!");
+			return -EBADMSG;
+		}
+		return 0;
+	}
+
+	if (sig_size > desc_size - sizeof(*desc)) {
+		fsverity_err(inode, "Signature overflows verity descriptor");
+		return -EBADMSG;
+	}
+
+	err = verify_pkcs7_signature(NULL, 0, desc->signature, sig_size,
+				     fsverity_keyring,
+				     VERIFYING_UNSPECIFIED_SIGNATURE,
+				     extract_measurement, &arg);
+	if (err) {
+		fsverity_err(inode, "Error %d verifying PKCS#7 signature", err);
+		return err;
+	}
+
+	if (!arg.have_measurement) {
+		fsverity_err(inode, "PKCS#7 message is missing internal data");
+		return -EBADMSG;
+	}
+
+	if (memcmp(arg.measurement, vi->measurement, digest_size) != 0) {
+		fsverity_err(inode,
+			     "FILE CORRUPTED (signed measurement differs from actual measurement): signed %s:%*phN, actual %s:%*phN",
+			     hash_alg->name, digest_size, arg.measurement,
+			     hash_alg->name, digest_size, vi->measurement);
+		return -EBADMSG;
+	}
+
+	pr_debug("Valid signature for measurement: %s:%*phN\n",
+		 hash_alg->name, digest_size, vi->measurement);
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int one = 1;
+static struct ctl_table_header *fsverity_sysctl_header;
+
+static const struct ctl_path fsverity_sysctl_path[] = {
+	{ .procname = "fs", },
+	{ .procname = "verity", },
+	{ }
+};
+
+static struct ctl_table fsverity_sysctl_table[] = {
+	{
+		.procname       = "require_signatures",
+		.data           = &fsverity_require_signatures,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+	{ }
+};
+
+static int __init fsverity_sysctl_init(void)
+{
+	fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path,
+						       fsverity_sysctl_table);
+	if (!fsverity_sysctl_header) {
+		pr_err("sysctl registration failed!\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+#else /* !CONFIG_SYSCTL */
+static inline int fsverity_sysctl_init(void)
+{
+	return 0;
+}
+#endif /* !CONFIG_SYSCTL */
+
+int __init fsverity_init_signature(void)
+{
+	struct key *ring;
+	int err;
+
+	ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
+			     current_cred(), KEY_POS_SEARCH |
+				KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE |
+				KEY_USR_SEARCH | KEY_USR_SETATTR,
+			     KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(ring))
+		return PTR_ERR(ring);
+
+	err = fsverity_sysctl_init();
+	if (err)
+		goto err_put_ring;
+
+	fsverity_keyring = ring;
+	return 0;
+
+err_put_ring:
+	key_put(ring);
+	return err;
+}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 2a0f9e2ebc9f..783f4042b679 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -273,3 +273,9 @@ int __init fsverity_init_workqueue(void)
 		return -ENOMEM;
 	return 0;
 }
+
+void __init fsverity_exit_workqueue(void)
+{
+	destroy_workqueue(fsverity_read_workqueue);
+	fsverity_read_workqueue = NULL;
+}
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 12/16] fs-verity: add SHA-512 support
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add SHA-512 support to fs-verity.  This is primarily a demonstration of
the trivial changes needed to support a new hash algorithm in fs-verity;
most users will still use SHA-256, due to the smaller space required to
store the hashes.  But some users may prefer SHA-512.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/fsverity_private.h  | 2 +-
 fs/verity/hash_algs.c         | 5 +++++
 include/uapi/linux/fsverity.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index d61ad740027d..693ef030adbe 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -31,7 +31,7 @@ struct ahash_request;
  * Largest digest size among all hash algorithms supported by fs-verity.
  * Currently assumed to be <= size of fsverity_descriptor::root_hash.
  */
-#define FS_VERITY_MAX_DIGEST_SIZE	SHA256_DIGEST_SIZE
+#define FS_VERITY_MAX_DIGEST_SIZE	SHA512_DIGEST_SIZE
 
 /* A hash algorithm supported by fs-verity */
 struct fsverity_hash_alg {
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 46df17094fc2..e0462a010cab 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -17,6 +17,11 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
 		.digest_size = SHA256_DIGEST_SIZE,
 		.block_size = SHA256_BLOCK_SIZE,
 	},
+	[FS_VERITY_HASH_ALG_SHA512] = {
+		.name = "sha512",
+		.digest_size = SHA512_DIGEST_SIZE,
+		.block_size = SHA512_BLOCK_SIZE,
+	},
 };
 
 /**
diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h
index 57d1d7fc0c34..da0daf6c193b 100644
--- a/include/uapi/linux/fsverity.h
+++ b/include/uapi/linux/fsverity.h
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 
 #define FS_VERITY_HASH_ALG_SHA256	1
+#define FS_VERITY_HASH_ALG_SHA512	2
 
 struct fsverity_enable_arg {
 	__u32 version;
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 11/16] fs-verity: implement FS_IOC_MEASURE_VERITY ioctl
From: Eric Biggers @ 2019-06-06 15:52 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a function for filesystems to call to implement the
FS_IOC_MEASURE_VERITY ioctl.  This ioctl retrieves the file measurement
that fs-verity calculated for the given file and is enforcing for reads;
i.e., reads that don't match this hash will fail.  This ioctl can be
used for authentication or logging of file measurements in userspace.

See the "FS_IOC_MEASURE_VERITY" section of
Documentation/filesystems/fsverity.rst for the documentation.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Makefile       |  1 +
 fs/verity/measure.c      | 57 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fsverity.h | 11 ++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 fs/verity/measure.c

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 04b37475fd28..6f7675ae0a31 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -3,5 +3,6 @@
 obj-$(CONFIG_FS_VERITY) += enable.o \
 			   hash_algs.o \
 			   init.o \
+			   measure.o \
 			   open.o \
 			   verify.o
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
new file mode 100644
index 000000000000..05049b68c745
--- /dev/null
+++ b/fs/verity/measure.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/measure.c: ioctl to get a verity file's measurement
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/uaccess.h>
+
+/**
+ * fsverity_ioctl_measure() - get a verity file's measurement
+ *
+ * Retrieve the file measurement that the kernel is enforcing for reads from a
+ * verity file.  See the "FS_IOC_MEASURE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
+{
+	const struct inode *inode = file_inode(filp);
+	struct fsverity_digest __user *uarg = _uarg;
+	const struct fsverity_info *vi;
+	const struct fsverity_hash_alg *hash_alg;
+	struct fsverity_digest arg;
+
+	vi = fsverity_get_info(inode);
+	if (!vi)
+		return -ENODATA; /* not a verity file */
+	hash_alg = vi->tree_params.hash_alg;
+
+	/*
+	 * The user specifies the digest_size their buffer has space for; we can
+	 * return the digest if it fits in the available space.  We write back
+	 * the actual size, which may be shorter than the user-specified size.
+	 */
+
+	if (get_user(arg.digest_size, &uarg->digest_size))
+		return -EFAULT;
+	if (arg.digest_size < hash_alg->digest_size)
+		return -EOVERFLOW;
+
+	memset(&arg, 0, sizeof(arg));
+	arg.digest_algorithm = hash_alg - fsverity_hash_algs;
+	arg.digest_size = hash_alg->digest_size;
+
+	if (copy_to_user(uarg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 7ef2ef826534..247359c86b72 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -116,6 +116,10 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 
 extern int fsverity_ioctl_enable(struct file *filp, const void __user *arg);
 
+/* measure.c */
+
+extern int fsverity_ioctl_measure(struct file *filp, void __user *arg);
+
 /* open.c */
 
 extern int fsverity_file_open(struct inode *inode, struct file *filp);
@@ -143,6 +147,13 @@ static inline int fsverity_ioctl_enable(struct file *filp,
 	return -EOPNOTSUPP;
 }
 
+/* measure.c */
+
+static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
 /* open.c */
 
 static inline int fsverity_file_open(struct inode *inode, struct file *filp)
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 10/16] fs-verity: implement FS_IOC_ENABLE_VERITY ioctl
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a function for filesystems to call to implement the
FS_IOC_ENABLE_VERITY ioctl.  This ioctl enables fs-verity on a file.

See the "FS_IOC_ENABLE_VERITY" section of
Documentation/filesystems/fsverity.rst for the documentation.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Makefile       |   3 +-
 fs/verity/enable.c       | 339 +++++++++++++++++++++++++++++++++++++++
 include/linux/fsverity.h |  64 ++++++++
 3 files changed, 405 insertions(+), 1 deletion(-)
 create mode 100644 fs/verity/enable.c

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 7fa628cd5eba..04b37475fd28 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_FS_VERITY) += hash_algs.o \
+obj-$(CONFIG_FS_VERITY) += enable.o \
+			   hash_algs.o \
 			   init.o \
 			   open.o \
 			   verify.o
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
new file mode 100644
index 000000000000..7e7ef9d3c376
--- /dev/null
+++ b/fs/verity/enable.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/enable.c: ioctl to enable verity on a file
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+static int build_merkle_tree_level(struct inode *inode, unsigned int level,
+				   u64 num_blocks_to_hash,
+				   const struct merkle_tree_params *params,
+				   u8 *pending_hashes,
+				   struct ahash_request *req)
+{
+	const struct fsverity_operations *vops = inode->i_sb->s_vop;
+	unsigned int pending_size = 0;
+	u64 dst_block_num;
+	u64 i;
+	int err;
+
+	if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */
+		return -EINVAL;
+
+	if (level < params->num_levels) {
+		dst_block_num = params->level_start[level];
+	} else {
+		if (WARN_ON(num_blocks_to_hash != 1))
+			return -EINVAL;
+		dst_block_num = 0; /* unused */
+	}
+
+	for (i = 0; i < num_blocks_to_hash; i++) {
+		struct page *src_page;
+
+		if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash)
+			pr_debug("Hashing block %llu of %llu for level %u\n",
+				 i + 1, num_blocks_to_hash, level);
+
+		if (level == 0)
+			/* Leaf: hashing a data block */
+			src_page = read_mapping_page(inode->i_mapping, i, NULL);
+		else
+			/* Non-leaf: hashing hash block from level below */
+			src_page = vops->read_merkle_tree_page(inode,
+					params->level_start[level - 1] + i);
+		if (IS_ERR(src_page)) {
+			err = PTR_ERR(src_page);
+			fsverity_err(inode,
+				     "Error %d reading Merkle tree page %llu",
+				     err, params->level_start[level - 1] + i);
+			return err;
+		}
+
+		err = fsverity_hash_page(params, inode, req, src_page,
+					 &pending_hashes[pending_size]);
+		put_page(src_page);
+		if (err)
+			return err;
+		pending_size += params->digest_size;
+
+		if (level == params->num_levels) /* Root hash? */
+			return 0;
+
+		if (pending_size + params->digest_size > params->block_size ||
+		    i + 1 == num_blocks_to_hash) {
+			/* Flush the pending hash block */
+			memset(&pending_hashes[pending_size], 0,
+			       params->block_size - pending_size);
+			err = vops->write_merkle_tree_block(inode,
+					pending_hashes,
+					dst_block_num,
+					params->log_blocksize);
+			if (err) {
+				fsverity_err(inode,
+					     "Error %d writing Merkle tree block %llu",
+					     err, dst_block_num);
+				return err;
+			}
+			dst_block_num++;
+			pending_size = 0;
+		}
+
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		cond_resched();
+	}
+	return 0;
+}
+
+/*
+ * Build the Merkle tree for the given inode using the given parameters, and
+ * return the root hash in @root_hash.
+ *
+ * The tree is written to a filesystem-specific location as determined by the
+ * ->write_merkle_tree_block() method.  However, the blocks that comprise the
+ * tree are the same for all filesystems.
+ */
+static int build_merkle_tree(struct inode *inode,
+			     const struct merkle_tree_params *params,
+			     u8 *root_hash)
+{
+	u8 *pending_hashes;
+	struct ahash_request *req;
+	u64 blocks;
+	unsigned int level;
+	int err = -ENOMEM;
+
+	pending_hashes = kmalloc(params->block_size, GFP_KERNEL);
+	req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL);
+	if (!pending_hashes || !req)
+		goto out;
+
+	/*
+	 * Build each level of the Merkle tree, starting at the leaf level
+	 * (level 0) and ascending to the root node (level 'num_levels - 1').
+	 * Then at the end (level 'num_levels'), calculate the root hash.
+	 */
+	blocks = (params->data_size + params->block_size - 1) >>
+		 params->log_blocksize;
+	for (level = 0; level <= params->num_levels; level++) {
+		err = build_merkle_tree_level(inode, level, blocks, params,
+					      pending_hashes, req);
+		if (err)
+			goto out;
+		blocks = (blocks + params->hashes_per_block - 1) >>
+			 params->log_arity;
+	}
+	memcpy(root_hash, pending_hashes, params->digest_size);
+	err = 0;
+out:
+	kfree(pending_hashes);
+	ahash_request_free(req);
+	return err;
+}
+
+static int enable_verity(struct file *filp,
+			 const struct fsverity_enable_arg *arg)
+{
+	struct inode *inode = file_inode(filp);
+	const struct fsverity_operations *vops = inode->i_sb->s_vop;
+	struct merkle_tree_params params = { };
+	struct fsverity_descriptor *desc;
+	size_t desc_size = sizeof(*desc);
+	struct fsverity_info *vi;
+	int err;
+
+	/* Start initializing the fsverity_descriptor */
+	desc = kzalloc(desc_size, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	desc->version = 1;
+	desc->hash_algorithm = arg->hash_algorithm;
+	desc->log_blocksize = ilog2(arg->block_size);
+
+	/* Get the salt if the user provided one */
+	if (arg->salt_size &&
+	    copy_from_user(desc->salt,
+			   (const u8 __user *)(uintptr_t)arg->salt_ptr,
+			   arg->salt_size)) {
+		err = -EFAULT;
+		goto out;
+	}
+	desc->salt_size = arg->salt_size;
+
+	desc->data_size = cpu_to_le64(inode->i_size);
+
+	pr_debug("Building Merkle tree...\n");
+
+	/* Prepare the Merkle tree parameters */
+	err = fsverity_init_merkle_tree_params(&params, inode,
+					       arg->hash_algorithm,
+					       desc->log_blocksize,
+					       desc->salt, desc->salt_size);
+	if (err)
+		goto out;
+
+	/* Tell the filesystem that verity is being enabled on the file */
+	err = vops->begin_enable_verity(filp);
+	if (err)
+		goto out;
+
+	/* Build the Merkle tree */
+	BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE);
+	err = build_merkle_tree(inode, &params, desc->root_hash);
+	if (err) {
+		fsverity_err(inode, "Error %d building Merkle tree", err);
+		goto rollback;
+	}
+	pr_debug("Done building Merkle tree.  Root hash is %s:%*phN\n",
+		 params.hash_alg->name, params.digest_size, desc->root_hash);
+
+	/*
+	 * Create the fsverity_info.  Don't bother trying to save work by
+	 * reusing the merkle_tree_params from above.  Instead, just create the
+	 * fsverity_info from the fsverity_descriptor as if it were just loaded
+	 * from disk.  This is simpler, and it serves as an extra check that the
+	 * metadata we're writing is valid before actually enabling verity.
+	 */
+	vi = fsverity_create_info(inode, desc, desc_size);
+	if (IS_ERR(vi)) {
+		err = PTR_ERR(vi);
+		goto rollback;
+	}
+
+	/* Tell the filesystem to finish enabling verity on the file */
+	err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
+	if (err) {
+		fsverity_err(inode, "%ps() failed with err %d",
+			     vops->end_enable_verity, err);
+		fsverity_free_info(vi);
+	} else {
+		/* Successfully enabled verity */
+
+		WARN_ON(!IS_VERITY(inode));
+
+		/*
+		 * Readers can start using ->i_verity_info immediately, so it
+		 * can't be rolled back once set.  So don't set it until just
+		 * after the filesystem has successfully enabled verity.
+		 */
+		fsverity_set_info(inode, vi);
+	}
+out:
+	kfree(params.hashstate);
+	kfree(desc);
+	return err;
+
+rollback:
+	(void)vops->end_enable_verity(filp, NULL, 0, params.tree_size);
+	goto out;
+}
+
+/**
+ * fsverity_ioctl_enable() - enable verity on a file
+ *
+ * Enable fs-verity on a file.  See the "FS_IOC_ENABLE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
+{
+	struct inode *inode = file_inode(filp);
+	struct fsverity_enable_arg arg;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.version != 1)
+		return -EINVAL;
+
+	if (arg.__reserved1 ||
+	    memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
+		return -EINVAL;
+
+	if (arg.block_size != PAGE_SIZE)
+		return -EINVAL;
+
+	if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt))
+		return -EMSGSIZE;
+
+	if (arg.sig_size)
+		return -EINVAL;
+
+	/*
+	 * Require a regular file with write access.  But the actual fd must
+	 * still be readonly so that we can lock out all writers.  This is
+	 * needed to guarantee that no writable fds exist to the file once it
+	 * has verity enabled, and to stabilize the data being hashed.
+	 */
+
+	err = inode_permission(inode, MAY_WRITE);
+	if (err)
+		return err;
+
+	if (IS_APPEND(inode))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EISDIR;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	err = mnt_want_write_file(filp);
+	if (err) /* -EROFS */
+		return err;
+
+	err = deny_write_access(filp);
+	if (err) /* -ETXTBSY */
+		goto out_drop_write;
+
+	inode_lock(inode);
+
+	if (IS_VERITY(inode)) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (inode->i_size <= 0) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	err = enable_verity(filp, &arg);
+	if (err)
+		goto out_unlock;
+
+	/*
+	 * Some pages of the file may have been evicted from pagecache after
+	 * being used in the Merkle tree construction, then read into pagecache
+	 * again by another process reading from the file concurrently.  Since
+	 * these pages didn't undergo verification against the file measurement
+	 * which fs-verity now claims to be enforcing, we have to wipe the
+	 * pagecache to ensure that all future reads are verified.
+	 */
+	filemap_write_and_wait(inode->i_mapping);
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	/*
+	 * allow_write_access() is needed to pair with deny_write_access().
+	 * Regardless, the filesystem won't allow writing to verity files.
+	 */
+out_unlock:
+	inode_unlock(inode);
+	allow_write_access(filp);
+out_drop_write:
+	mnt_drop_write_file(filp);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_enable);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index ecd47e748c7f..7ef2ef826534 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -17,6 +17,42 @@
 /* Verity operations for filesystems */
 struct fsverity_operations {
 
+	/**
+	 * Begin enabling verity on the given file.
+	 *
+	 * @filp: a readonly file descriptor for the file
+	 *
+	 * The filesystem must do any needed filesystem-specific preparations
+	 * for enabling verity, e.g. evicting inline data.
+	 *
+	 * i_rwsem is held for write.
+	 *
+	 * Return: 0 on success, -errno on failure
+	 */
+	int (*begin_enable_verity)(struct file *filp);
+
+	/**
+	 * End enabling verity on the given file.
+	 *
+	 * @filp: a readonly file descriptor for the file
+	 * @desc: the verity descriptor to write, or NULL on failure
+	 * @desc_size: size of verity descriptor, or 0 on failure
+	 * @merkle_tree_size: total bytes the Merkle tree took up
+	 *
+	 * If desc == NULL, then enabling verity failed and the filesystem only
+	 * must do any necessary cleanups.  Else, it must also store the given
+	 * verity descriptor to a fs-specific location associated with the inode
+	 * and do any fs-specific actions needed to mark the inode as a verity
+	 * inode, e.g. setting a bit in the on-disk inode.  The filesystem is
+	 * also responsible for setting the S_VERITY flag in the VFS inode.
+	 *
+	 * i_rwsem is held for write.
+	 *
+	 * Return: 0 on success, -errno on failure
+	 */
+	int (*end_enable_verity)(struct file *filp, const void *desc,
+				 size_t desc_size, u64 merkle_tree_size);
+
 	/**
 	 * Get the verity descriptor of the given inode.
 	 *
@@ -50,6 +86,22 @@ struct fsverity_operations {
 	 */
 	struct page *(*read_merkle_tree_page)(struct inode *inode,
 					      pgoff_t index);
+
+	/**
+	 * Write a Merkle tree block to the given inode.
+	 *
+	 * @inode: the inode for which the Merkle tree is being built
+	 * @buf: block to write
+	 * @index: 0-based index of the block within the Merkle tree
+	 * @log_blocksize: log base 2 of the Merkle tree block size
+	 *
+	 * This is only called between ->begin_enable_verity() and
+	 * ->end_enable_verity().  i_rwsem is held for write.
+	 *
+	 * Return: 0 on success, -errno on failure
+	 */
+	int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
+				       u64 index, int log_blocksize);
 };
 
 #ifdef CONFIG_FS_VERITY
@@ -60,6 +112,10 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 	return READ_ONCE(inode->i_verity_info);
 }
 
+/* enable.c */
+
+extern int fsverity_ioctl_enable(struct file *filp, const void __user *arg);
+
 /* open.c */
 
 extern int fsverity_file_open(struct inode *inode, struct file *filp);
@@ -79,6 +135,14 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 	return NULL;
 }
 
+/* enable.c */
+
+static inline int fsverity_ioctl_enable(struct file *filp,
+					const void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
 /* open.c */
 
 static inline int fsverity_file_open(struct inode *inode, struct file *filp)
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 09/16] fs-verity: add data verification hooks for ->readpages()
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add functions that verify data pages that have been read from a
fs-verity file, against that file's Merkle tree.  These will be called
from filesystems' ->readpage() and ->readpages() methods.

Since data verification can block, a workqueue is provided for these
methods to enqueue verification work from their bio completion callback.

See the "Verifying data" section of
Documentation/filesystems/fsverity.rst for more information.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Makefile           |   3 +-
 fs/verity/fsverity_private.h |   5 +
 fs/verity/init.c             |   8 +
 fs/verity/open.c             |   6 +
 fs/verity/verify.c           | 275 +++++++++++++++++++++++++++++++++++
 include/linux/fsverity.h     |  56 +++++++
 6 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 fs/verity/verify.c

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index e6a8951c493a..7fa628cd5eba 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -2,4 +2,5 @@
 
 obj-$(CONFIG_FS_VERITY) += hash_algs.o \
 			   init.o \
-			   open.o
+			   open.o \
+			   verify.o
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index c7fc3c7fc714..d61ad740027d 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -137,5 +137,10 @@ void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
 void fsverity_free_info(struct fsverity_info *vi);
 
 int __init fsverity_init_info_cache(void);
+void __init fsverity_exit_info_cache(void);
+
+/* verify.c */
+
+int __init fsverity_init_workqueue(void);
 
 #endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/init.c b/fs/verity/init.c
index fff1fd634335..b593805aafcc 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -41,7 +41,15 @@ static int __init fsverity_init(void)
 	if (err)
 		return err;
 
+	err = fsverity_init_workqueue();
+	if (err)
+		goto err_exit_info_cache;
+
 	pr_debug("Initialized fs-verity\n");
 	return 0;
+
+err_exit_info_cache:
+	fsverity_exit_info_cache();
+	return err;
 }
 late_initcall(fsverity_init)
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 91f7e2d05b54..731c7e810719 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -344,3 +344,9 @@ int __init fsverity_init_info_cache(void)
 		return -ENOMEM;
 	return 0;
 }
+
+void __init fsverity_exit_info_cache(void)
+{
+	kmem_cache_destroy(fsverity_info_cachep);
+	fsverity_info_cachep = NULL;
+}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
new file mode 100644
index 000000000000..2a0f9e2ebc9f
--- /dev/null
+++ b/fs/verity/verify.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/bio.h>
+#include <linux/ratelimit.h>
+
+static struct workqueue_struct *fsverity_read_workqueue;
+
+/**
+ * hash_at_level() - compute the location of the block's hash at the given level
+ *
+ * @params:	(in) the Merkle tree parameters
+ * @dindex:	(in) the index of the data block being verified
+ * @level:	(in) the level of hash we want (0 is leaf level)
+ * @hindex:	(out) the index of the hash block containing the wanted hash
+ * @hoffset:	(out) the byte offset to the wanted hash within the hash block
+ */
+static void hash_at_level(const struct merkle_tree_params *params,
+			  pgoff_t dindex, unsigned int level, pgoff_t *hindex,
+			  unsigned int *hoffset)
+{
+	pgoff_t position;
+
+	/* Offset of the hash within the level's region, in hashes */
+	position = dindex >> (level * params->log_arity);
+
+	/* Index of the hash block in the tree overall */
+	*hindex = params->level_start[level] + (position >> params->log_arity);
+
+	/* Offset of the wanted hash (in bytes) within the hash block */
+	*hoffset = (position & ((1 << params->log_arity) - 1)) <<
+		   (params->log_blocksize - params->log_arity);
+}
+
+/* Extract a hash from a hash page */
+static void extract_hash(struct page *hpage, unsigned int hoffset,
+			 unsigned int hsize, u8 *out)
+{
+	void *virt = kmap_atomic(hpage);
+
+	memcpy(out, virt + hoffset, hsize);
+	kunmap_atomic(virt);
+}
+
+static inline int cmp_hashes(const struct fsverity_info *vi,
+			     const u8 *want_hash, const u8 *real_hash,
+			     pgoff_t index, int level)
+{
+	const unsigned int hsize = vi->tree_params.digest_size;
+
+	if (memcmp(want_hash, real_hash, hsize) == 0)
+		return 0;
+
+	fsverity_err(vi->inode,
+		     "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+		     index, level,
+		     vi->tree_params.hash_alg->name, hsize, want_hash,
+		     vi->tree_params.hash_alg->name, hsize, real_hash);
+	return -EBADMSG;
+}
+
+/*
+ * Verify a single data page against the file's Merkle tree.
+ *
+ * In principle, we need to verify the entire path to the root node.  However,
+ * for efficiency the filesystem may cache the hash pages.  Therefore we need
+ * only ascend the tree until an already-verified page is seen, as indicated by
+ * the PageChecked bit being set; then verify the path to that page.
+ *
+ * This code currently only supports the case where the verity block size is
+ * equal to PAGE_SIZE.  Doing otherwise would be possible but tricky, since we
+ * wouldn't be able to use the PageChecked bit.
+ *
+ * Note that multiple processes may race to verify a hash page and mark it
+ * Checked, but it doesn't matter; the result will be the same either way.
+ *
+ * Return: true if the page is valid, else false.
+ */
+static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
+			struct ahash_request *req, struct page *data_page)
+{
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int hsize = params->digest_size;
+	const pgoff_t index = data_page->index;
+	int level;
+	u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	const u8 *want_hash;
+	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	struct page *hpages[FS_VERITY_MAX_LEVELS];
+	unsigned int hoffsets[FS_VERITY_MAX_LEVELS];
+	int err;
+
+	if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page)))
+		return false;
+
+	pr_debug_ratelimited("Verifying data page %lu...\n", index);
+
+	/*
+	 * Starting at the leaf level, ascend the tree saving hash pages along
+	 * the way until we find a verified hash page, indicated by PageChecked;
+	 * or until we reach the root.
+	 */
+	for (level = 0; level < params->num_levels; level++) {
+		pgoff_t hindex;
+		unsigned int hoffset;
+		struct page *hpage;
+
+		hash_at_level(params, index, level, &hindex, &hoffset);
+
+		pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n",
+				     level, hindex, hoffset);
+
+		hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
+								  hindex);
+		if (IS_ERR(hpage)) {
+			err = PTR_ERR(hpage);
+			fsverity_err(inode,
+				     "Error %d reading Merkle tree page %lu",
+				     err, hindex);
+			goto out;
+		}
+
+		if (PageChecked(hpage)) {
+			extract_hash(hpage, hoffset, hsize, _want_hash);
+			want_hash = _want_hash;
+			put_page(hpage);
+			pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
+					     params->hash_alg->name,
+					     hsize, want_hash);
+			goto descend;
+		}
+		pr_debug_ratelimited("Hash page not yet checked\n");
+		hpages[level] = hpage;
+		hoffsets[level] = hoffset;
+	}
+
+	want_hash = vi->root_hash;
+	pr_debug("Want root hash: %s:%*phN\n",
+		 params->hash_alg->name, hsize, want_hash);
+descend:
+	/* Descend the tree verifying hash pages */
+	for (; level > 0; level--) {
+		struct page *hpage = hpages[level - 1];
+		unsigned int hoffset = hoffsets[level - 1];
+
+		err = fsverity_hash_page(params, inode, req, hpage, real_hash);
+		if (err)
+			goto out;
+		err = cmp_hashes(vi, want_hash, real_hash, index, level - 1);
+		if (err)
+			goto out;
+		SetPageChecked(hpage);
+		extract_hash(hpage, hoffset, hsize, _want_hash);
+		want_hash = _want_hash;
+		put_page(hpage);
+		pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
+			 level - 1, params->hash_alg->name, hsize, want_hash);
+	}
+
+	/* Finally, verify the data page */
+	err = fsverity_hash_page(params, inode, req, data_page, real_hash);
+	if (err)
+		goto out;
+	err = cmp_hashes(vi, want_hash, real_hash, index, -1);
+out:
+	for (; level > 0; level--)
+		put_page(hpages[level - 1]);
+
+	return err == 0;
+}
+
+/**
+ * fsverity_verify_page - verify a data page
+ *
+ * Verify a page that has just been read from a verity file.  The page must be a
+ * pagecache page that is still locked and not yet uptodate.
+ *
+ * Return: true if the page is valid, else false.
+ */
+bool fsverity_verify_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	const struct fsverity_info *vi = inode->i_verity_info;
+	struct ahash_request *req;
+	bool valid;
+
+	req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+	if (unlikely(!req))
+		return false;
+
+	valid = verify_page(inode, vi, req, page);
+
+	ahash_request_free(req);
+
+	return valid;
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_page);
+
+#ifdef CONFIG_BLOCK
+/**
+ * fsverity_verify_bio - verify a 'read' bio that has just completed
+ *
+ * Verify a set of pages that have just been read from a verity file.  The pages
+ * must be pagecache pages that are still locked and not yet uptodate.  Pages
+ * that fail verification are set to the Error state.  Verification is skipped
+ * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ *
+ * This is a helper function for use by the ->readpages() method of filesystems
+ * that issue bios to read data directly into the page cache.  Filesystems that
+ * populate the page cache without issuing bios (e.g. non block-based
+ * filesystems) must instead call fsverity_verify_page() directly on each page.
+ * All filesystems must also call fsverity_verify_page() on holes.
+ */
+void fsverity_verify_bio(struct bio *bio)
+{
+	struct inode *inode = bio_first_page_all(bio)->mapping->host;
+	const struct fsverity_info *vi = inode->i_verity_info;
+	struct ahash_request *req;
+	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
+
+	req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+	if (unlikely(!req)) {
+		bio_for_each_segment_all(bv, bio, iter_all)
+			SetPageError(bv->bv_page);
+		return;
+	}
+
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		struct page *page = bv->bv_page;
+
+		if (!PageError(page) && !verify_page(inode, vi, req, page))
+			SetPageError(page);
+	}
+
+	ahash_request_free(req);
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_bio);
+#endif /* CONFIG_BLOCK */
+
+/**
+ * fsverity_enqueue_verify_work - enqueue work on the fs-verity workqueue
+ *
+ * Enqueue verification work for asynchronous processing.
+ */
+void fsverity_enqueue_verify_work(struct work_struct *work)
+{
+	queue_work(fsverity_read_workqueue, work);
+}
+EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
+
+int __init fsverity_init_workqueue(void)
+{
+	/*
+	 * Use an unbound workqueue to allow bios to be verified in parallel
+	 * even when they happen to complete on the same CPU.  This sacrifices
+	 * locality, but it's worthwhile since hashing is CPU-intensive.
+	 *
+	 * Also use a high-priority workqueue to prioritize verification work,
+	 * which blocks reads from completing, over regular application tasks.
+	 */
+	fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
+						  WQ_UNBOUND | WQ_HIGHPRI,
+						  num_online_cpus());
+	if (!fsverity_read_workqueue)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index cbcc358d0736..ecd47e748c7f 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -33,6 +33,23 @@ struct fsverity_operations {
 	 */
 	int (*get_verity_descriptor)(struct inode *inode, void *buf,
 				     size_t bufsize);
+
+	/**
+	 * Read a Merkle tree page of the given inode.
+	 *
+	 * @inode: the inode
+	 * @index: 0-based index of the page within the Merkle tree
+	 *
+	 * This can be called at any time on an open verity file, as well as
+	 * between ->begin_enable_verity() and ->end_enable_verity().  It may be
+	 * called by multiple processes concurrently, even with the same page.
+	 *
+	 * Note that this must retrieve a *page*, not necessarily a *block*.
+	 *
+	 * Return: the page on success, ERR_PTR() on failure
+	 */
+	struct page *(*read_merkle_tree_page)(struct inode *inode,
+					      pgoff_t index);
 };
 
 #ifdef CONFIG_FS_VERITY
@@ -49,6 +66,12 @@ extern int fsverity_file_open(struct inode *inode, struct file *filp);
 extern int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 extern void fsverity_cleanup_inode(struct inode *inode);
 
+/* verify.c */
+
+extern bool fsverity_verify_page(struct page *page);
+extern void fsverity_verify_bio(struct bio *bio);
+extern void fsverity_enqueue_verify_work(struct work_struct *work);
+
 #else /* !CONFIG_FS_VERITY */
 
 static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
@@ -73,6 +96,39 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
 {
 }
 
+/* verify.c */
+
+static inline bool fsverity_verify_page(struct page *page)
+{
+	WARN_ON(1);
+	return false;
+}
+
+static inline void fsverity_verify_bio(struct bio *bio)
+{
+	WARN_ON(1);
+}
+
+static inline void fsverity_enqueue_verify_work(struct work_struct *work)
+{
+	WARN_ON(1);
+}
+
 #endif	/* !CONFIG_FS_VERITY */
 
+/**
+ * fsverity_active() - do reads from the inode need to go through fs-verity?
+ *
+ * This checks whether ->i_verity_info has been set.
+ *
+ * Filesystems call this from ->readpages() to check whether the pages need to
+ * be verified or not.  Don't use IS_VERITY() for this purpose; it's subject to
+ * a race condition where the file is being read concurrently with
+ * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before ->i_verity_info.)
+ */
+static inline bool fsverity_active(const struct inode *inode)
+{
+	return fsverity_get_info(inode) != NULL;
+}
+
 #endif	/* _LINUX_FSVERITY_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 08/16] fs-verity: add the hook for file ->setattr()
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a function fsverity_prepare_setattr() which filesystems that support
fs-verity must call to deny truncates of verity files.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/open.c         | 21 +++++++++++++++++++++
 include/linux/fsverity.h |  7 +++++++
 2 files changed, 28 insertions(+)

diff --git a/fs/verity/open.c b/fs/verity/open.c
index a8a1261f0e30..91f7e2d05b54 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -302,6 +302,27 @@ int fsverity_file_open(struct inode *inode, struct file *filp)
 }
 EXPORT_SYMBOL_GPL(fsverity_file_open);
 
+/**
+ * fsverity_prepare_setattr - prepare to change a verity inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Verity files are immutable, so deny truncates.  This isn't covered by the
+ * open-time check because sys_truncate() takes a path, not a file descriptor.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) {
+		pr_debug("Denying truncate of verity file (ino %lu)\n",
+			 d_inode(dentry)->i_ino);
+		return -EPERM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_prepare_setattr);
+
 /**
  * fsverity_cleanup_inode - free the inode's verity info, if present
  *
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 1372c236c877..cbcc358d0736 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -46,6 +46,7 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 /* open.c */
 
 extern int fsverity_file_open(struct inode *inode, struct file *filp);
+extern int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 extern void fsverity_cleanup_inode(struct inode *inode);
 
 #else /* !CONFIG_FS_VERITY */
@@ -62,6 +63,12 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp)
 	return IS_VERITY(inode) ? -EOPNOTSUPP : 0;
 }
 
+static inline int fsverity_prepare_setattr(struct dentry *dentry,
+					   struct iattr *attr)
+{
+	return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0;
+}
+
 static inline void fsverity_cleanup_inode(struct inode *inode)
 {
 }
-- 
2.21.0

^ permalink raw reply related

* [PATCH v4 07/16] fs-verity: add the hook for file ->open()
From: Eric Biggers @ 2019-06-06 15:51 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Y . Ts'o, Darrick J . Wong, linux-api, Dave Chinner,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, linux-integrity,
	linux-ext4, Linus Torvalds, Christoph Hellwig, Victor Hsieh
In-Reply-To: <20190606155205.2872-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add the fsverity_file_open() function, which prepares an fs-verity file
to be read from.  If not already done, it loads the fs-verity descriptor
from the filesystem and sets up an fsverity_info structure for the inode
which describes the Merkle tree and contains the file measurement.  It
also denies all attempts to open verity files for writing.

This commit also begins the include/linux/fsverity.h header, which
declares the interface between fs/verity/ and filesystems.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/Makefile           |   3 +-
 fs/verity/fsverity_private.h |  54 +++++-
 fs/verity/init.c             |   6 +
 fs/verity/open.c             | 325 +++++++++++++++++++++++++++++++++++
 include/linux/fsverity.h     |  71 ++++++++
 5 files changed, 456 insertions(+), 3 deletions(-)
 create mode 100644 fs/verity/open.c
 create mode 100644 include/linux/fsverity.h

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 398f3f85fa18..e6a8951c493a 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_FS_VERITY) += hash_algs.o \
-			   init.o
+			   init.o \
+			   open.o
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index c879189bfd61..c7fc3c7fc714 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -15,8 +15,7 @@
 #define pr_fmt(fmt) "fs-verity: " fmt
 
 #include <crypto/sha.h>
-#include <linux/fs.h>
-#include <uapi/linux/fsverity.h>
+#include <linux/fsverity.h>
 
 struct ahash_request;
 
@@ -62,6 +61,40 @@ struct merkle_tree_params {
 	u64 level_start[FS_VERITY_MAX_LEVELS];
 };
 
+/**
+ * fsverity_info - cached verity metadata for an inode
+ *
+ * When a verity file is first opened, an instance of this struct is allocated
+ * and stored in ->i_verity_info; it remains until the inode is evicted.  It
+ * caches information about the Merkle tree that's needed to efficiently verify
+ * data read from the file.  It also caches the file measurement.  The Merkle
+ * tree pages themselves are not cached here, but the filesystem may cache them.
+ */
+struct fsverity_info {
+	struct merkle_tree_params tree_params;
+	u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+	const struct inode *inode;
+};
+
+/*
+ * Merkle tree properties.  The file measurement is the hash of this structure.
+ */
+struct fsverity_descriptor {
+	__u8 version;		/* must be 1 */
+	__u8 hash_algorithm;	/* Merkle tree hash algorithm */
+	__u8 log_blocksize;	/* log2 of size of data and tree blocks */
+	__u8 salt_size;		/* size of salt in bytes; 0 if none */
+	__le32 sig_size;	/* reserved, must be 0 */
+	__le64 data_size;	/* size of file the Merkle tree is built over */
+	__u8 root_hash[64];	/* Merkle tree root hash */
+	__u8 salt[32];		/* salt prepended to each hashed block */
+	__u8 __reserved[144];	/* must be 0's */
+};
+
+/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
+
 /* hash_algs.c */
 
 extern struct fsverity_hash_alg fsverity_hash_algs[];
@@ -88,4 +121,21 @@ fsverity_msg(const struct inode *inode, const char *level,
 #define fsverity_err(inode, fmt, ...)		\
 	fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
 
+/* open.c */
+
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size);
+
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   const void *desc, size_t desc_size);
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
+
+void fsverity_free_info(struct fsverity_info *vi);
+
+int __init fsverity_init_info_cache(void);
+
 #endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/init.c b/fs/verity/init.c
index 40076bbe452a..fff1fd634335 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -33,8 +33,14 @@ void fsverity_msg(const struct inode *inode, const char *level,
 
 static int __init fsverity_init(void)
 {
+	int err;
+
 	fsverity_check_hash_algs();
 
+	err = fsverity_init_info_cache();
+	if (err)
+		return err;
+
 	pr_debug("Initialized fs-verity\n");
 	return 0;
 }
diff --git a/fs/verity/open.c b/fs/verity/open.c
new file mode 100644
index 000000000000..a8a1261f0e30
--- /dev/null
+++ b/fs/verity/open.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/open.c: opening fs-verity files
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/slab.h>
+
+static struct kmem_cache *fsverity_info_cachep;
+
+/**
+ * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters
+ * @params: the parameters struct to initialize
+ * @inode: the inode for which the Merkle tree is being built
+ * @hash_algorithm: number of hash algorithm to use
+ * @log_blocksize: log base 2 of block size to use
+ * @salt: pointer to salt (optional)
+ * @salt_size: size of salt, possibly 0
+ *
+ * Validate the hash algorithm and block size, then compute the tree topology
+ * (num levels, num blocks in each level, etc.) and initialize @params.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size)
+{
+	const struct fsverity_hash_alg *hash_alg;
+	int err;
+	u64 blocks;
+	u64 offset;
+	int level;
+
+	memset(params, 0, sizeof(*params));
+
+	if (inode->i_size <= 0) {
+		fsverity_warn(inode, "File is empty.  This is not supported.");
+		return -EINVAL;
+	}
+
+	hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
+	if (IS_ERR(hash_alg))
+		return PTR_ERR(hash_alg);
+	params->hash_alg = hash_alg;
+	params->digest_size = hash_alg->digest_size;
+
+	params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
+							salt_size);
+	if (IS_ERR(params->hashstate)) {
+		err = PTR_ERR(params->hashstate);
+		params->hashstate = NULL;
+		fsverity_err(inode, "Error %d preparing hash state", err);
+		goto out_err;
+	}
+
+	if (log_blocksize != PAGE_SHIFT) {
+		fsverity_warn(inode, "Unsupported log_blocksize: %u",
+			      log_blocksize);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_blocksize = log_blocksize;
+	params->block_size = 1 << log_blocksize;
+
+	if (WARN_ON(!is_power_of_2(params->digest_size))) {
+		err = -EINVAL;
+		goto out_err;
+	}
+	if (params->block_size < 2 * params->digest_size) {
+		fsverity_warn(inode,
+			      "Merkle tree block size (%u) too small for hash algorithm \"%s\"",
+			      params->block_size, hash_alg->name);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_arity = params->log_blocksize - ilog2(params->digest_size);
+	params->hashes_per_block = 1 << params->log_arity;
+
+	pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n",
+		 hash_alg->name, params->block_size, params->hashes_per_block,
+		 (int)salt_size, salt);
+
+	/*
+	 * Compute the number of levels in the Merkle tree and create a map from
+	 * level to the starting block of that level.  Level 'num_levels - 1' is
+	 * the root and is stored first.  Level 0 is the level directly "above"
+	 * the data blocks and is stored last.
+	 */
+
+	/* Compute number of levels and the number of blocks in each level */
+	blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+	pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
+	while (blocks > 1) {
+		if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
+			fsverity_err(inode, "Too many levels in Merkle tree");
+			err = -EINVAL;
+			goto out_err;
+		}
+		blocks = (blocks + params->hashes_per_block - 1) >>
+			 params->log_arity;
+		/* temporarily using level_start[] to store blocks in level */
+		params->level_start[params->num_levels++] = blocks;
+	}
+
+	/* Compute the starting block of each level */
+	offset = 0;
+	for (level = (int)params->num_levels - 1; level >= 0; level--) {
+		blocks = params->level_start[level];
+		params->level_start[level] = offset;
+		pr_debug("Level %d is %llu blocks starting at index %llu\n",
+			 level, blocks, offset);
+		offset += blocks;
+	}
+
+	params->data_size = inode->i_size;
+	params->tree_size = offset << log_blocksize;
+	return 0;
+
+out_err:
+	kfree(params->hashstate);
+	memset(params, 0, sizeof(*params));
+	return err;
+}
+
+/* Compute the file measurement by hashing the fsverity_descriptor. */
+static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
+				    const struct fsverity_descriptor *desc,
+				    u8 *measurement)
+{
+	return fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+}
+
+/*
+ * Validate the given fsverity_descriptor and create a new fsverity_info from
+ * it.  The signature (if present) is also checked.
+ */
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   const void *_desc, size_t desc_size)
+{
+	const struct fsverity_descriptor *desc = _desc;
+	struct fsverity_info *vi;
+	int err;
+
+	if (desc_size < sizeof(*desc)) {
+		fsverity_err(inode, "Unrecognized descriptor size (%zu)",
+			     desc_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->version != 1) {
+		fsverity_err(inode, "Unrecognized descriptor version: %u",
+			     desc->version);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->sig_size ||
+	    memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+		fsverity_err(inode, "Reserved bits set in descriptor");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->salt_size > sizeof(desc->salt)) {
+		fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (le64_to_cpu(desc->data_size) != inode->i_size) {
+		fsverity_err(inode,
+			     "Wrong data_size: %llu (desc) != %lld (inode)",
+			     le64_to_cpu(desc->data_size), inode->i_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
+	if (!vi)
+		return ERR_PTR(-ENOMEM);
+	vi->inode = inode;
+
+	err = fsverity_init_merkle_tree_params(&vi->tree_params, inode,
+					       desc->hash_algorithm,
+					       desc->log_blocksize,
+					       desc->salt, desc->salt_size);
+	if (err) {
+		fsverity_err(inode,
+			     "Error %d initializing Merkle tree parameters",
+			     err);
+		goto out;
+	}
+
+	memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
+
+	err = compute_file_measurement(vi->tree_params.hash_alg, desc,
+				       vi->measurement);
+	if (err) {
+		fsverity_err(vi->inode, "Error %d computing file measurement",
+			     err);
+		goto out;
+	}
+	pr_debug("Computed file measurement: %s:%*phN\n",
+		 vi->tree_params.hash_alg->name,
+		 vi->tree_params.digest_size, vi->measurement);
+out:
+	if (err) {
+		fsverity_free_info(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
+{
+	/*
+	 * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
+	 * This pairs with the READ_ONCE() in fsverity_get_info().
+	 */
+	if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL)
+		fsverity_free_info(vi);
+}
+
+void fsverity_free_info(struct fsverity_info *vi)
+{
+	if (!vi)
+		return;
+	kfree(vi->tree_params.hashstate);
+	kmem_cache_free(fsverity_info_cachep, vi);
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+	struct fsverity_info *vi = fsverity_get_info(inode);
+	struct fsverity_descriptor *desc;
+	int res;
+
+	if (vi)
+		return 0;
+
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
+	if (res < 0) {
+		fsverity_err(inode,
+			     "Error %d getting verity descriptor size", res);
+		return res;
+	}
+	if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+		fsverity_err(inode, "Verity descriptor is too large (%d bytes)",
+			     res);
+		return -EMSGSIZE;
+	}
+	desc = kmalloc(res, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
+	if (res < 0) {
+		fsverity_err(inode, "Error %d reading verity descriptor", res);
+		goto out_free_desc;
+	}
+
+	vi = fsverity_create_info(inode, desc, res);
+	if (IS_ERR(vi)) {
+		res = PTR_ERR(vi);
+		goto out_free_desc;
+	}
+
+	fsverity_set_info(inode, vi);
+	res = 0;
+out_free_desc:
+	kfree(desc);
+	return res;
+}
+
+/**
+ * fsverity_file_open - prepare to open a verity file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * When opening a verity file, deny the open if it is for writing.  Otherwise,
+ * set up the inode's ->i_verity_info if not already done.
+ *
+ * When combined with fscrypt, this must be called after fscrypt_file_open().
+ * Otherwise, we won't have the key set up to decrypt the verity metadata.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+	if (!IS_VERITY(inode))
+		return 0;
+
+	if (filp->f_mode & FMODE_WRITE) {
+		pr_debug("Denying opening verity file (ino %lu) for write\n",
+			 inode->i_ino);
+		return -EPERM;
+	}
+
+	return ensure_verity_info(inode);
+}
+EXPORT_SYMBOL_GPL(fsverity_file_open);
+
+/**
+ * fsverity_cleanup_inode - free the inode's verity info, if present
+ *
+ * Filesystems must call this on inode eviction to free ->i_verity_info.
+ */
+void fsverity_cleanup_inode(struct inode *inode)
+{
+	fsverity_free_info(inode->i_verity_info);
+	inode->i_verity_info = NULL;
+}
+EXPORT_SYMBOL_GPL(fsverity_cleanup_inode);
+
+int __init fsverity_init_info_cache(void)
+{
+	fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
+						   SLAB_RECLAIM_ACCOUNT,
+						   measurement);
+	if (!fsverity_info_cachep)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
new file mode 100644
index 000000000000..1372c236c877
--- /dev/null
+++ b/include/linux/fsverity.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs-verity: read-only file-based authenticity protection
+ *
+ * This header declares the interface between the fs/verity/ support layer and
+ * filesystems that support fs-verity.
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef _LINUX_FSVERITY_H
+#define _LINUX_FSVERITY_H
+
+#include <linux/fs.h>
+#include <uapi/linux/fsverity.h>
+
+/* Verity operations for filesystems */
+struct fsverity_operations {
+
+	/**
+	 * Get the verity descriptor of the given inode.
+	 *
+	 * @inode: an inode with the S_VERITY flag set
+	 * @buf: buffer in which to place the verity descriptor
+	 * @bufsize: size of @buf, or 0 to retrieve the size only
+	 *
+	 * If bufsize == 0, then the size of the verity descriptor is returned.
+	 * Otherwise the verity descriptor is written to 'buf' and its actual
+	 * size is returned; -ERANGE is returned if it's too large.  This may be
+	 * called by multiple processes concurrently on the same inode.
+	 *
+	 * Return: the size on success, -errno on failure
+	 */
+	int (*get_verity_descriptor)(struct inode *inode, void *buf,
+				     size_t bufsize);
+};
+
+#ifdef CONFIG_FS_VERITY
+
+static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
+{
+	/* pairs with the cmpxchg_release() in fsverity_set_info() */
+	return READ_ONCE(inode->i_verity_info);
+}
+
+/* open.c */
+
+extern int fsverity_file_open(struct inode *inode, struct file *filp);
+extern void fsverity_cleanup_inode(struct inode *inode);
+
+#else /* !CONFIG_FS_VERITY */
+
+static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
+{
+	return NULL;
+}
+
+/* open.c */
+
+static inline int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+	return IS_VERITY(inode) ? -EOPNOTSUPP : 0;
+}
+
+static inline void fsverity_cleanup_inode(struct inode *inode)
+{
+}
+
+#endif	/* !CONFIG_FS_VERITY */
+
+#endif	/* _LINUX_FSVERITY_H */
-- 
2.21.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox