From: ezemtsov@google.com
To: linux-fsdevel@vger.kernel.org
Cc: tytso@mit.edu, Eugene Zemtsov <ezemtsov@google.com>
Subject: [PATCH 2/6] incfs: Backing file format
Date: Wed, 1 May 2019 21:03:27 -0700 [thread overview]
Message-ID: <20190502040331.81196-3-ezemtsov@google.com> (raw)
In-Reply-To: <20190502040331.81196-1-ezemtsov@google.com>
From: Eugene Zemtsov <ezemtsov@google.com>
- Read and write logic for ondisk backing file format (aka incfs image)
- Add format.c and format.h
- Utils in internal.h
Signed-off-by: Eugene Zemtsov <ezemtsov@google.com>
---
fs/incfs/Makefile | 2 +-
fs/incfs/format.c | 554 ++++++++++++++++++++++++++++++++++++++++++++
fs/incfs/format.h | 294 +++++++++++++++++++++++
fs/incfs/internal.h | 31 +++
4 files changed, 880 insertions(+), 1 deletion(-)
create mode 100644 fs/incfs/format.c
create mode 100644 fs/incfs/format.h
create mode 100644 fs/incfs/internal.h
diff --git a/fs/incfs/Makefile b/fs/incfs/Makefile
index 7892196c634f..cdea18c7213e 100644
--- a/fs/incfs/Makefile
+++ b/fs/incfs/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_INCREMENTAL_FS) += incrementalfs.o
-incrementalfs-y := main.o vfs.o
\ No newline at end of file
+incrementalfs-y := main.o vfs.o format.o
diff --git a/fs/incfs/format.c b/fs/incfs/format.c
new file mode 100644
index 000000000000..a0e6ecec09d3
--- /dev/null
+++ b/fs/incfs/format.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+
+#include "format.h"
+
+struct backing_file_context *incfs_alloc_bfc(struct file *backing_file)
+{
+ struct backing_file_context *result = NULL;
+
+ result = kzalloc(sizeof(*result), GFP_NOFS);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ result->bc_file = backing_file;
+ mutex_init(&result->bc_mutex);
+ return result;
+}
+
+void incfs_free_bfc(struct backing_file_context *bfc)
+{
+ if (!bfc)
+ return;
+
+ if (bfc->bc_file)
+ fput(bfc->bc_file);
+
+ mutex_destroy(&bfc->bc_mutex);
+ kfree(bfc);
+}
+
+loff_t incfs_get_end_offset(struct file *f)
+{
+ /*
+ * This function assumes that file size and the end-offset
+ * are the same. This is not always true.
+ */
+ return i_size_read(file_inode(f));
+}
+
+/*
+ * Truncate the tail of the file to the given length.
+ * Used to rollback partially successful multistep writes.
+ */
+static int truncate_backing_file(struct backing_file_context *bfc,
+ loff_t new_end)
+{
+ struct inode *inode = NULL;
+ struct dentry *dentry = NULL;
+ loff_t old_end = 0;
+ struct iattr attr;
+ int result = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ if (!bfc->bc_file)
+ return -EFAULT;
+
+ old_end = incfs_get_end_offset(bfc->bc_file);
+ if (old_end == new_end)
+ return 0;
+ if (old_end < new_end)
+ return -EINVAL;
+
+ inode = bfc->bc_file->f_inode;
+ dentry = bfc->bc_file->f_path.dentry;
+
+ attr.ia_size = new_end;
+ attr.ia_valid = ATTR_SIZE;
+
+ inode_lock(inode);
+ result = notify_change(dentry, &attr, NULL);
+ inode_unlock(inode);
+
+ return result;
+}
+
+/* Append a given number of zero bytes to the end of the backing file. */
+static int append_zeros(struct backing_file_context *bfc, size_t len)
+{
+ loff_t file_size = 0;
+ loff_t new_last_byte_offset = 0;
+ int res = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (len == 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /*
+ * Allocate only one byte at the new desired end of the file.
+ * It will increase file size and create a zeroed area of
+ * a given size.
+ */
+ file_size = incfs_get_end_offset(bfc->bc_file);
+ new_last_byte_offset = file_size + len - 1;
+ res = vfs_fallocate(bfc->bc_file, 0, new_last_byte_offset, 1);
+ if (res)
+ return res;
+
+ res = vfs_fsync_range(bfc->bc_file, file_size, file_size + len, 1);
+ return res;
+}
+
+static int write_to_bf(struct backing_file_context *bfc, const void *buf,
+ size_t count, loff_t pos, bool sync)
+{
+ ssize_t res = 0;
+ loff_t p = pos;
+
+ res = kernel_write(bfc->bc_file, buf, count, &p);
+ if (res < 0)
+ return res;
+ if (res != count)
+ return -EIO;
+
+ if (sync)
+ return vfs_fsync_range(bfc->bc_file, pos, pos + count, 1);
+
+ return 0;
+}
+
+static u32 calc_md_crc(struct incfs_md_header *record)
+{
+ u32 result = 0;
+ __le32 saved_crc = record->h_record_crc;
+ __le64 saved_md_offset = record->h_next_md_offset;
+ size_t record_size = min_t(size_t, le16_to_cpu(record->h_record_size),
+ INCFS_MAX_METADATA_RECORD_SIZE);
+
+ /* Zero fields which needs to be excluded from CRC calculation. */
+ record->h_record_crc = 0;
+ record->h_next_md_offset = 0;
+ result = crc32(0, record, record_size);
+
+ /* Restore excluded fields. */
+ record->h_record_crc = saved_crc;
+ record->h_next_md_offset = saved_md_offset;
+
+ return result;
+}
+
+/*
+ * Append a given metadata record to the backing file and update a previous
+ * record to add the new record the the metadata list.
+ */
+static int append_md_to_backing_file(struct backing_file_context *bfc,
+ struct incfs_md_header *record)
+{
+ int result = 0;
+ loff_t record_offset;
+ loff_t file_pos;
+ __le64 new_md_offset;
+ size_t record_size;
+
+ if (!bfc || !record)
+ return -EFAULT;
+
+ if (bfc->bc_last_md_record_offset < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ record_size = le16_to_cpu(record->h_record_size);
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ record->h_prev_md_offset = bfc->bc_last_md_record_offset;
+ record->h_next_md_offset = 0;
+ record->h_record_crc = cpu_to_le32(calc_md_crc(record));
+
+ /* Write the metadata record to the end of the backing file */
+ record_offset = file_pos;
+ new_md_offset = cpu_to_le64(record_offset);
+ result = write_to_bf(bfc, record, record_size, file_pos, true);
+ if (result)
+ return result;
+
+ /* Update next metadata offset in a previous record or a superblock. */
+ if (bfc->bc_last_md_record_offset) {
+ /*
+ * Find a place in the previous md record where new record's
+ * offset needs to be saved.
+ */
+ file_pos = bfc->bc_last_md_record_offset +
+ offsetof(struct incfs_md_header, h_next_md_offset);
+ } else {
+ /* No metadata yet, file a place to update in the superblock. */
+ file_pos = offsetof(struct incfs_super_block,
+ s_first_md_offset);
+ }
+ result = write_to_bf(bfc, &new_md_offset, sizeof(new_md_offset),
+ file_pos, true);
+ if (result)
+ return result;
+
+ bfc->bc_last_md_record_offset = record_offset;
+ return result;
+}
+
+/* Append incfs_inode metadata record to the backing file. */
+int incfs_write_inode_to_backing_file(struct backing_file_context *bfc, u64 ino,
+ u64 size, u16 mode)
+{
+ struct incfs_inode disk_inode = {};
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ disk_inode.i_header.h_md_entry_type = INCFS_MD_INODE;
+ disk_inode.i_header.h_record_size = cpu_to_le16(sizeof(disk_inode));
+ disk_inode.i_header.h_next_md_offset = cpu_to_le64(0);
+ disk_inode.i_no = cpu_to_le64(ino);
+ disk_inode.i_size = cpu_to_le64(size);
+ disk_inode.i_mode = cpu_to_le16(mode);
+ disk_inode.i_flags = cpu_to_le32(0);
+
+ return append_md_to_backing_file(bfc, &disk_inode.i_header);
+}
+
+/* Append incfs_dir_action metadata record to the backing file. */
+int incfs_write_dir_action(struct backing_file_context *bfc, u64 dir_ino,
+ u64 dentry_ino, enum incfs_dir_action_type type,
+ struct mem_range name)
+{
+ struct incfs_dir_action action = {};
+ u8 name_len = min_t(u8, INCFS_MAX_NAME_LEN, name.len);
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ action.da_header.h_md_entry_type = INCFS_MD_DIR_ACTION;
+ action.da_header.h_record_size = cpu_to_le16(sizeof(action));
+ action.da_header.h_next_md_offset = cpu_to_le64(0);
+ action.da_dir_inode = cpu_to_le64(dir_ino);
+ action.da_entry_inode = cpu_to_le64(dentry_ino);
+ action.da_type = (__u8)type;
+ action.da_name_len = name_len;
+ memcpy(action.da_name, name.data, name_len);
+
+ return append_md_to_backing_file(bfc, &action.da_header);
+}
+
+/*
+ * Reserve 0-filled space for the blockmap body, and append
+ * incfs_blockmap metadata record pointing to it.
+ */
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u64 ino, u32 block_count, loff_t *map_base_off)
+{
+ struct incfs_blockmap blockmap = {};
+ int result = 0;
+ loff_t file_end = 0;
+ size_t map_size = block_count * sizeof(struct incfs_blockmap_entry);
+
+ if (!bfc)
+ return -EFAULT;
+
+ blockmap.m_header.h_md_entry_type = INCFS_MD_BLOCK_MAP;
+ blockmap.m_header.h_record_size = cpu_to_le16(sizeof(blockmap));
+ blockmap.m_header.h_next_md_offset = cpu_to_le64(0);
+ blockmap.m_inode = cpu_to_le64(ino);
+ blockmap.m_block_count = cpu_to_le32(block_count);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /* Reserve 0-filled space for the blockmap body in the backing file. */
+ file_end = incfs_get_end_offset(bfc->bc_file);
+ result = append_zeros(bfc, map_size);
+ if (result)
+ return result;
+
+ /* Write blockmap metadata record pointing to the body written above. */
+ blockmap.m_base_offset = cpu_to_le64(file_end);
+ result = append_md_to_backing_file(bfc, &blockmap.m_header);
+ if (result) {
+ /* Error, rollback file changes */
+ truncate_backing_file(bfc, file_end);
+ } else if (map_base_off) {
+ *map_base_off = file_end;
+ }
+
+ return result;
+}
+
+/*
+ * Write a backing file header (superblock).
+ * It should always be called only on empty file.
+ * incfs_super_block.s_first_md_offset is 0 for now, but will be updated
+ * once first metadata record is added.
+ */
+int incfs_write_sb_to_backing_file(struct backing_file_context *bfc)
+{
+ struct incfs_super_block sb = {};
+ loff_t file_pos = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ sb.s_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
+ sb.s_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
+ sb.s_super_block_size = cpu_to_le16(sizeof(sb));
+ sb.s_first_md_offset = cpu_to_le64(0);
+ sb.s_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ if (file_pos != 0)
+ return -EEXIST;
+
+ return write_to_bf(bfc, &sb, sizeof(sb), file_pos, true);
+}
+
+/* Write a given data block and update file's blockmap to point it. */
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block, int block_index,
+ loff_t bm_base_off, u16 flags, u32 crc)
+{
+ struct incfs_blockmap_entry bm_entry = {};
+ int result = 0;
+ loff_t data_offset = 0;
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (block.len >= (1 << 16) || block_index < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ data_offset = incfs_get_end_offset(bfc->bc_file);
+ if (data_offset <= bm_entry_off) {
+ /* Blockmap entry is beyond the file's end. It is not normal. */
+ return -EINVAL;
+ }
+
+ /* Write the block data at the end of the backing file. */
+ result = write_to_bf(bfc, block.data, block.len, data_offset, false);
+ if (result)
+ return result;
+
+ /* Update the blockmap to point to the newly written data. */
+ bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
+ bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
+ bm_entry.me_data_size = cpu_to_le16((u16)block.len);
+ bm_entry.me_flags = cpu_to_le16(flags);
+ bm_entry.me_data_crc = cpu_to_le32(crc);
+
+ result = write_to_bf(bfc, &bm_entry, sizeof(bm_entry),
+ bm_entry_off, false);
+
+ return result;
+}
+
+/* Initialize a new image in a given backing file. */
+int incfs_make_empty_backing_file(struct backing_file_context *bfc)
+{
+ int result = 0;
+
+ if (!bfc || !bfc->bc_file)
+ return -EFAULT;
+
+ result = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (result)
+ goto out;
+
+ result = truncate_backing_file(bfc, 0);
+ if (result)
+ goto out;
+
+ result = incfs_write_sb_to_backing_file(bfc);
+out:
+ mutex_unlock(&bfc->bc_mutex);
+ return result;
+}
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry)
+{
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
+ const size_t bytes_to_read = sizeof(struct incfs_blockmap_entry);
+ int result = 0;
+
+ if (!bfc || !bm_entry)
+ return -EFAULT;
+
+ if (block_index < 0 || bm_base_off <= 0)
+ return -ENODATA;
+
+ result = kernel_read(bfc->bc_file, bm_entry, bytes_to_read,
+ &bm_entry_off);
+ if (result < 0)
+ return result;
+ if (result < bytes_to_read)
+ return -EIO;
+ return 0;
+}
+
+int incfs_read_superblock(struct backing_file_context *bfc,
+ loff_t *first_md_off)
+{
+ loff_t pos = 0;
+ ssize_t bytes_read = 0;
+ struct incfs_super_block sb = {};
+
+ if (!bfc || !first_md_off)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ bytes_read = kernel_read(bfc->bc_file, &sb, sizeof(sb), &pos);
+ if (bytes_read < 0)
+ return bytes_read;
+
+ if (bytes_read < sizeof(sb))
+ return -EBADMSG;
+
+ if (le64_to_cpu(sb.s_magic) != INCFS_MAGIC_NUMBER)
+ return -EILSEQ;
+
+ if (le64_to_cpu(sb.s_version) > INCFS_FORMAT_CURRENT_VER)
+ return -EILSEQ;
+
+ if (le16_to_cpu(sb.s_data_block_size) != INCFS_DATA_FILE_BLOCK_SIZE)
+ return -EILSEQ;
+
+ if (le16_to_cpu(sb.s_super_block_size) > sizeof(sb))
+ return -EILSEQ;
+
+ *first_md_off = le64_to_cpu(sb.s_first_md_offset);
+ return 0;
+}
+
+/*
+ * Read through metadata records from the backing file one by one
+ * and call provided metadata handlers.
+ */
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler)
+{
+ loff_t pos = 0;
+ const ssize_t max_md_size = INCFS_MAX_METADATA_RECORD_SIZE;
+ ssize_t bytes_read = 0;
+ size_t md_record_size = 0;
+ loff_t next_record = 0;
+ loff_t prev_record = 0;
+ int res = 0;
+ struct incfs_md_header *md_hdr = NULL;
+
+ if (!bfc || !handler)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ if (handler->md_record_offset == 0)
+ return -EPERM;
+
+ memset(&handler->md_buffer, 0, max_md_size);
+ pos = handler->md_record_offset;
+ bytes_read = kernel_read(bfc->bc_file, (u8 *)&handler->md_buffer,
+ max_md_size, &pos);
+ if (bytes_read < 0)
+ return bytes_read;
+ if (bytes_read < sizeof(*md_hdr))
+ return -EBADMSG;
+
+ md_hdr = &handler->md_buffer.md_header;
+ next_record = le64_to_cpu(md_hdr->h_next_md_offset);
+ prev_record = le64_to_cpu(md_hdr->h_prev_md_offset);
+ md_record_size = le16_to_cpu(md_hdr->h_record_size);
+
+ if (md_record_size > max_md_size) {
+ pr_warn("incfs: The record is too large. Size: %ld",
+ md_record_size);
+ return -EBADMSG;
+ }
+
+ if (bytes_read < md_record_size) {
+ pr_warn("incfs: The record hasn't been fully read.");
+ return -EBADMSG;
+ }
+
+ if (next_record <= handler->md_record_offset && next_record != 0) {
+ pr_warn("incfs: Next record (%lld) points back in file.",
+ next_record);
+ return -EBADMSG;
+ }
+
+ if (prev_record != handler->md_prev_record_offset) {
+ pr_warn("incfs: Metadata chain has been corrupted.");
+ return -EBADMSG;
+ }
+
+ if (le32_to_cpu(md_hdr->h_record_crc) != calc_md_crc(md_hdr)) {
+ pr_warn("incfs: Metadata CRC mismatch.");
+ return -EBADMSG;
+ }
+
+ switch (md_hdr->h_md_entry_type) {
+ case INCFS_MD_NONE:
+ break;
+ case INCFS_MD_INODE:
+ if (handler->handle_inode)
+ res = handler->handle_inode(&handler->md_buffer.inode,
+ handler);
+ break;
+ case INCFS_MD_BLOCK_MAP:
+ if (handler->handle_blockmap)
+ res = handler->handle_blockmap(
+ &handler->md_buffer.blockmap, handler);
+ break;
+ case INCFS_MD_DIR_ACTION:
+ if (handler->handle_dir_action)
+ res = handler->handle_dir_action(
+ &handler->md_buffer.dir_action, handler);
+ break;
+ default:
+ res = -ENOTSUPP;
+ break;
+ }
+
+ if (!res) {
+ if (next_record == 0) {
+ /*
+ * Zero offset for the next record means that the last
+ * metadata record has just been processed.
+ */
+ bfc->bc_last_md_record_offset =
+ handler->md_record_offset;
+ }
+ handler->md_prev_record_offset = handler->md_record_offset;
+ handler->md_record_offset = next_record;
+ }
+ return res;
+}
diff --git a/fs/incfs/format.h b/fs/incfs/format.h
new file mode 100644
index 000000000000..2c2114bdd08f
--- /dev/null
+++ b/fs/incfs/format.h
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Overview
+ * --------
+ * The backbone of the incremental-fs ondisk format is an append only linked
+ * list of metadata blocks. Each metadata block contains an offset of the next
+ * one. These blocks describe files and directories on the
+ * file system. They also represent actions of adding and removing file names
+ * (hard links).
+ *
+ * Every time incremental-fs instance is mounted, it reads through this list
+ * to recreate filesystem's state in memory. An offset of the first record in
+ * the metadata list is stored in the superblock at the beginning of the backing
+ * file.
+ *
+ * Most of the backing file is taken by data areas and blockmaps.
+ * Since data blocks can be compressed and have different sizes,
+ * single per-file data area can't be pre-allocated. That's why blockmaps are
+ * needed in order to find a location and size of each data block in
+ * the backing file. Each time a file is created, a corresponding block map is
+ * allocated to store future offsets of data blocks.
+ *
+ * Whenever a data block is given by data loader to incremental-fs:
+ * - A data area with the given block is appended to the end of
+ * the backing file.
+ * - A record in the blockmap for the given block index is updated to reflect
+ * its location, size, and compression algorithm.
+
+ * Metadata records
+ * ----------------
+ * incfs_inode - metadata record to declare a file or a directory.
+ * incfs_inode.i_mode determents if it is a file
+ * or a directory.
+ * incfs_blockmap_entry - metadata record that specifies size and location
+ * of a blockmap area for a given file. This area
+ * contains an array of incfs_blockmap_entry-s.
+ * incfs_dir_action - metadata record that specifies changes made to a
+ * to a directory structure, e.g. add or remove a hardlink.
+ *
+ * Metadata header
+ * ---------------
+ * incfs_md_header - header of a metadata record. It's always a part
+ * of other structures and served purpose of metadata
+ * bookkeeping.
+ *
+ * +-----------------------------------------------+ ^
+ * | incfs_md_header | |
+ * | 1. type of body(INODE, BLOCKMAP, DIR ACTION..)| |
+ * | 2. size of the whole record header + body | |
+ * | 3. CRC the whole record header + body | |
+ * | 4. offset of the previous md record |]------+
+ * | 5. offset of the next md record (md link) |]---+
+ * +-----------------------------------------------+ |
+ * | Metadata record body with useful data | |
+ * +-----------------------------------------------+ |
+ * +--->
+ *
+ * Other ondisk structures
+ * -----------------------
+ * incfs_super_block - backing file header
+ * incfs_blockmap_entry - a record in a blockmap area that describes size
+ * and location of a data block.
+ * Data blocks dont have any particular structure, they are written to the
+ * backing file in a raw form as they come from a data loader.
+ *
+ * Backing file layout
+ * -------------------
+ *
+ *
+ * +-------------------------------------------+
+ * | incfs_super_block |]---+
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_inode |]---+
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ | metadata
+ * +------->| blockmap area | | list links
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | +--[| [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | +-------------------------------------------+ |
+ * | | ......................... |
+ * | | +-------------------------------------------+ |
+ * | | | metadata |<---+
+ * +----|--[| incfs_blockmap |]---+
+ * | +-------------------------------------------+ |
+ * | ......................... |
+ * | +-------------------------------------------+ |
+ * +-->| data block | |
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_dir_action |
+ * +-------------------------------------------+
+ */
+#ifndef _INCFS_FORMAT_H
+#define _INCFS_FORMAT_H
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <uapi/linux/incrementalfs.h>
+
+#include "internal.h"
+
+#define INCFS_MAX_NAME_LEN 255
+#define INCFS_FORMAT_V1 1
+#define INCFS_FORMAT_CURRENT_VER INCFS_FORMAT_V1
+
+enum incfs_metadata_type {
+ INCFS_MD_NONE = 0,
+ INCFS_MD_INODE = 1,
+ INCFS_MD_BLOCK_MAP = 2,
+ INCFS_MD_DIR_ACTION = 3
+};
+
+/* Header included at the beginning of all metadata records on the disk. */
+struct incfs_md_header {
+ __u8 h_md_entry_type;
+
+ /*
+ * Size of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le16 h_record_size;
+
+ /*
+ * CRC32 of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le32 h_record_crc;
+
+ /* Offset of the next metadata entry if any */
+ __le64 h_next_md_offset;
+
+ /* Offset of the previous metadata entry if any */
+ __le64 h_prev_md_offset;
+
+} __packed;
+
+/* Backing file header */
+struct incfs_super_block {
+ __le64 s_magic; /* Magic signature: INCFS_MAGIC_NUMBER */
+ __le64 s_version; /* Format version: INCFS_FORMAT_CURRENT_VER */
+ __le16 s_super_block_size; /* sizeof(incfs_super_block) */
+ __le32 s_flags; /* Reserved for future use. */
+ __le64 s_first_md_offset; /* Offset of the first metadata record */
+ __le16 s_data_block_size; /* INCFS_DATA_FILE_BLOCK_SIZE */
+} __packed;
+
+/* Metadata record for files and directories. Type = INCFS_MD_INODE */
+struct incfs_inode {
+ struct incfs_md_header i_header;
+ __le64 i_no; /* inode number */
+ __le64 i_size; /* Full size of the file's content */
+ __le16 i_mode; /* File mode */
+ __le32 i_flags; /* Reserved for future use. */
+} __packed;
+
+enum incfs_block_map_entry_flags {
+ INCFS_BLOCK_COMPRESSED_LZ4 = (1 << 0),
+};
+
+/* Block map entry pointing to an actual location of the data block. */
+struct incfs_blockmap_entry {
+ /* Offset of the actual data block. Lower 32 bits */
+ __le32 me_data_offset_lo;
+
+ /* Offset of the actual data block. Higher 16 bits */
+ __le16 me_data_offset_hi;
+
+ /* How many bytes the data actually occupies in the backing file */
+ __le16 me_data_size;
+
+ /* Block flags from incfs_block_map_entry_flags */
+ __u16 me_flags;
+
+ /* CRC32 of the block's data */
+ __le32 me_data_crc;
+} __packed;
+
+/* Metadata record for locations of file blocks. Type = INCFS_MD_BLOCK_MAP */
+struct incfs_blockmap {
+ struct incfs_md_header m_header;
+ /* inode of a file this map belongs to */
+ __le64 m_inode;
+
+ /* Base offset of the array of incfs_blockmap_entry */
+ __le64 m_base_offset;
+
+ /* Size of the map entry array in blocks */
+ __le32 m_block_count;
+} __packed;
+
+enum incfs_dir_action_type {
+ INCFS_DIRA_NONE = 0,
+ INCFS_DIRA_ADD_ENTRY = 1,
+ INCFS_DIRA_REMOVE_ENTRY = 2,
+};
+
+/* Metadata record of directory content change. Type = INCFS_MD_DIR_ACTION */
+struct incfs_dir_action {
+ struct incfs_md_header da_header;
+ __le64 da_dir_inode; /* Parent directory inode number */
+ __le64 da_entry_inode; /* File/subdirectory inode number */
+ __u8 da_type; /* One of enums incfs_dir_action_type */
+ __u8 da_name_len; /* Name length */
+ char da_name[INCFS_MAX_NAME_LEN]; /* File name */
+} __packed;
+
+/* State of the backing file. */
+struct backing_file_context {
+ /* Protects writes to bc_file */
+ struct mutex bc_mutex;
+
+ /* File object to read data from */
+ struct file *bc_file;
+
+ /*
+ * Offset of the last known metadata record in the backing file.
+ * 0 means there are no metadata records.
+ */
+ loff_t bc_last_md_record_offset;
+};
+
+struct metadata_handler {
+ loff_t md_record_offset;
+ loff_t md_prev_record_offset;
+ void *context;
+
+ union {
+ struct incfs_md_header md_header;
+ struct incfs_inode inode;
+ struct incfs_blockmap blockmap;
+ struct incfs_dir_action dir_action;
+ } md_buffer;
+
+ int (*handle_inode)(struct incfs_inode *inode,
+ struct metadata_handler *handler);
+ int (*handle_blockmap)(struct incfs_blockmap *bm,
+ struct metadata_handler *handler);
+ int (*handle_dir_action)(struct incfs_dir_action *da,
+ struct metadata_handler *handler);
+};
+#define INCFS_MAX_METADATA_RECORD_SIZE \
+ FIELD_SIZEOF(struct metadata_handler, md_buffer)
+
+loff_t incfs_get_end_offset(struct file *f);
+
+/* Backing file context management */
+struct backing_file_context *incfs_alloc_bfc(struct file *backing_file);
+
+void incfs_free_bfc(struct backing_file_context *bfc);
+
+/* Writing stuff */
+int incfs_write_inode_to_backing_file(struct backing_file_context *bfc, u64 ino,
+ u64 size, u16 mode);
+
+int incfs_write_dir_action(struct backing_file_context *bfc, u64 dir_ino,
+ u64 dentry_ino, enum incfs_dir_action_type type,
+ struct mem_range name);
+
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u64 ino, u32 block_count,
+ loff_t *map_base_off);
+
+int incfs_write_sb_to_backing_file(struct backing_file_context *bfc);
+
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block,
+ int block_index, loff_t bm_base_off,
+ u16 flags, u32 crc);
+
+int incfs_make_empty_backing_file(struct backing_file_context *bfc);
+
+/* Reading stuff */
+int incfs_read_superblock(struct backing_file_context *bfc,
+ loff_t *first_md_off);
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry);
+
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler);
+
+#endif /* _INCFS_FORMAT_H */
diff --git a/fs/incfs/internal.h b/fs/incfs/internal.h
new file mode 100644
index 000000000000..de8b6240e347
--- /dev/null
+++ b/fs/incfs/internal.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+#ifndef _INCFS_INTERNAL_H
+#define _INCFS_INTERNAL_H
+#include <linux/types.h>
+
+struct mem_range {
+ u8 *data;
+ size_t len;
+};
+
+static inline struct mem_range range(u8 *data, size_t len)
+{
+ return (struct mem_range){ .data = data, .len = len };
+}
+
+#ifdef DEBUG
+#define LOCK_REQUIRED(lock) \
+ do { \
+ if (!mutex_is_locked(&(lock))) { \
+ pr_err(#lock " must be taken"); \
+ panic("Lock not taken."); \
+ } \
+ } while (0)
+#else
+#define LOCK_REQUIRED(lock)
+#endif
+
+#endif /* _INCFS_INTERNAL_H */
--
2.21.0.593.g511ec345e18-goog
next prev parent reply other threads:[~2019-05-02 4:03 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-02 4:03 Initial patches for Incremental FS ezemtsov
2019-05-02 4:03 ` [PATCH 1/6] incfs: Add first files of incrementalfs ezemtsov
2019-05-02 19:06 ` Miklos Szeredi
2019-05-02 20:41 ` Randy Dunlap
2019-05-07 15:57 ` Jann Horn
2019-05-07 17:13 ` Greg KH
2019-05-07 17:18 ` Greg KH
2019-05-02 4:03 ` ezemtsov [this message]
2019-05-02 4:03 ` [PATCH 3/6] incfs: Management of in-memory FS data structures ezemtsov
2019-05-02 4:03 ` [PATCH 4/6] incfs: Integration with VFS layer ezemtsov
2019-05-02 4:03 ` [PATCH 6/6] incfs: Integration tests for incremental-fs ezemtsov
2019-05-02 11:19 ` Initial patches for Incremental FS Amir Goldstein
2019-05-02 13:10 ` Theodore Ts'o
2019-05-02 13:26 ` Al Viro
2019-05-03 4:23 ` Eugene Zemtsov
2019-05-03 5:19 ` Amir Goldstein
2019-05-08 20:09 ` Eugene Zemtsov
2019-05-09 8:15 ` Amir Goldstein
[not found] ` <CAK8JDrEQnXTcCtAPkb+S4r4hORiKh_yX=0A0A=LYSVKUo_n4OA@mail.gmail.com>
2019-05-21 1:32 ` Yurii Zubrytskyi
2019-05-22 8:32 ` Miklos Szeredi
2019-05-22 17:25 ` Yurii Zubrytskyi
2019-05-23 4:25 ` Miklos Szeredi
2019-05-29 21:06 ` Yurii Zubrytskyi
2019-05-30 9:22 ` Miklos Szeredi
2019-05-30 22:45 ` Yurii Zubrytskyi
2019-05-31 9:02 ` Miklos Szeredi
2019-05-22 10:54 ` Amir Goldstein
2019-05-03 7:23 ` Richard Weinberger
2019-05-03 10:22 ` Miklos Szeredi
2019-05-02 13:46 ` Amir Goldstein
2019-05-02 18:16 ` Richard Weinberger
2019-05-02 18:33 ` Richard Weinberger
2019-05-02 13:47 ` J. R. Okajima
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190502040331.81196-3-ezemtsov@google.com \
--to=ezemtsov@google.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.