All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 9/9]ext4: online defrag command
@ 2008-10-24 10:10 Akira Fujita
  0 siblings, 0 replies; only message in thread
From: Akira Fujita @ 2008-10-24 10:10 UTC (permalink / raw)
  To: linux-ext4, Theodore Tso, Mingming Cao; +Cc: linux-fsdevel

ext4: online defrag -- Online defrag command

From: Akira Fujita <a-fujita@rs.jp.nec.com>

- The defrag command. Usage is as follows:
  - Put the multiple files closer together.
    # e4defrag -r directory-name
  - Defrag for free space fragmentation.
    # e4defrag -f file-name
  - Defrag for a single file.
    # e4defrag file-name
  - Defrag for all files on ext4.
    # e4defrag device-name

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
/*
 * e4defrag.c - ext4 filesystem defragmenter
 *
 * Copyright (C) 2008 NEC Software Tohoku, Ltd.
 *
 * Author: Akira Fujita	<a-fujita@rs.jp.nec.com>
 *         Takashi Sato	<t-sato@yk.jp.nec.com>
 */

#ifndef _LARGEFILE_SOURCE
#define _LARGEFILE_SOURCE
#endif

#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE
#endif

#define _XOPEN_SOURCE	500
#define _GNU_SOURCE
#include <ftw.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <dirent.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/statfs.h>
#include <sys/vfs.h>
#include <sys/ioctl.h>
#include <mntent.h>
#include <linux/fs.h>
#include <ctype.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <ext2fs/ext2_types.h>
#include <endian.h>

/* Ioctl command */
#define EXT4_IOC_DEFRAG		_IOW('f', 15, struct ext4_ext_defrag_data)
#define EXT4_IOC_SUPER_INFO	_IOR('f', 16, struct ext4_super_block)
#define EXT4_IOC_FREE_BLOCKS_INFO _IOWR('f', 17, struct ext4_extents_info)
#define EXT4_IOC_FIEMAP_INO	_IOWR('f', 18, struct fiemap_ino)
#define EXT4_IOC_MOVE_VICTIM	_IOW('f', 19, struct ext4_extents_info)
#define FS_IOC_FIEMAP		_IOWR('f', 11, struct fiemap)

/* Macro functions */
#define PRINT_ERR_MSG(msg)	fprintf(stderr, "%s\n", (msg));
#define IN_FTW_PRINT_ERR_MSG(msg)	\
	fprintf(stderr, "\t%s\t\t[ NG ]\n", (msg));
#define PRINT_FILE_NAME(file)	fprintf(stderr, " \"%s\"\n", (file));
#define PRINT_ERR_MSG_WITH_ERRNO(msg)	\
	fprintf(stderr, "\t%s:%s\t[ NG ]\n", (msg), strerror(errno));
#define min(x, y) (((x) > (y)) ? (y) : (x))
/* Wrap up the free function */
#define FREE(tmp)				\
	do {					\
		if (tmp)			\
			free(tmp);		\
	} while (0)				\
/* Insert list2 after list1 */
#define insert(list1, list2)			\
	do {					\
		list2->next = list1->next;	\
		list1->next->prev = list2;	\
		list2->prev = list1;		\
		list1->next = list2;		\
	} while (0)

#ifndef __NR_fadvise64
#define __NR_fadvise64		250
#endif

#ifndef __NR_sync_file_range
#define __NR_sync_file_range	314
#endif

#ifndef POSIX_FADV_DONTNEED
#if defined(__s390x__)
#define POSIX_FADV_DONTNEED	6 /* Don't need these pages */
#else
#define POSIX_FADV_DONTNEED	4 /* Don't need these pages */
#endif
#endif

#ifndef SYNC_FILE_RANGE_WAIT_BEFORE
#define SYNC_FILE_RANGE_WAIT_BEFORE	1
#endif
#ifndef SYNC_FILE_RANGE_WRITE
#define SYNC_FILE_RANGE_WRITE		2
#endif
#ifndef SYNC_FILE_RANGE_WAIT_AFTER
#define SYNC_FILE_RANGE_WAIT_AFTER	4
#endif

#define DEVNAME			0
#define DIRNAME			1
#define FILENAME		2

#define RETURN_OK		0
#define RETURN_NG		-1
#define FTW_CONT		0
#define FTW_OPEN_FD		2000

#define FS_EXT4			"ext4"
#define ROOT_UID		0
#define CHECK_FRAG_COUNT	1

/* Extent status which are used in extent_t */
#define EXT4_EXT_USE		0
#define EXT4_EXT_FREE		1

/* The maximum number of extents for exchanging between
 * kernel-space and user-space per ioctl
 */
#define DEFRAG_MAX_ENT	32

/* The phase of force defrag mode */
#define DEFRAG_FORCE_TRY	1
#define DEFRAG_FORCE_GATHER	3

/* Magic number for ext4 */
#define EXT4_SUPER_MAGIC	0xEF53

/* Defrag size(64MB) per ioctl(not including force mode) */
#define DEFRAG_SIZE		((unsigned long)1 << 26)

/* Force defrag mode: Max file size in bytes (128MB) */
#define	MAX_FILE_SIZE		((unsigned long)1 << 27)


/* The following four macros are used for ioctl FS_IOC_FIEMAP
 * FIEMAP_FLAG_SYNC:	sync file data before map.
 * FIEMAP_EXTENT_LAST:	last extent in file.
 * FIEMAP_MAX_OFFSET:	max file offset.
 * EXTENT_MAX_COUNT:	the maximum number of extents for exchanging between
			kernel-space and user-space per ioctl
 */
#define FIEMAP_FLAG_SYNC	0x00000001
#define FIEMAP_EXTENT_LAST	0x00000001
#define FIEMAP_MAX_OFFSET	(~0ULL)
#define EXTENT_MAX_COUNT	512

/* The following macros are error message */
#define MSG_USAGE		\
"Usage : e4defrag [-v] file...| directory...| device...\n\
      : e4defrag -f file [blocknr] \n\
      : e4defrag -r directory... | device... \n"

#define MSG_R_OPTION		" with regional block allocation mode.\n"
#define NGMSG_MTAB		"Can not access /etc/mtab"
#define NGMSG_UNMOUNT		"FS is not mounted"
#define NGMSG_EXT4		"FS is not ext4 File System"
#define NGMSG_FS_INFO		"Get FSInfo fail"
#define NGMSG_FILE_INFO		"Get FileInfo fail"
#define NGMSG_FILE_OPEN		"Open fail"
#define NGMSG_FILE_SYNC		"Sync(fsync) fail"
#define NGMSG_FILE_DEFRAG	"Defrag fail"
#define NGMSG_FILE_BLOCKSIZE	"Can't get blocksize"
#define NGMSG_FILE_SUPER	"Can't get super block info"
#define NGMSG_FILE_UNREG	"File is not regular file"
#define NGMSG_VICTIM_UID	"Victim file is not current user's file"

#define NGMSG_FILE_LARGE	\
	"Defrag size is larger than FileSystem's free space"

#define NGMSG_FILE_PRIORITY	\
"File is not current user's file or current user is not root"

#define NGMSG_FILE_LOCK		"File is locked"
#define NGMSG_FILE_BLANK	"File size is 0"
#define NGMSG_GET_LCKINFO	"Get LockInfo fail"
#define NGMSG_TYPE		\
	"Can not process %s in regional mode.\n"
#define NGMSG_LOST_FOUND	"Can not process \"lost+found\""
#define NGMSG_REALPATH		"Can not get full path"
#define NGMSG_FILE_MAP		"Get file map fail"
#define NGMSG_FILE_DROP_BUFFER	"Free page fail"
#define NGMSG_FADVISE_SYSCALL	"\tFadvise fail"
#define NGMSG_FORCE_DEFRAG_SIZE	\
"Cannot specify a file larger than %luMB in force defrag mode"
#define NGMSG_ENDIAN		"Endian type is not big/little endian"

/* Data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* Data type for file logical block number */
typedef unsigned int ext4_lblk_t;

/* Data type for block offset of block group */
typedef int ext4_grpblk_t;

struct ext4_extent_data {
	ext4_lblk_t  block;		/* start logical block number */
	ext4_fsblk_t start;		/* start physical block number */
	int len;			/* blocks count */
};

/* Used for defrag */
struct ext4_ext_defrag_data {
	ext4_lblk_t start_offset;	/* start offset to defrag in blocks */
	ext4_lblk_t defrag_size;	/* size of defrag in blocks */
	ext4_fsblk_t goal;		/* block offset for allocation */
	int flag;			/* free space mode flag */
	struct ext4_extent_data ext;
};

typedef __u16 __le16;
typedef __u32 __le32;
typedef __u64 __le64;

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/	__le32	s_inodes_count;		/* Inodes count */
	__le32	s_blocks_count_lo;	/* Blocks count */
	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
	__le32	s_free_blocks_count_lo;	/* Free blocks count */
/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
	__le32	s_first_data_block;	/* First Data Block */
	__le32	s_log_block_size;	/* Block size */
	__le32	s_obso_log_frag_size;	/* Obsoleted fragment size */
/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
	__le32	s_obso_frags_per_group;	/* Obsoleted fragments per group */
	__le32	s_inodes_per_group;	/* # Inodes per group */
	__le32	s_mtime;		/* Mount time */
/*30*/	__le32	s_wtime;		/* Write time */
	__le16	s_mnt_count;		/* Mount count */
	__le16	s_max_mnt_count;	/* Maximal mount count */
	__le16	s_magic;		/* Magic signature */
	__le16	s_state;		/* File system state */
	__le16	s_errors;		/* Behaviour when detecting errors */
	__le16	s_minor_rev_level;	/* minor revision level */
/*40*/	__le32	s_lastcheck;		/* time of last check */
	__le32	s_checkinterval;	/* max. time between checks */
	__le32	s_creator_os;		/* OS */
	__le32	s_rev_level;		/* Revision level */
/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
	__le16	s_def_resgid;		/* Default gid for reserved blocks */
	/*
	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
	 *
	 * Note: the difference between the compatible feature set and
	 * the incompatible feature set is that if there is a bit set
	 * in the incompatible feature set that the kernel doesn't
	 * know about, it should refuse to mount the filesystem.
	 *
	 * e2fsck's requirements are more strict; if it doesn't know
	 * about a feature in either the compatible or incompatible
	 * feature set, it must abort and not try to meddle with
	 * things it doesn't understand...
	 */
	__le32	s_first_ino;		/* First non-reserved inode */
	__le16  s_inode_size;		/* size of inode structure */
	__le16	s_block_group_nr;	/* block group # of this superblock */
	__le32	s_feature_compat;	/* compatible feature set */
/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
/*78*/	char	s_volume_name[16];	/* volume name */
/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
	/*
	 * Performance hints.  Directory preallocation should only
	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
	 */
	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
	/*
	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
	 */
/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
	__le32	s_journal_dev;		/* device number of journal file */
	__le32	s_last_orphan;		/* start of list of inodes to delete */
	__le32	s_hash_seed[4];		/* HTREE hash seed */
	__u8	s_def_hash_version;	/* Default hash version to use */
	__u8	s_reserved_char_pad;
	__le16  s_desc_size;		/* size of group descriptor */
/*100*/	__le32	s_default_mount_opts;
	__le32	s_first_meta_bg;	/* First metablock block group */
	__le32	s_mkfs_time;		/* When the filesystem was created */
	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
	__le32	s_free_blocks_count_hi;	/* Free blocks count */
	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
	__le32	s_flags;		/* Miscellaneous flags */
	__le16  s_raid_stride;		/* RAID stride */
	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
	__le64  s_mmp_block;            /* Block for multi-mount protection */
	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
	__u8	s_reserved_char_pad2;
	__le16  s_reserved_pad;
	__u32   s_reserved[162];        /* Padding to the end of the block */
};

struct ext4_extents_info {
	unsigned long long ino;		/* inode number */
	int max_entries;		/* maximum extents count */
	int entries; 	 		/* extent number/count */
	ext4_lblk_t  f_offset;		/* file offset */
	ext4_grpblk_t g_offset;		/* group offset */
	ext4_fsblk_t goal;
	struct ext4_extent_data ext[DEFRAG_MAX_ENT];
};

typedef struct extent {
	struct extent *prev;
	unsigned long tag;		/* extent status */
	unsigned long ino;		/* file's inode number */
	struct ext4_extent_data data;	/* extent belong to file */
	struct extent *next;
} extent_t;

typedef struct exts_group {
	struct exts_group *prev;
	extent_t *start;		/* start ext */
	extent_t *end;			/* end ext */
	int len;			/* length of this continuous region */
	struct exts_group *next;
} exts_group_t;

struct fiemap_extent {
	__u64 fe_logical;	/* logical offset in bytes for the start of
				* the extent from the beginning of the file */
	__u64 fe_physical;	/* physical offset in bytes for the start
				* of the extent from the beginning
				* of the disk */
	__u64 fe_length;	/* length in bytes for this extent */
	__u64 fe_reserved64[2];
	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
	__u32 fe_reserved[3];
};

struct fiemap {
	__u64 fm_start;		/* logical offset (inclusive) at
				* which to start mapping (in) */
	__u64 fm_length;	/* logical length of mapping which
				* userspace wants (in) */
	__u32 fm_flags;		/* FIEMAP_FLAG_* flags for request (in/out) */
	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
	__u32 fm_extent_count;	/* size of fm_extents array (in) */
	__u32 fm_reserved;
	struct fiemap_extent fm_extents[0];/* array of mapped extents (out) */
};

struct fiemap_ino {
	__u64 ino;
	struct fiemap fiemap;
};

int	force_flag;
int	detail_flag;
int	regional_flag;
int	extents_before_defrag;
int	extents_after_defrag;
unsigned long	succeed_cnt;
unsigned long	regular_count;
unsigned long	total_count;
unsigned long	frag_files_before_defrag;
unsigned long	frag_files_after_defrag;
unsigned long	defraged_file_count;
char	lost_found_dir[PATH_MAX + 1];
ext4_fsblk_t	goal;
ext4_fsblk_t	fgoal = -1;

/*
 * ext2fs_swab32() -	Change endian.
 *
 * @val:		the entry used for change.
 */
__u32 ext2fs_swab32(__u32 val)
{
#if BYTE_ORDER == BIG_ENDIAN
	return (val>>24) | ((val>>8)&0xFF00) |
		((val<<8)&0xFF0000) | (val<<24);
#else
	return val;
#endif
}

/*
 * fadvise() -		Give advice about file access.
 *
 * @fd:			the file's descriptor.
 * @offset:		file offset.
 * @len:		area length.
 * @advise:		process flag.
 */
int fadvise(int fd, loff_t offset, size_t len, int advise)
{
	return syscall(__NR_fadvise64, fd, offset, len, advise);
}

/*
 * sync_file_range() -	Sync file region.
 *
 * @fd:			the file's descriptor.
 * @offset:		file offset.
 * @length:		area length.
 * @flag:		process flag.
 */
int sync_file_range(int fd, loff_t offset, loff_t length, unsigned int flag)
{
	return syscall(__NR_sync_file_range, fd, offset, length, flag);
}

/*
 * page_in_core() -	Get information on whether pages are in core.
 *
 * @fd:			the file's descriptor.
 * @defrag_data:	data used for defrag.
 * @vec:		page state array.
 * @page_num:		page number.
 */
int page_in_core(int fd, struct ext4_ext_defrag_data defrag_data,
		 unsigned char **vec, unsigned long *page_num)
{
	int blocksize;
	int pagesize = getpagesize();
	void *page = NULL;
	loff_t offset, end_offset, length;

	if (vec == NULL || *vec != NULL)
		return RETURN_NG;

	if (ioctl(fd, FIGETBSZ, &blocksize) < 0)
		return RETURN_NG;

	/* In mmap, offset should be a multiple of the page size */
	offset = defrag_data.start_offset * blocksize;
	length = defrag_data.defrag_size * blocksize;
	end_offset = offset + length;
	/* Round the offset down to the nearest multiple of pagesize */
	offset = (offset / pagesize) * pagesize;
	length = end_offset - offset;

	page = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, offset);
	if (page == MAP_FAILED)
		return RETURN_NG;

	*page_num = 0;
	*page_num = (length + pagesize - 1) / pagesize;
	*vec = (unsigned char *)calloc(*page_num, 1);
	if (*vec == NULL)
		return RETURN_NG;

	/* Get information on whether pages are in core */
	if (mincore(page, (size_t)length, *vec) == -1 ||
	    munmap(page, length) == -1) {
		FREE(*vec);
		return RETURN_NG;
	}

	return RETURN_OK;
}

/*
 * defrag_fadvise() -	Predeclare an access pattern for file data.
 *
 * @fd:			the file's descriptor.
 * @defrag_data:	data used for defrag.
 * @vec:		page state array.
 * @page_num:		page number.
 */
int defrag_fadvise(int fd, struct ext4_ext_defrag_data defrag_data,
		   unsigned char *vec, unsigned long page_num)
{
	int flag = 1;
	int blocksize;
	int pagesize = getpagesize();
	int fadvise_flag = POSIX_FADV_DONTNEED;
	int sync_flag = SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE|
			SYNC_FILE_RANGE_WAIT_AFTER;
	unsigned long i;
	loff_t offset;

	if (ioctl(fd, FIGETBSZ, &blocksize) < 0)
		return RETURN_NG;

	offset = (loff_t)defrag_data.start_offset * blocksize;
	offset = (offset / pagesize) * pagesize;

	/* Sync file for fadvise process */
	if (sync_file_range(fd, offset,
		(loff_t)pagesize*page_num, sync_flag) != 0)
		return RETURN_NG;

	/* Try to release buffer cache which this process used,
	 * then other process can use the released buffer
	 */
	for (i = 0; i < page_num; i++) {
		if ((vec[i] & 0x1) == 0) {
			offset += pagesize;
			continue;
		}
		if (fadvise(fd, offset, pagesize, fadvise_flag) != 0) {
			if (detail_flag && flag) {
				perror(NGMSG_FADVISE_SYSCALL);
				flag = 0;
			}
		}
		offset += pagesize;
	}

	return RETURN_OK;
}

/*
 * check_free_size() -	Check if there's enough disk space.
 *
 * @fd:			the file's descriptor.
 * @file_name		file name.
 * @buf:		the pointer of the struct stat64.
 */
int check_free_size(int fd, const char *file_name, const struct stat64 *buf)
{
	off64_t	size = 0;
	off64_t	free_size = 0;
	struct statfs	fsbuf;

	/* Target file size */
	size = buf->st_size;

	if (fstatfs(fd, &fsbuf) < 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FS_INFO);
		}
		return RETURN_NG;
	}

	/* Compute free space for root and normal user separately */
	if (getuid() == ROOT_UID)
		free_size = (off64_t)fsbuf.f_bsize * fsbuf.f_bfree;
	else
		free_size = (off64_t)fsbuf.f_bsize * fsbuf.f_bavail;

	if (free_size >= size)
		return RETURN_OK;

	if (detail_flag) {
		PRINT_FILE_NAME(file_name);
		IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_LARGE);
	}
	return RETURN_NG;
}

int file_check(int fd, const struct stat64 *buf, const char *file_name);
int force_defrag(int fd, const char *file_name,
		const struct stat64 *buf, int blocksize);

/*
 * file_frag_count() -  Get file fragment count.
 *
 * @fd:			the file's descriptor.
 */
long long file_frag_count(int fd)
{
	int ret = RETURN_NG;
	struct fiemap file_extent_map;

	/* When fm_extent_count is 0,
	 * ioctl just get file fragment count.
	 */
	memset(&file_extent_map, 0, sizeof(struct fiemap));
	file_extent_map.fm_start = 0;
	file_extent_map.fm_length = FIEMAP_MAX_OFFSET;
	file_extent_map.fm_flags |= FIEMAP_FLAG_SYNC;

	ret = ioctl(fd, FS_IOC_FIEMAP, &file_extent_map);
	if (ret < 0)
		return RETURN_NG;

	return file_extent_map.fm_mapped_extents;
}

/*
 * ftw_fn() -           Check file attributes and ioctl call to avoid.
 * 			illegal operations.
 *
 * @file:		the file's name.
 * @buf:		the pointer of the struct stat64.
 * @flag:		file type.
 * @ftwbuf:		the pointer of a struct FTW.
 */
int ftw_fn(const char *file, const struct stat64 *buf, int flag,
	   struct FTW *ftwbuf)
{
	int	fd;
	int	blocksize;
	int	percent = 0;
	int	defrag_fail = 0;
	int	defraged_size = 0;
	int	ret = RETURN_NG;
	long long	file_frags_start, file_frags_end;
	unsigned long	page_num;
	unsigned char	*vec = NULL;
	loff_t	start = 0;
	struct ext4_ext_defrag_data	df_data;

	defraged_file_count++;

	if (detail_flag) {
		printf("[%lu/%lu]", defraged_file_count, total_count);
		fflush(stdout);
	}

	if (lost_found_dir[0] != '\0' &&
	    !memcmp(file, lost_found_dir, strnlen(lost_found_dir, PATH_MAX))) {
		if (detail_flag) {
			PRINT_FILE_NAME(file);
			IN_FTW_PRINT_ERR_MSG(NGMSG_LOST_FOUND);
		}
		return FTW_CONT;
	}

	if (flag != FTW_F) {
		if (detail_flag) {
			PRINT_FILE_NAME(file);
			IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_UNREG);
		}
		return FTW_CONT;
	}

	fd = open64(file, O_RDONLY);
	if (fd < 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_OPEN);
		}
		return FTW_CONT;
	}

	if (file_check(fd, buf, file) == RETURN_NG)
		goto out;

	if (fsync(fd) < 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_SYNC);
		}
		goto out;
	}
	/* Get blocksize */
	if (ioctl(fd, FIGETBSZ, &blocksize) < 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_BLOCKSIZE);
		}
		goto out;
	}
	/* Ioctl call does the actual defragment job */
	df_data.start_offset = 0;
	df_data.goal = goal;
	df_data.ext.len = 0;
	df_data.flag = force_flag;

	/* Count file fragments before defrag */
	if (detail_flag) {
		file_frags_start = file_frag_count(fd);
		if (file_frags_start == RETURN_NG) {
			PRINT_FILE_NAME(file);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_INFO);
			goto out;
		}

		if (file_frags_start != 1)
			frag_files_before_defrag++;

		extents_before_defrag += file_frags_start;
	}

	/* Print process progress */
	percent = (start * 100) / buf->st_size;
	printf("\033[79;0H\033[K[%lu/%lu]%s:\t%3d%%",
		defraged_file_count, total_count, file, min(percent, 100));
	fflush(stdout);

	while (1) {
		df_data.defrag_size = (min((buf->st_size - start),
			DEFRAG_SIZE) + blocksize - 1) / blocksize;
		ret = page_in_core(fd, df_data, &vec, &page_num);
		if (ret == RETURN_NG) {
			if (detail_flag) {
				printf("\n");
				PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_MAP);
			} else {
				printf("\t[ NG ]\n");
			}
			defrag_fail = 1;
			break;
		}

		/* EXT4_IOC_DEFRAG */
		defraged_size = ioctl(fd, EXT4_IOC_DEFRAG, &df_data);

		/* Free pages */
		ret = defrag_fadvise(fd, df_data, vec, page_num);
		if (vec) {
			free(vec);
			vec = NULL;
		}
		if (ret == RETURN_NG) {
			if (detail_flag) {
				printf("\n");
				PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_DROP_BUFFER
					);
			} else {
				printf("\t[ NG ]\n");
			}
			defrag_fail = 1;
			break;
		}

		/* Go into force defrag mode */
		if (defraged_size < 0 && force_flag == DEFRAG_FORCE_TRY
					&& errno == ENOSPC) {
			/* File size is larger than max size of
			 * force defrag mode
			 */
			if (buf->st_size > MAX_FILE_SIZE) {
				if (detail_flag) {
					fprintf(stderr, "\n\t");
					fprintf(stderr, NGMSG_FORCE_DEFRAG_SIZE,
						(MAX_FILE_SIZE) / (1024 * 1024)
						);
					fprintf(stderr, "\t[ NG ]\n");
				} else {
					printf("\t[ NG ]\n");
				}
				defrag_fail = 1;
				break;
			}

			printf("\033[79;0H\033[K");
			defraged_size = force_defrag(fd, file, buf, blocksize);
			if (defraged_size * blocksize >= buf->st_size)
				/* Whole file is enforcedly defraged */
				break;
			else {
				if (!detail_flag) {
					printf("\033[79;0H\033[K[%lu/%lu]%s\t"
						"0%%\t[ NG ]\n",
						defraged_file_count,
						total_count, file);
				}
				defrag_fail = 1;
				break;
			}
		}
		if (defraged_size < 0) {
			if (detail_flag) {
				printf("\n");
				PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_DEFRAG);
			} else {
				printf("\t[ NG ]\n");
			}
			defrag_fail = 1;
			break;
		}
		df_data.start_offset += defraged_size;
		start = (long long)df_data.start_offset * blocksize;

		/* Print process progress */
		percent = (start * 100) / buf->st_size;

		/* Disk space file used is bigger than logical size */
		printf("\033[79;0H\033[K[%lu/%lu]%s:\t%3d%%",
			defraged_file_count, total_count, file,
				min(percent, 100));
		fflush(stdout);

		/* End of file */
		if (start >= buf->st_size)
			break;
	}

	/* Count file fragments after defrag and print extents info */
	if (detail_flag) {
		file_frags_end = file_frag_count(fd);
		if (file_frags_end == RETURN_NG) {
			printf("\n");
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_INFO);
			goto out;
		}

		if (file_frags_end != 1)
			frag_files_after_defrag++;

		extents_after_defrag += file_frags_end;

		if (defrag_fail)
			goto out;

		printf("  extents: %lld -> %lld",
			file_frags_start, file_frags_end);
		fflush(stdout);
	}

	if (defrag_fail)
		goto out;

	printf("\t[ OK ]\n");
	fflush(stdout);
	succeed_cnt++;

out:
	close(fd);
	return FTW_CONT;
}

/*
 * file_check() -       Check file's attributes.
 *
 * @fd:			the file's descriptor.
 * @buf:		a pointer of the struct stat64.
 * @file_name:		the file's name.
 */
int file_check(int fd, const struct stat64 *buf, const char *file_name)
{
	struct flock	lock;

	/* Write-lock check is more reliable */
	lock.l_type = F_WRLCK;
	lock.l_start = 0;
	lock.l_whence = SEEK_SET;
	lock.l_len = 0;

	/* Regular file */
	if (S_ISREG(buf->st_mode) == 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_UNREG);
		}
		return RETURN_NG;
	}

	/* Free space */
	if (check_free_size(fd, file_name, buf) == RETURN_NG)
		return RETURN_NG;

	/* Priority */
	if (getuid() != ROOT_UID &&
		buf->st_uid != getuid()) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_PRIORITY);
		}
		return RETURN_NG;
	}

	/* Lock status */
	if (fcntl(fd, F_GETLK, &lock) < 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_GET_LCKINFO);
		}
		return RETURN_NG;
	} else if (lock.l_type != F_UNLCK) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_LOCK);
		}
		return RETURN_NG;
	}

	/* Empty file */
	if (buf->st_size == 0) {
		if (detail_flag) {
			PRINT_FILE_NAME(file_name);
			IN_FTW_PRINT_ERR_MSG(NGMSG_FILE_BLANK);
		}
		return RETURN_NG;
	}

	return RETURN_OK;
}

/*
 * is_ext4() -		Whether on an ext4 filesystem.
 *
 * @filename:		the file's name.
 */
int is_ext4(const char *filename)
{
	int 	maxlen, len, ret;
	FILE	*fp = NULL;
	char	*mnt_type = NULL;
	/* Refer to /etc/mtab */
	char	*mtab = MOUNTED;
	char	file_path[PATH_MAX + 1];
	struct mntent	*mnt = NULL;
	struct statfs	buffs;

	/* Get full path */
	if (realpath(filename, file_path) == NULL) {
		perror(NGMSG_REALPATH);
		PRINT_FILE_NAME(filename);
		return RETURN_NG;
	}

	if (statfs(file_path, &buffs) < 0) {
		perror(NGMSG_FS_INFO);
		PRINT_FILE_NAME(filename);
		return RETURN_NG;
	}

	if (buffs.f_type != EXT4_SUPER_MAGIC) {
		PRINT_ERR_MSG(NGMSG_EXT4);
		return RETURN_NG;
	}

	fp = setmntent(mtab, "r");
	if (fp == NULL) {
		perror(NGMSG_MTAB);
		return RETURN_NG;
	}

	maxlen = 0;
	while ((mnt = getmntent(fp)) != NULL) {
		len = strlen(mnt->mnt_dir);
		ret = memcmp(file_path, mnt->mnt_dir, len);
		if (ret != 0)
			continue;

		if (maxlen >= len)
			continue;

		maxlen = len;

		mnt_type = realloc(mnt_type, strlen(mnt->mnt_type) + 1);
		if (!mnt_type) {
			endmntent(fp);
			return RETURN_NG;
		}
		strcpy(mnt_type, mnt->mnt_type);
		strncpy(lost_found_dir, mnt->mnt_dir, PATH_MAX);
	}

	endmntent(fp);
	if (strcmp(mnt_type, FS_EXT4) == 0) {
		FREE(mnt_type);
		return RETURN_OK;
	} else {
		FREE(mnt_type);
		PRINT_ERR_MSG(NGMSG_EXT4);
		return RETURN_NG;
	}
}

/*
 * calc_entry_counts -	calculate file counts
 *
 * @file:		file name.
 * @buf:		file info.
 * @flag:		file type.
 * @ftwbuf:		the pointer of a struct FTW.
 */
int calc_entry_counts(const char *file, const struct stat64 *buf, int flag,
			   struct FTW *ftwbuf)
{

	if (flag == FTW_F)
		regular_count++;

	total_count++;

	return FTW_CONT;
}

/*
 * get_mount_point() -	Get device's mount point.
 *
 * @devname:		the device's name.
 * @mount_point:	the mount point.
 * @dir_path_len:	the length of directory.
 */
int get_mount_point(const char *devname, char *mount_point,
		int dir_path_len)
{
	/* Refer to /etc/mtab */
	char	*mtab = MOUNTED;
	FILE	*fp = NULL;
	struct mntent	*mnt = NULL;

	fp = setmntent(mtab, "r");
	if (fp == NULL) {
		perror(NGMSG_MTAB);
		return RETURN_NG;
	}

	while ((mnt = getmntent(fp)) != NULL) {
		if (strcmp(devname, mnt->mnt_fsname) != 0)
			continue;

		endmntent(fp);
		if (strcmp(mnt->mnt_type, FS_EXT4) == 0) {
			strncpy(mount_point, mnt->mnt_dir,
				dir_path_len);
			return RETURN_OK;
		}
		PRINT_ERR_MSG(NGMSG_EXT4);
		return RETURN_NG;
	}
	endmntent(fp);
	PRINT_ERR_MSG(NGMSG_UNMOUNT);
	return RETURN_NG;
}

/*
 * main() -		ext4 online defrag.
 *
 * @argc:		the number of parameter.
 * @argv[]:		the pointer array of parameter.
 */
int main(int argc, char *argv[])
{
	int	fd;
	int	opt;
	int	i, flags;
	int	arg_type;
	int	detail_tmp;
	int	success_flag;
	char	dir_name[PATH_MAX + 1];
	struct stat64	buf;

	i = 1;
	flags = 0;
	arg_type = -1;
	detail_tmp = -1;
	success_flag = 0;
	/* Do not follow symlink */
	flags |= FTW_PHYS;
	/* Stay within the same filesystem */
	flags |= FTW_MOUNT;

	/* Parse arguments */
	if (argc == 1 || (argc == 2 && argv[1][0] == '-'))
		goto out;

	while ((opt = getopt(argc, argv, "rvf")) != EOF) {
		switch (opt) {
		case 'r':
			regional_flag = 1;
			i = 2;
			break;
		case 'v':
			detail_flag = 1;
			i = 2;
			break;
		case 'f':
			force_flag = DEFRAG_FORCE_TRY;
			i = 2;

			if (argc == 3)
				break;

			if (argc > 4) {
				printf("Illegal argument\n");
				goto out;
			}

			/* Now the agrc must be 4(e4defrag -f file blocknr) */
			int res_strlen;
			res_strlen = strlen(argv[3]);
			for (res_strlen -= 1; res_strlen >= 0; res_strlen--) {
				if (!isdigit(argv[3][res_strlen])) {
					printf("Illegal argument\n");
					goto out;
				}
			}

			fgoal = strtoul(argv[3], NULL, 0);
			if (errno) {
				printf("block num shold be < 4294967295"
					"(max num of unsigned long 32bit)\n");
				exit(1);
			}
			if (!fgoal)
				fgoal = -1;
			break;
		default:
			goto out;
		}
	}

	/* Main process */
	for (; i < argc; i++) {
		succeed_cnt = 0;
		regular_count = 0;
		total_count = 0;
		frag_files_before_defrag = 0;
		frag_files_after_defrag = 0;
		extents_before_defrag = 0;
		extents_after_defrag = 0;
		defraged_file_count = 0;

		memset(dir_name, 0, PATH_MAX + 1);
		memset(lost_found_dir, 0, PATH_MAX + 1);

		if (force_flag && i == 3)
			break;

		#if BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN
			PRINT_ERR_MSG(NGMSG_ENDIAN);
			PRINT_FILE_NAME(argv[i]);
			continue;
		#endif

		if (lstat64(argv[i], &buf) < 0) {
			perror(NGMSG_FILE_INFO);
			PRINT_FILE_NAME(argv[i]);
			continue;
		}

		/* Only regular file is acceptalbe with force defrag mode */
		if (force_flag && !S_ISREG(buf.st_mode)) {
			printf("Inappropriate file type\n");
			goto out;
		}

		if (S_ISBLK(buf.st_mode)) {
			/* Block device */
			if (get_mount_point(argv[i], dir_name, PATH_MAX)
							== RETURN_NG)
				continue;
			arg_type = DEVNAME;
			printf("ext4 defragmentation for device(%s)\n",
				argv[i]);
		} else if (S_ISDIR(buf.st_mode)) {
			/* Directory */
			if (access(argv[i], R_OK) < 0) {
				perror(argv[i]);
				continue;
			}
			arg_type = DIRNAME;
			strcpy(dir_name, argv[i]);
		} else if (S_ISREG(buf.st_mode)) {
			/* Regular file */
			arg_type = FILENAME;
		} else {
			/* Irregular file */
			PRINT_ERR_MSG(NGMSG_FILE_UNREG);
			PRINT_FILE_NAME(argv[i]);
			continue;
		}

		/* For device case,
		 * filesystem type checked in get_mount_point()
		 */
		if (arg_type == FILENAME || arg_type == DIRNAME) {
			if (is_ext4(argv[i]) == RETURN_NG)
				continue;
			if (realpath(argv[i], dir_name) == NULL) {
				perror(NGMSG_REALPATH);
				PRINT_FILE_NAME(argv[i]);
				continue;
			}
		}

		switch (arg_type) {
		case DIRNAME:
			printf("ext4 defragmentation for directory(%s)\n",
				argv[i]);

			int mount_dir_len = 0;
			mount_dir_len = strnlen(lost_found_dir, PATH_MAX);

			strncat(lost_found_dir, "/lost+found",
				PATH_MAX - strnlen(lost_found_dir, PATH_MAX));

			/* Not the case("e4defrag  mount_piont_dir") */
			if (dir_name[mount_dir_len] != '\0') {
				/* "e4defrag mount_piont_dir/lost+found"
				 * or "e4defrag mount_piont_dir/lost+found/"
				 */
				if (strncmp(lost_found_dir, dir_name,
					    strnlen(lost_found_dir,
						    PATH_MAX)) == 0 &&
				    (dir_name[strnlen(lost_found_dir,
						      PATH_MAX)] == '\0' ||
				     dir_name[strnlen(lost_found_dir,
						      PATH_MAX)] == '/')) {
					PRINT_ERR_MSG(NGMSG_LOST_FOUND);
					PRINT_FILE_NAME(argv[i]);
					continue;
				}

				/* "e4defrag mount_piont_dir/else_dir" */
				memset(lost_found_dir, 0, PATH_MAX + 1);
			}
		case DEVNAME:
			if (arg_type == DEVNAME) {
				strncpy(lost_found_dir, dir_name,
					strnlen(dir_name, PATH_MAX));
				strncat(lost_found_dir, "/lost+found/",
					PATH_MAX - strnlen(lost_found_dir,
							   PATH_MAX));
			}

			/* Regional block allocation */
			if (regional_flag) {
				unsigned int bg_num = 0;
				int ret = 0;
				struct ext4_super_block sb;

				printf(MSG_R_OPTION);

				fd = open64(dir_name, O_RDONLY);
				if (fd < 0) {
					if (detail_flag) {
						perror(NGMSG_FILE_OPEN);
						PRINT_FILE_NAME(dir_name);
					}
					continue;
				}

				memset(&sb, 0, sizeof(struct ext4_super_block));

				ret = ioctl(fd, EXT4_IOC_SUPER_INFO, &sb);
				if (ret < 0) {
					perror(NGMSG_FILE_SUPER);
					PRINT_FILE_NAME(dir_name);
					close(fd);
					continue;
				}
				close(fd);

				sb.s_blocks_per_group =
					ext2fs_swab32(sb.s_blocks_per_group);
				sb.s_inodes_per_group =
					ext2fs_swab32(sb.s_inodes_per_group);
				sb.s_first_data_block =
					ext2fs_swab32(sb.s_first_data_block);

				bg_num = (buf.st_ino - 1) /
							sb.s_inodes_per_group;
				goal = bg_num * sb.s_blocks_per_group +
							sb.s_first_data_block;
			}
			nftw64(dir_name, calc_entry_counts, FTW_OPEN_FD, flags);

			/* File tree walk */
			nftw64(dir_name, ftw_fn, FTW_OPEN_FD, flags);
			printf("\n\tSuccess:\t\t\t[ %lu/%lu ]\n", succeed_cnt,
				total_count);
			printf("\tFailure:\t\t\t[ %lu/%lu ]\n",
				total_count - succeed_cnt, total_count);
			if (detail_flag) {
				printf("\tTotal extents:\t\t\t%4d->%d\n",
					extents_before_defrag,
					extents_after_defrag);
				printf("\tFragmented percentage:\t\t"
					"%3llu%%->%llu%%\n",
					!regular_count ? 0 :
					((unsigned long long)
					frag_files_before_defrag * 100) /
					regular_count,
					!regular_count ? 0 :
					((unsigned long long)
					frag_files_after_defrag * 100) /
					regular_count);
			}
			break;
		case FILENAME:
			total_count = 1;
			strncat(lost_found_dir, "/lost+found/",
				PATH_MAX - strnlen(lost_found_dir,
						   PATH_MAX));
			if (strncmp(lost_found_dir, dir_name,
				    strnlen(lost_found_dir,
					    PATH_MAX)) == 0) {
				PRINT_ERR_MSG(NGMSG_LOST_FOUND);
				PRINT_FILE_NAME(argv[i]);
				continue;
			}

			if (regional_flag) {
				fprintf(stderr, NGMSG_TYPE, argv[i]);
				continue;
			}
			printf("ext4 defragmentation for %s\n", argv[i]);
			/* Defrag single file process */
			ftw_fn(argv[i], &buf, FTW_F, NULL);
			if (succeed_cnt != 0)
				printf(" Success:\t\t\t[1/1]\n");
			else
				printf(" Success:\t\t\t[0/1]\n");

			break;
		}

		if (succeed_cnt != 0)
			success_flag = 1;
	}

	if (success_flag)
		return RETURN_OK;

	exit(1);

out:
	printf(MSG_USAGE);
	exit(1);
}

/*
 * insert_extent() -	Sequentially insert extent by physical block number.
 *
 * @extlist_head:	the head of an extent list.
 * @ext:		the extent element which will be inserted.
 */
int insert_extent(extent_t **extlist_head, extent_t *ext)
{
	extent_t	*ext_tmp = *extlist_head;

	if (ext == NULL)
		return RETURN_NG;

	/* First element */
	if (*extlist_head == NULL) {
		(*extlist_head) = ext;
		(*extlist_head)->prev = *extlist_head;
		(*extlist_head)->next = *extlist_head;
		return RETURN_OK;
	}

	if (ext->data.start <= ext_tmp->data.start) {
		/* Insert before head */
		if (ext_tmp->data.start < ext->data.start + ext->data.len)
			/* Overlap */
			return RETURN_NG;
		/* Adjust head */
		*extlist_head = ext;
	} else {
		/* Insert into the middle or last of the list */
		do {
			if (ext->data.start < ext_tmp->data.start)
				break;
			ext_tmp = ext_tmp->next;
		} while (ext_tmp != (*extlist_head));
		if (ext->data.start <
		    ext_tmp->prev->data.start + ext_tmp->prev->data.len)
			/* Overlap */
			return RETURN_NG;

		if (ext_tmp != *extlist_head &&
		    ext_tmp->data.start < ext->data.start + ext->data.len)
			/* Overlap */
			return RETURN_NG;
	}
	ext_tmp = ext_tmp->prev;
	/* Insert "ext" after "ext_tmp" */
	insert(ext_tmp, ext);
	return RETURN_OK;
}

/*
 * insert_exts_group() -	Insert a exts_group in decreasing order
 *				of length.
 *
 * @exts_group_list_head:	the head of a exts_group list.
 * @exts_group:			the exts_group element which will be inserted.
 */
int insert_exts_group(exts_group_t **exts_group_list_head,
		      exts_group_t *exts_group)
{
	exts_group_t	*exts_group_tmp = NULL;

	if (exts_group == NULL)
		return RETURN_NG;

	/* Initialize list */
	if (*exts_group_list_head == NULL) {
		(*exts_group_list_head) = exts_group;
		(*exts_group_list_head)->prev = *exts_group_list_head;
		(*exts_group_list_head)->next = *exts_group_list_head;
		return RETURN_OK;
	}

	if (exts_group->len >= (*exts_group_list_head)->len) {
		/* Insert before exts_group_list_head */
		exts_group_tmp = (*exts_group_list_head)->prev;
		insert(exts_group_tmp, exts_group);
		*exts_group_list_head = exts_group;
		return RETURN_OK;
	}

	/* Find insertion positon */
	for (exts_group_tmp = (*exts_group_list_head)->next;
	     exts_group_tmp != *exts_group_list_head;
	     exts_group_tmp = exts_group_tmp->next) {
		if (exts_group_tmp->len < exts_group->len)
			break;
	}
	exts_group_tmp = exts_group_tmp->prev;
	insert(exts_group_tmp, exts_group);

	return RETURN_OK;
}

/*
 * get_exts_group() -		Get element from the exts_group list.
 *
 * @exts_group_list_head:	the head of a exts_group list.
 * @exts_group:			the exts_group element which will be got.
 */
exts_group_t *get_exts_group(exts_group_t **exts_group_list_head,
			      exts_group_t *exts_group)
{
	if (exts_group == NULL || *exts_group_list_head == NULL)
		return NULL;

	/* Keep "exts_group_list_head" point to the largest extent group */
	if (exts_group == *exts_group_list_head)
		*exts_group_list_head = exts_group->next;

	if (*exts_group_list_head == (*exts_group_list_head)->next &&
	    exts_group == *exts_group_list_head)
		/* Delete the last element in the list */
		*exts_group_list_head = NULL;

	exts_group->prev->next = exts_group->next;
	exts_group->next->prev = exts_group->prev;
	return exts_group;
}

/*
 * free_exts_group() -		Free the exts_group.
 *
 * @*exts_group_list_head:	the exts_group list head which will be free.
 */
 void free_exts_group(exts_group_t *exts_group_list_head)
{
	exts_group_t *exts_group_tmp = NULL;

	if (exts_group_list_head == NULL)
		return;

	while (exts_group_list_head->next != exts_group_list_head) {
		exts_group_tmp = exts_group_list_head;
		exts_group_list_head->prev->next = exts_group_list_head->next;
		exts_group_list_head->next->prev = exts_group_list_head->prev;
		exts_group_list_head = exts_group_list_head->next;
		free(exts_group_tmp);
	}
	free(exts_group_list_head);
}

/*
 * free_ext() -		Free the extent list.
 *
 * @extent_list_head:	the extent list head of which will be free.
 */
void free_ext(extent_t *extent_list_head)
{
	extent_t *extent_tmp = NULL;

	if (extent_list_head == NULL)
		return;

	while (extent_list_head->next != extent_list_head) {
		extent_tmp = extent_list_head;
		extent_list_head->prev->next = extent_list_head->next;
		extent_list_head->next->prev = extent_list_head->prev;
		extent_list_head = extent_list_head->next;
		free(extent_tmp);
	}
	free(extent_list_head);
}

/*
 * do_defrag() -	Execute the defrag program in force mode.
 *
 * @fd:			the file's descriptor.
 * @exts_group:		the exts_group which will be defraged.
 * @defrag_data:	the data which will be defraged.
 */
int do_defrag(int fd, exts_group_t *exts_group,
	      struct ext4_ext_defrag_data defrag_data)
{
	int defraged_size = 0;
	int fadvise_ret = 0;
	unsigned long page_num;
	unsigned char *vec = NULL;

	/* Init defrag_data for defrag */
	defrag_data.ext.start = exts_group->start->data.start;
	defrag_data.ext.len = exts_group->len;
	defrag_data.ext.block = 0;
	defrag_data.defrag_size = exts_group->len;
	/* Move victim extents to make sufficient space */
	defrag_data.flag = DEFRAG_FORCE_GATHER;
	defrag_data.goal = exts_group->start->data.start;

	if (page_in_core(fd, defrag_data, &vec, &page_num) == RETURN_NG)
		return RETURN_NG;

	defraged_size = ioctl(fd, EXT4_IOC_DEFRAG, &defrag_data);

	/* Free pages */
	fadvise_ret = defrag_fadvise(fd, defrag_data, vec, page_num);
	FREE(vec);

	if (fadvise_ret == RETURN_NG || defraged_size < 0)
		return RETURN_NG;

	return defraged_size;
}

/*
 * get_used_extent() -	Get used extent in the block group.
 *
 * @fd:			the file's descriptor.
 * @ext_list_head:	the head of the extent list.
 * @istart:		start inode in the block group.
 * @iend:		end inode in the block group.
 * @bstart:		start block in the block group.
 * @bend:		end block in the block group.
 */
int get_used_extent(int fd, extent_t **ext_list_head,
		    unsigned long long istart, unsigned long long iend,
		    ext4_fsblk_t bstart, ext4_fsblk_t bend)
{
	int	ret = 0;
	int	blocksize;
	int	extents_buffer_size, fiemap_ino_size;
	__u64	pos = 0;
	__u64   group_start, group_end;
	unsigned long long	ino;
	struct	fiemap_extent *extents_buf = NULL;
	struct	fiemap_ino *fiemap_ino_buf = NULL;

	ret = ioctl(fd, FIGETBSZ, &blocksize);
	if (ret < 0) {
		if (detail_flag)
			perror(NGMSG_FILE_BLOCKSIZE);

		return RETURN_NG;
	}

	/* Convert units, in bytes.
	 * Be careful : now, physical block number in extent is 48bit,
	 * and the maximum blocksize for ext4 is 4K(12bit),
	 * so there is no overflow, but in future it may be changed.
	 */
	group_start = bstart * blocksize;
	group_end = bend * blocksize;

	/* Alloc space for fiemap_ino */
	fiemap_ino_size = sizeof(struct fiemap_ino);
	extents_buffer_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent);

	fiemap_ino_buf = malloc(fiemap_ino_size + extents_buffer_size);
	if (!fiemap_ino_buf)
		return RETURN_NG;

	extents_buf = fiemap_ino_buf->fiemap.fm_extents;
	memset(fiemap_ino_buf, 0, fiemap_ino_size);
	fiemap_ino_buf->fiemap.fm_length = FIEMAP_MAX_OFFSET;
	fiemap_ino_buf->fiemap.fm_flags |= FIEMAP_FLAG_SYNC;
	fiemap_ino_buf->fiemap.fm_extent_count = EXTENT_MAX_COUNT;

	for (ino = istart; ino <= iend; ino++) {
		fiemap_ino_buf->ino = ino;
		pos = 0;
		do {
			int i;
			fiemap_ino_buf->fiemap.fm_start = pos;
			memset(extents_buf, 0, extents_buffer_size);

			ret = ioctl(fd, EXT4_IOC_FIEMAP_INO, fiemap_ino_buf);
			if (ret < 0) {
				/* Skip non-regular file and unused inode */
				if (errno == EOPNOTSUPP || errno == ESTALE
					|| errno == ENOENT || errno == EACCES)
					continue;
				goto out;
			}

			for (i = 0; i < fiemap_ino_buf->
					fiemap.fm_mapped_extents; i++) {
				extent_t        *extent = NULL;

				/* Is this extent in current block group ? */
				if (extents_buf[i].fe_physical < group_start ||
					extents_buf[i].fe_physical > group_end)
					continue;

				extent = malloc(sizeof(extent_t));
				if (extent == NULL)
					goto out;

				extent->data.start = extents_buf[i].fe_physical
							/ blocksize;
				extent->data.block = extents_buf[i].fe_logical
							/ blocksize;
				extent->data.len = extents_buf[i].fe_length
							/ blocksize;
				extent->ino = ino;
				extent->tag = EXT4_EXT_USE;

				if (insert_extent(ext_list_head, extent) < 0) {
					FREE(extent);
					goto out;
				}
			}

			/* Record file's logical offset this time */
			pos = extents_buf[EXTENT_MAX_COUNT-1].fe_logical +
				extents_buf[EXTENT_MAX_COUNT-1].fe_length;
		/* If fm_extents array has been filled and
		 * there are extents left, continue to cycle.
		 */
		} while (fiemap_ino_buf->fiemap.fm_mapped_extents
						== EXTENT_MAX_COUNT &&
			!(extents_buf[EXTENT_MAX_COUNT-1].fe_flags
						& FIEMAP_EXTENT_LAST));
	}

	FREE(fiemap_ino_buf);
	return RETURN_OK;
out:
	FREE(fiemap_ino_buf);
	return RETURN_NG;
}

/*
 * get_free_extent() -	Get used extent in the block group.
 *
 * @fd:			the file's descriptor.
 * @ino:		inode number from struct stat64.
 * @blocks_per_group:	the block number of each block group.
 * @ext_list_head:	the head of the extent list.
 */
int get_free_extent(int fd, unsigned long long ino,
		    int blocks_per_group, extent_t **ext_list_head)
{
	ext4_grpblk_t	pos = 0;
	struct ext4_extents_info	extents_info;

	memset(&extents_info, 0, sizeof(struct ext4_extents_info));
	extents_info.ino = ino;
	extents_info.max_entries = DEFRAG_MAX_ENT;
	while (pos < blocks_per_group) {
		int	i;
		if (ioctl(fd, EXT4_IOC_FREE_BLOCKS_INFO, &extents_info) < 0)
			return RETURN_NG;

		for (i = 0;
		     extents_info.ext[i].len != 0 && i < DEFRAG_MAX_ENT; i++) {
			/* Alloc space for extent */
			extent_t	*extent = NULL;

			extent = malloc(sizeof(extent_t));
			if (extent == NULL)
				return RETURN_NG;
			memset(extent, 0, sizeof(extent_t));
			memcpy(&(extent->data), &(extents_info.ext[i]),
			       sizeof(struct ext4_extent_data));
			/* Free extent */
			extent->tag = EXT4_EXT_FREE;
			if (insert_extent(ext_list_head, extent) < 0) {
				FREE(extent);
				return RETURN_NG;
			}
		}

		/* No free extent after the logical block number "pos".
		 * In other word, offset this time equals to prev recursion.
		 */
		if (pos == extents_info.g_offset)
			break;
		if (i < DEFRAG_MAX_ENT)
			break;
		/* Record the offset of logical block number this time */
		pos = extents_info.g_offset;
		memset(extents_info.ext, 0,
		       sizeof(struct ext4_extent_data) * DEFRAG_MAX_ENT);
	}

	return RETURN_OK;
}

/*
 * join_extents() -		Find continuous region(exts_group).
 *
 * @ext_list_head:		the head of the extent list.
 * @target_exts_group_list_head:the head of the target exts_group list.
 * @exts_group_list_head:	the head of the original exts_group list.
 * @filesize:			the file's descriptor.
 * @max:			the max size of free space.
 */
int join_extents(extent_t *ext_list_head,
		 exts_group_t **target_exts_group_list_head,
		 exts_group_t **exts_group_list_head,
		 unsigned long filesize, int *max)
{
	int len;
	extent_t *ext_start, *extent_tmp;

	ext_start = ext_list_head;
	extent_tmp = ext_list_head;
	*max = 0;
	len = ext_list_head->data.len;
	extent_tmp = extent_tmp->next;
	do {
		exts_group_t    *exts_group_tmp = NULL;

		if (len >= filesize) {
			/* Hit on the way,
			 * one extent group is enough for defrag, so return.
			 */
			exts_group_tmp = malloc(sizeof(exts_group_t));
			if (!exts_group_tmp)
				return RETURN_NG;

			exts_group_tmp->prev = NULL;
			exts_group_tmp->next = NULL;
			exts_group_tmp->start = ext_start;
			exts_group_tmp->end = extent_tmp->prev;
			exts_group_tmp->len = len;
			if (insert_exts_group(target_exts_group_list_head,
					      exts_group_tmp) < 0) {
				FREE(exts_group_tmp);
				return RETURN_NG;
			}
			return CHECK_FRAG_COUNT;
		}

		/* This extent and previous extent are not continuous,
		 * so, all previous extents are treated as an extent group.
		 */
		if ((extent_tmp->prev->data.start + extent_tmp->prev->data.len)
					    != extent_tmp->data.start) {
			exts_group_tmp = malloc(sizeof(exts_group_t));
			if (exts_group_tmp == NULL)
				return RETURN_NG;

			memset(exts_group_tmp, 0, sizeof(exts_group_t));
			exts_group_tmp->len = len;
			exts_group_tmp->start = ext_start;
			exts_group_tmp->end = extent_tmp->prev;

			if (insert_exts_group(exts_group_list_head,
					      exts_group_tmp) < 0) {
				FREE(exts_group_tmp);
				return RETURN_NG;
			}
			*max += len;
			ext_start = extent_tmp;
			len = extent_tmp->data.len;
			extent_tmp = extent_tmp->next;
			continue;
		}

		/* This extent and previous extent are continuous,
		 * so, they belong to the same extent group, and we check
		 * if the next extent belongs to the same extent group.
		 */
		len += extent_tmp->data.len;
		extent_tmp = extent_tmp->next;
	} while (extent_tmp != ext_list_head->next);

	return RETURN_OK;
}

/*
 * find_exts_group() -			Find target exts_group.
 *
 * @ext_count:				the number of extents.
 * @filesize:				the file's size.
 * @exts_group_list_head:		the head of the original exts_group list
 * @target_exts_group_list_head:	the head of the target exts_group list.
 */
int find_exts_group(int	*ext_count, unsigned long filesize,
		    exts_group_t **exts_group_list_head,
		    exts_group_t **target_exts_group_list_head)
{
	/* Blocks we found for target file */
	int len = 0;

	if (!(*exts_group_list_head))
		return RETURN_NG;

	while (*exts_group_list_head) {
		exts_group_t	*exts_group_tmp = NULL;

		/* Even add the current largest extent group,
		 * the sapce we found is still not enough for the file,
		 * so just add the current largest extent group,
		 * and do the next loop.
		 */
		if ((*exts_group_list_head)->len + len < filesize) {
			len += (*exts_group_list_head)->len;
			exts_group_tmp = get_exts_group(exts_group_list_head,
				*exts_group_list_head);
			if (insert_exts_group(target_exts_group_list_head,
				      exts_group_tmp) < 0) {
				FREE(exts_group_tmp);
				return RETURN_NG;
			}
			(*ext_count)++;
			continue;
		}

		/* Now, if add the current largest extent group,
		 * the space we found is enough,
		 * So, search from the smallest extent group
		 * to avoid waste of space
		 */
		exts_group_tmp = (*exts_group_list_head)->prev;
		do {
			if (exts_group_tmp->len + len >= filesize) {
				len += exts_group_tmp->len;
				exts_group_tmp =
				get_exts_group(exts_group_list_head,
					       exts_group_tmp);
				if (insert_exts_group
					(target_exts_group_list_head,
					 exts_group_tmp) < 0) {
					FREE(exts_group_tmp);
					return RETURN_NG;
				}
				(*ext_count)++;
				/* The only entry go out normally */
				return RETURN_OK;
			}
			exts_group_tmp = exts_group_tmp->prev;
		} while (exts_group_tmp !=
			 (*exts_group_list_head)->prev);

		/* If we come here, there must be something wrong */
		return RETURN_NG;
	}

	/* If we come here, there must be something wrong(space not enough) */
	return RETURN_NG;
}

/*
 * check_frag_count() -		Check file frag.
 *
 * @fd:				the file's discriptor.
 * @extent_count:		the number of extents.
 */
int check_frag_count(int fd, int extent_count)
{
	long long file_extent_count = RETURN_NG;

	file_extent_count = file_frag_count(fd);
	if (file_extent_count == RETURN_NG)
		return RETURN_NG;

	if (extent_count >= file_extent_count) {
		/* No improvment */
		errno = ENOSPC;
		return RETURN_NG;
	}

	return RETURN_OK;
}

/*
 * defrag_proc() -		Reserve extent group and execute
 *				the defrag program.
 *
 * @fd:				the file's discriptor.
 * @file_name			file name.
 * @target_exts_group_head:	the head of the original exts_group list.
 * @ino:			inode number from struct stat64.
 */
int defrag_proc(int fd, const char *file_name,
		exts_group_t *target_exts_group_head, unsigned long long ino)
{
	int ret = 0;
	int flag = 0;
	int count = 0;
	int percent = 0;
	int blocksize = 0;
	struct stat64	buf;
	exts_group_t 			*exts_group = NULL;
	extent_t			*extent = NULL;
	struct ext4_extents_info	extents_info;
	struct ext4_ext_defrag_data	defrag_data;

	if (!target_exts_group_head)
		return RETURN_NG;

	/* Get file size */
	memset(&buf, 0, sizeof(struct stat64));
	ret = fstat64(fd, &buf);
	if (ret < 0) {
		if (detail_flag) {
			printf("\033[79;0H\033[K[%lu/%lu]%s\n",
					defraged_file_count,
					total_count, file_name);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_INFO);
		}
		return RETURN_NG;
	}

	/* Get block size */
	ret = ioctl(fd, FIGETBSZ, &blocksize);
	if (ret < 0) {
		if (detail_flag) {
			printf("\033[79;0H\033[K[%lu/%lu]%s\n",
					defraged_file_count,
					total_count, file_name);
			PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_BLOCKSIZE);
		}
		return RETURN_NG;
	}

	memset(&extents_info, 0, sizeof(extents_info));
	memset(&defrag_data, 0, sizeof(struct ext4_ext_defrag_data));
	extents_info.ino = 0;
	exts_group = target_exts_group_head;
	extents_info.max_entries = DEFRAG_MAX_ENT;
	extents_info.ino = ino;
	defrag_data.start_offset = 0;

	do {
		/* Move victim extents to make sufficient space */
		extent = exts_group->start;
		int flush_count = 0;
		do {
			if (extent->tag != EXT4_EXT_USE) {
				extent = extent->next;
				continue;
			}
			extents_info.ino = extent->ino;
			extents_info.goal = fgoal;
			memcpy(extents_info.ext, &extent->data,
			       sizeof(struct ext4_extent_data));

			extent = extent->next;
			extents_info.entries = 1;
			ret = ioctl(fd, EXT4_IOC_MOVE_VICTIM, &extents_info);
			if (ret < 0) {
				if (errno == EACCES && detail_flag) {
					printf("\033[79;0H\033[K[%lu/%lu]%s\n",
					defraged_file_count,
					total_count, file_name);
					PRINT_ERR_MSG_WITH_ERRNO
						(NGMSG_VICTIM_UID);
				}
				return ret;
			}
			count++;
			if (detail_flag) {
				if (!flag) {
					printf(" Move victim extents to");
					printf(" the other block group\n");
					flag = 1;
				}
				/* moved extent info */
				if (!flush_count) {
					/* flush the defrag process info */
					printf("\033[79;0H\033[K ext%-3d",
						count);
					printf(" phys:\t%8llu logical:\t%8u "
						"length:\t%8d\n",
						(extents_info.ext[0]).start,
						(extents_info.ext[0]).block,
						(extents_info.ext[0]).len);
					flush_count = 1;
				} else {
					printf(" ext%-3d phys:\t%8llu logical:"
						"\t%8u length:\t%8d\n", count,
						(extents_info.ext[0]).start,
						(extents_info.ext[0]).block,
						(extents_info.ext[0]).len);
				}
			}
		} while (extent != exts_group->end->next);

		if (fsync(fd) < 0) {
			if (detail_flag) {
				printf("\033[79;0H\033[K[%lu/%lu]%s\n",
					defraged_file_count,
					total_count, file_name);
				PRINT_ERR_MSG_WITH_ERRNO(NGMSG_FILE_SYNC);
			}
			return ret;
		}

		/* Init extents_info for ioctl (RESERVE) */
		extents_info.entries = 1;
		extents_info.ext[0].block = exts_group->start->data.block;
		extents_info.ext[0].start = exts_group->start->data.start;
		extents_info.ext[0].len = exts_group->len;

		/* Defrag */
		ret = do_defrag(fd, exts_group, defrag_data);
		if (ret < 0) {
			if (detail_flag) {
				printf("\033[79;0H\033[K[%lu/%lu]%s\n",
					defraged_file_count,
					total_count, file_name);
				printf("\tDEFRAG_ERROR ret = %d\t[ NG ]\n",
					ret);
			}
			return ret;
		}
		/* Defrag success, so update offset */
		defrag_data.start_offset += ret;
		ret = defrag_data.start_offset;

		/* Print process progress */
		{
			percent = ((long long)ret * blocksize * 100) /
				  buf.st_size;
			printf("\033[79;0H\033[K[%lu/%lu]%s:\t%d%%",
				defraged_file_count, total_count,
				file_name, min(percent, 100));
			fflush(stdout);
		}

		exts_group = exts_group->next;
	} while (exts_group != target_exts_group_head);
	return ret;
}

/*
 * force_defrag() -	Execute the defrag program in force defrag mode.
 *
 * @fd:			the file's descriptor.
 * @file_name		file name.
 * @buf:		a pointer of the struct stat64.
 * @blocksize:		block size in byte.
 */
int force_defrag(int fd, const char *file_name,
		const struct stat64 *buf, int blocksize)
{
	int     ret = 0;
	int     exts = 0;
	int     maxlen = 0;
	unsigned int    gnumber;
	unsigned long   filesize;
	unsigned long long	istart, iend;
	ext4_fsblk_t	bstart, bend;
	extent_t	*extlist_head = NULL;
	exts_group_t	*exts_group_list_head, *exts_group_list_target_head;
	struct ext4_super_block sb;

	exts_group_list_head = NULL;
	exts_group_list_target_head = NULL;

	/* Get super block info */
	memset(&sb, 0, sizeof(struct ext4_super_block));

	if (ioctl(fd, EXT4_IOC_SUPER_INFO, &sb) < 0)
		return RETURN_NG;

	sb.s_blocks_per_group = ext2fs_swab32(sb.s_blocks_per_group);
	sb.s_inodes_per_group = ext2fs_swab32(sb.s_inodes_per_group);

	gnumber = (buf->st_ino - 1) / sb.s_inodes_per_group;
	istart = gnumber * sb.s_inodes_per_group;
	iend = istart + sb.s_inodes_per_group - 1;
	bstart = gnumber * sb.s_blocks_per_group;
	bend = bstart + sb.s_blocks_per_group - 1;

	/* Compute filesize in block */
	filesize = (buf->st_size + blocksize - 1) / blocksize;

	/* Get used extents in the block group */
	ret = get_used_extent(fd, &extlist_head, istart, iend, bstart, bend);
	if (ret == RETURN_NG)
		goto freelist;

	/* Get free extents in the group */
	ret = get_free_extent(fd, buf->st_ino,
			     sb.s_blocks_per_group, &extlist_head);
	if (ret == RETURN_NG)
		goto freelist;

	/* All space in this group is used by other groups' inodes */
	if (extlist_head == NULL) {
		ret = RETURN_NG;
		goto freelist;
	}

	/* Get continuous region(extents group) */
	ret = join_extents(extlist_head, &exts_group_list_target_head,
				  &exts_group_list_head, filesize, &maxlen);
	if (ret == RETURN_NG)
		goto freelist;
	if (ret == CHECK_FRAG_COUNT) {
		exts = 1;
		goto frag_check;
	}

	if (maxlen < filesize) {
		/* No enough space */
		errno = ENOSPC;
		ret = RETURN_NG;
		goto freelist;
	}

	if (!exts_group_list_head) {
		ret = RETURN_NG;
		goto freelist;
	}

	/* Find target extents group */
	ret = find_exts_group(&exts, filesize, &exts_group_list_head,
				      &exts_group_list_target_head);
	if (ret == RETURN_NG)
		goto freelist;

frag_check:
	/* Check file extent count */
	ret = check_frag_count(fd, exts);
	if (ret == RETURN_NG)
		goto freelist;

	/* Reserve extent group and execute the defrag program */
	ret = defrag_proc(fd, file_name,
			exts_group_list_target_head, buf->st_ino);

freelist:
	free_exts_group(exts_group_list_target_head);
	free_exts_group(exts_group_list_head);
	free_ext(extlist_head);
	return ret;
}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-10-24 10:10 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-24 10:10 [RFC][PATCH 9/9]ext4: online defrag command Akira Fujita

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.