* [PATCH RFC 4/6] ext4: add project id support
From: Konstantin Khlebnikov @ 2015-02-11 15:11 UTC (permalink / raw)
To: Linux FS Devel, linux-ext4, linux-kernel
Cc: Jan Kara, Linux API, containers, Dave Chinner, Andy Lutomirski,
Christoph Hellwig, Dmitry Monakhov, Eric W. Biederman, Li Xi,
Theodore Ts'o, Al Viro
In-Reply-To: <20150211151146.6717.62017.stgit@buzz>
This patch adds a new internal field of ext4 inode to save project identifier.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
fs/ext4/ext4.h | 13 ++++++++++-
fs/ext4/ialloc.c | 6 +++++
fs/ext4/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/namei.c | 14 ++++++++++++
fs/ext4/super.c | 2 ++
5 files changed, 99 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba6..a3fdbb5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -683,6 +683,7 @@ struct ext4_inode {
__le32 i_crtime; /* File Creation time */
__le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
__le32 i_version_hi; /* high 32 bits for 64-bit version */
+ __le32 i_projid; /* Project ID */
};
struct move_extent {
@@ -938,6 +939,7 @@ struct ext4_inode_info {
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
+ kprojid_t i_projid;
};
/*
@@ -1522,6 +1524,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
* GDT_CSUM bits are mutually exclusive.
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x1000 /* Project ID */
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1571,7 +1574,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
- EXT4_FEATURE_RO_COMPAT_QUOTA)
+ EXT4_FEATURE_RO_COMPAT_QUOTA |\
+ EXT4_FEATURE_RO_COMPAT_PROJECT)
/*
* Default values for user and/or group using reserved blocks
@@ -1579,6 +1583,11 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_DEF_RESUID 0
#define EXT4_DEF_RESGID 0
+/*
+ * Default project ID
+ */
+#define EXT4_DEF_PROJID 0
+
#define EXT4_DEF_INODE_READAHEAD_BLKS 32
/*
@@ -2131,6 +2140,8 @@ extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
+extern int ext4_set_projid(struct inode *inode, kprojid_t projid);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ac644c3..d81a30d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -756,6 +756,12 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ ei->i_projid = EXT4_I(dir)->i_projid;
+ else
+ ei->i_projid = KPROJIDT_INIT(EXT4_DEF_PROJID);
+
dquot_initialize(inode);
if (!goal)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa4..0ae2c39 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3863,6 +3863,53 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
EXT4_I(inode)->i_inline_off = 0;
}
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ return -EOPNOTSUPP;
+
+ *projid = EXT4_I(inode)->i_projid;
+ return 0;
+}
+
+/* Called with inode->i_mutex locked. */
+int ext4_set_projid(struct inode *inode, kprojid_t projid)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+ handle_t *handle;
+ int err;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ return -EOPNOTSUPP;
+
+ /* Sanity check */
+ if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE ||
+ !EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), i_projid))
+ return -EOPNOTSUPP;
+
+ if (projid_eq(EXT4_I(inode)->i_projid, projid))
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (!err) {
+ inode->i_ctime = ext4_current_time(inode);
+ EXT4_I(inode)->i_projid = projid;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ ext4_journal_stop(handle);
+
+ return err;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -3874,6 +3921,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
int block;
uid_t i_uid;
gid_t i_gid;
+ projid_t i_projid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -3923,12 +3971,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+ else
+ i_projid = EXT4_DEF_PROJID;
+
if (!(test_opt(inode->i_sb, NO_UID32))) {
i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
+ ei->i_projid = KPROJIDT_INIT(i_projid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -4192,6 +4248,15 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+ projid_t i_projid;
+
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+ }
+
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923..9337d81 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2938,6 +2938,10 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
+ if (!capable_mix_inode_project(EXT4_I(dir)->i_projid,
+ EXT4_I(inode)->i_projid))
+ return -EXDEV;
+
dquot_initialize(dir);
retry:
@@ -3217,6 +3221,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
int credits;
u8 old_file_type;
+ if (!capable_mix_inode_project(EXT4_I(new.dir)->i_projid,
+ EXT4_I(old.inode)->i_projid))
+ return -EXDEV;
+
dquot_initialize(old.dir);
dquot_initialize(new.dir);
@@ -3395,6 +3403,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
u8 new_file_type;
int retval;
+ if (!capable_mix_inode_project(EXT4_I(new.dir)->i_projid,
+ EXT4_I(old.inode)->i_projid) ||
+ !capable_mix_inode_project(EXT4_I(old.dir)->i_projid,
+ EXT4_I(new.inode)->i_projid))
+ return -EXDEV;
+
dquot_initialize(old.dir);
dquot_initialize(new.dir);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ac64edb..d656269 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1103,6 +1103,8 @@ static const struct super_operations ext4_sops = {
.get_dquots = ext4_get_dquots,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
+ .get_projid = ext4_get_projid,
+ .set_projid = ext4_set_projid,
};
static const struct export_operations ext4_export_ops = {
^ permalink raw reply related
* [PATCH RFC 5/6] ext4: adds project quota support
From: Konstantin Khlebnikov @ 2015-02-11 15:11 UTC (permalink / raw)
To: Linux FS Devel, linux-ext4, linux-kernel
Cc: Jan Kara, Linux API, containers, Dave Chinner, Andy Lutomirski,
Christoph Hellwig, Dmitry Monakhov, Eric W. Biederman, Li Xi,
Theodore Ts'o, Al Viro
In-Reply-To: <20150211151146.6717.62017.stgit@buzz>
This patch adds mount options for enabling/disabling project quota
accounting and enforcement. A new specific inode is also used for
project quota accounting.
[ Mostly unchanged patch from Li Xi <lixi@ddn.com> ]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Li Xi <lixi@ddn.com>
---
fs/ext4/ext4.h | 8 ++++++--
fs/ext4/inode.c | 12 +++++++++++-
fs/ext4/super.c | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
3 files changed, 59 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a3fdbb5..da153c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -208,6 +208,7 @@ struct ext4_io_submit {
#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO 8 /* Journal inode */
+#define EXT4_PRJ_QUOTA_INO 9 /* Project quota inode */
/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO 11
@@ -983,6 +984,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_PRJQUOTA 0x2000000 /* Project quota support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1158,7 +1160,8 @@ struct ext4_super_block {
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
- __le32 s_reserved[106]; /* Padding to the end of the block */
+ __le32 s_prj_quota_inum; /* inode for tracking project quota */
+ __le32 s_reserved[105]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1173,7 +1176,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
/* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
/*
* fourth extended-fs super-block data in memory
@@ -1365,6 +1368,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino == EXT4_BOOT_LOADER_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
+ ino == EXT4_PRJ_QUOTA_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0ae2c39..966bad1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3894,9 +3894,18 @@ int ext4_set_projid(struct inode *inode, kprojid_t projid)
if (projid_eq(EXT4_I(inode)->i_projid, projid))
return 0;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ dquot_initialize(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1 +
+ EXT4_QUOTA_INIT_BLOCKS(sb) +
+ EXT4_QUOTA_DEL_BLOCKS(sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
+
+ err = dquot_transfer_project(inode, projid);
+ if (err)
+ goto out;
+
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (!err) {
inode->i_ctime = ext4_current_time(inode);
@@ -3905,6 +3914,7 @@ int ext4_set_projid(struct inode *inode, kprojid_t projid)
}
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+out:
ext4_journal_stop(handle);
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d656269..3637eef 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1036,8 +1036,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
}
#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1123,10 +1123,11 @@ enum {
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_usrjquota, Opt_grpjquota, Opt_prjjquota,
+ Opt_offusrjquota, Opt_offgrpjquota, Opt_offprjjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1178,6 +1179,8 @@ static const match_table_t tokens = {
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
{Opt_grpjquota, "grpjquota=%s"},
+ {Opt_offprjjquota, "prjjquota="},
+ {Opt_prjjquota, "prjjquota=%s"},
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
@@ -1185,6 +1188,7 @@ static const match_table_t tokens = {
{Opt_noquota, "noquota"},
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
+ {Opt_prjquota, "prjquota"},
{Opt_barrier, "barrier=%u"},
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
@@ -1399,12 +1403,17 @@ static const struct mount_opts {
MOPT_SET | MOPT_Q},
{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
MOPT_SET | MOPT_Q},
+ {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
+ MOPT_SET | MOPT_Q},
{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
- EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
+ MOPT_CLEAR | MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q},
{Opt_grpjquota, 0, MOPT_Q},
+ {Opt_prjjquota, 0, MOPT_Q},
{Opt_offusrjquota, 0, MOPT_Q},
{Opt_offgrpjquota, 0, MOPT_Q},
+ {Opt_offprjjquota, 0, MOPT_Q},
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
@@ -1427,10 +1436,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return set_qf_name(sb, USRQUOTA, &args[0]);
else if (token == Opt_grpjquota)
return set_qf_name(sb, GRPQUOTA, &args[0]);
+ else if (token == Opt_prjjquota)
+ return set_qf_name(sb, PRJQUOTA, &args[0]);
else if (token == Opt_offusrjquota)
return clear_qf_name(sb, USRQUOTA);
else if (token == Opt_offgrpjquota)
return clear_qf_name(sb, GRPQUOTA);
+ else if (token == Opt_offprjjquota)
+ return clear_qf_name(sb, PRJQUOTA);
#endif
switch (token) {
case Opt_noacl:
@@ -1656,19 +1669,28 @@ static int parse_options(char *options, struct super_block *sb,
}
#ifdef CONFIG_QUOTA
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+ (test_opt(sb, USRQUOTA) ||
+ test_opt(sb, GRPQUOTA) ||
+ test_opt(sb, PRJQUOTA))) {
ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
"feature is enabled");
return 0;
}
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ if (sbi->s_qf_names[USRQUOTA] ||
+ sbi->s_qf_names[GRPQUOTA] ||
+ sbi->s_qf_names[PRJQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
clear_opt(sb, GRPQUOTA);
- if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+ if (test_opt(sb, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
+ clear_opt(sb, PRJQUOTA);
+
+ if (test_opt(sb, GRPQUOTA) ||
+ test_opt(sb, USRQUOTA) ||
+ test_opt(sb, PRJQUOTA)) {
ext4_msg(sb, KERN_ERR, "old and new quota "
"format mixing");
return 0;
@@ -1728,6 +1750,9 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+ if (sbi->s_qf_names[PRJQUOTA])
+ seq_printf(seq, ",prjjquota=%s", sbi->s_qf_names[PRJQUOTA]);
#endif
}
@@ -3928,6 +3953,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
else
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ sb->s_quota_types = QTYPE_MASK_PRJ;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -5145,7 +5172,9 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
/* Are we journaling quotas? */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
- sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ sbi->s_qf_names[USRQUOTA] ||
+ sbi->s_qf_names[GRPQUOTA] ||
+ sbi->s_qf_names[PRJQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -5229,7 +5258,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
struct inode *qf_inode;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
@@ -5257,7 +5287,8 @@ static int ext4_enable_quotas(struct super_block *sb)
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
^ permalink raw reply related
* [PATCH RFC 6/6] tools/quota/project_quota: sample tool for early adopters
From: Konstantin Khlebnikov @ 2015-02-11 15:11 UTC (permalink / raw)
To: Linux FS Devel, linux-ext4, linux-kernel
Cc: Jan Kara, Linux API, containers, Dave Chinner, Andy Lutomirski,
Christoph Hellwig, Dmitry Monakhov, Eric W. Biederman, Li Xi,
Theodore Ts'o, Al Viro
In-Reply-To: <20150211151146.6717.62017.stgit@buzz>
Usage: ./project_quota <command> <path> [args]...
Commands:
init <path> initialize quota file
on <path> turn on
off <path> turn off
info <path> show project, usage and limits
project <path> [<id>] get / set project id
limit <path> [<bytes>] get / set space limit
ilimit <path> [<inodes>] get / set inodes limit
How to enable feature using debugfs tool:
# debugfs
debugfs: open -w <disk>
debugfs: feature +FEATURE_R12
debugfs: quit
# mount ...
# project_quota init <mountpoint>
# project_quota on <mountpoint>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
tools/quota/.gitignore | 1
tools/quota/Makefile | 6 +
tools/quota/project_quota.c | 324 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 331 insertions(+)
create mode 100644 tools/quota/.gitignore
create mode 100644 tools/quota/Makefile
create mode 100644 tools/quota/project_quota.c
diff --git a/tools/quota/.gitignore b/tools/quota/.gitignore
new file mode 100644
index 0000000..4aacefc
--- /dev/null
+++ b/tools/quota/.gitignore
@@ -0,0 +1 @@
+project_quota
diff --git a/tools/quota/Makefile b/tools/quota/Makefile
new file mode 100644
index 0000000..0c3daef
--- /dev/null
+++ b/tools/quota/Makefile
@@ -0,0 +1,6 @@
+CFLAGS=-Wall -W
+
+project_quota:
+
+clean:
+ rm project_quota
diff --git a/tools/quota/project_quota.c b/tools/quota/project_quota.c
new file mode 100644
index 0000000..ca7f49a
--- /dev/null
+++ b/tools/quota/project_quota.c
@@ -0,0 +1,324 @@
+/*
+ * project_quota: Tool for project disk quota manipulations
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should find a copy of v2 of the GNU General Public License somewhere on
+ * your Linux system; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2015 Yandex LLC
+ *
+ * Authors: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+ */
+
+#define _FILE_OFFSET_BITS 64
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <err.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/quota.h>
+#include <sys/quota.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+
+#ifndef F_GET_PROJECT
+#define F_GET_PROJECT (F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_PROJECT (F_LINUX_SPECIFIC_BASE + 12)
+#endif
+
+#ifndef PRJQUOTA
+#define PRJQUOTA 2
+#endif
+
+/* First generic header */
+struct v2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* File version */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+ __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
+ __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
+ __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
+ __le32 dqi_blocks; /* Number of blocks in file */
+ __le32 dqi_free_blk; /* Number of first free block in the list */
+ __le32 dqi_free_entry; /* Number of block with at least one free entry */
+};
+
+#define PROJECT_QUOTA_FILE "quota.project"
+#define PROJECT_QUOTA_MAGIC 0xd9c03f14
+
+static int find_mountpoint(const char *path, struct stat *path_st,
+ char **device, char **fstype, char **root_path)
+{
+ struct stat dev_st;
+ char *buf = NULL, *ptr, *real_device;
+ unsigned major, minor;
+ size_t len;
+ FILE *file;
+
+ if (stat(path, path_st))
+ return -1;
+
+ *root_path = malloc(PATH_MAX + 1);
+
+ /* since v2.6.26 */
+ file = fopen("/proc/self/mountinfo", "r");
+ if (!file)
+ goto parse_mounts;
+ while (getline(&buf, &len, file) > 0) {
+ sscanf(buf, "%*d %*d %u:%u %*s %s", &major, &minor, *root_path);
+ if (makedev(major, minor) != path_st->st_dev)
+ continue;
+ ptr = strstr(buf, " - ") + 3;
+ *fstype = strdup(strsep(&ptr, " "));
+ *device = strdup(strsep(&ptr, " "));
+ goto found;
+ }
+
+parse_mounts:
+ /* for older versions */
+ file = fopen("/proc/mounts", "r");
+ if (!file)
+ goto not_found;
+ while (getline(&buf, &len, file) > 0) {
+ ptr = buf;
+ strsep(&ptr, " ");
+ if (*buf != '/' || stat(buf, &dev_st) ||
+ dev_st.st_rdev != path_st->st_dev)
+ continue;
+ strcpy(*root_path, strsep(&ptr, " "));
+ *fstype = strdup(strsep(&ptr, " "));
+ *device = strdup(buf);
+ goto found;
+ }
+not_found:
+ free(*root_path);
+ errno = ENODEV;
+ return -1;
+
+found:
+ real_device = realpath(*device, NULL);
+ if (real_device) {
+ free(*device);
+ *device = real_device;
+ }
+ return 0;
+}
+
+static int init_project_quota(const char *quota_path)
+{
+ struct {
+ struct v2_disk_dqheader header;
+ struct v2_disk_dqinfo info;
+ char zero[1024 * 2 - 8 * 4];
+ } quota_init = {
+ .header = {
+ .dqh_magic = PROJECT_QUOTA_MAGIC,
+ .dqh_version = 1,
+ },
+ .info = {
+ .dqi_bgrace = 7 * 24 * 60 * 60,
+ .dqi_igrace = 7 * 24 * 60 * 60,
+ .dqi_flags = 0,
+ .dqi_blocks = 2, /* header and root */
+ .dqi_free_blk = 0,
+ .dqi_free_entry = 0,
+ },
+ .zero = {0, },
+ };
+ int fd;
+
+ fd = open(quota_path, O_CREAT|O_RDWR|O_EXCL, 0600);
+ if (fd < 0)
+ return fd;
+ write(fd, "a_init, sizeof(quota_init));
+ fsync(fd);
+ close(fd);
+ return 0;
+}
+
+static int get_project_id(const char *path, unsigned *project_id)
+{
+ int fd, ret;
+
+ fd = open(path, O_PATH);
+ if (fd < 0)
+ return fd;
+ ret = fcntl(fd, F_GET_PROJECT, project_id);
+ close(fd);
+ return ret;
+}
+
+static int set_project_id(const char *path, unsigned project_id)
+{
+ int fd, ret;
+
+ fd = open(path, O_PATH);
+ if (fd < 0)
+ return fd;
+ ret = fcntl(fd, F_SET_PROJECT, project_id);
+ close(fd);
+ return ret;
+}
+
+static void get_project_quota(const char *device, unsigned project_id,
+ struct if_dqblk *quota)
+{
+ if (quotactl(QCMD(Q_GETQUOTA, PRJQUOTA), device,
+ project_id, (caddr_t)quota))
+ err(2, "cannot get project quota \"%u\" at \"%s\"",
+ project_id, device);
+}
+
+static void set_project_quota(const char *device, unsigned project_id,
+ struct if_dqblk *quota)
+{
+ if (quotactl(QCMD(Q_SETQUOTA, PRJQUOTA),
+ device, project_id, (caddr_t)quota))
+ err(2, "cannot set project quota \"%u\" at \"%s\"",
+ project_id, device);
+}
+
+int main (int argc, char **argv) {
+ char *cmd, *path, *device, *fstype, *root_path;
+ struct if_dqblk quota;
+ struct stat path_st;
+ unsigned project_id;
+
+ if (argc < 3)
+ goto usage;
+
+ cmd = argv[1];
+ path = argv[2];
+ if (find_mountpoint(path, &path_st, &device, &fstype, &root_path))
+ err(2, "cannot find mountpoint for \"%s\"", path);
+
+ if (!strcmp(cmd, "limit") || !strcmp(cmd, "ilimit") ||
+ !strcmp(cmd, "info") || !strcmp(cmd, "parent")) {
+ if (get_project_id(path, &project_id))
+ err(2, "cannot get project id for \"%s\"", path);
+ }
+
+ if (!strcmp(cmd, "init")) {
+ if (S_ISDIR(path_st.st_mode))
+ asprintf(&path, "%s/%s", path, PROJECT_QUOTA_FILE);
+
+ if (init_project_quota(path))
+ err(2, "cannot init project quota file \"%s\"", path);
+
+ } else if (!strcmp(cmd, "on")) {
+ struct v2_disk_dqheader header;
+ int fd, format;
+
+ if (S_ISDIR(path_st.st_mode))
+ asprintf(&path, "%s/%s", path, PROJECT_QUOTA_FILE);
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ err(2, "cannot open quota file \"%s\"", path);
+ if (read(fd, &header, sizeof(header)) != sizeof(header))
+ err(2, "cannot read quota file \"%s\"", path);
+ close(fd);
+
+ if (header.dqh_magic != PROJECT_QUOTA_MAGIC)
+ errx(2, "wrong quota file magic");
+
+ if (header.dqh_version == 1)
+ format = QFMT_VFS_V1;
+ else
+ errx(2, "unsupported quota file version");
+
+ if (mount(NULL, root_path, NULL, MS_REMOUNT, "prjquota"))
+ err(2, "cannot remount \"%s\"", root_path);
+
+ if (quotactl(QCMD(Q_QUOTAON, PRJQUOTA), device,
+ format, (caddr_t)path))
+ err(2, "cannot turn on project quota for %s", device);
+
+ } else if (!strcmp(cmd, "off")) {
+
+ if (quotactl(QCMD(Q_QUOTAOFF, PRJQUOTA), device, 0, NULL))
+ err(2, "cannot turn off project quota for %s", device);
+
+ } else if (!strcmp(cmd, "project")) {
+ if (argc < 4) {
+ if (get_project_id(path, &project_id))
+ err(2, "cannot get project id for \"%s\"", path);
+ printf("%u\n", project_id);
+ } else {
+ project_id = atoi(argv[3]);
+ if (set_project_id(path, project_id))
+ err(2, "cannot set project id for \"%s\"", path);
+ }
+ } else if (!strcmp(cmd, "limit")) {
+ if (argc < 4) {
+ get_project_quota(device, project_id, "a);
+ printf("%lld\n", quota.dqb_bhardlimit * QIF_DQBLKSIZE);
+ } else {
+ quota.dqb_bhardlimit = atoll(argv[3]) / QIF_DQBLKSIZE;
+ quota.dqb_bsoftlimit = 0;
+ quota.dqb_valid = QIF_BLIMITS;
+ set_project_quota(device, project_id, "a);
+ }
+ } else if (!strcmp(cmd, "ilimit")) {
+ if (argc < 4) {
+ get_project_quota(device, project_id, "a);
+ printf("%lld\n", quota.dqb_ihardlimit);
+ } else {
+ quota.dqb_ihardlimit = atoll(argv[3]);
+ quota.dqb_isoftlimit = 0;
+ quota.dqb_valid = QIF_ILIMITS;
+ set_project_quota(device, project_id, "a);
+ }
+ } else if (!strcmp(cmd, "info")) {
+ get_project_quota(device, project_id, "a);
+ printf("project %u\n", project_id);
+ printf("usage %llu\n", quota.dqb_curspace);
+ printf("limit %llu\n", quota.dqb_bhardlimit * QIF_DQBLKSIZE);
+ printf("inodes %llu\n", quota.dqb_curinodes);
+ printf("ilimit %llu\n", quota.dqb_ihardlimit);
+ } else {
+ warnx("Unknown command \"%s\"", cmd);
+ goto usage;
+ }
+
+ free(device);
+ free(fstype);
+ free(root_path);
+
+ return 0;
+
+usage:
+ fprintf(stderr, "Usage: %s <command> <path> [args]...\n"
+ "Commands:\n"
+ " init <path> initialize quota file\n"
+ " on <path> turn on\n"
+ " off <path> turn off\n"
+ " info <path> show project, usage and limits\n"
+ " project <path> [<id>] get / set project id\n"
+ " limit <path> [<bytes>] get / set space limit\n"
+ " ilimit <path> [<inodes>] get / set inodes limit\n",
+ argv[0]);
+ return 2;
+}
^ permalink raw reply related
* Re: [PATCHv3 8/8] cgroup: Add documentation for cgroup namespaces
From: Serge E. Hallyn @ 2015-02-11 16:00 UTC (permalink / raw)
To: Tejun Heo
Cc: Eric W. Biederman, Serge E. Hallyn, Richard Weinberger, Linux API,
Linux Containers, Serge Hallyn,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Andy Lutomirski, cgroups mailinglist, Ingo Molnar
In-Reply-To: <20150211051704.GB24897-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
Quoting Tejun Heo (tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org):
> Hey,
>
> On Tue, Feb 10, 2015 at 11:02:40PM -0600, Eric W. Biederman wrote:
> > A slightly off topic comment, for where this thread has gone but
> > relevant if we are talking about cgroup namespaces.
> >
> > If don't implement compatibility with existing userspace, they get a
> > nack. A backwards-incompatible change should figure out how to remove
> > the need for any namespaces.
> >
> > Because that is what namespaces are about backwards compatibility.
>
> Are you claiming that namespaces are soley about backwards
> compatibility? ie. to trick userland into scoping without letting it
> notice? That's a very restricted view and namespaces do provide
> further isolation capabilties in addition to what can be achieved
> otherwise and it is logical to collect simliar funtionalities there.
We absolutely would love to use cgroup namespaces to run older
userspace in containers. I don't know that it's actually possible
to do both that and use unified hierarchy at the same time though,
which is unfortunate. So an Ubuntu 12.04 container will never, afaics,
be able to run inside an ubuntu 16.04 host that is using unified
hierarchy, without using backported newer versions of lxc (etc) in
the container.
^ permalink raw reply
* Re: [PATCHv3 8/8] cgroup: Add documentation for cgroup namespaces
From: Tejun Heo @ 2015-02-11 16:03 UTC (permalink / raw)
To: Serge E. Hallyn
Cc: Eric W. Biederman, Richard Weinberger, Linux API,
Linux Containers, Serge Hallyn,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Andy Lutomirski, cgroups mailinglist, Ingo Molnar
In-Reply-To: <20150211160023.GA1579-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
On Wed, Feb 11, 2015 at 05:00:23PM +0100, Serge E. Hallyn wrote:
> We absolutely would love to use cgroup namespaces to run older
> userspace in containers. I don't know that it's actually possible
> to do both that and use unified hierarchy at the same time though,
> which is unfortunate. So an Ubuntu 12.04 container will never, afaics,
> be able to run inside an ubuntu 16.04 host that is using unified
> hierarchy, without using backported newer versions of lxc (etc) in
> the container.
So, the constraint there are the controllers. A controller can't be
attached to two hierarchies at the same time for obvious reasons, so
regardless of NS, you can't use the same controller on a unified
hierarchy *and* a traditional hierarchy. NS doesn't adds or
substracts from the situation. If you decide to attach a controller
to a traditional hierarchy, that's where it's gonna be available. If
you attach it to the unified hierarchy, the same story.
Thanks.
--
tejun
^ permalink raw reply
* Re: [PATCH v5] perf: Use monotonic clock as a source for timestamps
From: Peter Zijlstra @ 2015-02-11 16:12 UTC (permalink / raw)
To: Pawel Moll
Cc: ajh mls, Richard Cochran, Steven Rostedt, Ingo Molnar,
Paul Mackerras, Arnaldo Carvalho de Melo, John Stultz,
Masami Hiramatsu, Christopher Covington, Namhyung Kim,
David Ahern, Thomas Gleixner, Tomeu Vizoso,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
adrian.hunter-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org
In-Reply-To: <1422955245.4944.26.camel-5wv7dgnIgG8@public.gmane.org>
How about something like the below? I _think_ it should mostly work for
x86, where the tsc is a 64bit wide cycle counter.
I suppose we should extend the perf userpage time data with
time_last_cycle and time_mask if/when we want to make this work on
something with a short counter.
Of course, at that time we also need to somehow deal with that counter
wrapping, its hardly practical to go iterate all possible userpg
instances from a timer handler.
---
Documentation/kernel-parameters.txt | 9 +++++++
arch/x86/kernel/cpu/perf_event.c | 44 ++++++++++++++++++++++++---------
include/linux/perf_event.h | 6 +++++
kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++---
kernel/time/timekeeping.c | 30 +++++++++++++++++++++++
5 files changed, 123 insertions(+), 15 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 176d4fe4f076..52255676b6e2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -91,6 +91,7 @@ the beginning of each description states the restrictions within which a
NUMA NUMA support is enabled.
NFS Appropriate NFS support is enabled.
OSS OSS sound support is enabled.
+ PERF Performance events and counters support is enabled.
PV_OPS A paravirtualized kernel is enabled.
PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
PARISC The PA-RISC architecture is enabled.
@@ -2796,6 +2797,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
allocator. This parameter is primarily for debugging
and performance comparison.
+ perf_use_local_clock
+ [PERF]
+ Use local_clock() as a source for perf timestamps
+ generation. This was be the default behaviour and
+ this parameter can be used to maintain backward
+ compatibility or on older hardware with expensive
+ monotonic clock source.
+
pf. [PARIDE]
See Documentation/blockdev/paride.txt.
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..436a66632f76 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1952,6 +1952,35 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack,
};
+static void local_clock_user_time(struct perf_event_mmap_page *userpg, u64 now)
+{
+ data = cyc2ns_read_begin();
+
+ userpg->cap_user_time = 1;
+ userpg->time_mult = data->cyc2ns_mul;
+ userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_offset = data->cyc2ns_offset - now;
+
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+
+ cyc2ns_read_end(data);
+}
+
+extern void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift);
+
+static void ktime_fast_mono_user_time(struct perf_event_mmap_page *userpg, u64 now)
+{
+ userpg->cap_user_time = 1;
+ userpg->cap_user_time_zero = 1;
+
+ __ktime_get_mono_fast(&userpg->time_zero,
+ &userpg->time_mult,
+ &userpg->time_shift);
+
+ userpg->offset = userpg->time_zero - now;
+}
+
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
@@ -1966,17 +1995,10 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!sched_clock_stable())
return;
- data = cyc2ns_read_begin();
-
- userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
- userpg->time_offset = data->cyc2ns_offset - now;
-
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
-
- cyc2ns_read_end(data);
+ if (static_key_false(&perf_use_local_clock_key))
+ local_clock_user_time(userpg, now);
+ else
+ ktime_fast_mono_user_time(userpg, now);
}
/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33262004c310..1d61f968113a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -561,6 +561,12 @@ extern void perf_pmu_enable(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
extern int perf_event_refresh(struct perf_event *event, int refresh);
+
+extern struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE;
+extern void __weak
+arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now);
+
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 13209a90b751..7bad385103ea 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
+#include <linux/sysctl.h>
+#include <linux/jump_label.h>
#include "internal.h"
@@ -322,9 +324,43 @@ extern __weak const char *perf_pmu_name(void)
return "pmu";
}
+struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE;
+static bool perf_use_local_clock_param __initdata;
+static int __init perf_use_local_clock_setup(char *__unused)
+{
+ perf_use_local_clock_param = true;
+ return 1;
+}
+__setup("perf_use_local_clock", perf_use_local_clock_setup);
+
+static int sysctl_perf_sample_time_clk_id = CLOCK_MONOTONIC;
+
+static struct ctl_table perf_sample_time_kern_table[] = {
+ {
+ .procname = "perf_sample_time_clk_id",
+ .data = &sysctl_perf_sample_time_clk_id,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static struct ctl_table perf_sample_time_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = perf_sample_time_kern_table,
+ },
+ {}
+};
+
static inline u64 perf_clock(void)
{
- return local_clock();
+ if (static_key_false(&perf_use_local_clock_key))
+ return local_clock();
+ else
+ return ktime_get_mono_fast_ns();
}
static inline struct perf_cpu_context *
@@ -4101,8 +4137,8 @@ static void perf_event_init_userpage(struct perf_event *event)
rcu_read_unlock();
}
-void __weak arch_perf_update_userpage(
- struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -4487,7 +4523,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
- rb = rb_alloc(nr_pages,
+ rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
@@ -8516,6 +8552,11 @@ void __init perf_event_init(void)
*/
BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
!= 1024);
+
+ if (perf_use_local_clock_param)
+ static_key_slow_inc(&perf_use_local_clock_key);
+ else
+ register_sysctl_table(perf_sample_time_root_table);
}
static int __init perf_event_sysfs_init(void)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b124af259800..37bed5931a91 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -334,6 +334,36 @@ u64 notrace ktime_get_mono_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ cycle_t cycle_now, delta;
+ u64 nsecs, now;
+
+ do {
+ seq = raw_read_seqcount(&tk_fast_mono.seq);
+ tkr = tk_fast_mono.base + (seq & 0x01);
+
+ cycle_now = tkr->read(tkr->clock);
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
+ nsec += arch_gettimeoffset();
+
+ now = ktime_to_ns(tkr->base_mono) + nsec;
+
+ *mult = tkr->mult;
+ *shift = tkr->shift;
+
+ nsec = mul_u64_u32_shr(cycle_now, tkr->mult, tkr->shift);
+
+ *offset = now - nsec;
+
+ } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+}
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall(struct timekeeper *tk)
^ permalink raw reply related
* Re: [PATCHv3 8/8] cgroup: Add documentation for cgroup namespaces
From: Serge E. Hallyn @ 2015-02-11 16:18 UTC (permalink / raw)
To: Tejun Heo
Cc: Serge E. Hallyn, Eric W. Biederman, Richard Weinberger, Linux API,
Linux Containers, Serge Hallyn, linux-kernel@vger.kernel.org,
Andy Lutomirski, cgroups mailinglist, Ingo Molnar
In-Reply-To: <20150211160347.GE21356@htj.duckdns.org>
Quoting Tejun Heo (tj@kernel.org):
> On Wed, Feb 11, 2015 at 05:00:23PM +0100, Serge E. Hallyn wrote:
> > We absolutely would love to use cgroup namespaces to run older
> > userspace in containers. I don't know that it's actually possible
> > to do both that and use unified hierarchy at the same time though,
> > which is unfortunate. So an Ubuntu 12.04 container will never, afaics,
> > be able to run inside an ubuntu 16.04 host that is using unified
> > hierarchy, without using backported newer versions of lxc (etc) in
> > the container.
>
> So, the constraint there are the controllers. A controller can't be
> attached to two hierarchies at the same time for obvious reasons, so
> regardless of NS, you can't use the same controller on a unified
> hierarchy *and* a traditional hierarchy. NS doesn't adds or
> substracts from the situation. If you decide to attach a controller
> to a traditional hierarchy, that's where it's gonna be available. If
> you attach it to the unified hierarchy, the same story.
Right, exactly.
thanks,
-serge
^ permalink raw reply
* Re: [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: Stefan Berger @ 2015-02-11 18:47 UTC (permalink / raw)
To: Jarkko Sakkinen
Cc: Peter Hüwe, Ashley Lai, Marcel Selhorst,
tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, josh-iaAMLnmF4UmaiuxdJuQwMA,
christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
linux-api-u79uwXL29TY76Z2rM5mHXA,
trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <20150210125037.GB4313-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
On 02/10/2015 07:50 AM, Jarkko Sakkinen wrote:
> On Tue, Feb 10, 2015 at 07:16:32AM -0500, Stefan Berger wrote:
>> On 02/09/2015 03:39 AM, Jarkko Sakkinen wrote:
>>> On Mon, Feb 09, 2015 at 12:08:46AM +0100, Peter Hüwe wrote:
>>>> Am Mittwoch, 4. Februar 2015, 15:21:09 schrieb Jarkko Sakkinen:
>>>>> If during transmission system error was returned, the logic was to
>>>>> incorrectly deduce that chip is a TPM 1.x chip. This patch fixes this
>>>>> issue. Also, this patch changes probing so that message tag is used as the
>>>>> measure for TPM 2.x, which should be much more stable.
>>>> Is it aware that some TPMs may respond with 0x00C1 as TAG for TPM1.2 commands?
>>> I guess none of the TPM 1.2 command answer with the tag 0x8002?
>>
>> FYI: pdf page 26 , section 6.1 explains the predictable return value for a
>> TPM1.2 command seen by a TPM2
>>
>> http://www.trustedcomputinggroup.org/files/static_page_files/8C68ADA8-1A4B-B294-D0FC06D3773F7DAA/TPM%20Rev%202.0%20Part%203%20-%20Commands%2001.16-code.pdf
>>
>> Following this:
>>
>> Sending a TPM1.2 command to a TPM2 should return a TPM1.2 header (tag =
>> 0xc4) and error code (TPM_BADTAG = 0x1e)
>>
>> Sending a TPM 2 command to a TPM 2 will give a TPM 2 tag in the header.
>> Sending a TPM 2 command to a TPM 1.2 will give a TPM 1.2 tag in the header
>> and an error code.
> Thank you for the information. Do you think that for some reason
> tpm2_probe() shoould instead check that value is not this error
> instead of checking that tag is 0x80002?
Following your path, you are checking for TPM2_ST_NO_SESSION (0x8001),
which looks correct to me. A TPM1.2 would never send this tag back.
Stefan
^ permalink raw reply
* Re: [RFC] simple_char: New infrastructure to simplify chardev management
From: Andy Lutomirski @ 2015-02-11 20:04 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Arnd Bergmann, Jiri Kosina,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Linux API
In-Reply-To: <20150211004459.GA30746-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
On Tue, Feb 10, 2015 at 4:44 PM, Greg Kroah-Hartman
<gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org> wrote:
> On Tue, Feb 10, 2015 at 03:44:05PM -0800, Andy Lutomirski wrote:
>> This isn't adequately tested, and I don't have a demonstration (yet).
>> It's here for review for whether it's a good idea in the first place
>> and for weather the fully_dynamic mechanism is a good idea.
>>
>> The current character device interfaces are IMO awful.
>
> That's a total understatement. Redoing the char interface has been in
> my todo list for a decade now. It's the complexity that happens to be
> used by just a handful of drivers that have prevented me from doing the
> rework in the past. Creating a "new" interface that we then port code
> to is a very good idea, as it can happen over time in a much more
> orderly way.
>
> And we can throw the kernel-janitors people at it once it's working, to
> convert the rest of the tree, providing them a useful outlet for their
> need for patch cleanups :)
>
> So yes, I'm all for this, thanks so much for looking into this. I'm at
> a conference this week, but will go over it on the plane home and give
> you some review comments.
It would be nice to make the reference counting cleaner, perhaps by
tying a chardev minor more directly to a struct device, but I wasn't
sure how to do that usefully.
--Andy
>
> greg k-h
--
Andy Lutomirski
AMA Capital Management, LLC
^ permalink raw reply
* Re: [PATCH v17 1/7] mm: support madvise(MADV_FREE)
From: Shaohua Li @ 2015-02-12 0:14 UTC (permalink / raw)
To: Minchan Kim
Cc: Michael Kerrisk (man-pages), Michal Hocko, Andrew Morton,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
Hugh Dickins, Johannes Weiner, Rik van Riel, KOSAKI Motohiro,
Mel Gorman, Jason Evans, zhangyanfei-BthXqXjhjHXQFUHtdCDX3A,
Kirill A. Shutemov, Kirill A. Shutemov
In-Reply-To: <20150211005620.GA4078@blaptop>
On Wed, Feb 11, 2015 at 09:56:20AM +0900, Minchan Kim wrote:
> Hi Shaohua,
>
> On Tue, Feb 10, 2015 at 02:38:26PM -0800, Shaohua Li wrote:
> > On Mon, Feb 09, 2015 at 04:15:53PM +0900, Minchan Kim wrote:
> > > On Fri, Feb 06, 2015 at 10:29:18AM -0800, Shaohua Li wrote:
> > > > On Fri, Feb 06, 2015 at 02:51:03PM +0900, Minchan Kim wrote:
> > > > > Hi Shaohua,
> > > > >
> > > > > On Thu, Feb 05, 2015 at 04:33:11PM -0800, Shaohua Li wrote:
> > > > > >
> > > > > > Hi Minchan,
> > > > > >
> > > > > > Sorry to jump in this thread so later, and if some issues are discussed before.
> > > > > > I'm interesting in this patch, so tried it here. I use a simple test with
> > > > >
> > > > > No problem at all. Interest is always win over ignorance.
> > > > >
> > > > > > jemalloc. Obviously this can improve performance when there is no memory
> > > > > > pressure. Did you try setup with memory pressure?
> > > > >
> > > > > Sure but it was not a huge memory system like yours.
> > > >
> > > > Yes, I'd like to check the symptom in memory pressure, so choose such test.
> > > >
> > > > > > In my test, jemalloc will map 61G vma, and use about 32G memory without
> > > > > > MADV_FREE. If MADV_FREE is enabled, jemalloc will use whole 61G memory because
> > > > > > madvise doesn't reclaim the unused memory. If I disable swap (tweak your patch
> > > > >
> > > > > Yes, IIUC, jemalloc replaces MADV_DONTNEED with MADV_FREE completely.
> > > >
> > > > right.
> > > > > > slightly to make it work without swap), I got oom. If swap is enabled, my
> > > > >
> > > > > You mean you modified anon aging logic so it works although there is no swap?
> > > > > If so, I have no idea why OOM happens. I guess it should free all of freeable
> > > > > pages during the aging so although system stall happens more, I don't expect
> > > > > OOM. Anyway, with MADV_FREE with no swap, we should consider more things
> > > > > about anonymous aging.
> > > >
> > > > In the patch, MADV_FREE will be disabled and fallback to DONTNEED if no swap is
> > > > enabled. Our production environment doesn't enable swap, so I tried to delete
> > > > the 'no swap' check and make MADV_FREE always enabled regardless if swap is
> > > > enabled. I didn't change anything else. With such change, I saw oom
> > > > immediately. So definitely we have aging issue, the pages aren't reclaimed
> > > > fast.
> > >
> > > In current VM implementation, it doesn't age anonymous LRU list if we have no
> > > swap. That's the reason to drop freeing pages instantly.
> > > I think it could be enhanced later.
> > > http://lists.infradead.org/pipermail/linux-arm-kernel/2014-December/311591.html
> > >
> > > >
> > > > > > system is totally stalled because of swap activity. Without the MADV_FREE,
> > > > > > everything is ok. Considering we definitely don't want to waste too much
> > > > > > memory, a system with memory pressure is normal, so sounds MADV_FREE will
> > > > > > introduce big trouble here.
> > > > > >
> > > > > > Did you think about move the MADV_FREE pages to the head of inactive LRU, so
> > > > > > they can be reclaimed easily?
> > > > >
> > > > > I think it's desirable if the page lived in active LRU.
> > > > > The reason I didn't that was caused by volatile ranges system call which
> > > > > was motivaion for MADV_FREE in my mind.
> > > > > In last LSF/MM, there was concern about data's hotness.
> > > > > Some of users want to keep that as it is in LRU position, others want to
> > > > > handle that as cold(tail of inactive list)/warm(head of inactive list)/
> > > > > hot(head of active list), for example.
> > > > > The vrange syscall was just about volatiltiy, not depends on page hotness
> > > > > so the decision on my head was not to change LRU order and let's make new
> > > > > hotness advise if we need it later.
> > > > >
> > > > > However, MADV_FREE's main customer is allocators and afaik, they want
> > > > > to replace MADV_DONTNEED with MADV_FREE so I think it is really cold,
> > > > > but we couldn't make sure so head of inactive is good compromise.
> > > > > Another concern about tail of inactive list is that there could be
> > > > > plenty of pages in there, which was asynchromos write-backed in
> > > > > previous reclaim path, not-yet reclaimed because of not being able
> > > > > to free the in softirq context of writeback. It means we ends up
> > > > > freeing more potential pages to become workingset in advance
> > > > > than pages VM already decided to evict.
> > > >
> > > > Yes, they are definitely cold pages. I thought We should make sure the
> > > > MADV_FREE pages are reclaimed first before other pages, at least in the anon
> > > > LRU list, though there might be difficult to determine if we should reclaim
> > > > writeback pages first or MADV_FREE pages first.
> > >
> > > Frankly speaking, the issue with writeback page is just hurdle of
> > > implementation, not design so if we could fix it, we might move
> > > cold pages into tail of the inactive LRU. I tried it but don't have
> > > time slot to continue these days. Hope to get a time to look soon.
> > > https://lkml.org/lkml/2014/7/1/628
> > > Even, it wouldn't be critical problem although we couldn't fix
> > > the problem of writeback pages because they are already all
> > > cold pages so it might be not important to keep order in LRU so
> > > we could save working set and effort of VM to reclaim them
> > > at the cost of moving all of hinting pages into tail of the LRU
> > > whenever the syscall is called.
> > >
> > > However, significant problem from my mind is we couldn't make
> > > sure they are really cold pages. It would be true for allocators
> > > but it's cache-friendly pages so it might be better to discard
> > > tail pages of inactive LRU, which are really cold.
> > > In addition, we couldn't expect all of usecase for MADV_FREE
> > > so some of users might want to treat them as warm, not cold.
> > >
> > > With moving them into inactive list's head, if we still see
> > > a lot stall, I think it's a sign to add other logic, for example,
> > > we could drop MADV_FREEed pages instantly if the zone is below
> > > low min watermark when the syscall is called. Because everybody
> > > doesn't like direct reclaim.
> >
> > So I tried move the MADV_FREE pages to inactive list head or tail. It helps a
> > little. But there are still stalls/oom. kswapd isn't fast enough to free the
> > pages, App enters direct reclaim frequently. In one machine, no swap trigger,
> > but MADV_FREE is 5x slower than MADV_DONTNEED. In another machine, MADV_FREE
>
> It's expected. MADV_DONTNEED and MADV_FREE is really different.
> MADV_DONTNEED is self-sacrificy for others in the system while MADV_FREE is
> greedy approach for itself because random process asking the memory could
> enter direct reclaim.
> However, as I said earlier, we could mitigate the problem by checking
> min_free_kbytes. If memory in the system is under min_free_kbytes, it is
> pointless to impose reclaim overhead for hinted pages because we alreay
> know the hint is "please free when you are trouble with memory" and we got
> know it already.
>
> When I test below patch on my 3G machine + 12 CPU + 8G swap with below test
> test: 12 processes(each process does 5 iteration: mmap 512M + memset + madvise),
>
> 1. MADV_DONTNEED : 41.884sec, sys:3m4.552
> 2. MADV_FREE : 1m28sec, sys: 5m23
> 3. MADV_FREE + below patch : 37.188s, sys: 2m20
>
> Could you test?
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 6d0fcb8..da15f8f 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -523,7 +523,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
> * XXX: In this implementation, MADV_FREE works like
> * MADV_DONTNEED on swapless system or full swap.
> */
> - if (get_nr_swap_pages() > 0)
> + if (get_nr_swap_pages() > 0 && min_free_kbytes < nr_free_pages())
> return madvise_free(vma, prev, start, end);
> /* passthrough */
> case MADV_DONTNEED:
The throttling makes a lot of sense, definitely should be included in the
patch. At least my jemalloc test has similar performance result with/without
the patch in memory pressure case. So overall I'm pretty happy with it.
However, this only solves half of the problem. pages which are MADV_FREE before
watermark is hit are still hard to be reclaimed later if there are other
allocations. I'm not sure how severe this issue is. My jemalloc test frequently
does madvise (fallback to DONTNEED with above change), so itself can free a lot
of memory in memory pressure. If application uses MADV_FREE before watermark is
hit, but don't use it after watermark is hit, we will have trouble.
Thanks,
Shaohua
^ permalink raw reply
* Re: [PATCH 1/2 v2] xfs: introduce a generic shutdown ioctl
From: Jaegeuk Kim @ 2015-02-12 1:40 UTC (permalink / raw)
To: Dave Chinner, Jeff Layton, J. Bruce Fields
Cc: linux-fsdevel, linux-api, linux-kernel, xfs
In-Reply-To: <1420796076-82847-1-git-send-email-jaegeuk@kernel.org>
Change log from v1:
o modify the xfs changes merged toward 3.20-rc1
-- >8 --
This patch introduces a generic ioctl for fs shutdown, which was used by xfs.
If this shutdown is triggered, filesystem stops any further IOs according to the
following options.
1. FS_GOING_DOWN_FULLSYNC
: this will flush all the data and dentry blocks, and do checkpoint before
shutdown.
2. FS_GOING_DOWN_METASYNC
: this will do checkpoint before shutdown.
3. FS_GOING_DOWN_NOSYNC
: this will trigger shutdown as is.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
fs/xfs/libxfs/xfs_fs.h | 8 ++++----
include/uapi/linux/fs.h | 8 ++++++++
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721..fe0eeee 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -484,9 +484,9 @@ typedef struct xfs_swapext
/*
* Flags for going down operation
*/
-#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
-#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
-#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT FS_GOING_DOWN_FULLSYNC
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH FS_GOING_DOWN_METASYNC
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH FS_GOING_DOWN_NOSYNC
/*
* ioctl commands that are used by Linux filesystems
@@ -555,7 +555,7 @@ typedef struct xfs_swapext
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
-#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t)
+#define XFS_IOC_GOINGDOWN FS_IOC_SHUTDOWN
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..a4e4be5 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -157,6 +157,7 @@ struct inodes_stat_t {
#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
#define FITHAW _IOWR('X', 120, int) /* Thaw */
#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */
+#define FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
@@ -205,4 +206,11 @@ struct inodes_stat_t {
#define SYNC_FILE_RANGE_WRITE 2
#define SYNC_FILE_RANGE_WAIT_AFTER 4
+/*
+ * Flags for going down operation used by FS_IOC_GOINGDOWN
+ */
+#define FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
+#define FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
+#define FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+
#endif /* _UAPI_LINUX_FS_H */
--
2.1.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related
* Re: [Patch v4] firmware: dmi-sysfs: add SMBIOS entry point area attribute
From: Ivan Khoronzhuk @ 2015-02-12 2:33 UTC (permalink / raw)
To: Matt Fleming
Cc: Grant Likely, Ard Biesheuvel,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA,
linux-doc-u79uwXL29TY76Z2rM5mHXA, Leif Lindholm, Mark Salter
In-Reply-To: <20150211144321.GB4665-mF/unelCI9GS6iBeEJttW/XRex20P6io@public.gmane.org>
[-- Attachment #1: Type: text/plain, Size: 1123 bytes --]
On 02/11/2015 04:43 PM, Matt Fleming wrote:
> On Wed, 11 Feb, at 02:17:03PM, Matt Fleming wrote:
>> On Tue, 10 Feb, at 11:51:44AM, Ivan Khoronzhuk wrote:
>>> If you are Ok with this patch, could you please pickup it?
>> Applied, thanks Ivan!
> Btw this patch doesn't apply cleanly, the reject looks like this,
>
> --- drivers/firmware/dmi_scan.c
> +++ drivers/firmware/dmi_scan.c
> @@ -537,6 +543,8 @@
> dmi_ver &= 0xFFFFFF;
The problem is in above string.
I used linux next, but I had one patch before.
Sorry... just forgot about it.
I've attached the same patch but on top of linux_next,
it can be applied cleanly.
Sorry once again and thanks!
> dmi_len = get_unaligned_le32(buf + 12);
> dmi_base = get_unaligned_le64(buf + 16);
> + smbios_header_size = buf[6];
> + memcpy(smbios_header, buf, smbios_header_size);
>
> /*
> * The 64-bit SMBIOS 3.0 entry point no longer has a field
>
> What version of the kernel did you base this patch on? The conflict is
> trivial to fixup and I've done so and pushed it out on the EFI 'next'
> branch, but I wanted to call out this conflict explicitly.
>
[-- Attachment #2: 0001-firmware-dmi-sysfs-add-SMBIOS-entry-point-area-attri.patch --]
[-- Type: text/x-patch, Size: 6728 bytes --]
>From b64185ddfb0b704f83324d8b0c6be7f7ee951581 Mon Sep 17 00:00:00 2001
From: Ivan Khoronzhuk <ivan.khoronzhuk-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
Date: Tue, 30 Dec 2014 02:58:12 +0200
Subject: [PATCH] firmware: dmi-sysfs: add SMBIOS entry point area attribute
Some utils, like dmidecode and smbios, needs to access SMBIOS entry
table area in order to get information like SMBIOS version, size, etc.
Currently it's done via /dev/mem. But for situation when /dev/mem
usage is disabled, the utils have to use dmi sysfs instead, which
doesn't represent SMBIOS entry. So this patch adds SMBIOS area to
dmi-sysfs in order to allow utils in question to work correctly with
dmi sysfs interface.
Reviewed-by: Ard Biesheuvel <ard.biesheuvel-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
Documentation/ABI/testing/sysfs-firmware-dmi | 10 +++++++
drivers/firmware/dmi-sysfs.c | 42 ++++++++++++++++++++++++++++
drivers/firmware/dmi_scan.c | 26 +++++++++++++++++
include/linux/dmi.h | 3 ++
4 files changed, 81 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi
index c78f9ab..3a9ffe8 100644
--- a/Documentation/ABI/testing/sysfs-firmware-dmi
+++ b/Documentation/ABI/testing/sysfs-firmware-dmi
@@ -12,6 +12,16 @@ Description:
cannot ensure that the data as exported to userland is
without error either.
+ The firmware provides DMI structures as a packed list of
+ data referenced by a SMBIOS table entry point. The SMBIOS
+ entry point contains general information, like SMBIOS
+ version, DMI table size, etc. The structure, content and
+ size of SMBIOS entry point is dependent on SMBIOS version.
+ That's why SMBIOS entry point is represented in dmi sysfs
+ like a raw attribute and is accessible via
+ /sys/firmware/dmi/smbios_raw_header. The format of SMBIOS
+ entry point header can be read in SMBIOS specification.
+
DMI is structured as a large table of entries, where
each entry has a common header indicating the type and
length of the entry, as well as a firmware-provided
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index e0f1cb3..9b396d7 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -29,6 +29,8 @@
#define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
the top entry type is only 8 bits */
+static const u8 *smbios_raw_header;
+
struct dmi_sysfs_entry {
struct dmi_header dh;
struct kobject kobj;
@@ -646,9 +648,37 @@ static void cleanup_entry_list(void)
}
}
+static ssize_t smbios_entry_area_raw_read(struct file *filp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ ssize_t size;
+
+ size = bin_attr->size;
+
+ if (size > pos)
+ size -= pos;
+ else
+ return 0;
+
+ if (count < size)
+ size = count;
+
+ memcpy(buf, &smbios_raw_header[pos], size);
+
+ return size;
+}
+
+static struct bin_attribute smbios_raw_area_attr = {
+ .read = smbios_entry_area_raw_read,
+ .attr = {.name = "smbios_raw_header", .mode = 0400},
+};
+
static int __init dmi_sysfs_init(void)
{
int error = -ENOMEM;
+ int size;
int val;
/* Set up our directory */
@@ -669,6 +699,18 @@ static int __init dmi_sysfs_init(void)
goto err;
}
+ smbios_raw_header = dmi_get_smbios_entry_area(&size);
+ if (!smbios_raw_header) {
+ pr_debug("dmi-sysfs: SMBIOS raw data is not available.\n");
+ error = -EINVAL;
+ goto err;
+ }
+
+ /* Create the raw binary file to access the entry area */
+ smbios_raw_area_attr.size = size;
+ if (sysfs_create_bin_file(dmi_kobj, &smbios_raw_area_attr))
+ goto err;
+
pr_debug("dmi-sysfs: loaded.\n");
return 0;
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index c5f7b4e..d55c712 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -113,6 +113,8 @@ static void dmi_table(u8 *buf, int len, int num,
}
}
+static u8 smbios_header[32];
+static int smbios_header_size;
static phys_addr_t dmi_base;
static u16 dmi_len;
static u16 dmi_num;
@@ -474,6 +476,8 @@ static int __init dmi_present(const u8 *buf)
if (memcmp(buf, "_SM_", 4) == 0 &&
buf[5] < 32 && dmi_checksum(buf, buf[5])) {
smbios_ver = get_unaligned_be16(buf + 6);
+ smbios_header_size = buf[5];
+ memcpy(smbios_header, buf, smbios_header_size);
/* Some BIOS report weird SMBIOS version, fix that up */
switch (smbios_ver) {
@@ -505,6 +509,8 @@ static int __init dmi_present(const u8 *buf)
pr_info("SMBIOS %d.%d present.\n",
dmi_ver >> 8, dmi_ver & 0xFF);
} else {
+ smbios_header_size = 15;
+ memcpy(smbios_header, buf, smbios_header_size);
dmi_ver = (buf[14] & 0xF0) << 4 |
(buf[14] & 0x0F);
pr_info("Legacy DMI %d.%d present.\n",
@@ -530,6 +536,8 @@ static int __init dmi_smbios3_present(const u8 *buf)
dmi_ver = get_unaligned_be16(buf + 7);
dmi_len = get_unaligned_le32(buf + 12);
dmi_base = get_unaligned_le64(buf + 16);
+ smbios_header_size = buf[6];
+ memcpy(smbios_header, buf, smbios_header_size);
/*
* The 64-bit SMBIOS 3.0 entry point no longer has a field
@@ -941,3 +949,21 @@ void dmi_memdev_name(u16 handle, const char **bank, const char **device)
}
}
EXPORT_SYMBOL_GPL(dmi_memdev_name);
+
+/**
+ * dmi_get_smbios_entry_area - copy SMBIOS entry point area to array.
+ * @size - pointer to assign actual size of SMBIOS entry point area.
+ *
+ * returns NULL if table is not available, otherwise returns pointer on
+ * SMBIOS entry point area array.
+ */
+const u8 *dmi_get_smbios_entry_area(int *size)
+{
+ if (!smbios_header_size || !dmi_available)
+ return NULL;
+
+ *size = smbios_header_size;
+
+ return smbios_header;
+}
+EXPORT_SYMBOL_GPL(dmi_get_smbios_entry_area);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index f820f0a..8e1a28d 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -109,6 +109,7 @@ extern int dmi_walk(void (*decode)(const struct dmi_header *, void *),
void *private_data);
extern bool dmi_match(enum dmi_field f, const char *str);
extern void dmi_memdev_name(u16 handle, const char **bank, const char **device);
+const u8 *dmi_get_smbios_entry_area(int *size);
#else
@@ -140,6 +141,8 @@ static inline void dmi_memdev_name(u16 handle, const char **bank,
const char **device) { }
static inline const struct dmi_system_id *
dmi_first_match(const struct dmi_system_id *list) { return NULL; }
+static inline const u8 *dmi_get_smbios_entry_area(int *size)
+ { return NULL; }
#endif
--
1.9.1
^ permalink raw reply related
* Re: [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Hekuang @ 2015-02-12 4:58 UTC (permalink / raw)
To: Alexei Starovoitov, Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, Linux API, Network Development, LKML,
Linus Torvalds, Peter Zijlstra, Eric W. Biederman,
wangnan0-hv44wF8Li93QT0dZR+AlfA
In-Reply-To: <CAMEtUuzY_Po=WtFEFg1aqzJ8dEF4rHGcWDsaS44KYgACMNPPgA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
>> eBPF is very flexible, which means it is bound to have someone use it
>> in a way you never dreamed of, and that will be what bites you in the
>> end (pun intended).
> understood :)
> let's start slow then with bpf+syscall and bpf+kprobe only.
I think BPF + system calls/kprobes can meet our use case
(https://lkml.org/lkml/2015/2/6/44), but there're some issues to be
improved.
I suggest that you can improve bpf+kprobes when attached to function
headers(or TRACE_MARKERS), make it converts pt-regs to bpf_ctx->arg1,
arg2.., then top models and architectures can be separated by bpf.
BPF bytecode is cross-platform, but what we can get by using bpf+kprobes
is a 'regs->rdx' kind of information, such information is both
architecture and kernel version related.
We hope to establish some models for describing kernel procedures such
as IO and network, which requires that it does not rely on architecture
and does not rely to a specific kernel version as much as possible.
^ permalink raw reply
* Re: [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: Jarkko Sakkinen @ 2015-02-12 5:25 UTC (permalink / raw)
To: Stefan Berger
Cc: Peter Hüwe, Ashley Lai, Marcel Selhorst,
tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, josh-iaAMLnmF4UmaiuxdJuQwMA,
christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
linux-api-u79uwXL29TY76Z2rM5mHXA,
trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <54DBA3A5.7090306-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
On Wed, Feb 11, 2015 at 01:47:01PM -0500, Stefan Berger wrote:
> On 02/10/2015 07:50 AM, Jarkko Sakkinen wrote:
> >On Tue, Feb 10, 2015 at 07:16:32AM -0500, Stefan Berger wrote:
> >>On 02/09/2015 03:39 AM, Jarkko Sakkinen wrote:
> >>>On Mon, Feb 09, 2015 at 12:08:46AM +0100, Peter Hüwe wrote:
> >>>>Am Mittwoch, 4. Februar 2015, 15:21:09 schrieb Jarkko Sakkinen:
> >>>>>If during transmission system error was returned, the logic was to
> >>>>>incorrectly deduce that chip is a TPM 1.x chip. This patch fixes this
> >>>>>issue. Also, this patch changes probing so that message tag is used as the
> >>>>>measure for TPM 2.x, which should be much more stable.
> >>>>Is it aware that some TPMs may respond with 0x00C1 as TAG for TPM1.2 commands?
> >>>I guess none of the TPM 1.2 command answer with the tag 0x8002?
> >>
> >>FYI: pdf page 26 , section 6.1 explains the predictable return value for a
> >>TPM1.2 command seen by a TPM2
> >>
> >>http://www.trustedcomputinggroup.org/files/static_page_files/8C68ADA8-1A4B-B294-D0FC06D3773F7DAA/TPM%20Rev%202.0%20Part%203%20-%20Commands%2001.16-code.pdf
> >>
> >>Following this:
> >>
> >>Sending a TPM1.2 command to a TPM2 should return a TPM1.2 header (tag =
> >>0xc4) and error code (TPM_BADTAG = 0x1e)
> >>
> >>Sending a TPM 2 command to a TPM 2 will give a TPM 2 tag in the header.
> >>Sending a TPM 2 command to a TPM 1.2 will give a TPM 1.2 tag in the header
> >>and an error code.
> >Thank you for the information. Do you think that for some reason
> >tpm2_probe() shoould instead check that value is not this error
> >instead of checking that tag is 0x80002?
>
> Following your path, you are checking for TPM2_ST_NO_SESSION (0x8001), which
> looks correct to me. A TPM1.2 would never send this tag back.
OK, perfect :)
> Stefan
/Jarkko
^ permalink raw reply
* Re: [PATCH 2/5] Documentation/ABI: Add file describing the sysfs entries for toshiba_acpi
From: Darren Hart @ 2015-02-12 5:38 UTC (permalink / raw)
To: Azael Avalos
Cc: platform-driver-x86-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423637040-6813-3-git-send-email-coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
On Tue, Feb 10, 2015 at 11:43:57PM -0700, Azael Avalos wrote:
> This patch adds a new file describing the sysfs entries for the
> toshiba_acpi driver.
>
+linux-api list
I've queued this for 3.20 through platform-drivers-x86. Any objections?
> Signed-off-by: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
> .../ABI/testing/sysfs-driver-toshiba_acpi | 114 +++++++++++++++++++++
> 1 file changed, 114 insertions(+)
> create mode 100644 Documentation/ABI/testing/sysfs-driver-toshiba_acpi
>
> diff --git a/Documentation/ABI/testing/sysfs-driver-toshiba_acpi b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi
> new file mode 100644
> index 0000000..ca9c71a
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi
> @@ -0,0 +1,114 @@
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_backlight_mode
> +Date: June 8, 2014
> +KernelVersion: 3.15
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls the keyboard backlight operation mode, valid
> + values are:
> + * 0x1 -> FN-Z
> + * 0x2 -> AUTO (also called TIMER)
> + * 0x8 -> ON
> + * 0x10 -> OFF
> + Note that the kernel 3.16 onwards this file accepts all listed
> + parameters, kernel 3.15 only accepts the first two (FN-Z and
> + AUTO).
> +Users: KToshiba
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_backlight_timeout
> +Date: June 8, 2014
> +KernelVersion: 3.15
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls the timeout of the keyboard backlight
> + whenever the operation mode is set to AUTO (or TIMER),
> + valid values range from 0-60.
> + Note that the kernel 3.15 only had support for the first
> + keyboard type, the kernel 3.16 added support for the second
> + type and the range accepted for type 2 is 1-60.
> + See the entry named "kbd_type"
> +Users: KToshiba
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/position
> +Date: June 8, 2014
> +KernelVersion: 3.15
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file shows the absolute position of the built-in
> + accelereometer.
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/touchpad
> +Date: June 8, 2014
> +KernelVersion: 3.15
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This files controls the status of the touchpad and pointing
> + stick (if available), valid values are:
> + * 0 -> OFF
> + * 1 -> ON
> +Users: KToshiba
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/available_kbd_modes
> +Date: August 3, 2014
> +KernelVersion: 3.16
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file shows the supported keyboard backlight modes
> + the system supports, which can be:
> + * 0x1 -> FN-Z
> + * 0x2 -> AUTO (also called TIMER)
> + * 0x8 -> ON
> + * 0x10 -> OFF
> + Note that not all keyboard types support the listed modes.
> + See the entry named "available_kbd_modes"
> +Users: KToshiba
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_type
> +Date: August 3, 2014
> +KernelVersion: 3.16
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file shows the current keyboard backlight type,
> + which can be:
> + * 1 -> Type 1, supporting modes FN-Z and AUTO
> + * 2 -> Type 2, supporting modes TIMER, ON and OFF
> +Users: KToshiba
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/version
> +Date: February, 2015
> +KernelVersion: 3.20
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file shows the current version of the driver
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/fan
> +Date: February, 2015
> +KernelVersion: 3.20
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls the state of the internal fan, valid
> + values are:
> + * 0 -> OFF
> + * 1 -> ON
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_function_keys
> +Date: February, 2015
> +KernelVersion: 3.20
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls the Special Functions (hotkeys) operation
> + mode, valid values are:
> + * 0 -> Normal Operation
> + * 1 -> Special Functions
> + In the "Normal Operation" mode, the F{1-12} keys are as usual
> + and the hotkeys are accessed via FN-F{1-12}.
> + In the "Special Functions" mode, the F{1-12} keys trigger the
> + hotkey and the F{1-12} keys are accessed via FN-F{1-12}.
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/panel_power_on
> +Date: February, 2015
> +KernelVersion: 3.20
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls whether the laptop should turn ON whenever
> + the LID is opened, valid values are:
> + * 0 -> Disabled
> + * 1 -> Enabled
> +
> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_three
> +Date: February, 2015
> +KernelVersion: 3.20
> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> +Description: This file controls whether the USB 3 functionality, valid
> + values are:
> + * 0 -> Disabled (Acts as a regular USB 2)
> + * 1 -> Enabled (Full USB 3 functionality)
> --
> 2.2.2
>
>
--
Darren Hart
Intel Open Source Technology Center
^ permalink raw reply
* Re: [PATCH 2/5] Documentation/ABI: Add file describing the sysfs entries for toshiba_acpi
From: Azael Avalos @ 2015-02-12 5:42 UTC (permalink / raw)
To: Darren Hart
Cc: platform-driver-x86-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150212053807.GD30527-JIrPihikzLEQaXB9iyTzyw@public.gmane.org>
Hi Darren,
2015-02-11 22:38 GMT-07:00 Darren Hart <dvhart-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>:
> On Tue, Feb 10, 2015 at 11:43:57PM -0700, Azael Avalos wrote:
>> This patch adds a new file describing the sysfs entries for the
>> toshiba_acpi driver.
>>
>
> +linux-api list
Just CC me, because I'm not subscribed to that list, in case something
needs to be changed.
>
> I've queued this for 3.20 through platform-drivers-x86. Any objections?
Not at all, and given that this patch is managed on another list,
let's see if there are any problems with this file.
Cheers
Azael
>
>
>> Signed-off-by: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> ---
>> .../ABI/testing/sysfs-driver-toshiba_acpi | 114 +++++++++++++++++++++
>> 1 file changed, 114 insertions(+)
>> create mode 100644 Documentation/ABI/testing/sysfs-driver-toshiba_acpi
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-toshiba_acpi b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi
>> new file mode 100644
>> index 0000000..ca9c71a
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi
>> @@ -0,0 +1,114 @@
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_backlight_mode
>> +Date: June 8, 2014
>> +KernelVersion: 3.15
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls the keyboard backlight operation mode, valid
>> + values are:
>> + * 0x1 -> FN-Z
>> + * 0x2 -> AUTO (also called TIMER)
>> + * 0x8 -> ON
>> + * 0x10 -> OFF
>> + Note that the kernel 3.16 onwards this file accepts all listed
>> + parameters, kernel 3.15 only accepts the first two (FN-Z and
>> + AUTO).
>> +Users: KToshiba
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_backlight_timeout
>> +Date: June 8, 2014
>> +KernelVersion: 3.15
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls the timeout of the keyboard backlight
>> + whenever the operation mode is set to AUTO (or TIMER),
>> + valid values range from 0-60.
>> + Note that the kernel 3.15 only had support for the first
>> + keyboard type, the kernel 3.16 added support for the second
>> + type and the range accepted for type 2 is 1-60.
>> + See the entry named "kbd_type"
>> +Users: KToshiba
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/position
>> +Date: June 8, 2014
>> +KernelVersion: 3.15
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file shows the absolute position of the built-in
>> + accelereometer.
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/touchpad
>> +Date: June 8, 2014
>> +KernelVersion: 3.15
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This files controls the status of the touchpad and pointing
>> + stick (if available), valid values are:
>> + * 0 -> OFF
>> + * 1 -> ON
>> +Users: KToshiba
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/available_kbd_modes
>> +Date: August 3, 2014
>> +KernelVersion: 3.16
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file shows the supported keyboard backlight modes
>> + the system supports, which can be:
>> + * 0x1 -> FN-Z
>> + * 0x2 -> AUTO (also called TIMER)
>> + * 0x8 -> ON
>> + * 0x10 -> OFF
>> + Note that not all keyboard types support the listed modes.
>> + See the entry named "available_kbd_modes"
>> +Users: KToshiba
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_type
>> +Date: August 3, 2014
>> +KernelVersion: 3.16
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file shows the current keyboard backlight type,
>> + which can be:
>> + * 1 -> Type 1, supporting modes FN-Z and AUTO
>> + * 2 -> Type 2, supporting modes TIMER, ON and OFF
>> +Users: KToshiba
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/version
>> +Date: February, 2015
>> +KernelVersion: 3.20
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file shows the current version of the driver
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/fan
>> +Date: February, 2015
>> +KernelVersion: 3.20
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls the state of the internal fan, valid
>> + values are:
>> + * 0 -> OFF
>> + * 1 -> ON
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_function_keys
>> +Date: February, 2015
>> +KernelVersion: 3.20
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls the Special Functions (hotkeys) operation
>> + mode, valid values are:
>> + * 0 -> Normal Operation
>> + * 1 -> Special Functions
>> + In the "Normal Operation" mode, the F{1-12} keys are as usual
>> + and the hotkeys are accessed via FN-F{1-12}.
>> + In the "Special Functions" mode, the F{1-12} keys trigger the
>> + hotkey and the F{1-12} keys are accessed via FN-F{1-12}.
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/panel_power_on
>> +Date: February, 2015
>> +KernelVersion: 3.20
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls whether the laptop should turn ON whenever
>> + the LID is opened, valid values are:
>> + * 0 -> Disabled
>> + * 1 -> Enabled
>> +
>> +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_three
>> +Date: February, 2015
>> +KernelVersion: 3.20
>> +Contact: Azael Avalos <coproscefalo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> +Description: This file controls whether the USB 3 functionality, valid
>> + values are:
>> + * 0 -> Disabled (Acts as a regular USB 2)
>> + * 1 -> Enabled (Full USB 3 functionality)
>> --
>> 2.2.2
>>
>>
>
> --
> Darren Hart
> Intel Open Source Technology Center
--
-- El mundo apesta y vosotros apestais tambien --
^ permalink raw reply
* Re: [PATCH v5] perf: Use monotonic clock as a source for timestamps
From: Adrian Hunter @ 2015-02-12 10:04 UTC (permalink / raw)
To: Peter Zijlstra, Pawel Moll
Cc: ajh mls, Richard Cochran, Steven Rostedt, Ingo Molnar,
Paul Mackerras, Arnaldo Carvalho de Melo, John Stultz,
Masami Hiramatsu, Christopher Covington, Namhyung Kim,
David Ahern, Thomas Gleixner, Tomeu Vizoso,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150211161256.GH2896-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
On 11/02/15 18:12, Peter Zijlstra wrote:
>
> How about something like the below? I _think_ it should mostly work for
> x86, where the tsc is a 64bit wide cycle counter.
It would have to be based on CLOCK_MONOTONIC_RAW not CLOCK_MONOTONIC and you
would have to check the clocksource is TSC.
Why is CLOCK_MONOTONIC preferred anyway - I would have thought any
adjustment would skew performance timings?
>
> I suppose we should extend the perf userpage time data with
> time_last_cycle and time_mask if/when we want to make this work on
> something with a short counter.
>
> Of course, at that time we also need to somehow deal with that counter
> wrapping, its hardly practical to go iterate all possible userpg
> instances from a timer handler.
>
>
> ---
> Documentation/kernel-parameters.txt | 9 +++++++
> arch/x86/kernel/cpu/perf_event.c | 44 ++++++++++++++++++++++++---------
> include/linux/perf_event.h | 6 +++++
> kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++---
> kernel/time/timekeeping.c | 30 +++++++++++++++++++++++
> 5 files changed, 123 insertions(+), 15 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 176d4fe4f076..52255676b6e2 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -91,6 +91,7 @@ the beginning of each description states the restrictions within which a
> NUMA NUMA support is enabled.
> NFS Appropriate NFS support is enabled.
> OSS OSS sound support is enabled.
> + PERF Performance events and counters support is enabled.
> PV_OPS A paravirtualized kernel is enabled.
> PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
> PARISC The PA-RISC architecture is enabled.
> @@ -2796,6 +2797,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> allocator. This parameter is primarily for debugging
> and performance comparison.
>
> + perf_use_local_clock
> + [PERF]
> + Use local_clock() as a source for perf timestamps
> + generation. This was be the default behaviour and
> + this parameter can be used to maintain backward
> + compatibility or on older hardware with expensive
> + monotonic clock source.
> +
> pf. [PARIDE]
> See Documentation/blockdev/paride.txt.
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index b71a7f86d68a..436a66632f76 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -1952,6 +1952,35 @@ static struct pmu pmu = {
> .flush_branch_stack = x86_pmu_flush_branch_stack,
> };
>
> +static void local_clock_user_time(struct perf_event_mmap_page *userpg, u64 now)
> +{
> + data = cyc2ns_read_begin();
> +
> + userpg->cap_user_time = 1;
> + userpg->time_mult = data->cyc2ns_mul;
> + userpg->time_shift = data->cyc2ns_shift;
> + userpg->time_offset = data->cyc2ns_offset - now;
> +
> + userpg->cap_user_time_zero = 1;
> + userpg->time_zero = data->cyc2ns_offset;
> +
> + cyc2ns_read_end(data);
> +}
> +
> +extern void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift);
> +
> +static void ktime_fast_mono_user_time(struct perf_event_mmap_page *userpg, u64 now)
> +{
> + userpg->cap_user_time = 1;
> + userpg->cap_user_time_zero = 1;
> +
> + __ktime_get_mono_fast(&userpg->time_zero,
> + &userpg->time_mult,
> + &userpg->time_shift);
> +
> + userpg->offset = userpg->time_zero - now;
> +}
> +
> void arch_perf_update_userpage(struct perf_event *event,
> struct perf_event_mmap_page *userpg, u64 now)
> {
> @@ -1966,17 +1995,10 @@ void arch_perf_update_userpage(struct perf_event *event,
> if (!sched_clock_stable())
> return;
>
> - data = cyc2ns_read_begin();
> -
> - userpg->cap_user_time = 1;
> - userpg->time_mult = data->cyc2ns_mul;
> - userpg->time_shift = data->cyc2ns_shift;
> - userpg->time_offset = data->cyc2ns_offset - now;
> -
> - userpg->cap_user_time_zero = 1;
> - userpg->time_zero = data->cyc2ns_offset;
> -
> - cyc2ns_read_end(data);
> + if (static_key_false(&perf_use_local_clock_key))
> + local_clock_user_time(userpg, now);
> + else
> + ktime_fast_mono_user_time(userpg, now);
> }
>
> /*
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 33262004c310..1d61f968113a 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -561,6 +561,12 @@ extern void perf_pmu_enable(struct pmu *pmu);
> extern int perf_event_task_disable(void);
> extern int perf_event_task_enable(void);
> extern int perf_event_refresh(struct perf_event *event, int refresh);
> +
> +extern struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE;
> +extern void __weak
> +arch_perf_update_userpage(struct perf_event *event,
> + struct perf_event_mmap_page *userpg, u64 now);
> +
> extern void perf_event_update_userpage(struct perf_event *event);
> extern int perf_event_release_kernel(struct perf_event *event);
> extern struct perf_event *
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 13209a90b751..7bad385103ea 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -42,6 +42,8 @@
> #include <linux/module.h>
> #include <linux/mman.h>
> #include <linux/compat.h>
> +#include <linux/sysctl.h>
> +#include <linux/jump_label.h>
>
> #include "internal.h"
>
> @@ -322,9 +324,43 @@ extern __weak const char *perf_pmu_name(void)
> return "pmu";
> }
>
> +struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE;
> +static bool perf_use_local_clock_param __initdata;
> +static int __init perf_use_local_clock_setup(char *__unused)
> +{
> + perf_use_local_clock_param = true;
> + return 1;
> +}
> +__setup("perf_use_local_clock", perf_use_local_clock_setup);
> +
> +static int sysctl_perf_sample_time_clk_id = CLOCK_MONOTONIC;
> +
> +static struct ctl_table perf_sample_time_kern_table[] = {
> + {
> + .procname = "perf_sample_time_clk_id",
> + .data = &sysctl_perf_sample_time_clk_id,
> + .maxlen = sizeof(int),
> + .mode = 0444,
> + .proc_handler = proc_dointvec,
> + },
> + {}
> +};
> +
> +static struct ctl_table perf_sample_time_root_table[] = {
> + {
> + .procname = "kernel",
> + .mode = 0555,
> + .child = perf_sample_time_kern_table,
> + },
> + {}
> +};
> +
> static inline u64 perf_clock(void)
> {
> - return local_clock();
> + if (static_key_false(&perf_use_local_clock_key))
> + return local_clock();
> + else
> + return ktime_get_mono_fast_ns();
> }
>
> static inline struct perf_cpu_context *
> @@ -4101,8 +4137,8 @@ static void perf_event_init_userpage(struct perf_event *event)
> rcu_read_unlock();
> }
>
> -void __weak arch_perf_update_userpage(
> - struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
> +void __weak arch_perf_update_userpage(struct perf_event *event,
> + struct perf_event_mmap_page *userpg, u64 now)
> {
> }
>
> @@ -4487,7 +4523,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> if (vma->vm_flags & VM_WRITE)
> flags |= RING_BUFFER_WRITABLE;
>
> - rb = rb_alloc(nr_pages,
> + rb = rb_alloc(nr_pages,
> event->attr.watermark ? event->attr.wakeup_watermark : 0,
> event->cpu, flags);
>
> @@ -8516,6 +8552,11 @@ void __init perf_event_init(void)
> */
> BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
> != 1024);
> +
> + if (perf_use_local_clock_param)
> + static_key_slow_inc(&perf_use_local_clock_key);
> + else
> + register_sysctl_table(perf_sample_time_root_table);
> }
>
> static int __init perf_event_sysfs_init(void)
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index b124af259800..37bed5931a91 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -334,6 +334,36 @@ u64 notrace ktime_get_mono_fast_ns(void)
> }
> EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
>
> +void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift)
> +{
> + struct tk_read_base *tkr;
> + unsigned int seq;
> + cycle_t cycle_now, delta;
> + u64 nsecs, now;
> +
> + do {
> + seq = raw_read_seqcount(&tk_fast_mono.seq);
> + tkr = tk_fast_mono.base + (seq & 0x01);
> +
> + cycle_now = tkr->read(tkr->clock);
> + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
> +
> + nsec = delta * tkr->mult + tkr->xtime_nsec;
> + nsec >>= tkr->shift;
> + nsec += arch_gettimeoffset();
> +
> + now = ktime_to_ns(tkr->base_mono) + nsec;
> +
> + *mult = tkr->mult;
> + *shift = tkr->shift;
> +
> + nsec = mul_u64_u32_shr(cycle_now, tkr->mult, tkr->shift);
> +
> + *offset = now - nsec;
> +
> + } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
> +}
> +
> #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
>
> static inline void update_vsyscall(struct timekeeper *tk)
>
>
^ permalink raw reply
* Re: [PATCH 2/2] user_namespaces.7: Update the documention to reflect the fixes for negative groups
From: Michael Kerrisk (man-pages) @ 2015-02-12 10:11 UTC (permalink / raw)
To: Eric W. Biederman
Cc: mtk.manpages, Linux Containers, Josh Triplett, Andrew Morton,
Kees Cook, Linux API, linux-man, linux-kernel@vger.kernel.org,
LSM, Casey Schaufler, Serge E. Hallyn, Richard Weinberger,
Kenton Varda, stable, Andy Lutomirski
In-Reply-To: <87egpwk0n3.fsf@x220.int.ebiederm.org>
On 02/11/2015 03:01 PM, Eric W. Biederman wrote:
> "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com> writes:
>
>> Hi Eric,
>>
>> Ping!
>>
>> Cheers,
>>
>> Michael
>>
>>
>> On 2 February 2015 at 16:37, Michael Kerrisk (man-pages)
>> <mtk.manpages@gmail.com> wrote:
>>> Hi Eric,
>>>
>>> Thanks for writing this up!
>>>
>>> On 12/12/2014 10:54 PM, Eric W. Biederman wrote:
>>>>
>>>> Files with access permissions such as ---rwx---rwx give fewer
>>>> permissions to their group then they do to everyone else. Which means
>>>> dropping groups with setgroups(0, NULL) actually grants a process
>>>> privileges.
>>>>
>>>> The uprivileged setting of gid_map turned out not to be safe after
> ^^^^^^^^^^^
> unprivileged -- typo fix
Thanks for confirming.
>>>> this change. Privilege setting of gid_map can be interpreted as
>>>> meaning yes it is ok to drop groups.
>>>
>>> I had trouble to parse that sentence (and I'd like to make sure that
>>> the right sentence ends up in the commit message). Did you mean:
>>>
>>> "*Unprivileged* setting of gid_map can be interpreted as meaning
>>> yes it is ok to drop groups"
>>> ?
>>>
>>> Or something else?
>
>
> I meant: Setting of gid_map with privilege has been clarified to mean
> that dropping groups is ok. This allows existing programs that set
> gid_map with privilege to work without changes. That is newgidmap
> continues to work unchanged.
Thanks. I added that text to the changelog message.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
^ permalink raw reply
* Re: [PATCH v5] perf: Use monotonic clock as a source for timestamps
From: Peter Zijlstra @ 2015-02-12 10:28 UTC (permalink / raw)
To: Adrian Hunter
Cc: Pawel Moll, ajh mls, Richard Cochran, Steven Rostedt, Ingo Molnar,
Paul Mackerras, Arnaldo Carvalho de Melo, John Stultz,
Masami Hiramatsu, Christopher Covington, Namhyung Kim,
David Ahern, Thomas Gleixner, Tomeu Vizoso,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <54DC7AC6.5010605-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
On Thu, Feb 12, 2015 at 12:04:54PM +0200, Adrian Hunter wrote:
> On 11/02/15 18:12, Peter Zijlstra wrote:
> >
> > How about something like the below? I _think_ it should mostly work for
> > x86, where the tsc is a 64bit wide cycle counter.
>
> It would have to be based on CLOCK_MONOTONIC_RAW not CLOCK_MONOTONIC
Why?
> and you would have to check the clocksource is TSC.
It implicitly does that; it has that sched_clock_stable() thing, but
yeah I suppose someone could change the clocksource even though the tsc
is stable.
Not using TSC when its available is quite crazy though.. but sure.
> Why is CLOCK_MONOTONIC preferred anyway - I would have thought any
> adjustment would skew performance timings?
Because you can do inter-machine stuff with MONOTONIC and that's
entirely impossible with MONO_RAW.
^ permalink raw reply
* Re: [PATCH 1/2] proc.5: Document /proc/[pid]/setgroups
From: Michael Kerrisk (man-pages) @ 2015-02-12 13:53 UTC (permalink / raw)
To: Eric W. Biederman
Cc: linux-man, Kees Cook, Linux API, Linux Containers, Josh Triplett,
stable, linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Kenton Varda, LSM, mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
Richard Weinberger, Casey Schaufler, Andrew Morton,
Andy Lutomirski
In-Reply-To: <8761b8lfoz.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
Hello Eric,
On 02/11/2015 02:51 PM, Eric W. Biederman wrote:
> "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com> writes:
>
>> Hi Eric,
>>
>> Ping!
>>
>> Cheers,
>>
>> Michael
>
> My apologies. You description wasn't wrong but it may be a bit
> misleading, explanation below. You will have to figure out how to work
> that into your proposed text.
>
>> On 2 February 2015 at 16:36, Michael Kerrisk (man-pages)
>> <mtk.manpages@gmail.com> wrote:
>>> [Adding Josh to CC in case he has anything to add.]
>>>
>>> On 12/12/2014 10:54 PM, Eric W. Biederman wrote:
>>>>
>>>> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
>>>> ---
>>>> man5/proc.5 | 15 +++++++++++++++
>>>> 1 file changed, 15 insertions(+)
>>>>
>>>> diff --git a/man5/proc.5 b/man5/proc.5
>>>> index 96077d0dd195..d661e8cfeac9 100644
>>>> --- a/man5/proc.5
>>>> +++ b/man5/proc.5
>>>> @@ -1097,6 +1097,21 @@ are not available if the main thread has already terminated
>>>> .\" Added in 2.6.9
>>>> .\" CONFIG_SCHEDSTATS
>>>> .TP
>>>> +.IR /proc/[pid]/setgroups " (since Linux 3.19-rc1)"
>>>> +This file reports
>>>> +.BR allow
>>>> +if the setgroups system call is allowed in the current user namespace.
>>>> +This file reports
>>>> +.BR deny
>>>> +if the setgroups system call is not allowed in the current user namespace.
>>>> +This file may be written to with values of
>>>> +.BR allow
>>>> +and
>>>> +.BR deny
>>>> +before
>>>> +.IR /proc/[pid]/gid_map
>>>> +is written to (enabling setgroups) in a user namespace.
>>>> +.TP
>>>> .IR /proc/[pid]/smaps " (since Linux 2.6.14)"
>>>> This file shows memory consumption for each of the process's mappings.
>>>> (The
>>>
>>> Hi Eric,
>>>
>>> Thanks for this patch. I applied it, and then tried to work in
>>> quite a few other details gleaned from the source code and commit
>>> message, and Jon Corbet's article at http://lwn.net/Articles/626665/.
>>> Could you please let me know if the following is correct:
>
> It is close but it may be misleading.
>
>>> /proc/[pid]/setgroups (since Linux 3.19)
>>> This file displays the string "allow" if processes in
>>> the user namespace that contains the process pid are
>>> permitted to employ the setgroups(2) system call, and
>>> "deny" if setgroups(2) is not permitted in that user
>>> namespace.
>
> With the caveat that when gid_map is not set that setgroups is also not
> allowed.
Okay -- Iadded that point.
>>> A privileged process (one with the CAP_SYS_ADMIN capa‐
>>> bility in the namespace) may write either of the strings
>>> "allow" or "deny" to this file before writing a group ID
>>> mapping for this user namespace to the file
>>> /proc/[pid]/gid_map. Writing the string "deny" prevents
>>> any process in the user namespace from employing set‐
>>> groups(2).
>
> Or more succintly. You are allowed to write to /proc/[pid]/setgroups
> when calling setgroups is not allowed because gid_map is unset. This
> ensures we do not have any transitions from a state where setgroups
> is allowed to a state where setgroups is denied. There are only
> transitions from setgroups not-allowed to setgroups allowed.
And I've worked in the above point, rewording a bit along the way.
So, how does the following look (only the first two paragraphs have
changed)?
/proc/[pid]/setgroups (since Linux 3.19)
This file displays the string "allow" if processes in
the user namespace that contains the process pid are
permitted to employ the setgroups(2) system call, and
"deny" if setgroups(2) is not permitted in that user
namespace. (Note, however, that calls to setgroups(2)
are also not permitted if /proc/[pid]/gid_map has not
yet been set.)
A privileged process (one with the CAP_SYS_ADMIN capa‐
bility in the namespace) may write either of the strings
"allow" or "deny" to this file before writing a group ID
mapping for this user namespace to the file
/proc/[pid]/gid_map. Writing the string "deny" prevents
any process in the user namespace from employing set‐
groups(2). In other words, it is permitted to write to
/proc/[pid]/setgroups so long as calling setgroups(2) is
not allowed because /proc/[pid]gid_map has not been set.
This ensures that a process cannot transition from a
state where setgroups(2) is allowed to a state where
setgroups(2) is denied; a process can only trabsition
from setgroups(2) being disallowed to setgroups(2) being
allowed.
The default value of this file in the initial user
namespace is "allow".
Once /proc/[pid]/gid_map has been written to (which has
the effect of enabling setgroups(2) in the user names‐
pace), it is no longer possible to deny setgroups(2) by
writing to /proc/[pid]/setgroups.
A child user namespace inherits the /proc/[pid]/gid_map
setting from its parent.
If the setgroups file has the value "deny", then the
setgroups(2) system call can't subsequently be reenabled
(by writing "allow" to the file) in this user namespace.
This restriction also propagates down to all child user
namespaces of this user namespace.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/containers
^ permalink raw reply
* Re: [PATCH v5] perf: Use monotonic clock as a source for timestamps
From: Peter Zijlstra @ 2015-02-12 15:38 UTC (permalink / raw)
To: Adrian Hunter
Cc: Pawel Moll, ajh mls, Richard Cochran, Steven Rostedt, Ingo Molnar,
Paul Mackerras, Arnaldo Carvalho de Melo, John Stultz,
Masami Hiramatsu, Christopher Covington, Namhyung Kim,
David Ahern, Thomas Gleixner, Tomeu Vizoso,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20150212102814.GK2896-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
On Thu, Feb 12, 2015 at 11:28:14AM +0100, Peter Zijlstra wrote:
> > and you would have to check the clocksource is TSC.
>
> It implicitly does that; it has that sched_clock_stable() thing, but
> yeah I suppose someone could change the clocksource even though the tsc
> is stable.
>
> Not using TSC when its available is quite crazy though.. but sure.
Something like this on top then.. it might have a few header issues, the
whole asm/tsc.h vs clocksource.h thing looks like pain.
I haven't tried to compile it, maybe we can move cycle_t into types and
fwd declare struct clocksource or whatnot.
Of course, all this is quite horrible on the timekeeping side; it might
be tglx and/or jstutlz are having spasms just reading it :-)
---
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1967,17 +1967,19 @@ static void local_clock_user_time(struct
cyc2ns_read_end(data);
}
-extern void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift);
+extern bool notrace __ktime_get_mono_fast(cycle_t (*read)(struct clocksource *cs),
+ u64 *offset, u32 *mult, u16 *shift);
static void ktime_fast_mono_user_time(struct perf_event_mmap_page *userpg, u64 now)
{
+ if (!__ktime_get_mono_fast(read_tsc, &userpg->time_zero,
+ &userpg->time_mult,
+ &userpg->time_shift))
+ return;
+
userpg->cap_user_time = 1;
userpg->cap_user_time_zero = 1;
- __ktime_get_mono_fast(&userpg->time_zero,
- &userpg->time_mult,
- &userpg->time_shift);
-
userpg->offset = userpg->time_zero - now;
}
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -334,7 +334,8 @@ u64 notrace ktime_get_mono_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
-void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift)
+bool notrace __ktime_get_mono_fast(cycle_t (*read)(struct clocksource *),
+ u64 *offset, u32 *mult, u16 *shift)
{
struct tk_read_base *tkr;
unsigned int seq;
@@ -345,6 +346,9 @@ void notrace __ktime_get_mono_fast(u64 *
seq = raw_read_seqcount(&tk_fast_mono.seq);
tkr = tk_fast_mono.base + (seq & 0x01);
+ if (tkr->read != read)
+ return false;
+
cycle_now = tkr->read(tkr->clock);
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -362,6 +366,8 @@ void notrace __ktime_get_mono_fast(u64 *
*offset = now - nsec;
} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+
+ return true;
}
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..68e4039a58ea 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -32,6 +32,8 @@ static inline cycles_t get_cycles(void)
return ret;
}
+extern void cycle_t read_tsc(struct clocksource *);
+
static __always_inline cycles_t vget_cycles(void)
{
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..c580998f0160 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -965,7 +965,7 @@ static struct clocksource clocksource_tsc;
* checking the result of read_tsc() - cycle_last for being negative.
* That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
-static cycle_t read_tsc(struct clocksource *cs)
+cycle_t read_tsc(struct clocksource *cs)
{
return (cycle_t)get_cycles();
}
^ permalink raw reply related
* [PATCH 00/14] Add support to STMicroelectronics STM32 family
From: Maxime Coquelin @ 2015-02-12 17:45 UTC (permalink / raw)
To: Jonathan Corbet, Maxime Coquelin, Rob Herring, Pawel Moll,
Mark Rutland, Ian Campbell, Kumar Gala, Philipp Zabel,
Russell King, Daniel Lezcano, Thomas Gleixner, Linus Walleij,
Greg Kroah-Hartman, Jiri Slaby, Arnd Bergmann, Andrew Morton,
David S. Miller, Mauro Carvalho Chehab, Joe Perches,
Antti Palosaari, Tejun Heo, Will Deacon, Nikolay Borisov
This patchset adds basic support for STMicroelectronics STM32 series MCUs.
STM32 MCUs are Cortex-M CPU, used in various applications (consumer
electronics, industrial applications, hobbyists...).
Datasheets, user and programming manuals are publicly available on
STMicroelectronics website.
With this series applied, the STM32F419 Discovery can boot succesfully.
Once this series accepted, next steps will be to add DMA support, as USART,
I2C and SPI IPs don't have any FIFO. Then will come the clock driver, as today
the bootloader has to be patched to enable the needed clocks.
Maxime Coquelin (14):
scripts: link-vmlinux: Don't pass page offset to kallsyms if XIP
Kernel
ARM: ARMv7M: Enlarge vector table to 256 entries
clocksource: Add ARM System timer driver
reset: Add reset_controller_of_init() function
ARM: call reset_controller_of_init from default time_init handler
drivers: reset: Add STM32 reset driver
clockevent: Add STM32 Timer driver
pinctrl: Add pinctrl driver for STM32 MCUs
serial: stm32-usart: Add STM32 USART Driver
ARM: Add STM32 family machine
ARM: dts: Add ARM System timer as clockevent in armv7m
ARM: dts: Introduce STM32F429 MCU
ARM: configs: Add STM32 defconfig
MAINTAINERS: Add entry for STM32 MCUs
Documentation/arm/stm32/overview.txt | 32 +
Documentation/arm/stm32/stm32f429-overview.txt | 22 +
.../devicetree/bindings/arm/system_timer.txt | 15 +
.../devicetree/bindings/pinctrl/pinctrl-stm32.txt | 99 +++
.../devicetree/bindings/reset/st,stm32-reset.txt | 19 +
.../devicetree/bindings/serial/st,stm32-usart.txt | 18 +
.../devicetree/bindings/timer/st,stm32-timer.txt | 19 +
MAINTAINERS | 7 +
arch/arm/Kconfig | 22 +
arch/arm/Makefile | 1 +
arch/arm/boot/dts/Makefile | 1 +
arch/arm/boot/dts/armv7-m.dtsi | 7 +
arch/arm/boot/dts/stm32f429-disco.dts | 41 ++
arch/arm/boot/dts/stm32f429.dtsi | 279 ++++++++
arch/arm/configs/stm32_defconfig | 72 ++
arch/arm/kernel/entry-v7m.S | 8 +-
arch/arm/kernel/time.c | 4 +
arch/arm/mach-stm32/Makefile | 1 +
arch/arm/mach-stm32/Makefile.boot | 0
arch/arm/mach-stm32/board-dt.c | 19 +
drivers/clocksource/Kconfig | 16 +
drivers/clocksource/Makefile | 2 +
drivers/clocksource/arm_system_timer.c | 74 ++
drivers/clocksource/timer-stm32.c | 187 +++++
drivers/pinctrl/Kconfig | 9 +
drivers/pinctrl/Makefile | 1 +
drivers/pinctrl/pinctrl-stm32.c | 779 +++++++++++++++++++++
drivers/reset/Makefile | 1 +
drivers/reset/core.c | 20 +
drivers/reset/reset-stm32.c | 124 ++++
drivers/tty/serial/Kconfig | 17 +
drivers/tty/serial/Makefile | 1 +
drivers/tty/serial/stm32-usart.c | 695 ++++++++++++++++++
include/asm-generic/vmlinux.lds.h | 4 +-
include/dt-bindings/pinctrl/pinctrl-stm32.h | 43 ++
include/linux/reset-controller.h | 6 +
include/uapi/linux/serial_core.h | 3 +
scripts/link-vmlinux.sh | 2 +-
38 files changed, 2664 insertions(+), 6 deletions(-)
create mode 100644 Documentation/arm/stm32/overview.txt
create mode 100644 Documentation/arm/stm32/stm32f429-overview.txt
create mode 100644 Documentation/devicetree/bindings/arm/system_timer.txt
create mode 100644 Documentation/devicetree/bindings/pinctrl/pinctrl-stm32.txt
create mode 100644 Documentation/devicetree/bindings/reset/st,stm32-reset.txt
create mode 100644 Documentation/devicetree/bindings/serial/st,stm32-usart.txt
create mode 100644 Documentation/devicetree/bindings/timer/st,stm32-timer.txt
create mode 100644 arch/arm/boot/dts/stm32f429-disco.dts
create mode 100644 arch/arm/boot/dts/stm32f429.dtsi
create mode 100644 arch/arm/configs/stm32_defconfig
create mode 100644 arch/arm/mach-stm32/Makefile
create mode 100644 arch/arm/mach-stm32/Makefile.boot
create mode 100644 arch/arm/mach-stm32/board-dt.c
create mode 100644 drivers/clocksource/arm_system_timer.c
create mode 100644 drivers/clocksource/timer-stm32.c
create mode 100644 drivers/pinctrl/pinctrl-stm32.c
create mode 100644 drivers/reset/reset-stm32.c
create mode 100644 drivers/tty/serial/stm32-usart.c
create mode 100644 include/dt-bindings/pinctrl/pinctrl-stm32.h
--
1.9.1
^ permalink raw reply
* [PATCH 01/14] scripts: link-vmlinux: Don't pass page offset to kallsyms if XIP Kernel
From: Maxime Coquelin @ 2015-02-12 17:45 UTC (permalink / raw)
To: Jonathan Corbet, Maxime Coquelin, Rob Herring, Pawel Moll,
Mark Rutland, Ian Campbell, Kumar Gala, Philipp Zabel,
Russell King, Daniel Lezcano, Thomas Gleixner, Linus Walleij,
Greg Kroah-Hartman, Jiri Slaby, Arnd Bergmann, Andrew Morton,
David S. Miller, Mauro Carvalho Chehab, Joe Perches,
Antti Palosaari, Tejun Heo, Will Deacon, Nikolay Borisov
In-Reply-To: <1423763164-5606-1-git-send-email-mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
When Kernel is executed in place from ROM, the symbol addresses can be
lower than the page offset.
Signed-off-by: Maxime Coquelin <mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
scripts/link-vmlinux.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 86a4fe7..b055d9d 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -82,7 +82,7 @@ kallsyms()
kallsymopt="${kallsymopt} --all-symbols"
fi
- if [ -n "${CONFIG_ARM}" ] && [ -n "${CONFIG_PAGE_OFFSET}" ]; then
+ if [ -n "${CONFIG_ARM}" ] && [ -z "${CONFIG_XIP_KERNEL}" ] && [ -n "${CONFIG_PAGE_OFFSET}" ]; then
kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET"
fi
--
1.9.1
^ permalink raw reply related
* [PATCH 02/14] ARM: ARMv7M: Enlarge vector table to 256 entries
From: Maxime Coquelin @ 2015-02-12 17:45 UTC (permalink / raw)
To: Jonathan Corbet, Maxime Coquelin, Rob Herring, Pawel Moll,
Mark Rutland, Ian Campbell, Kumar Gala, Philipp Zabel,
Russell King, Daniel Lezcano, Thomas Gleixner, Linus Walleij,
Greg Kroah-Hartman, Jiri Slaby, Arnd Bergmann, Andrew Morton,
David S. Miller, Mauro Carvalho Chehab, Joe Perches,
Antti Palosaari, Tejun Heo, Will Deacon, Nikolay Borisov
In-Reply-To: <1423763164-5606-1-git-send-email-mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>From Cortex-M4 and M7 reference manuals, the nvic supports up to 240
interrupts. So the number of entries in vectors table is 256.
This patch adds the missing entries, and change the alignement, so that
vector_table remains naturally aligned.
Signed-off-by: Maxime Coquelin <mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
arch/arm/kernel/entry-v7m.S | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S
index 8944f49..29a461b 100644
--- a/arch/arm/kernel/entry-v7m.S
+++ b/arch/arm/kernel/entry-v7m.S
@@ -117,9 +117,9 @@ ENTRY(__switch_to)
ENDPROC(__switch_to)
.data
- .align 8
+ .align 10
/*
- * Vector table (64 words => 256 bytes natural alignment)
+ * Vector table (256 words => 1024 bytes alignment)
*/
ENTRY(vector_table)
.long 0 @ 0 - Reset stack pointer
@@ -138,6 +138,6 @@ ENTRY(vector_table)
.long __invalid_entry @ 13 - Reserved
.long __pendsv_entry @ 14 - PendSV
.long __invalid_entry @ 15 - SysTick
- .rept 64 - 16
- .long __irq_entry @ 16..64 - External Interrupts
+ .rept 256 - 16
+ .long __irq_entry @ 16..256 - External Interrupts
.endr
--
1.9.1
^ permalink raw reply related
* [PATCH 03/14] clocksource: Add ARM System timer driver
From: Maxime Coquelin @ 2015-02-12 17:45 UTC (permalink / raw)
To: Jonathan Corbet, Maxime Coquelin, Rob Herring, Pawel Moll,
Mark Rutland, Ian Campbell, Kumar Gala, Philipp Zabel,
Russell King, Daniel Lezcano, Thomas Gleixner, Linus Walleij,
Greg Kroah-Hartman, Jiri Slaby, Arnd Bergmann, Andrew Morton,
David S. Miller, Mauro Carvalho Chehab, Joe Perches,
Antti Palosaari, Tejun Heo, Will Deacon, Nikolay Borisov
In-Reply-To: <1423763164-5606-1-git-send-email-mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
This patch adds clocksource support for ARMv7-M's System timer,
also known as SysTick.
Signed-off-by: Maxime Coquelin <mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
.../devicetree/bindings/arm/system_timer.txt | 15 +++++
drivers/clocksource/Kconfig | 7 ++
drivers/clocksource/Makefile | 1 +
drivers/clocksource/arm_system_timer.c | 74 ++++++++++++++++++++++
4 files changed, 97 insertions(+)
create mode 100644 Documentation/devicetree/bindings/arm/system_timer.txt
create mode 100644 drivers/clocksource/arm_system_timer.c
diff --git a/Documentation/devicetree/bindings/arm/system_timer.txt b/Documentation/devicetree/bindings/arm/system_timer.txt
new file mode 100644
index 0000000..35268b7
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/system_timer.txt
@@ -0,0 +1,15 @@
+* ARM System Timer
+
+ARMv7-M includes a system timer, known as SysTick. Current driver only
+implements the clocksource feature.
+
+Required properties:
+- compatible : Should be "arm,armv7m-systick"
+- reg : The address range of the timer
+- clocks : The input clock of the timer
+
+systick: system-timer {
+ compatible = "arm,armv7m-systick";
+ reg = <0xe000e010 0x10>;
+ clocks = <&clk_systick>;
+};
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index fc01ec2..f9fe4ac 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -124,6 +124,13 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK
help
Use ARM global timer clock source as sched_clock
+config ARM_SYSTEM_TIMER
+ bool
+ select CLKSRC_OF if OF
+ select CLKSRC_MMIO
+ help
+ This options enables support for the ARM system timer unit
+
config ATMEL_PIT
select CLKSRC_OF if OF
def_bool SOC_AT91SAM9 || SOC_SAMA5
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 94d90b2..194400b 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_MTK_TIMER) += mtk_timer.o
obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o
obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o
+obj-$(CONFIG_ARM_SYSTEM_TIMER) += arm_system_timer.o
obj-$(CONFIG_CLKSRC_METAG_GENERIC) += metag_generic.o
obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST) += dummy_timer.o
obj-$(CONFIG_ARCH_KEYSTONE) += timer-keystone.o
diff --git a/drivers/clocksource/arm_system_timer.c b/drivers/clocksource/arm_system_timer.c
new file mode 100644
index 0000000..69e6ef9
--- /dev/null
+++ b/drivers/clocksource/arm_system_timer.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) Maxime Coquelin 2015
+ * Author: Maxime Coquelin <mcoquelin.stm32-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
+ * License terms: GNU General Public License (GPL), version 2
+ */
+
+#include <linux/kernel.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/clk.h>
+#include <linux/bitops.h>
+
+#define SYST_CSR 0x00
+#define SYST_RVR 0x04
+#define SYST_CVR 0x08
+#define SYST_CALIB 0x0c
+
+#define SYST_CSR_ENABLE BIT(0)
+
+#define SYSTICK_LOAD_RELOAD_MASK 0x00FFFFFF
+
+static void __init system_timer_of_register(struct device_node *np)
+{
+ struct clk *clk;
+ void __iomem *base;
+ unsigned long rate;
+ int ret;
+
+ base = of_iomap(np, 0);
+ if (!base) {
+ pr_warn("system-timer: invalid base address\n");
+ return;
+ }
+
+ clk = of_clk_get(np, 0);
+ if (IS_ERR(clk)) {
+ pr_warn("system-timer: clk not found\n");
+ ret = PTR_ERR(clk);
+ goto out_unmap;
+ }
+
+ ret = clk_prepare_enable(clk);
+ if (ret)
+ goto out_clk_put;
+
+ rate = clk_get_rate(clk);
+
+ writel_relaxed(SYSTICK_LOAD_RELOAD_MASK, base + SYST_RVR);
+ writel_relaxed(SYST_CSR_ENABLE, base + SYST_CSR);
+
+ ret = clocksource_mmio_init(base + SYST_CVR, "arm_system_timer", rate,
+ 200, 24, clocksource_mmio_readl_down);
+ if (ret) {
+ pr_err("failed to init clocksource (%d)\n", ret);
+ goto out_clk_disable;
+ }
+
+ pr_info("ARM System timer initialized as clocksource\n");
+
+ return;
+
+out_clk_disable:
+ clk_disable_unprepare(clk);
+out_clk_put:
+ clk_put(clk);
+out_unmap:
+ iounmap(base);
+ WARN(ret, "ARM System timer register failed (%d)\n", ret);
+}
+
+CLOCKSOURCE_OF_DECLARE(arm_systick, "arm,armv7m-systick",
+ system_timer_of_register);
--
1.9.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox