public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch] Additions to .data.read_mostly section
@ 2005-08-24 21:46 Ravikiran G Thirumalai
  2005-08-24 22:38 ` Eric Dumazet
  2005-08-25  7:56 ` Arjan van de Ven
  0 siblings, 2 replies; 19+ messages in thread
From: Ravikiran G Thirumalai @ 2005-08-24 21:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Shai Fultheim (Shai@scalex86.org), Alok Kataria

Following patch moves a few static 'read mostly' variables to the 
.data.read_mostly section.  Typically these are vector - irq tables,
boot_cpu_data, node_maps etc., which are initialized once and read from 
often and rarely written to.  Please include.

Thanks,
Kiran


Patch to mark variables which are usually accessed for reads with __readmostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>

Index: linux-2.6.13-rc6/arch/i386/kernel/io_apic.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/i386/kernel/io_apic.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/i386/kernel/io_apic.c	2005-08-23 15:09:22.000000000 -0700
@@ -77,7 +77,7 @@
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -1127,7 +1127,7 @@
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1992,7 +1992,7 @@
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename 	= "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -2003,7 +2003,7 @@
 	.set_affinity 	= set_ioapic_affinity,
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename 	= "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -2074,7 +2074,7 @@
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename 	= "local-APIC-edge",
 	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
Index: linux-2.6.13-rc6/arch/i386/kernel/setup.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/i386/kernel/setup.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/i386/kernel/setup.c	2005-08-23 15:09:22.000000000 -0700
@@ -82,7 +82,7 @@
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
Index: linux-2.6.13-rc6/arch/i386/kernel/timers/timer_hpet.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/i386/kernel/timers/timer_hpet.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/i386/kernel/timers/timer_hpet.c	2005-08-23 15:09:22.000000000 -0700
@@ -18,8 +18,8 @@
 #include "mach_timer.h"
 #include <asm/hpet.h>
 
-static unsigned long __read_mostly hpet_usec_quotient;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
+static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
+static unsigned long tsc_hpet_quotient __read_mostly;		/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
Index: linux-2.6.13-rc6/arch/i386/mm/discontig.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/i386/mm/discontig.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/i386/mm/discontig.c	2005-08-23 15:09:22.000000000 -0700
@@ -37,7 +37,7 @@
 #include <asm/mmzone.h>
 #include <bios_ebda.h>
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 bootmem_data_t node0_bdata;
 
@@ -49,8 +49,8 @@
  * 2) node_start_pfn   - the starting page frame number for a node
  * 3) node_end_pfn     - the ending page fram number for a node
  */
-unsigned long node_start_pfn[MAX_NUMNODES];
-unsigned long node_end_pfn[MAX_NUMNODES];
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -66,7 +66,7 @@
  *     physnode_map[4-7] = 1;
  *     physnode_map[8- ] = -1;
  */
-s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
 
 void memory_present(int nid, unsigned long start, unsigned long end)
Index: linux-2.6.13-rc6/arch/i386/mm/init.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/i386/mm/init.c	2005-08-23 15:09:22.000000000 -0700
+++ linux-2.6.13-rc6/arch/i386/mm/init.c	2005-08-23 15:09:22.000000000 -0700
@@ -390,7 +390,7 @@
 }
 
 static int disable_nx __initdata = 0;
-u64 __supported_pte_mask = ~_PAGE_NX;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
 
 /*
  * noexec = on|off
Index: linux-2.6.13-rc6/arch/x86_64/kernel/genapic.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/kernel/genapic.c	2005-08-23 12:09:44.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/kernel/genapic.c	2005-08-24 09:55:38.000000000 -0700
@@ -25,7 +25,7 @@
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
Index: linux-2.6.13-rc6/arch/x86_64/kernel/io_apic.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/kernel/io_apic.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/kernel/io_apic.c	2005-08-23 15:09:22.000000000 -0700
@@ -70,7 +70,7 @@
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -655,7 +655,7 @@
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1424,7 +1424,7 @@
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename = "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -1435,7 +1435,7 @@
 	.set_affinity = set_ioapic_affinity,
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename = "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -1506,7 +1506,7 @@
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename = "local-APIC-edge",
 	.startup = NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
Index: linux-2.6.13-rc6/arch/x86_64/kernel/setup.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/kernel/setup.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/kernel/setup.c	2005-08-23 15:09:22.000000000 -0700
@@ -65,7 +65,7 @@
  * Machine setup..
  */
 
-struct cpuinfo_x86 boot_cpu_data;
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 
 unsigned long mmu_cr4_features;
 
Index: linux-2.6.13-rc6/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/kernel/setup64.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/kernel/setup64.c	2005-08-23 15:09:22.000000000 -0700
@@ -36,7 +36,7 @@
 
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
-unsigned long __supported_pte_mask = ~0UL;
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
 static int do_not_nx __initdata = 0;
 
 /* noexec=on|off
Index: linux-2.6.13-rc6/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/kernel/smpboot.c	2005-08-23 12:10:04.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/kernel/smpboot.c	2005-08-24 09:55:38.000000000 -0700
@@ -62,13 +62,13 @@
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(phys_proc_id);
 EXPORT_SYMBOL(cpu_core_id);
 
 /* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
 
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -88,8 +88,8 @@
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
 /*
Index: linux-2.6.13-rc6/arch/x86_64/mm/numa.c
===================================================================
--- linux-2.6.13-rc6.orig/arch/x86_64/mm/numa.c	2005-08-23 12:09:44.000000000 -0700
+++ linux-2.6.13-rc6/arch/x86_64/mm/numa.c	2005-08-24 09:55:38.000000000 -0700
@@ -22,14 +22,14 @@
 #define Dprintk(x...)
 #endif
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
-unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
-cpumask_t     node_to_cpumask[MAX_NUMNODES];
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
Index: linux-2.6.13-rc6/fs/namespace.c
===================================================================
--- linux-2.6.13-rc6.orig/fs/namespace.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/fs/namespace.c	2005-08-23 15:09:22.000000000 -0700
@@ -40,7 +40,7 @@
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
Index: linux-2.6.13-rc6/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc6.orig/mm/mempolicy.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/mm/mempolicy.c	2005-08-23 15:09:22.000000000 -0700
@@ -88,7 +88,7 @@
    policied. */
 static int policy_zone;
 
-static struct mempolicy default_policy = {
+static struct mempolicy default_policy __read_mostly = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.policy = MPOL_DEFAULT,
 };
Index: linux-2.6.13-rc6/mm/mmap.c
===================================================================
--- linux-2.6.13-rc6.orig/mm/mmap.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/mm/mmap.c	2005-08-23 15:09:22.000000000 -0700
@@ -61,7 +61,7 @@
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
Index: linux-2.6.13-rc6/mm/page_alloc.c
===================================================================
--- linux-2.6.13-rc6.orig/mm/page_alloc.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/mm/page_alloc.c	2005-08-23 15:09:22.000000000 -0700
@@ -42,13 +42,13 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+struct pglist_data *pgdat_list __read_mostly;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
@@ -68,7 +68,7 @@
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
Index: linux-2.6.13-rc6/mm/shmem.c
===================================================================
--- linux-2.6.13-rc6.orig/mm/shmem.c	2005-08-23 15:05:49.000000000 -0700
+++ linux-2.6.13-rc6/mm/shmem.c	2005-08-23 15:09:22.000000000 -0700
@@ -182,7 +182,7 @@
 static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Additions to .data.read_mostly section
  2005-08-24 21:46 [patch] Additions to .data.read_mostly section Ravikiran G Thirumalai
@ 2005-08-24 22:38 ` Eric Dumazet
  2005-08-25  7:56 ` Arjan van de Ven
  1 sibling, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2005-08-24 22:38 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, linux-kernel, Shai Fultheim (Shai@scalex86.org),
	Alok Kataria

Ravikiran G Thirumalai a écrit :
> Following patch moves a few static 'read mostly' variables to the 
> .data.read_mostly section.  Typically these are vector - irq tables,
> boot_cpu_data, node_maps etc., which are initialized once and read from 
> often and rarely written to.  Please include.
> 

Good candidates for read_mostly are all the 'kmem_cache_t *xxx_cache'

slab was carefuly designed to eliminate cache line ping pongs on SMP, but if 
the initial pointer to slab sits in a heavily modified cache line, we loose.

Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Additions to .data.read_mostly section
  2005-08-24 21:46 [patch] Additions to .data.read_mostly section Ravikiran G Thirumalai
  2005-08-24 22:38 ` Eric Dumazet
@ 2005-08-25  7:56 ` Arjan van de Ven
  2005-08-25  8:45   ` [PATCH] removes filp_count_lock and changes nr_files type to atomic_t Eric Dumazet
  1 sibling, 1 reply; 19+ messages in thread
From: Arjan van de Ven @ 2005-08-25  7:56 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, linux-kernel, Shai Fultheim (Shai@scalex86.org),
	Alok Kataria

On Wed, 2005-08-24 at 14:46 -0700, Ravikiran G Thirumalai wrote:
> Following patch moves a few static 'read mostly' variables to the 
> .data.read_mostly section.  Typically these are vector - irq tables,
> boot_cpu_data, node_maps etc., which are initialized once and read from 
> often and rarely written to.  Please include.

it almost sounds like a "read_only_once_booted" is useful, esp for those
who modify their kernel to make such sections truely read only ;)



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  7:56 ` Arjan van de Ven
@ 2005-08-25  8:45   ` Eric Dumazet
  2005-08-25  9:05     ` Arjan van de Ven
  2005-08-25  9:08     ` Christoph Hellwig
  0 siblings, 2 replies; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25  8:45 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 350 bytes --]

This patch removes filp_count_lock spinlock, used to protect files_stat.nr_files.

Just use atomic_t type and atomic_inc()/atomic_dec() operations.

This patch assumes that atomic_read() is a plain {return v->counter;} on all 
architectures. (keywords : sysctl, /proc/sys/fs/file-nr, proc_dointvec)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: patch_filp_count_lock --]
[-- Type: text/plain, Size: 3005 bytes --]

diff -Nru linux-2.6.13-rc7/fs/file_table.c linux-2.6.13-rc7-ed/fs/file_table.c
--- linux-2.6.13-rc7/fs/file_table.c	2005-08-24 05:39:14.000000000 +0200
+++ linux-2.6.13-rc7-ed/fs/file_table.c	2005-08-25 10:41:00.000000000 +0200
@@ -28,29 +28,21 @@
 /* public. Not pretty! */
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
 
 /* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
+ * context and must be fully threaded
  */
 void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags)
 {
 	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
+		atomic_inc(&files_stat.nr_files);
 	}
 }
 
 void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	atomic_dec(&files_stat.nr_files);
 }
 
 static inline void file_free(struct file *f)
@@ -70,7 +62,7 @@
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
+	if (atomic_read(&files_stat.nr_files) >= files_stat.max_files &&
 				!capable(CAP_SYS_ADMIN))
 		goto over;
 
@@ -94,10 +86,10 @@
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (atomic_read(&files_stat.nr_files) > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
 					files_stat.max_files);
-		old_max = files_stat.nr_files;
+		old_max = atomic_read(&files_stat.nr_files);
 	}
 	goto fail;
 
diff -Nru linux-2.6.13-rc7/fs/xfs/linux-2.6/xfs_linux.h linux-2.6.13-rc7-ed/fs/xfs/linux-2.6/xfs_linux.h
--- linux-2.6.13-rc7/fs/xfs/linux-2.6/xfs_linux.h	2005-08-24 05:39:14.000000000 +0200
+++ linux-2.6.13-rc7-ed/fs/xfs/linux-2.6/xfs_linux.h	2005-08-25 10:41:00.000000000 +0200
@@ -242,7 +242,7 @@
 
 /* IRIX uses the current size of the name cache to guess a good value */
 /* - this isn't the same but is a good enough starting point for now. */
-#define DQUOT_HASH_HEURISTIC	files_stat.nr_files
+#define DQUOT_HASH_HEURISTIC	atomic_read(&files_stat.nr_files)
 
 /* IRIX inodes maintain the project ID also, zero this field on Linux */
 #define DEFAULT_PROJID	0
diff -Nru linux-2.6.13-rc7/include/linux/fs.h linux-2.6.13-rc7-ed/include/linux/fs.h
--- linux-2.6.13-rc7/include/linux/fs.h	2005-08-24 05:39:14.000000000 +0200
+++ linux-2.6.13-rc7-ed/include/linux/fs.h	2005-08-25 10:41:00.000000000 +0200
@@ -30,7 +30,7 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
+	atomic_t nr_files;		/* read only */
 	int nr_free_files;	/* read only */
 	int max_files;		/* tunable */
 };

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  8:45   ` [PATCH] removes filp_count_lock and changes nr_files type to atomic_t Eric Dumazet
@ 2005-08-25  9:05     ` Arjan van de Ven
  2005-08-25  9:20       ` Eric Dumazet
  2005-08-25  9:08     ` Christoph Hellwig
  1 sibling, 1 reply; 19+ messages in thread
From: Arjan van de Ven @ 2005-08-25  9:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, linux-kernel

On Thu, 2005-08-25 at 10:45 +0200, Eric Dumazet wrote:
> This patch removes filp_count_lock spinlock, used to protect files_stat.nr_files.
> 
> Just use atomic_t type and atomic_inc()/atomic_dec() operations.
> 
> This patch assumes that atomic_read() is a plain {return v->counter;} on all 
> architectures. (keywords : sysctl, /proc/sys/fs/file-nr, proc_dointvec)
> 

this patch adds atomic ops where there were none before
>  static inline void file_free(struct file *f)
> @@ -70,7 +62,7 @@
>  	/*
>  	 * Privileged users can go above max_files
>  	 */
> -	if (files_stat.nr_files >= files_stat.max_files &&
> +	if (atomic_read(&files_stat.nr_files) >= files_stat.max_files &&
>  				!capable(CAP_SYS_ADMIN))
>  		goto over;
>  

here 

> @@ -94,10 +86,10 @@
>  
>  over:
>  	/* Ran out of filps - report that */
> -	if (files_stat.nr_files > old_max) {
> +	if (atomic_read(&files_stat.nr_files) > old_max) {
>  		printk(KERN_INFO "VFS: file-max limit %d reached\n",
>  					files_stat.max_files);
> -		old_max = files_stat.nr_files;
> +		old_max = atomic_read(&files_stat.nr_files);
>  	}
>  	goto fail;

and here

for those architectures that need atomics for read (parisc? arm?)

however.. wouldn't it be better to make this a per cpu variable for
write, and for read iterate or do something smart otherwise?



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  8:45   ` [PATCH] removes filp_count_lock and changes nr_files type to atomic_t Eric Dumazet
  2005-08-25  9:05     ` Arjan van de Ven
@ 2005-08-25  9:08     ` Christoph Hellwig
  2005-08-25  9:17       ` Eric Dumazet
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2005-08-25  9:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, linux-kernel

On Thu, Aug 25, 2005 at 10:45:12AM +0200, Eric Dumazet wrote:
> This patch assumes that atomic_read() is a plain {return v->counter;} on 
> all architectures. (keywords : sysctl, /proc/sys/fs/file-nr, proc_dointvec)

But that's not true.  You need to write you own sysctl handler for it,
probably worth adding a generic atomic_t sysctl handler while you're
at it.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  9:08     ` Christoph Hellwig
@ 2005-08-25  9:17       ` Eric Dumazet
  2005-08-25  9:23         ` Christoph Hellwig
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25  9:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Andrew Morton, linux-kernel

Christoph Hellwig a écrit :
> On Thu, Aug 25, 2005 at 10:45:12AM +0200, Eric Dumazet wrote:
> 
>>This patch assumes that atomic_read() is a plain {return v->counter;} on 
>>all architectures. (keywords : sysctl, /proc/sys/fs/file-nr, proc_dointvec)
> 
> 
> But that's not true.  You need to write you own sysctl handler for it,
> probably worth adding a generic atomic_t sysctl handler while you're
> at it.
> 

I checked linux-2.6.13-rc7 tree, and atomic_read() is just a wrapper to read 
v->counter.

In the ancient times, yes, atomic_read() was different on some archs, but not 
today.

Eric


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  9:05     ` Arjan van de Ven
@ 2005-08-25  9:20       ` Eric Dumazet
  2005-08-25  9:31         ` Arjan van de Ven
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25  9:20 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Andrew Morton, linux-kernel

Arjan van de Ven a écrit :
> On Thu, 2005-08-25 at 10:45 +0200, Eric Dumazet wrote:
> 
>>This patch removes filp_count_lock spinlock, used to protect files_stat.nr_files.
>>
>>Just use atomic_t type and atomic_inc()/atomic_dec() operations.
>>
>>This patch assumes that atomic_read() is a plain {return v->counter;} on all 
>>architectures. (keywords : sysctl, /proc/sys/fs/file-nr, proc_dointvec)
>>
> 
> 
> this patch adds atomic ops where there were none before

nope... a spinlock/spinunlock contains atomic ops.

> for those architectures that need atomics for read (parisc? arm?)

not today. No atomic needed for read.

> 
> however.. wouldn't it be better to make this a per cpu variable for
> write, and for read iterate or do something smart otherwise?

So on a machine with 256 CPUS, you want to iterate 256 counters ?
nr_files is not heavily touched, no need to expand it.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  9:17       ` Eric Dumazet
@ 2005-08-25  9:23         ` Christoph Hellwig
  2005-08-25 10:41           ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2005-08-25  9:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, linux-kernel

On Thu, Aug 25, 2005 at 11:17:23AM +0200, Eric Dumazet wrote:
> >But that's not true.  You need to write you own sysctl handler for it,
> >probably worth adding a generic atomic_t sysctl handler while you're
> >at it.
> >
> 
> I checked linux-2.6.13-rc7 tree, and atomic_read() is just a wrapper to 
> read v->counter.

That doesn't matter.  atomic_t is an opaqueue type and you must use the
atomic_* interfaces to access it.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  9:20       ` Eric Dumazet
@ 2005-08-25  9:31         ` Arjan van de Ven
  0 siblings, 0 replies; 19+ messages in thread
From: Arjan van de Ven @ 2005-08-25  9:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, linux-kernel

> > 
> > this patch adds atomic ops where there were none before
> 
> nope... a spinlock/spinunlock contains atomic ops.

unlock usually doesn't but ok

> 
> > for those architectures that need atomics for read (parisc? arm?)
> 
> not today. No atomic needed for read.
> 
> > 
> > however.. wouldn't it be better to make this a per cpu variable for
> > write, and for read iterate or do something smart otherwise?
> 
> So on a machine with 256 CPUS, you want to iterate 256 counters ?

you can be smart about this, similar to what ext3 and others do for
similar counters



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25  9:23         ` Christoph Hellwig
@ 2005-08-25 10:41           ` Eric Dumazet
  2005-08-25 11:11             ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25 10:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 996 bytes --]

Christoph Hellwig a écrit :
> On Thu, Aug 25, 2005 at 11:17:23AM +0200, Eric Dumazet wrote:
> 
>>>But that's not true.  You need to write you own sysctl handler for it,
>>>probably worth adding a generic atomic_t sysctl handler while you're
>>>at it.
>>>
>>
>>I checked linux-2.6.13-rc7 tree, and atomic_read() is just a wrapper to 
>>read v->counter.
> 
> 
> That doesn't matter.  atomic_t is an opaqueue type and you must use the
> atomic_* interfaces to access it.

OK, here is a new clean patch that address this problem (nothing assumed about 
atomics)

This patch removes filp_count_lock spinlock, used to protect files_stat.nr_files.

Introduce an atomic_t atomic_nr_files to keep the exact count, and mirror its 
value into nr_files.

Forcing atomic_nr_files to be in the same cache line than nr_files makes sure 
we dont dirty two cache lines.

There is still a locked memory operation on SMP, but it saves an sti/cli pair.

Thank you

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: patch_filp_count_lock --]
[-- Type: text/plain, Size: 2589 bytes --]

diff -Nru linux-2.6.13-rc7/fs/file_table.c linux-2.6.13-rc7-ed/fs/file_table.c
--- linux-2.6.13-rc7/fs/file_table.c	2005-08-24 05:39:14.000000000 +0200
+++ linux-2.6.13-rc7-ed/fs/file_table.c	2005-08-25 12:18:02.000000000 +0200
@@ -19,38 +19,28 @@
 #include <linux/fsnotify.h>
 
 /* sysctl tunables... */
-struct files_stat_struct files_stat = {
-	.max_files = NR_FILE
-};
+struct files_stat_struct files_stat;
 
 EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
 
 /* public. Not pretty! */
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
 
 /* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
+ * context and must be fully threaded
  */
 void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags)
 {
 	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
+		files_stat.nr_files = atomic_add_return(1, &files_stat.atomic_nr_files);
 	}
 }
 
 void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	files_stat.nr_files = atomic_sub_return(1, &files_stat.atomic_nr_files);
 }
 
 static inline void file_free(struct file *f)
@@ -258,4 +248,5 @@
 	files_stat.max_files = n; 
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
+	atomic_set(&files_stat.atomic_nr_files, 0);
 } 
diff -Nru linux-2.6.13-rc7/include/linux/fs.h linux-2.6.13-rc7-ed/include/linux/fs.h
--- linux-2.6.13-rc7/include/linux/fs.h	2005-08-24 05:39:14.000000000 +0200
+++ linux-2.6.13-rc7-ed/include/linux/fs.h	2005-08-25 12:39:07.000000000 +0200
@@ -9,6 +9,8 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
+#include <linux/cache.h>
+#include <asm/atomic.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -30,10 +32,12 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
+	int nr_files;		/* mirrors atomic_nr_files value */
 	int nr_free_files;	/* read only */
 	int max_files;		/* tunable */
-};
+
+	atomic_t atomic_nr_files;
+} ____cacheline_aligned;
 extern struct files_stat_struct files_stat;
 
 struct inodes_stat_t {

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 10:41           ` Eric Dumazet
@ 2005-08-25 11:11             ` Nick Piggin
  2005-08-25 12:26               ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-08-25 11:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Christoph Hellwig, Andrew Morton, lkml

On Thu, 2005-08-25 at 12:41 +0200, Eric Dumazet wrote:

> OK, here is a new clean patch that address this problem (nothing assumed about 
> atomics)
> 

Would you just be able to add the atomic sysctl handler that
Christoph suggested?

This introduces lost update problems. 2 CPUs may store to nr_files
in the opposite order that they incremented atomic_nr_files.

It is not terribly bad, because the drift is not cumulative and the
field can't go negative... but its just ugly to add this hack
because there is no atomic sysctl handler.

Eliminating the cli/sti is a good idea though, I think.

-- 
SUSE Labs, Novell Inc.



Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 11:11             ` Nick Piggin
@ 2005-08-25 12:26               ` Eric Dumazet
  2005-08-25 14:51                 ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25 12:26 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Christoph Hellwig, Andrew Morton, lkml

Nick Piggin a écrit :
> On Thu, 2005-08-25 at 12:41 +0200, Eric Dumazet wrote:
> 
> 
>>OK, here is a new clean patch that address this problem (nothing assumed about 
>>atomics)
>>
> 
> 
> Would you just be able to add the atomic sysctl handler that
> Christoph suggested?
> 

Quite a lot of work indeed, and it would force to convert 3 int (nr_files, 
nr_free_files, max_files) to 3 atomic_t. I feel bad introducing a lot of 
sysctl rework for a tiny change (removing filp_count_lock)

> This introduces lost update problems. 2 CPUs may store to nr_files
> in the opposite order that they incremented atomic_nr_files.
> 

That's true, and the difference can be relatively important in case of preemption.

Each time the true and correct value  (atomic_nr_files) is updated, a copy is 
done on nr_files : as nr_files is only used to be a guard value against too 
many file allocations, a somewhat 'lazy' value has no impact at all.

> It is not terribly bad, because the drift is not cumulative and the
> field can't go negative... but its just ugly to add this hack
> because there is no atomic sysctl handler.
> 
> Eliminating the cli/sti is a good idea though, I think.
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 12:26               ` Eric Dumazet
@ 2005-08-25 14:51                 ` Nick Piggin
  2005-08-25 14:56                   ` Christoph Hellwig
  2005-08-25 15:13                   ` Eric Dumazet
  0 siblings, 2 replies; 19+ messages in thread
From: Nick Piggin @ 2005-08-25 14:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Christoph Hellwig, Andrew Morton, lkml

Eric Dumazet wrote:
> Nick Piggin a écrit :
> 

>> Would you just be able to add the atomic sysctl handler that
>> Christoph suggested?
>>
> 
> Quite a lot of work indeed, and it would force to convert 3 int 
> (nr_files, nr_free_files, max_files) to 3 atomic_t. I feel bad 
> introducing a lot of sysctl rework for a tiny change (removing 
> filp_count_lock)
> 

True, I didn't notice that.

>> This introduces lost update problems. 2 CPUs may store to nr_files
>> in the opposite order that they incremented atomic_nr_files.
>>
> 
> That's true, and the difference can be relatively important in case of 
> preemption.
> 
> Each time the true and correct value  (atomic_nr_files) is updated, a 
> copy is done on nr_files : as nr_files is only used to be a guard value 
> against too many file allocations, a somewhat 'lazy' value has no impact 
> at all.
> 

OK, well I would prefer you do the proper atomic operations throughout
where it "really matters" in file_table.c, and do your lazy synchronize
with just the sysctl exported value.

Unless the fs people had a problem with that.

And you may as well get rid of the atomic_inc_return which can be more
expensive on some platforms and doesn't buy you much.
   atomic_inc;
   atomic_read;
Should be enough if you don't care about lost updates here, yeah?

Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 14:51                 ` Nick Piggin
@ 2005-08-25 14:56                   ` Christoph Hellwig
  2005-08-25 15:13                   ` Eric Dumazet
  1 sibling, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2005-08-25 14:56 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Eric Dumazet, Christoph Hellwig, Andrew Morton, lkml

On Fri, Aug 26, 2005 at 12:51:30AM +1000, Nick Piggin wrote:
> Eric Dumazet wrote:
> >Nick Piggin a ?crit :
> >
> 
> >>Would you just be able to add the atomic sysctl handler that
> >>Christoph suggested?
> >>
> >
> >Quite a lot of work indeed, and it would force to convert 3 int 
> >(nr_files, nr_free_files, max_files) to 3 atomic_t. I feel bad 
> >introducing a lot of sysctl rework for a tiny change (removing 
> >filp_count_lock)
> >
> 
> True, I didn't notice that.

Well, it would with a generic atomic_t handler.  With a special
handler for this situation it's not needed.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 14:51                 ` Nick Piggin
  2005-08-25 14:56                   ` Christoph Hellwig
@ 2005-08-25 15:13                   ` Eric Dumazet
  2005-08-25 18:19                     ` Dipankar Sarma
  2005-08-26  1:16                     ` Nick Piggin
  1 sibling, 2 replies; 19+ messages in thread
From: Eric Dumazet @ 2005-08-25 15:13 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Christoph Hellwig, Andrew Morton, lkml

Nick Piggin a écrit :

> OK, well I would prefer you do the proper atomic operations throughout
> where it "really matters" in file_table.c, and do your lazy synchronize
> with just the sysctl exported value.
> 

But... I got complains about atomic_read(&counter) being 'an atomic op' 
(untrue), so my second patch just doesnt touch the path where nr_files was read.

Furthermore, a lazy sync would mean to change sysctl proc_handler for 
"file-nr" to perform a synchronize before calling proc_dointvec, this would be 
really obscure.

> Unless the fs people had a problem with that.
> 
> And you may as well get rid of the atomic_inc_return which can be more
> expensive on some platforms and doesn't buy you much.
>   atomic_inc;
>   atomic_read;
> Should be enough if you don't care about lost updates here, yeah?
> 

You mean :

atomic_inc(&counter);
lazeyvalue = atomic_read(&counter);

instead of

lazeyvalue = atomic_inc_return(&counter);

In fact I couldnt find one architecture where the later would be more expensive.

> Nick
> 

Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 15:13                   ` Eric Dumazet
@ 2005-08-25 18:19                     ` Dipankar Sarma
  2005-08-25 18:36                       ` Christoph Hellwig
  2005-08-26  1:16                     ` Nick Piggin
  1 sibling, 1 reply; 19+ messages in thread
From: Dipankar Sarma @ 2005-08-25 18:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Nick Piggin, Christoph Hellwig, Andrew Morton, lkml

On Thu, Aug 25, 2005 at 05:13:05PM +0200, Eric Dumazet wrote:
> Nick Piggin a écrit :
> 
> >OK, well I would prefer you do the proper atomic operations throughout
> >where it "really matters" in file_table.c, and do your lazy synchronize
> >with just the sysctl exported value.
> >
> 
> But... I got complains about atomic_read(&counter) being 'an atomic op' 
> (untrue), so my second patch just doesnt touch the path where nr_files was 
> read.
> 

Here is a patch that I had done some time ago that uses atomic_t,
yet retains the sysctl handler. Eric, you earlier patch is incorrect
exactly for that reason.

One other thing - the claim that it removes filp_count_lock
from fast path is bogus. The slab constructor/destructors are
called only when we return the free file structs to the page
allocator. That we don't do very often and therefore we
don't acquire the lock - atleast not for every filp open
and close.

This is not to say we don't want a better reference counter like
a per-cpu counter, but there is some difficult stuff there and
the returns need to justify that. I would appreciate if you
or anyone can demonstrate this to be a problem.

The patch below was meant for debugging some suspected problems
with -mm.

Thanks
Dipankar



This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate atomic_t, nr_files, for now and all
accesses to it are through get_nr_files() api. In the sysctl
handler for nr_files, we populate files_stat.nr_files before returning
to user.

Counting files as an when they are created and destroyed (as opposed
to inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
---


 fs/dcache.c                  |    2 -
 fs/file_table.c              |   71 ++++++++++++++++++++++++++-----------------
 fs/xfs/linux-2.6/xfs_linux.h |    3 +
 include/linux/file.h         |    2 -
 include/linux/fs.h           |    2 +
 kernel/sysctl.c              |    5 ++-
 net/unix/af_unix.c           |    2 -
 7 files changed, 54 insertions(+), 33 deletions(-)

diff -puN fs/file_table.c~files-scale-file-counting fs/file_table.c
--- linux-2.6.13-rc3-mm3-fixes/fs/file_table.c~files-scale-file-counting	2005-08-09 12:24:27.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/fs/file_table.c	2005-08-09 16:42:27.000000000 +0530
@@ -5,6 +5,7 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -18,52 +19,67 @@
 #include <linux/mount.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <asm/atomic.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
-
 /* public. Not pretty! */
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
+static atomic_t nr_files __cacheline_aligned_in_smp;
 
-/* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
- */
-void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags)
+static inline void file_free_rcu(struct rcu_head *head)
 {
-	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
-	}
+	struct file *f =  container_of(head, struct file, f_rcuhead);
+	kmem_cache_free(filp_cachep, f);
 }
 
-void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
+static inline void file_free(struct file *f)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	atomic_dec(&nr_files);
+	call_rcu(&f->f_rcuhead, file_free_rcu);
 }
 
-static inline void file_free_rcu(struct rcu_head *head)
+/*
+ * Return the total number of open files in the system
+ */
+int get_nr_files(void)
 {
-	struct file *f =  container_of(head, struct file, f_rcuhead);
-	kmem_cache_free(filp_cachep, f);
+	return atomic_read(&nr_files);
 }
 
-static inline void file_free(struct file *f)
+/*
+ * Return the maximum number of open files in the system
+ */
+int get_max_files(void)
 {
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	return files_stat.max_files;
+}
+
+EXPORT_SYMBOL(get_nr_files);
+EXPORT_SYMBOL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
 }
+#endif
 
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
@@ -77,7 +93,7 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
+	if (get_nr_files() >= files_stat.max_files &&
 				!capable(CAP_SYS_ADMIN))
 		goto over;
 
@@ -97,11 +113,12 @@ struct file *get_empty_filp(void)
 	/* f->f_version: 0 */
 	INIT_LIST_HEAD(&f->f_list);
 	f->f_maxcount = INT_MAX;
+	atomic_inc(&nr_files);
 	return f;
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (get_nr_files() > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
 					files_stat.max_files);
 		old_max = files_stat.nr_files;
diff -puN include/linux/fs.h~files-scale-file-counting include/linux/fs.h
--- linux-2.6.13-rc3-mm3-fixes/include/linux/fs.h~files-scale-file-counting	2005-08-09 15:52:14.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/include/linux/fs.h	2005-08-09 16:59:05.000000000 +0530
@@ -36,6 +36,8 @@ struct files_stat_struct {
 	int max_files;		/* tunable */
 };
 extern struct files_stat_struct files_stat;
+extern int get_nr_files(void);
+extern int get_max_files(void);
 
 struct inodes_stat_t {
 	int nr_inodes;
diff -puN fs/xfs/linux-2.6/xfs_linux.h~files-scale-file-counting fs/xfs/linux-2.6/xfs_linux.h
--- linux-2.6.13-rc3-mm3-fixes/fs/xfs/linux-2.6/xfs_linux.h~files-scale-file-counting	2005-08-09 15:58:51.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/fs/xfs/linux-2.6/xfs_linux.h	2005-08-09 15:59:36.000000000 +0530
@@ -88,6 +88,7 @@
 #include <linux/list.h>
 #include <linux/proc_fs.h>
 #include <linux/sort.h>
+#include <linux/fs.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -241,7 +242,7 @@ static inline void set_buffer_unwritten_
 
 /* IRIX uses the current size of the name cache to guess a good value */
 /* - this isn't the same but is a good enough starting point for now. */
-#define DQUOT_HASH_HEURISTIC	files_stat.nr_files
+#define DQUOT_HASH_HEURISTIC	get_nr_files()
 
 /* IRIX inodes maintain the project ID also, zero this field on Linux */
 #define DEFAULT_PROJID	0
diff -puN net/unix/af_unix.c~files-scale-file-counting net/unix/af_unix.c
--- linux-2.6.13-rc3-mm3-fixes/net/unix/af_unix.c~files-scale-file-counting	2005-08-09 15:59:57.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/net/unix/af_unix.c	2005-08-09 16:00:26.000000000 +0530
@@ -547,7 +547,7 @@ static struct sock * unix_create1(struct
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
+	if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
 		goto out;
 
 	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
diff -puN fs/dcache.c~files-scale-file-counting fs/dcache.c
--- linux-2.6.13-rc3-mm3-fixes/fs/dcache.c~files-scale-file-counting	2005-08-09 16:00:50.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/fs/dcache.c	2005-08-09 16:01:28.000000000 +0530
@@ -1895,7 +1895,7 @@ void __init vfs_caches_init(unsigned lon
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff -puN include/linux/file.h~files-scale-file-counting include/linux/file.h
--- linux-2.6.13-rc3-mm3-fixes/include/linux/file.h~files-scale-file-counting	2005-08-09 16:06:46.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/include/linux/file.h	2005-08-09 16:07:33.000000000 +0530
@@ -60,8 +60,6 @@ extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern void FASTCALL(put_unused_fd(unsigned int fd));
 struct kmem_cache_s;
-extern void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags);
-extern void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags);
 
 extern struct file ** alloc_fd_array(int);
 extern void free_fd_array(struct file **, int);
diff -puN kernel/sysctl.c~files-scale-file-counting kernel/sysctl.c
--- linux-2.6.13-rc3-mm3-fixes/kernel/sysctl.c~files-scale-file-counting	2005-08-09 16:21:27.000000000 +0530
+++ linux-2.6.13-rc3-mm3-fixes-dipankar/kernel/sysctl.c	2005-08-09 16:26:16.000000000 +0530
@@ -49,6 +49,9 @@
 #include <linux/nfs_fs.h>
 #endif
 
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -881,7 +884,7 @@ static ctl_table fs_table[] = {
 		.data		= &files_stat,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_nr_files,
 	},
 	{
 		.ctl_name	= FS_MAXFILE,

_

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 18:19                     ` Dipankar Sarma
@ 2005-08-25 18:36                       ` Christoph Hellwig
  0 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2005-08-25 18:36 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: Eric Dumazet, Nick Piggin, Andrew Morton, lkml

On Thu, Aug 25, 2005 at 11:49:35PM +0530, Dipankar Sarma wrote:
> On Thu, Aug 25, 2005 at 05:13:05PM +0200, Eric Dumazet wrote:
> > Nick Piggin a ?crit :
> > 
> > >OK, well I would prefer you do the proper atomic operations throughout
> > >where it "really matters" in file_table.c, and do your lazy synchronize
> > >with just the sysctl exported value.
> > >
> > 
> > But... I got complains about atomic_read(&counter) being 'an atomic op' 
> > (untrue), so my second patch just doesnt touch the path where nr_files was 
> > read.
> > 
> 
> Here is a patch that I had done some time ago that uses atomic_t,
> yet retains the sysctl handler. Eric, you earlier patch is incorrect
> exactly for that reason.
> 
> One other thing - the claim that it removes filp_count_lock
> from fast path is bogus. The slab constructor/destructors are
> called only when we return the free file structs to the page
> allocator. That we don't do very often and therefore we
> don't acquire the lock - atleast not for every filp open
> and close.
> 
> This is not to say we don't want a better reference counter like
> a per-cpu counter, but there is some difficult stuff there and
> the returns need to justify that. I would appreciate if you
> or anyone can demonstrate this to be a problem.
> 
> The patch below was meant for debugging some suspected problems
> with -mm.

As mentioned when we last discussed it the nr_files usage in XFS
is boguss and should go away first so we don't need the accessor
and export.  Hopefully we can get rid of the max_files usage
in af_unix aswell, can you ping davem on it?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] removes filp_count_lock and changes nr_files type to atomic_t
  2005-08-25 15:13                   ` Eric Dumazet
  2005-08-25 18:19                     ` Dipankar Sarma
@ 2005-08-26  1:16                     ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2005-08-26  1:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Christoph Hellwig, Andrew Morton, lkml

Eric Dumazet wrote:

> Furthermore, a lazy sync would mean to change sysctl proc_handler for 
> "file-nr" to perform a synchronize before calling proc_dointvec, this 
> would be really obscure.
>  

I was only using your terminology (ie. the 'lazy' synch after the
atomic is updated).

Actually, a better idea would be to make a specific sysctl handler
like Christoph said.

Unless you can show some improvement, it would better not to introduce
the racy hack (even if it is mostly harmless).

>> Unless the fs people had a problem with that.
>>
>> And you may as well get rid of the atomic_inc_return which can be more
>> expensive on some platforms and doesn't buy you much.
>>   atomic_inc;
>>   atomic_read;
>> Should be enough if you don't care about lost updates here, yeah?
>>
> 
> You mean :
> 
> atomic_inc(&counter);
> lazeyvalue = atomic_read(&counter);
> 
> instead of
> 
> lazeyvalue = atomic_inc_return(&counter);
> 

Yes.

> In fact I couldnt find one architecture where the later would be more 
> expensive.
> 

atomic_inc_return guarantees a memory barrier, while the former
statements do not. I'm fairly sure it will be more expensive on
a POWER5.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-08-26  1:16 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-24 21:46 [patch] Additions to .data.read_mostly section Ravikiran G Thirumalai
2005-08-24 22:38 ` Eric Dumazet
2005-08-25  7:56 ` Arjan van de Ven
2005-08-25  8:45   ` [PATCH] removes filp_count_lock and changes nr_files type to atomic_t Eric Dumazet
2005-08-25  9:05     ` Arjan van de Ven
2005-08-25  9:20       ` Eric Dumazet
2005-08-25  9:31         ` Arjan van de Ven
2005-08-25  9:08     ` Christoph Hellwig
2005-08-25  9:17       ` Eric Dumazet
2005-08-25  9:23         ` Christoph Hellwig
2005-08-25 10:41           ` Eric Dumazet
2005-08-25 11:11             ` Nick Piggin
2005-08-25 12:26               ` Eric Dumazet
2005-08-25 14:51                 ` Nick Piggin
2005-08-25 14:56                   ` Christoph Hellwig
2005-08-25 15:13                   ` Eric Dumazet
2005-08-25 18:19                     ` Dipankar Sarma
2005-08-25 18:36                       ` Christoph Hellwig
2005-08-26  1:16                     ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox