* [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
@ 2002-07-27 13:41 Anton Altaparmakov
2002-07-27 17:23 ` Andrew Morton
0 siblings, 1 reply; 29+ messages in thread
From: Anton Altaparmakov @ 2002-07-27 13:41 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Linux Kernel
Linus,
This patch introduces 64-bit versions of PAGE_{CACHE_,}MASK and
PAGE_{CACHE_,}ALIGN:
PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
These are needed when 64-bit values are worked with on 32-bit
architectures, otherwise the high 32-bits are destroyed.
For example:
my64bitval &= PAGE_CACHE_MASK;
is broken on 32-bit architectures... As is:
my64bitval = PAGE_ALIGN(other64bitval);
Jes mentioned that on MIPS32 with HIGHMEM something like this is
also needed due to 64-bit physical addresses.
These are also needed by the XFS patch so they would want to
introduce these anyway. In fact I borrowed the _LL naming from XFS.
This patch together with the two follow up patches fixing bugs I
found can also be taken from bkbits.net:
bk pull http://linux-ntfs.bkbits.net/linux-2.5-pm
Comments?
Best regards,
Anton
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/, http://www-stu.christs.cam.ac.uk/~aia21/
===================================================================
This will update the following files:
include/asm-alpha/page.h | 2 ++
include/asm-arm/page.h | 2 ++
include/asm-cris/page.h | 2 ++
include/asm-i386/page.h | 2 ++
include/asm-ia64/page.h | 2 ++
include/asm-m68k/page.h | 2 ++
include/asm-mips/page.h | 2 ++
include/asm-mips64/page.h | 2 ++
include/asm-parisc/page.h | 2 ++
include/asm-ppc/page.h | 2 ++
include/asm-ppc64/page.h | 2 ++
include/asm-s390/page.h | 2 ++
include/asm-s390x/page.h | 2 ++
include/asm-sh/page.h | 2 ++
include/asm-sparc/page.h | 2 ++
include/asm-sparc64/page.h | 2 ++
include/asm-x86_64/page.h | 2 ++
include/linux/pagemap.h | 3 +++
18 files changed, 37 insertions(+)
through these ChangeSets:
<aia21@cantab.net> (02/07/27 1.477)
Introduce 64-bit versions of PAGE_{CACHE_,}MASK and PAGE_{CACHE_,}ALIGN:
PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
These are needed when 64-bit values are worked with on 32-bit
architectures, otherwise the high 32-bits are destroyed.
Jes tells me, on MIPS32 with HIGHMEM these are also needed due to
64-bit physical addresses.
These are also needed by the XFS patch so they would want to
introduce these anyway. In fact I borrowed the _LL naming from XFS.
diff -Nru a/include/asm-alpha/page.h b/include/asm-alpha/page.h
--- a/include/asm-alpha/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-alpha/page.h Sat Jul 27 14:20:05 2002
@@ -7,6 +7,7 @@
#define PAGE_SHIFT 13
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -85,6 +86,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
#ifdef USE_48_BIT_KSEG
#define PAGE_OFFSET 0xffff800000000000
diff -Nru a/include/asm-arm/page.h b/include/asm-arm/page.h
--- a/include/asm-arm/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-arm/page.h Sat Jul 27 14:20:05 2002
@@ -89,9 +89,11 @@
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
diff -Nru a/include/asm-cris/page.h b/include/asm-cris/page.h
--- a/include/asm-cris/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-cris/page.h Sat Jul 27 14:20:05 2002
@@ -8,6 +8,7 @@
#define PAGE_SHIFT 13
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -61,6 +62,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/* This handles the memory map.. */
diff -Nru a/include/asm-i386/page.h b/include/asm-i386/page.h
--- a/include/asm-i386/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-i386/page.h Sat Jul 27 14:20:05 2002
@@ -5,6 +5,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
@@ -67,6 +68,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/*
* This handles the memory map.. We could make this a config
diff -Nru a/include/asm-ia64/page.h b/include/asm-ia64/page.h
--- a/include/asm-ia64/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-ia64/page.h Sat Jul 27 14:20:05 2002
@@ -28,7 +28,9 @@
#define PAGE_SIZE (__IA64_UL_CONST(1) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE - 1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE - 1))
#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK_LL)
#ifdef __ASSEMBLY__
# define __pa(x) ((x) - PAGE_OFFSET)
diff -Nru a/include/asm-m68k/page.h b/include/asm-m68k/page.h
--- a/include/asm-m68k/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-m68k/page.h Sat Jul 27 14:20:05 2002
@@ -12,6 +12,7 @@
#define PAGE_SIZE (8192)
#endif
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -99,6 +100,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/* Pure 2^n version of get_order */
extern __inline__ int get_order(unsigned long size)
diff -Nru a/include/asm-mips/page.h b/include/asm-mips/page.h
--- a/include/asm-mips/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-mips/page.h Sat Jul 27 14:20:05 2002
@@ -15,6 +15,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -67,6 +68,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/*
* This handles the memory map.
diff -Nru a/include/asm-mips64/page.h b/include/asm-mips64/page.h
--- a/include/asm-mips64/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-mips64/page.h Sat Jul 27 14:20:05 2002
@@ -15,6 +15,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -53,6 +54,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/*
* This handles the memory map.
diff -Nru a/include/asm-parisc/page.h b/include/asm-parisc/page.h
--- a/include/asm-parisc/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-parisc/page.h Sat Jul 27 14:20:05 2002
@@ -5,6 +5,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
@@ -51,6 +52,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/*
* Tell the user there is some problem. Beep too, so we can
diff -Nru a/include/asm-ppc/page.h b/include/asm-ppc/page.h
--- a/include/asm-ppc/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-ppc/page.h Sat Jul 27 14:20:05 2002
@@ -8,6 +8,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
#include <linux/config.h>
@@ -84,6 +85,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
struct page;
extern void clear_page(void *page);
diff -Nru a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
--- a/include/asm-ppc64/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-ppc64/page.h Sat Jul 27 14:20:05 2002
@@ -16,6 +16,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#define PAGE_OFFSET_MASK (PAGE_SIZE-1)
#define SID_SHIFT 28
@@ -149,6 +150,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE)
+#define PAGE_ALIGN_LL(addr) _ALIGN(addr, (u64)PAGE_SIZE)
#ifdef MODULE
#define __page_aligned __attribute__((__aligned__(PAGE_SIZE)))
diff -Nru a/include/asm-s390/page.h b/include/asm-s390/page.h
--- a/include/asm-s390/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-s390/page.h Sat Jul 27 14:20:05 2002
@@ -16,6 +16,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
@@ -112,6 +113,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
#define __PAGE_OFFSET 0x0UL
#define PAGE_OFFSET 0x0UL
diff -Nru a/include/asm-s390x/page.h b/include/asm-s390x/page.h
--- a/include/asm-s390x/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-s390x/page.h Sat Jul 27 14:20:05 2002
@@ -15,6 +15,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
@@ -109,6 +110,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
#define __PAGE_OFFSET 0x0UL
#define PAGE_OFFSET 0x0UL
diff -Nru a/include/asm-sh/page.h b/include/asm-sh/page.h
--- a/include/asm-sh/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-sh/page.h Sat Jul 27 14:20:05 2002
@@ -19,6 +19,7 @@
#define PAGE_SHIFT 12
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#define PTE_MASK PAGE_MASK
#ifdef __KERNEL__
@@ -59,6 +60,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/*
* IF YOU CHANGE THIS, PLEASE ALSO CHANGE
diff -Nru a/include/asm-sparc/page.h b/include/asm-sparc/page.h
--- a/include/asm-sparc/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-sparc/page.h Sat Jul 27 14:20:05 2002
@@ -21,6 +21,7 @@
#define PAGE_SIZE (1 << PAGE_SHIFT)
#endif
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -172,6 +173,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
#define PAGE_OFFSET 0xf0000000
#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
diff -Nru a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
--- a/include/asm-sparc64/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-sparc64/page.h Sat Jul 27 14:20:05 2002
@@ -12,6 +12,7 @@
#endif
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#ifdef __KERNEL__
@@ -106,6 +107,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/* We used to stick this into a hard-coded global register (%g4)
* but that does not make sense anymore.
diff -Nru a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
--- a/include/asm-x86_64/page.h Sat Jul 27 14:20:05 2002
+++ b/include/asm-x86_64/page.h Sat Jul 27 14:20:05 2002
@@ -9,6 +9,7 @@
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#endif
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & (__PHYSICAL_MASK << PAGE_SHIFT))
#define THREAD_SIZE (2*PAGE_SIZE)
#define CURRENT_MASK (~(THREAD_SIZE-1))
@@ -49,6 +50,7 @@
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+#define PAGE_ALIGN_LL(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK_LL)
/* See Documentation/x86_64/mm.txt for a description of the layout. */
#define __START_KERNEL 0xffffffff80100000
diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h Sat Jul 27 14:20:05 2002
+++ b/include/linux/pagemap.h Sat Jul 27 14:20:05 2002
@@ -20,7 +20,10 @@
#define PAGE_CACHE_SHIFT PAGE_SHIFT
#define PAGE_CACHE_SIZE PAGE_SIZE
#define PAGE_CACHE_MASK PAGE_MASK
+#define PAGE_CACHE_MASK_LL PAGE_MASK_LL
#define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
+#define PAGE_CACHE_ALIGN_LL(addr) \
+ (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK_LL)
#define page_cache_get(x) get_page(x)
extern void FASTCALL(page_cache_release(struct page *));
===================================================================
This BitKeeper patch contains the following changesets:
aia21@cantab.net|ChangeSet|20020727131954|59561
## Wrapped with gzip_uu ##
begin 664 bkpatch21816
M'XL(``6>0CT``]V;;6_;R!&`/YN_8H$`!QN)Z7U?KH$42G.YQ+VD#2X]H"@*
M!"MR91*62(&DXKAE^]L[)&V)%B52E*V@E?P"B1*'P]EG9V=F1R_0U<^7)WF2
M?C/3(!N9/)PFL9NG)LYF-C>NG\R*MZ&)K^T7FQ<48PH_@BB&A2R(Q%P5/@D(
M,9S8`%/N2>Z\0+]G-KT\,9&A!%Y]2++\\L0W<6[&;FQS./1;DL"ABT667F2I
M?S&-XL7W<^J*\_G,@;<_F]P/T3>;9I<GQ&7+(_G=W%Z>_/;N_>\?W_SF.*]?
MHZ5NZ/5KYWEO8VX6TT4VRLQL;-PDO5X7H*C"C$E*"TRDQL[/B+A<*83I!587
M5"'"+XF^%/PE)I<8H\H<HY49T$OBH7/L_!$]K]YO'1]=Q7F:!`O?(LG/QU%>
MV3)*X@PE$_3YS?MW7__U]LW;#^^^OOKWIS=??D4F#M8.O_EX]?[/ER#J_M$^
MZ>O'C]O.@[=<.!5^_QK:S"*36A1;&]@`W88V7BIEI@N;5>_>)NE-^6Z4ARB)
M$:/E!^!\D_IAE%L_7Z0V>X62/+3I;00BX0D*H^OP_J.UE,!F<-]W-KB_^I]`
M>FZGTPS-[*M2[J>KSU\8K2_SX>K]AT_O/I62[E4TTRQYT#-8P#42D'&OZSR\
MRR+?3)$)`E`ELUGK#INGC^\J#?_VRQ<TK]"%M^#`'=SH8@KW"134XJ/E2-WK
M$=_=FCL71A!-C)^C*S1.TC2Y!9FEP-+HL9E%\36:I,FLO(#K_(J$%I(XGU?3
MP3D?^'`<;+#S![A^GL2C;#&WJ;U.W-,XB>U9$<7^=!'8"Y/-SN=S7_*+N;FV
M;GB/).$$_I@L!%6$%$);QL<BH,;`-/%,"_T>>3"S"".:BX(0H@5HU3U#FM+,
M=!Z:Q]I5$Z:41I7'6,$Y];6@A&)_,O%(>V+VR%MI!S(D&Z3=3'HW&Y6#V4P\
M3(NQ]#Q?C*4V5OC8\&[E6N)6NF&BF03=*OKZ%<N8QM_;F@E-<,&8AWDA&:5!
MP(F@.)CX$]NM65M>0S6)O6%FBY@G-YJ-PYAR)8K)F)D`:Z,)&$V/>\:T):ZA
M&Y"L!NF6S<%'K4^(6CV!"TXYHT4PYI)(7]MR?DC69[L-$AM30C%!AD$7S;,M
M"L(-ZU*H$()S'E@1!)[B..C!KBUPI9\47,A!^H$#V#BXNF!8"590ABV=4*F$
MPL0HK]>=;+.<P+KDSMR,9@O?#>RC\[Y[\FO+KS&*014&\$M-"R$U8YH1AC6U
MN,>M;9#7\!R*#IP"Y83:.(2RP%@`8YXV`(;O^TIC;L0.\W/K]%3:VUFW*G2K
M),W,?'T$P7=0`3$*,,$F)&#,UTIZM,.K;12WTDTHR7;7K?+?Z6RCV6A!&*!?
M\`#[&IN)8A;NF^N>U6!-6@-[S/0>CF,C^`IFI=2JX!,#X\O'`<-CR['<P6UL
M&U/%/;K[:N"G4;9A,9`<)B.1GH*I1)0U,@BH#4!+OUNSEKB58HSK@?ZV=#Y;
MU@*8")H4AGOCB?08U<2(<9^[:(EK&(TR,0PV&($HVSBF0A74`],5'J7"5SX>
M>^!I->GQ96UYC;F@P1T.(R[<J!J$1+"*XF+B"Z8D)R;@$D]LGP<)M^D%G`P;
MTLAL69XX\$$U*V`6@+_T"!/,>ACW:-82UXR*/"ZKQ'&KJRX3R>=:(787)"$<
MQ1R#'R<:(JTJF13KJ2076U-)>JA,,GJ>3!)RE'K-^PLZ3V^K7\@Y/F\?ACT2
MF"M"$'%>!'82Q;96XCY//3G]S^E"\K/3ZN"7J[^_.R=G9\Z5:)WPD+Z>EFG>
MV<GI:?WD9?/$GYJBSUHT-?.9"J8#9%0/0I>UB6YQDL!_IBC$"K!*U:4*,-91
MX56GBAUX-<VR%UW>0+I(+U[UZ^K%*U3)6(IH<]5814NL#K64.X'Y%@6S43B?
MNN%\!WG@N2"PIE50J^0Q>JXZ2.E`JV&4O<C"`\F2[`!^JYFR#P!L>.7`F9DT
MCV(W\\/;"&XBN[D;!19&R$YWJ"-(K#'&I,PJF)9>Q5NKZ/K_S5M5'^G`K6F4
MO7A30SU9>V5]!N":I;7^\OW3BGP]7FV#P*5;(PR"LWK)Q,?%65V_[`"M:99]
M0-,#.?-:8#X'9LN<?2!D`RL'3CJ[&4VF,#PNG.I6E8PR-G,7-YUR8:0A)R1E
ME$<54T=)6ET=Z2)M:92].-L:^&\C3?,#D-:H*P]#;7!]NTPLLT5FUS/+EB!)
M/`*Q'N3X$A+5>S<FCPNNJF3?P5;#*/O`-72QE"VO]QQHF4<YY0%K*P\K91Y=
MV]3=LEZVI`HJL.2P8$*JRF6=!>CCPJPJ'75A9IZ47K*M24`;-'2.*M38KF$9
M>HD>GXQ^0MW`-;86AP$W>(O3N;8VS4=U#TAY=JN@L6&;LRR5,5(603U*CC'G
MK+9O.VAKV&2O%*"U_O6E`/@0*4"C^CZ0LJ&[`#T)0%O>,OZ7,+ZL0HP=%V+5
M]D878BN;_)`L\R`+YZ.]ZN&,#=PWWX&R5J7W@3.A/"R.D;.Z(Z`'M*>59H>B
M)L0A"O_-W;IAJ`W?..PA;9/`%6GEDV,DK=X2[=H#:)KE1R0#XA"5VE6+RT#*
M!C;:M%M@NZ1)"BDF@Z=EMP&O4TWON/BJ.XBZ]YA^Y#Z`)P^T#[`77H-;E(9N
M!&S;!^`0YQWE/D#9B-6S#_!#-S3;6<-S\!;N1]NP9I:=M[-:8BE`1KV"*ZWE
M42Z:5;].%V;A$R"C0WLRY"'RS&9OW$#,!G?IE9&9G8WB19ZY<13?F!TZ]639
MH@%Q'BND8)S7WR4Y+LRJ#L0NRAI&V0NT5J3U\"65;=Y,'<2;/6K>WH.U@:WD
M.]*VWA"TY`T85NHX&X*J1OD^XIZ6=VZKHFUE#K?6W"<RM]8PO3MP>S5N.^9F
M/AO],ZH23M<L>OJVB28>*9L:N2=D':)1.H`R]K]/6=V1OH6R-:OLY=;H.C"U
M!@^K:).,R@G239]>@^P?S@D\'J-6?_`1<(^N=+;Z8J8?6O\F6\Q>6VSL1!GI
*_!<6L\'C1SH`````
`
end
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-27 13:41 [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN} Anton Altaparmakov
@ 2002-07-27 17:23 ` Andrew Morton
2002-07-28 17:53 ` Eric W. Biederman
0 siblings, 1 reply; 29+ messages in thread
From: Andrew Morton @ 2002-07-27 17:23 UTC (permalink / raw)
To: Anton Altaparmakov; +Cc: Linus Torvalds, Linux Kernel
Anton Altaparmakov wrote:
>
> Linus,
>
> This patch introduces 64-bit versions of PAGE_{CACHE_,}MASK and
> PAGE_{CACHE_,}ALIGN:
> PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
>
> These are needed when 64-bit values are worked with on 32-bit
> architectures, otherwise the high 32-bits are destroyed.
>
> ...
> #define PAGE_SIZE (1UL << PAGE_SHIFT)
> #define PAGE_MASK (~(PAGE_SIZE-1))
> +#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
The problem here is that we've explicitly forced the
PAGE_foo type to unsigned long.
If we instead take the "UL" out of PAGE_SIZE altogether,
the compiler can then promote the type of PAGE_SIZE and PAGE_MASK
to the widest type being used in the expression (ie: long long)
and everything should work.
Which seems to be a much cleaner solution, if it works.
Will it work?
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-27 17:23 ` Andrew Morton
@ 2002-07-28 17:53 ` Eric W. Biederman
2002-07-28 18:54 ` Anton Altaparmakov
0 siblings, 1 reply; 29+ messages in thread
From: Eric W. Biederman @ 2002-07-28 17:53 UTC (permalink / raw)
To: Andrew Morton; +Cc: Anton Altaparmakov, Linus Torvalds, Linux Kernel
Andrew Morton <akpm@zip.com.au> writes:
> Anton Altaparmakov wrote:
> >
> > Linus,
> >
> > This patch introduces 64-bit versions of PAGE_{CACHE_,}MASK and
> > PAGE_{CACHE_,}ALIGN:
> > PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
> >
> > These are needed when 64-bit values are worked with on 32-bit
> > architectures, otherwise the high 32-bits are destroyed.
> >
> > ...
> > #define PAGE_SIZE (1UL << PAGE_SHIFT)
> > #define PAGE_MASK (~(PAGE_SIZE-1))
> > +#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
>
> The problem here is that we've explicitly forced the
> PAGE_foo type to unsigned long.
>
> If we instead take the "UL" out of PAGE_SIZE altogether,
> the compiler can then promote the type of PAGE_SIZE and PAGE_MASK
> to the widest type being used in the expression (ie: long long)
> and everything should work.
>
> Which seems to be a much cleaner solution, if it works.
>
> Will it work?
I don't quite see the point of this work.
There is exactly one operation that must be done in 64bit.
if (my64bitval > max) {
return -E2BIG;
}
After that the value can be broken into, an index/offset pair.
Which is how the data is used in the page cache.
Eric
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-28 17:53 ` Eric W. Biederman
@ 2002-07-28 18:54 ` Anton Altaparmakov
2002-07-28 20:12 ` Eric W. Biederman
2002-07-28 23:26 ` Linus Torvalds
0 siblings, 2 replies; 29+ messages in thread
From: Anton Altaparmakov @ 2002-07-28 18:54 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: Andrew Morton, Linus Torvalds, Linux Kernel
At 18:53 28/07/02, Eric W. Biederman wrote:
>Andrew Morton <akpm@zip.com.au> writes:
> > Anton Altaparmakov wrote:
> > >
> > > Linus,
> > >
> > > This patch introduces 64-bit versions of PAGE_{CACHE_,}MASK and
> > > PAGE_{CACHE_,}ALIGN:
> > > PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
> > >
> > > These are needed when 64-bit values are worked with on 32-bit
> > > architectures, otherwise the high 32-bits are destroyed.
> > >
> > > ...
> > > #define PAGE_SIZE (1UL << PAGE_SHIFT)
> > > #define PAGE_MASK (~(PAGE_SIZE-1))
> > > +#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
> >
> > The problem here is that we've explicitly forced the
> > PAGE_foo type to unsigned long.
> >
> > If we instead take the "UL" out of PAGE_SIZE altogether,
> > the compiler can then promote the type of PAGE_SIZE and PAGE_MASK
> > to the widest type being used in the expression (ie: long long)
> > and everything should work.
> >
> > Which seems to be a much cleaner solution, if it works.
> >
> > Will it work?
I will reply to that point later, I want to do some experiments with gcc
first... I think it may work due to signextension but that implies the
value must be signed which is of course implied by leaving out the "UL"...
I will try it and report results...
>I don't quite see the point of this work.
>
>There is exactly one operation that must be done in 64bit.
>if (my64bitval > max) {
> return -E2BIG;
>}
>After that the value can be broken into, an index/offset pair.
>Which is how the data is used in the page cache.
Why should I need to bother with index/offset? It is much more natural to
work with bytes. Also ntfs has to convert back and forth to bytes (internal
NTFS storage for sizes is s64 in units of bytes in many places), ntfs
clusters, pages, and buffer heads which are all different sizes so your
approach would be a complete code mess.
Also the page cache limit of 32-bit index is IMO not good and needs to be
removed. The code needs to be able to cope with true 64-bits. We already
have sector_t that can be defined to 64-bit. Once it is used everywhere it
will be relatively easy to do something simillar for struct page. Of course
people are going to scream so it will just be a compile time option. Or
even just an out of tree patch but still I consider 64-bit support on
32-bit architectures very important in the future and I belive I am not
alone seeing Matt Domsch (sp?)'s comments for example... I guess it boils
down to how quickly the 64-bit cpus will become standard comodity hardware
vs how quick available storage will blow the 32-bit page cache limit...
Best regards,
Anton
--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-28 18:54 ` Anton Altaparmakov
@ 2002-07-28 20:12 ` Eric W. Biederman
2002-07-28 23:26 ` Linus Torvalds
1 sibling, 0 replies; 29+ messages in thread
From: Eric W. Biederman @ 2002-07-28 20:12 UTC (permalink / raw)
To: Anton Altaparmakov; +Cc: Andrew Morton, Linus Torvalds, Linux Kernel
Anton Altaparmakov <aia21@cantab.net> writes:
> At 18:53 28/07/02, Eric W. Biederman wrote:
> >Andrew Morton <akpm@zip.com.au> writes:
> > > Anton Altaparmakov wrote:
> > > >
> > > > Linus,
> > > >
> > > > This patch introduces 64-bit versions of PAGE_{CACHE_,}MASK and
> > > > PAGE_{CACHE_,}ALIGN:
> > > > PAGE_{CACHE_,}MASK_LL and PAGE_{CACHE_,}ALIGN_LL.
> > > >
> > > > These are needed when 64-bit values are worked with on 32-bit
> > > > architectures, otherwise the high 32-bits are destroyed.
> > > >
> > > > ...
> > > > #define PAGE_SIZE (1UL << PAGE_SHIFT)
> > > > #define PAGE_MASK (~(PAGE_SIZE-1))
> > > > +#define PAGE_MASK_LL (~(u64)(PAGE_SIZE-1))
> > >
> > > The problem here is that we've explicitly forced the
> > > PAGE_foo type to unsigned long.
> > >
> > > If we instead take the "UL" out of PAGE_SIZE altogether,
> > > the compiler can then promote the type of PAGE_SIZE and PAGE_MASK
> > > to the widest type being used in the expression (ie: long long)
> > > and everything should work.
> > >
> > > Which seems to be a much cleaner solution, if it works.
> > >
> > > Will it work?
With the current set of macros I will agree that it becomes error
prone, to use large offsets. So perhaps we should just provide
the larger type in the MASK and ALIGN functions. Having to track
if you are using a 64bit type to closely is problematic. An alternative
is to ignore the ALIGN macro and provide PAGE_OFFSET instead of PAGE_MASK,
which keeps the supplied values small.
> I will reply to that point later, I want to do some experiments with gcc
> first... I think it may work due to signextension but that implies the value
> must be signed which is of course implied by leaving out the "UL"... I will try
> it and report results...
It would also be interesting to see if the value was ULL if it would make the
code worse.
>
> >I don't quite see the point of this work.
> >
> >There is exactly one operation that must be done in 64bit.
> >if (my64bitval > max) {
> > return -E2BIG;
> >}
> >After that the value can be broken into, an index/offset pair.
> >Which is how the data is used in the page cache.
>
> Why should I need to bother with index/offset? It is much more natural to work
> with bytes. Also ntfs has to convert back and forth to bytes (internal NTFS
> storage for sizes is s64 in units of bytes in many places), ntfs clusters,
> pages, and buffer heads which are all different sizes so your approach would be
> a complete code mess.
For the internals of ntfs, and similar systems I will concede that a 64bit value
may be a more natural intermediate type. This doesn't mean we need to do
weird things in the generic code.
> Also the page cache limit of 32-bit index is IMO not good and needs to be
> removed. The code needs to be able to cope with true 64-bits. We already have
> sector_t that can be defined to 64-bit. Once it is used everywhere it will be
> relatively easy to do something simillar for struct page. Of course people are
> going to scream so it will just be a compile time option. Or even just an out of
>
> tree patch but still I consider 64-bit support on 32-bit architectures very
> important in the future and I belive I am not alone seeing Matt Domsch (sp?)'s
> comments for example... I guess it boils down to how quickly the 64-bit cpus
> will become standard comodity hardware vs how quick available storage will blow
> the 32-bit page cache limit...
The page cache limit is currently 44bits/16TB on x86. Current drives
are currently running at about 37-38 bits. Which means it takes about
64 drives to blow the current page cache. So we probably have 2 or 3
years before this becomes a concern with commodity hardware. And we
should have commodity 64bit cpus by then. We can afford to
hold off a little longer.
For disk sizes we need the larger sector_t simply because drives
really are exceeding it, today.
Eric
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-28 18:54 ` Anton Altaparmakov
2002-07-28 20:12 ` Eric W. Biederman
@ 2002-07-28 23:26 ` Linus Torvalds
2002-07-29 0:10 ` Andrew Morton
1 sibling, 1 reply; 29+ messages in thread
From: Linus Torvalds @ 2002-07-28 23:26 UTC (permalink / raw)
To: Anton Altaparmakov; +Cc: Eric W. Biederman, Andrew Morton, Linux Kernel
On Sun, 28 Jul 2002, Anton Altaparmakov wrote:
>
> Why should I need to bother with index/offset? It is much more natural to
> work with bytes.
Two major reasons:
- the page cache works with index/offset, and that should be your first
priority, since the page cache is all that matters from a performance
standpoint.
- gcc is known to be broken with 64-bit stuff on 32-bit platforms, and
minimizing the use of "long long" minimizes the risk of hitting bugs.
> Also the page cache limit of 32-bit index is IMO not good and needs to be
> removed.
Dream on. It's good, and it's not getting removed. The "struct page" is
size-critical, and also correctness-critical (see above on gcc issues).
We're not moving to a 64-bit index for the next few years. We're a lot
more likely to make PAGE_SIZE bigger, and generally praying that AMD's
x86-64 succeeds in the market, forcing Intel to make Yamhill their
standard platform. At which point we _could_ make things truly 64 bits
(the size pressure on "struct page" is largely due to HIGHMEM, and gcc
does fine on 64-bit platforms).
But that's certainly years away.
Linus
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-28 23:26 ` Linus Torvalds
@ 2002-07-29 0:10 ` Andrew Morton
2002-07-29 0:43 ` William Lee Irwin III
` (3 more replies)
0 siblings, 4 replies; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 0:10 UTC (permalink / raw)
To: Linux Kernel
Linus Torvalds wrote:
>
> ...
> Dream on. It's good, and it's not getting removed. The "struct page" is
> size-critical, and also correctness-critical (see above on gcc issues).
>
Plan B is to remove page->index.
- Replace ->mapping with a pointer to the page's radix tree
slot. Use address masking to go from page.radix_tree_slot
to the radix tree node.
- Store the base index in the radix tree node, use math to
derive page->index. Gives 64-bit index without increasing
the size of struct page. 4 bytes saved.
- Implement radix_tree_gang_lookup() as previously described. Use
this in truncate_inode_pages, invalidate_inode_pages[2], readahead
and writeback.
- The only thing we now need page.list for is tracking dirty pages.
Implement a 64-bit dirtiness bitmap in radix_tree_node, propagate
that up the radix tree so we can efficiently traverse dirty pages
in a mapping. This also allows writeback to always write in ascending
index order. Remove page->list. 8 bytes saved.
- Few pages use ->private for much. Hash for it. 4(ish) bytes
saved.
- Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
- Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
saved. struct page is now 20 bytes.
There look. In five minutes I shrunk 24 bytes from the page
structure. Who said programming was hard?
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:10 ` Andrew Morton
@ 2002-07-29 0:43 ` William Lee Irwin III
2002-07-29 0:56 ` Andrea Arcangeli
2002-07-29 0:49 ` Andrea Arcangeli
` (2 subsequent siblings)
3 siblings, 1 reply; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 0:43 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
Linus Torvalds wrote:
>> Dream on. It's good, and it's not getting removed. The "struct page" is
>> size-critical, and also correctness-critical (see above on gcc issues).
32-bit is a sad, broken, and depressing reality we're going to be
saddled with on mainstream systems for ages. It's stinking up the
kernel like a dead woodchuck under the porch as it is, and the 64GB
abominations on their way out the ass-end of hardware vendor pipelines
are truly vomitous.
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> Plan B is to remove page->index.
> - Replace ->mapping with a pointer to the page's radix tree
> slot. Use address masking to go from page.radix_tree_slot
> to the radix tree node.
> - Store the base index in the radix tree node, use math to
> derive page->index. Gives 64-bit index without increasing
> the size of struct page. 4 bytes saved.
> - Implement radix_tree_gang_lookup() as previously described. Use
> this in truncate_inode_pages, invalidate_inode_pages[2], readahead
> and writeback.
> - The only thing we now need page.list for is tracking dirty pages.
> Implement a 64-bit dirtiness bitmap in radix_tree_node, propagate
> that up the radix tree so we can efficiently traverse dirty pages
> in a mapping. This also allows writeback to always write in ascending
> index order. Remove page->list. 8 bytes saved.
> - Few pages use ->private for much. Hash for it. 4(ish) bytes
> saved.
> - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
> - Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
> saved. struct page is now 20 bytes.
> There look. In five minutes I shrunk 24 bytes from the page
> structure. Who said programming was hard?
This is so aggressive I'm obligated to pursue it. The pte_chain will
die shortly if I get my way as it is.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:10 ` Andrew Morton
2002-07-29 0:43 ` William Lee Irwin III
@ 2002-07-29 0:49 ` Andrea Arcangeli
2002-07-29 2:05 ` Andrew Morton
2002-07-29 0:56 ` William Lee Irwin III
2002-07-29 9:27 ` Russell King
3 siblings, 1 reply; 29+ messages in thread
From: Andrea Arcangeli @ 2002-07-29 0:49 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> Linus Torvalds wrote:
> >
> > ...
> > Dream on. It's good, and it's not getting removed. The "struct page" is
> > size-critical, and also correctness-critical (see above on gcc issues).
> >
>
> Plan B is to remove page->index.
>
> - Replace ->mapping with a pointer to the page's radix tree
> slot. Use address masking to go from page.radix_tree_slot
> to the radix tree node.
that's not immediate anymore, you've to walk the tree backwards, like
you do for the lookups.
I recall you are benchmarking radix tree with dbench. You've to
benchmark the worst case not dbench with small files. with small files
even the rbtree was nicer than the hashtable. The right benchmark is:
truncate(800GByte)
write(1G)
lseek(800Gbyte)
fsync()
addr = mmap(1G)
benchmark_start()
mlock(1G)
benchmark_stop()
instead of mlock you can also do read(1G) as you prefer, but the
overhead of the copy-user would be certainly more significant than the
overhead of filling the ptes, so an mlock or *even* a page fault should
be lighter to allow us to better benchmark the pagecache performance.
The nocopy hack from Lincol would be fine too, then you could read the
whole thing with one syscall and no pagetable overhead.
(of course you need >1G of ram to avoid to hit the disk during the read,
probably the read pass should be read 1 byte, lseek to the next 1byte,
and you also need the hashtable allocated with the bootmem allocator)
Nobody did that yet AFIK, so it's not a surprise nobody found any
regression with the radix tree (yet). I expect walking 6/7 cacheline
steps every time is going to be a significant hit (even worse if you
need to do that every time to derive the index with the gang method). Of
course if you work with small files you'll never walk more than a few
steps and the regression doesn't showup, that was true with the rbtree
too two/three years ago.
The other major problems of radix trees are the GFP_ATOMIC allocations
that can lead at the very least to I/O failures, the mempool usage seems
just a band-aid to hide those failures but it works by pure luck it
seems.
On the same GFP_ATOMIC lines we can find in rmap.c:
static void alloc_new_pte_chains()
{
struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
int i = PAGE_SIZE / sizeof(struct pte_chain);
if (pte_chain) {
inc_page_state(nr_pte_chain_pages);
for (; i-- > 0; pte_chain++)
pte_chain_push(pte_chain);
} else {
/* Yeah yeah, I'll fix the pte_chain allocation ... */
panic("Fix pte_chain allocation, you lazy bastard!\n");
how will you fix the pte_chain allocation to avoid deadlocks? please
elaborate. none of the callers can handle a failure there, no surprise
there is a panic there.
> - The only thing we now need page.list for is tracking dirty pages.
> Implement a 64-bit dirtiness bitmap in radix_tree_node, propagate
> that up the radix tree so we can efficiently traverse dirty pages
> in a mapping. This also allows writeback to always write in ascending
> index order. Remove page->list. 8 bytes saved.
>
page->list is needed for the freelist, but ok you could now share
freelist and lru since they're mutually exclusive. This seems a nice
idea for the ordering in particular, but again the "find" algorithm will
be slower and walking a tree in order is even a recursive operation that
will require either sane programming and a recursive algorithm that will
overflow the stack with a big radix tree at offset 200T on a 64bit arch,
or dynamic allocations and overcomplex code.
> - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
note you still have to handle the collisions, it's not like the
waitqueue hash were you avoid handling the collisions by doing a
wake-all. You should handle the collision with kmalloc dyn-alloc
but you have no failure path there.
In short none of these things cames for free, it's not like the page
based writeback that removes overhead. They're not obvious optimizations
to my eyes, but yes you could theoretically shrunk the struct page that
way, but I'm pretty much fine to pay with ram if the other option is to
run slower.
The keeping track of dirty pages into a tree and to walk the tree in
ascendent order to flush those dirty pages may actually pay off, in
userspace with a recursive stack, but in kernel even only the fact we
cannot handle a kmalloc failure during dirty flushing is a showstopper
for those algorithms that means OOM deadlock, you cannot just avoid to
flush dirty pages and try again, everybody may be trying to flush dirty
pages to make progress. In userspace that just means "task dies with
-ENOMEM or sigkill from kernel, plug some more ram or add some more swap
and try again", but for an operative system the thing is different.
Andrea
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:43 ` William Lee Irwin III
@ 2002-07-29 0:56 ` Andrea Arcangeli
2002-07-29 1:04 ` William Lee Irwin III
2002-07-29 1:09 ` Rik van Riel
0 siblings, 2 replies; 29+ messages in thread
From: Andrea Arcangeli @ 2002-07-29 0:56 UTC (permalink / raw)
To: William Lee Irwin III, Andrew Morton, Linux Kernel
On Sun, Jul 28, 2002 at 05:43:25PM -0700, William Lee Irwin III wrote:
> Linus Torvalds wrote:
> >> Dream on. It's good, and it's not getting removed. The "struct page" is
> >> size-critical, and also correctness-critical (see above on gcc issues).
>
> 32-bit is a sad, broken, and depressing reality we're going to be
> saddled with on mainstream systems for ages. It's stinking up the
> kernel like a dead woodchuck under the porch as it is, and the 64GB
> abominations on their way out the ass-end of hardware vendor pipelines
> are truly vomitous.
>
>
> On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> > Plan B is to remove page->index.
> > - Replace ->mapping with a pointer to the page's radix tree
> > slot. Use address masking to go from page.radix_tree_slot
> > to the radix tree node.
> > - Store the base index in the radix tree node, use math to
> > derive page->index. Gives 64-bit index without increasing
> > the size of struct page. 4 bytes saved.
> > - Implement radix_tree_gang_lookup() as previously described. Use
> > this in truncate_inode_pages, invalidate_inode_pages[2], readahead
> > and writeback.
> > - The only thing we now need page.list for is tracking dirty pages.
> > Implement a 64-bit dirtiness bitmap in radix_tree_node, propagate
> > that up the radix tree so we can efficiently traverse dirty pages
> > in a mapping. This also allows writeback to always write in ascending
> > index order. Remove page->list. 8 bytes saved.
> > - Few pages use ->private for much. Hash for it. 4(ish) bytes
> > saved.
> > - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
> > - Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
> > saved. struct page is now 20 bytes.
> > There look. In five minutes I shrunk 24 bytes from the page
> > structure. Who said programming was hard?
>
> This is so aggressive I'm obligated to pursue it. The pte_chain will
> die shortly if I get my way as it is.
if you look at DaveM first full rmap implementation it never had a
pte-chain. He used the same rmap logic we always hand in linux since the
first 2.1 kernel I looked at, to handle correctly truncate against
MAP_SHARED. Unfortunately that's not very efficient and requires some
metadata allocation for anonymous pages (that's the address space
pointer, anon pages regularly doesn't have a dedicated address space),
and overhead that we never had w/o full rmap (and for inode backed
mappings we just have this info in the inode, just the shared_lock
locking isn't trivial). Hope you can came up with a better algorithm
(nevertheless also the current rmap implementation adds significant
measurable overhead in the fast paths), Rik told me a few days ago he
also wanted to drop the pte_chain, but I assume you're just in sync with him.
Andrea
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:10 ` Andrew Morton
2002-07-29 0:43 ` William Lee Irwin III
2002-07-29 0:49 ` Andrea Arcangeli
@ 2002-07-29 0:56 ` William Lee Irwin III
2002-07-29 1:36 ` Andrew Morton
2002-07-29 9:27 ` Russell King
3 siblings, 1 reply; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 0:56 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> - Few pages use ->private for much. Hash for it. 4(ish) bytes
> saved.
Do you know an approximate reasonable constant of proportionality
for how many pages have ->private attached?
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> - Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
> saved. struct page is now 20 bytes.
How did ptep_to_address() break? I browsed over your latest changes and
missed the bit where that fell apart. I'll at least take a stab at fixing
it up until the other bits materialize.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:56 ` Andrea Arcangeli
@ 2002-07-29 1:04 ` William Lee Irwin III
2002-07-29 1:09 ` Rik van Riel
1 sibling, 0 replies; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 1:04 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: Andrew Morton, Linux Kernel
On Sun, Jul 28, 2002 at 05:43:25PM -0700, William Lee Irwin III wrote:
>> This is so aggressive I'm obligated to pursue it. The pte_chain will
>> die shortly if I get my way as it is.
On Mon, Jul 29, 2002 at 02:56:12AM +0200, Andrea Arcangeli wrote:
> if you look at DaveM first full rmap implementation it never had a
> pte-chain. He used the same rmap logic we always hand in linux since the
> first 2.1 kernel I looked at, to handle correctly truncate against
> MAP_SHARED. Unfortunately that's not very efficient and requires some
> metadata allocation for anonymous pages (that's the address space
> pointer, anon pages regularly doesn't have a dedicated address space),
> and overhead that we never had w/o full rmap (and for inode backed
> mappings we just have this info in the inode, just the shared_lock
> locking isn't trivial). Hope you can came up with a better algorithm
> (nevertheless also the current rmap implementation adds significant
> measurable overhead in the fast paths), Rik told me a few days ago he
> also wanted to drop the pte_chain, but I assume you're just in sync with him.
I've seen davem's implementation. The anonymous page metadata
allocations, while they are overhead, are likely to be significantly
smaller than per-pte overhead. The rest is a matter of details. You're
welcome to participate with the design and/or implementation.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:56 ` Andrea Arcangeli
2002-07-29 1:04 ` William Lee Irwin III
@ 2002-07-29 1:09 ` Rik van Riel
2002-07-29 2:14 ` Andrew Morton
1 sibling, 1 reply; 29+ messages in thread
From: Rik van Riel @ 2002-07-29 1:09 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: William Lee Irwin III, Andrew Morton, Linux Kernel
On Mon, 29 Jul 2002, Andrea Arcangeli wrote:
> if you look at DaveM first full rmap implementation it never had a
> pte-chain. He used the same rmap logic we always hand in linux since the
> first 2.1 kernel I looked at, to handle correctly truncate against
> MAP_SHARED. Unfortunately that's not very efficient and requires some
> metadata allocation for anonymous pages (that's the address space
> pointer, anon pages regularly doesn't have a dedicated address space),
Together with the K42 people we found a way to avoid the
badnesses of an object-based VM.
The space overhead will be a "double wide" radix tree
per anonymous memory object, where each entry has a
copy-on-write count and either the page frame number
or the position in swap.
Added benefits are not having to modify page->count for
most pages on fork(), exec(), etc. and being able to just
throw away page tables.
This scheme doesn't have deep tree walking of memory objects,
doesn't have the disadvantage of leaving 'stale' pages behind
in parent objects after COW and can still do refcounting on an
object basis instead of a page by page basis.
It'll also allow us to drop the usage count for swap entries,
turning those into a simple bitmap (or maybe a better form?).
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:56 ` William Lee Irwin III
@ 2002-07-29 1:36 ` Andrew Morton
2002-07-29 1:37 ` William Lee Irwin III
0 siblings, 1 reply; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 1:36 UTC (permalink / raw)
To: William Lee Irwin III; +Cc: Linux Kernel
William Lee Irwin III wrote:
>
> On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> > - Few pages use ->private for much. Hash for it. 4(ish) bytes
> > saved.
>
> Do you know an approximate reasonable constant of proportionality
> for how many pages have ->private attached?
Well, it depends on what the mapping is using ->private for.
In the case of ext2, ext3 and (soon) reiserfs mappings, ->private
is only used for pagecache pages which were written to with write(2).
But for other filesystems, basically all pagecache pages have
buffers at present, so I exaggerate. But as filesystems migrate
to using direct-to-BIO reads, the situation gets better.
It might be useful to buffer-strip written-to pages as well, if
a clean way of doing that presents itself. Maybe in refill_inactive
or something.
> On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> > - Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
> > saved. struct page is now 20 bytes.
>
> How did ptep_to_address() break? I browsed over your latest changes and
> missed the bit where that fell apart. I'll at least take a stab at fixing
> it up until the other bits materialize.
I broke it in my five-minute thought-coding exercise. By removing
page->index.
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 1:36 ` Andrew Morton
@ 2002-07-29 1:37 ` William Lee Irwin III
0 siblings, 0 replies; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 1:37 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
William Lee Irwin III wrote:
>> Do you know an approximate reasonable constant of proportionality
>> for how many pages have ->private attached?
On Sun, Jul 28, 2002 at 06:36:20PM -0700, Andrew Morton wrote:
> Well, it depends on what the mapping is using ->private for.
> In the case of ext2, ext3 and (soon) reiserfs mappings, ->private
> is only used for pagecache pages which were written to with write(2).
> But for other filesystems, basically all pagecache pages have
> buffers at present, so I exaggerate. But as filesystems migrate
> to using direct-to-BIO reads, the situation gets better.
> It might be useful to buffer-strip written-to pages as well, if
> a clean way of doing that presents itself. Maybe in refill_inactive
> or something.
Collecting some numbers might be useful here.
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
>>> - Remove the rmap chain (I just broke ptep_to_address() anyway). 4 bytes
>>> saved. struct page is now 20 bytes.
William Lee Irwin III wrote:
>> How did ptep_to_address() break? I browsed over your latest changes and
>> missed the bit where that fell apart. I'll at least take a stab at fixing
>> it up until the other bits materialize.
On Sun, Jul 28, 2002 at 06:36:20PM -0700, Andrew Morton wrote:
> I broke it in my five-minute thought-coding exercise. By removing
> page->index.
Sorry, I took fixing up the users as part of the ->index removal. This
isn't a serious issue.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:49 ` Andrea Arcangeli
@ 2002-07-29 2:05 ` Andrew Morton
2002-07-29 2:09 ` William Lee Irwin III
2002-07-29 20:52 ` Andrea Arcangeli
0 siblings, 2 replies; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 2:05 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: Linux Kernel
Andrea Arcangeli wrote:
>
> On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> > Linus Torvalds wrote:
> > >
> > > ...
> > > Dream on. It's good, and it's not getting removed. The "struct page" is
> > > size-critical, and also correctness-critical (see above on gcc issues).
> > >
> >
> > Plan B is to remove page->index.
> >
> > - Replace ->mapping with a pointer to the page's radix tree
> > slot. Use address masking to go from page.radix_tree_slot
> > to the radix tree node.
>
> that's not immediate anymore, you've to walk the tree backwards, like
> you do for the lookups.
No, it's contant-time. page.radix_tree_slot points to the page's
slot in its radix-tree_node. As long as the radix_tree_nodes are
laid out in memory with a stable alignment, you can go from
page.radix_tree_slot to the radix_tree_node pointer with just some
masking, maybe a modulus.
But yes, all of this is a straight speed/space tradeoff. Probably
some of it should be ifdeffed.
> I recall you are benchmarking radix tree with dbench. You've to
> benchmark the worst case not dbench with small files. with small files
> even the rbtree was nicer than the hashtable. The right benchmark is:
>
> truncate(800GByte)
> write(1G)
> lseek(800Gbyte)
> fsync()
> addr = mmap(1G)
> benchmark_start()
> mlock(1G)
> benchmark_stop()
>
> instead of mlock you can also do read(1G) as you prefer, but the
> overhead of the copy-user would be certainly more significant than the
> overhead of filling the ptes, so an mlock or *even* a page fault should
> be lighter to allow us to better benchmark the pagecache performance.
>
> The nocopy hack from Lincol would be fine too, then you could read the
> whole thing with one syscall and no pagetable overhead.
>
> (of course you need >1G of ram to avoid to hit the disk during the read,
> probably the read pass should be read 1 byte, lseek to the next 1byte,
> and you also need the hashtable allocated with the bootmem allocator)
Yes, there are space concerns with the radix tree, and nobody has
tried to really make it fall over yet.
The situation improved when I halved the ratnode size, but a node
is still 1/15th of a page, and the math is fairly easy to do.
Lame fix is to make the nodes even smaller. Another speed/space
tradeoff.
> Nobody did that yet AFIK, so it's not a surprise nobody found any
> regression with the radix tree (yet). I expect walking 6/7 cacheline
> steps every time is going to be a significant hit
The cost of the tree walk doesn't worry me much - generally we
walk the tree with good locality of reference, so most everything is
in cache anyway.
> (even worse if you
> need to do that every time to derive the index with the gang method).
You don't need to do that.
> Of
> course if you work with small files you'll never walk more than a few
> steps and the regression doesn't showup, that was true with the rbtree
> too two/three years ago.
>
> The other major problems of radix trees are the GFP_ATOMIC allocations
> that can lead at the very least to I/O failures, the mempool usage seems
> just a band-aid to hide those failures but it works by pure luck it
> seems.
I'm not particularly concerned about that. There are two scenarios
in which we allocate ratnodes:
1: Adding pages to swap. Here, failure is OK. As long as we add
just a single page to swapcache before running out of ratnodes,
the system won't deadlock. The mempool helps here.
2: Adding pagecache pages. Here, the ratnode allocation occurs just
a few hundred instructions after we're performed a GFP_HIGHUSER
allocation. For the page itself. So we know that there are
tons of pages available. The only way this can fail is if someone
comes in and allocates a few megabytes of GFP_ATOMIC memory
at interrupt time in that few-hundred instruction window.
_and_ if the ratnode mempool is exhausted as well.
Good luck setting up a testcase which does this ;)
> On the same GFP_ATOMIC lines we can find in rmap.c:
>
> static void alloc_new_pte_chains()
> {
> struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
> int i = PAGE_SIZE / sizeof(struct pte_chain);
>
> if (pte_chain) {
> inc_page_state(nr_pte_chain_pages);
> for (; i-- > 0; pte_chain++)
> pte_chain_push(pte_chain);
> } else {
> /* Yeah yeah, I'll fix the pte_chain allocation ... */
> panic("Fix pte_chain allocation, you lazy bastard!\n");
>
> how will you fix the pte_chain allocation to avoid deadlocks? please
> elaborate. none of the callers can handle a failure there, no surprise
> there is a panic there.
I think that code's in redhat kernels, actually. So it's presumably
not occurring on a daily basis. But no, it cannot be allowed
to live. It was replaced with a slab in the patches I sent yesterday,
and Bill is cooking something up for the OOM handling there.
> > - The only thing we now need page.list for is tracking dirty pages.
> > Implement a 64-bit dirtiness bitmap in radix_tree_node, propagate
> > that up the radix tree so we can efficiently traverse dirty pages
> > in a mapping. This also allows writeback to always write in ascending
> > index order. Remove page->list. 8 bytes saved.
> >
>
> page->list is needed for the freelist, but ok you could now share
> freelist and lru since they're mutually exclusive. This seems a nice
> idea for the ordering in particular, but again the "find" algorithm will
> be slower and walking a tree in order is even a recursive operation that
> will require either sane programming and a recursive algorithm that will
> overflow the stack with a big radix tree at offset 200T on a 64bit arch,
> or dynamic allocations and overcomplex code.
No walk for the page_index(page) function.
No recursion either. Note how the current radix tree code uses a
fixed-size (28 byte) local structure for maintaining the path
down the tree.
> > - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
>
> note you still have to handle the collisions, it's not like the
> waitqueue hash were you avoid handling the collisions by doing a
> wake-all. You should handle the collision with kmalloc dyn-alloc
> but you have no failure path there.
yes. But we really need to get all the fastpath kmaps using
kmap_atomic anyway.
> In short none of these things cames for free, it's not like the page
> based writeback that removes overhead. They're not obvious optimizations
> to my eyes, but yes you could theoretically shrunk the struct page that
> way, but I'm pretty much fine to pay with ram if the other option is to
> run slower.
It's the 32G highmem systems which would be prepared to spend the CPU
cycles to get the ZONE_NORMAL savings. But we'd need appropriate
conditionals so that 64-bit machines didn't need to pay for the
ia32 highmem silliness.
Then again, Andi says that sizeof(struct page) is a problem for
x86-64.
> The keeping track of dirty pages into a tree and to walk the tree in
> ascendent order to flush those dirty pages may actually pay off, in
> userspace with a recursive stack, but in kernel even only the fact we
> cannot handle a kmalloc failure during dirty flushing is a showstopper
> for those algorithms that means OOM deadlock, you cannot just avoid to
> flush dirty pages and try again, everybody may be trying to flush dirty
> pages to make progress. In userspace that just means "task dies with
> -ENOMEM or sigkill from kernel, plug some more ram or add some more swap
> and try again", but for an operative system the thing is different.
No recursion needed, no allocations needed.
It would have to be coded with some delicacy, yes. For example,
page_index(page) may only be valid when calculated inside
mapping->page_lock. Depends how it's done.
But I don't see any showstoppers here.
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 2:05 ` Andrew Morton
@ 2002-07-29 2:09 ` William Lee Irwin III
2002-07-29 20:52 ` Andrea Arcangeli
1 sibling, 0 replies; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 2:09 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andrea Arcangeli, Linux Kernel
Andrea Arcangeli wrote:
>> how will you fix the pte_chain allocation to avoid deadlocks? please
>> elaborate. none of the callers can handle a failure there, no surprise
>> there is a panic there.
On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> I think that code's in redhat kernels, actually. So it's presumably
> not occurring on a daily basis. But no, it cannot be allowed
> to live. It was replaced with a slab in the patches I sent yesterday,
> and Bill is cooking something up for the OOM handling there.
There is a small race for time here; if the OOM handling doesn't
materialize before the algorithm no longer requires per-pte allocation
it may not happen unless an explicit requirement appears.
On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> It's the 32G highmem systems which would be prepared to spend the CPU
> cycles to get the ZONE_NORMAL savings. But we'd need appropriate
> conditionals so that 64-bit machines didn't need to pay for the
> ia32 highmem silliness.
The bad news is this is going to turn into 64GB and soon. Hopefully
these will flop badly enough to silence the users thereof with the
most minimally invasive accommodations.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 2:14 ` Andrew Morton
@ 2002-07-29 2:11 ` William Lee Irwin III
2002-07-29 2:18 ` Rik van Riel
1 sibling, 0 replies; 29+ messages in thread
From: William Lee Irwin III @ 2002-07-29 2:11 UTC (permalink / raw)
To: Andrew Morton; +Cc: Rik van Riel, Andrea Arcangeli, Linux Kernel
Rik van Riel wrote:
>> Together with the K42 people we found a way to avoid the
>> badnesses of an object-based VM.
On Sun, Jul 28, 2002 at 07:14:03PM -0700, Andrew Morton wrote:
> eek. Please let's not tie the delivery of the 2.6 kernel to
> the success of this R&D effort. We need reasonable-sized fixes, fast,
> for the current problems so that people who have feature work banked
> up can get going on it.
> Plus, staying close to the 2.4 rmap VM allows us to leverage the
> testing and experience which that has had, yes?
If this is the direction we're headed there are some tasks I won't
be able to get out of. I was ready for double and/or triple duty
anyway, though.
Cheers,
Bill
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 1:09 ` Rik van Riel
@ 2002-07-29 2:14 ` Andrew Morton
2002-07-29 2:11 ` William Lee Irwin III
2002-07-29 2:18 ` Rik van Riel
0 siblings, 2 replies; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 2:14 UTC (permalink / raw)
To: Rik van Riel; +Cc: Andrea Arcangeli, William Lee Irwin III, Linux Kernel
Rik van Riel wrote:
>
> On Mon, 29 Jul 2002, Andrea Arcangeli wrote:
>
> > if you look at DaveM first full rmap implementation it never had a
> > pte-chain. He used the same rmap logic we always hand in linux since the
> > first 2.1 kernel I looked at, to handle correctly truncate against
> > MAP_SHARED. Unfortunately that's not very efficient and requires some
> > metadata allocation for anonymous pages (that's the address space
> > pointer, anon pages regularly doesn't have a dedicated address space),
>
> Together with the K42 people we found a way to avoid the
> badnesses of an object-based VM.
>
eek. Please let's not tie the delivery of the 2.6 kernel to
the success of this R&D effort. We need reasonable-sized fixes, fast,
for the current problems so that people who have feature work banked
up can get going on it.
Plus, staying close to the 2.4 rmap VM allows us to leverage the
testing and experience which that has had, yes?
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 2:14 ` Andrew Morton
2002-07-29 2:11 ` William Lee Irwin III
@ 2002-07-29 2:18 ` Rik van Riel
1 sibling, 0 replies; 29+ messages in thread
From: Rik van Riel @ 2002-07-29 2:18 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andrea Arcangeli, William Lee Irwin III, Linux Kernel
On Sun, 28 Jul 2002, Andrew Morton wrote:
> > Together with the K42 people we found a way to avoid the
> > badnesses of an object-based VM.
>
> eek. Please let's not tie the delivery of the 2.6 kernel to the success
> of this R&D effort. We need reasonable-sized fixes, fast, for the
> current problems so that people who have feature work banked up can get
> going on it.
Fully agreed. We can go with the mechanisms we have now and
should only work on new mechanisms later.
I'm planning to keep the whole K42-style VM thing in design
stage at least until after the feature freeze...
(just so nobody gets tempted to sneak it into the kernel ;))
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 0:10 ` Andrew Morton
` (2 preceding siblings ...)
2002-07-29 0:56 ` William Lee Irwin III
@ 2002-07-29 9:27 ` Russell King
2002-07-29 18:32 ` Andrew Morton
3 siblings, 1 reply; 29+ messages in thread
From: Russell King @ 2002-07-29 9:27 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
Hmmmmmmm. page_address() is already 5 loads (on ARM) if page->virtual
isn't used. I'm seriously considering changing page_address() to cover
the 3 cases more efficiently:
1. non-discontiguous case (should be around 2 loads + math)
2. discontiguous case (currently 5 loads + lots of math)
3. weirder setups where loading page->virtual is faster
We currently ignore (1) completely, and just assume its the same as (2).
--
Russell King (rmk@arm.linux.org.uk) The developer of ARM Linux
http://www.arm.linux.org.uk/personal/aboutme.html
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 9:27 ` Russell King
@ 2002-07-29 18:32 ` Andrew Morton
0 siblings, 0 replies; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 18:32 UTC (permalink / raw)
To: Russell King; +Cc: Linux Kernel
Russell King wrote:
>
> On Sun, Jul 28, 2002 at 05:10:48PM -0700, Andrew Morton wrote:
> > - Remove ->virtual, do page_address() via a hash. 4(ish) bytes saved.
>
> Hmmmmmmm. page_address() is already 5 loads (on ARM) if page->virtual
> isn't used. I'm seriously considering changing page_address() to cover
> the 3 cases more efficiently:
Well, one would want to keep the WANT_PAGE_VIRTUAL thing anyway.
btw, the usage of page_address() will quite possibly drop sharply soon
anyway. There's the patch floating about which permits atomic
kmaps to be held across copy_*_user. If that is adopted, things
like the pagecache IO routines won't do page_address() any more.
Said patch speeds up pagecache IO by between 0% and probably 30%.
It's the mystery surrounding this variation which is holding things
up.
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 2:05 ` Andrew Morton
2002-07-29 2:09 ` William Lee Irwin III
@ 2002-07-29 20:52 ` Andrea Arcangeli
2002-07-29 21:01 ` Andrew Morton
1 sibling, 1 reply; 29+ messages in thread
From: Andrea Arcangeli @ 2002-07-29 20:52 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> But yes, all of this is a straight speed/space tradeoff. Probably
> some of it should be ifdeffed.
I would say so. recalculating page_address in cpu core with no cacheline
access is one thing, deriving the index is a different thing.
> The cost of the tree walk doesn't worry me much - generally we
> walk the tree with good locality of reference, so most everything is
> in cache anyway.
well, the rbtree showedup heavily when it started growing more than a
few steps, it has less locality of reference though.
> Good luck setting up a testcase which does this ;)
a gigabit will trigger it in a millisecond. of course nobody tested it
either I guess (I guess not many people tested the 800Gbyte offset
either in the first place).
> Then again, Andi says that sizeof(struct page) is a problem for
> x86-64.
not true.
> No recursion needed, no allocations needed.
the 28 bytes if they're on the stack they're like recursion, just using
an interactive algorithm.
you're done with 28 bytes with a max 7/8 level tree, so 7*4 = 28 (4 size
of pointer/long). On a 32bit arch the max index supported is
2^32, on a 64bit arch the max index supported is
2^(64-PAGE_CACHE_SHFIT), plus each pointer is 8 bytes. You may want to
do the math to verify if you've enough stack to walk the tree in order,
it's not obvious.
> But I don't see any showstoppers here.
on a 32bit arch with 32bit index it seems ok to me too, on 64bit
somebody has to do the math and it's not really obvious to me that's
feasible.
Andrea
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 20:52 ` Andrea Arcangeli
@ 2002-07-29 21:01 ` Andrew Morton
2002-07-29 21:31 ` Andrea Arcangeli
0 siblings, 1 reply; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 21:01 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: Linux Kernel
Andrea Arcangeli wrote:
>
> On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> > But yes, all of this is a straight speed/space tradeoff. Probably
> > some of it should be ifdeffed.
>
> I would say so. recalculating page_address in cpu core with no cacheline
> access is one thing, deriving the index is a different thing.
>
> > The cost of the tree walk doesn't worry me much - generally we
> > walk the tree with good locality of reference, so most everything is
> > in cache anyway.
>
> well, the rbtree showedup heavily when it started growing more than a
> few steps, it has less locality of reference though.
>
> > Good luck setting up a testcase which does this ;)
>
> a gigabit will trigger it in a millisecond. of course nobody tested it
> either I guess (I guess not many people tested the 800Gbyte offset
> either in the first place).
There's still the mempool.
We could perform a GFP_KERNEL replenishment of the ratnode mempool
after the page_cache_alloc(), and before taking any locks, if
that's needed.
> > Then again, Andi says that sizeof(struct page) is a problem for
> > x86-64.
>
> not true.
>
> > No recursion needed, no allocations needed.
>
> the 28 bytes if they're on the stack they're like recursion, just using
> an interactive algorithm.
>
> you're done with 28 bytes with a max 7/8 level tree, so 7*4 = 28 (4 size
> of pointer/long). On a 32bit arch the max index supported is
> 2^32, on a 64bit arch the max index supported is
> 2^(64-PAGE_CACHE_SHFIT), plus each pointer is 8 bytes. You may want to
> do the math to verify if you've enough stack to walk the tree in order,
> it's not obvious.
I make that 144 bytes of stack.
-
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 21:01 ` Andrew Morton
@ 2002-07-29 21:31 ` Andrea Arcangeli
2002-07-29 21:46 ` Andrew Morton
0 siblings, 1 reply; 29+ messages in thread
From: Andrea Arcangeli @ 2002-07-29 21:31 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Mon, Jul 29, 2002 at 02:01:15PM -0700, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> > > But yes, all of this is a straight speed/space tradeoff. Probably
> > > some of it should be ifdeffed.
> >
> > I would say so. recalculating page_address in cpu core with no cacheline
> > access is one thing, deriving the index is a different thing.
> >
> > > The cost of the tree walk doesn't worry me much - generally we
> > > walk the tree with good locality of reference, so most everything is
> > > in cache anyway.
> >
> > well, the rbtree showedup heavily when it started growing more than a
> > few steps, it has less locality of reference though.
> >
> > > Good luck setting up a testcase which does this ;)
> >
> > a gigabit will trigger it in a millisecond. of course nobody tested it
> > either I guess (I guess not many people tested the 800Gbyte offset
> > either in the first place).
>
> There's still the mempool.
that's hiding the problem at the moment, it's global, it doesn't provide
any real guarantee.
> We could perform a GFP_KERNEL replenishment of the ratnode mempool
> after the page_cache_alloc(), and before taking any locks, if
> that's needed.
one safe way to do it is to take the fail path, try to allocate with
GFP_KERNEL an object on the stack in the fail path, take all the locks
again and try again with the local ram watching if nobody raced. Just
doing a replenishment before entering the critical section it's not
enough, it's still in the "hiding" category if you then consider a
failure if the global mempool is empty at the time you need the atomic
ram.
>
> > > Then again, Andi says that sizeof(struct page) is a problem for
> > > x86-64.
> >
> > not true.
> >
> > > No recursion needed, no allocations needed.
> >
> > the 28 bytes if they're on the stack they're like recursion, just using
> > an interactive algorithm.
> >
> > you're done with 28 bytes with a max 7/8 level tree, so 7*4 = 28 (4 size
> > of pointer/long). On a 32bit arch the max index supported is
> > 2^32, on a 64bit arch the max index supported is
> > 2^(64-PAGE_CACHE_SHFIT), plus each pointer is 8 bytes. You may want to
> > do the math to verify if you've enough stack to walk the tree in order,
> > it's not obvious.
>
> I make that 144 bytes of stack.
so it's not too bad in terms of stack because there's not going to be
more than one walk at time, thanks for doing the math btw. You'd
basically need a second radix tree for the dirty pages (using the same
radix tree is not an option because it would increase pdflush complexity
too much with terabytes of clean pages in the tree).
Andrea
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 21:31 ` Andrea Arcangeli
@ 2002-07-29 21:46 ` Andrew Morton
2002-07-29 22:18 ` Andrea Arcangeli
0 siblings, 1 reply; 29+ messages in thread
From: Andrew Morton @ 2002-07-29 21:46 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: Linux Kernel
Andrea Arcangeli wrote:
>
> On Mon, Jul 29, 2002 at 02:01:15PM -0700, Andrew Morton wrote:
> > Andrea Arcangeli wrote:
> > >
> > > On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> > > > But yes, all of this is a straight speed/space tradeoff. Probably
> > > > some of it should be ifdeffed.
> > >
> > > I would say so. recalculating page_address in cpu core with no cacheline
> > > access is one thing, deriving the index is a different thing.
> > >
> > > > The cost of the tree walk doesn't worry me much - generally we
> > > > walk the tree with good locality of reference, so most everything is
> > > > in cache anyway.
> > >
> > > well, the rbtree showedup heavily when it started growing more than a
> > > few steps, it has less locality of reference though.
> > >
> > > > Good luck setting up a testcase which does this ;)
> > >
> > > a gigabit will trigger it in a millisecond. of course nobody tested it
> > > either I guess (I guess not many people tested the 800Gbyte offset
> > > either in the first place).
> >
> > There's still the mempool.
>
> that's hiding the problem at the moment, it's global, it doesn't provide
> any real guarantee.
Sizing the mempool to max_cpus * max tree depth provides a guarantee,
provided you take care of context switches, which is pretty easy.
> ...
>
> so it's not too bad in terms of stack because there's not going to be
> more than one walk at time, thanks for doing the math btw. You'd
> basically need a second radix tree for the dirty pages (using the same
> radix tree is not an option because it would increase pdflush complexity
> too much with terabytes of clean pages in the tree).
Not sure. If each ratnode has a 64-bit bitmap which represents
dirty pages if it's a leaf node, or nodes which have dirty pages
if it's a higher node then the "find the next 16 dirty pages above index
N" is a pretty efficient thing.
Tricky to code though.
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-29 21:46 ` Andrew Morton
@ 2002-07-29 22:18 ` Andrea Arcangeli
0 siblings, 0 replies; 29+ messages in thread
From: Andrea Arcangeli @ 2002-07-29 22:18 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel
On Mon, Jul 29, 2002 at 02:46:07PM -0700, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > On Mon, Jul 29, 2002 at 02:01:15PM -0700, Andrew Morton wrote:
> > > Andrea Arcangeli wrote:
> > > >
> > > > On Sun, Jul 28, 2002 at 07:05:19PM -0700, Andrew Morton wrote:
> > > > > But yes, all of this is a straight speed/space tradeoff. Probably
> > > > > some of it should be ifdeffed.
> > > >
> > > > I would say so. recalculating page_address in cpu core with no cacheline
> > > > access is one thing, deriving the index is a different thing.
> > > >
> > > > > The cost of the tree walk doesn't worry me much - generally we
> > > > > walk the tree with good locality of reference, so most everything is
> > > > > in cache anyway.
> > > >
> > > > well, the rbtree showedup heavily when it started growing more than a
> > > > few steps, it has less locality of reference though.
> > > >
> > > > > Good luck setting up a testcase which does this ;)
> > > >
> > > > a gigabit will trigger it in a millisecond. of course nobody tested it
> > > > either I guess (I guess not many people tested the 800Gbyte offset
> > > > either in the first place).
> > >
> > > There's still the mempool.
> >
> > that's hiding the problem at the moment, it's global, it doesn't provide
> > any real guarantee.
>
> Sizing the mempool to max_cpus * max tree depth provides a guarantee,
> provided you take care of context switches, which is pretty easy.
I guess I still prefer the GFP_KERNEL fallback because it avoids to
waste/reserve lots of ram, but I only care about correctness, the
current code isn't correct, doing max_cpus * max tree depth would
satisfy me completely too (saving ram is a lower prio), so it's up to
you as far as it cannot fail unless it's truly oom (i.e. you need a
GFP_KERNEL in your way).
>
> > ...
> >
> > so it's not too bad in terms of stack because there's not going to be
> > more than one walk at time, thanks for doing the math btw. You'd
> > basically need a second radix tree for the dirty pages (using the same
> > radix tree is not an option because it would increase pdflush complexity
> > too much with terabytes of clean pages in the tree).
>
> Not sure. If each ratnode has a 64-bit bitmap which represents
> dirty pages if it's a leaf node, or nodes which have dirty pages
> if it's a higher node then the "find the next 16 dirty pages above index
> N" is a pretty efficient thing.
You will have """only""" 18 layers, but scanning through 2**(6*18)
entries will take too long time even if only entry takes 1 nanosecond to
scan. Of course that's the extreme case, but still it should be too much
in practice. I doubt you can avoid at least an additional infrastructure
that tells you if any of the underlying ratnodes has any dirty page,
which will probably save ram at least because it can be coded as a
bitflag in each node, but that will force you an up-walk of the tree
every time you mark a page dirty (but of course also a second tree would
force you to do some tree every time you mark a page dirty/clean). The
second tree probably allows you not to go into the radix-tree
implementation details to provide the "underlying node dirty page" info,
and it would be faster if for example only the start of the inode has
dirty pages, that would allow the dirty page flushing to walk only a few
levels instead of potential 18 of them even to reach the first few
pages. But I don't think it's a common case, so probably the best
(but not simpler) approch is to mark each ratnode with a dirty
cumulative information.
Andrea
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
[not found] ` <20020729205211.GB1201@dualathlon.random.suse.lists.linux.kernel>
@ 2002-07-30 13:44 ` Andi Kleen
2002-07-30 14:06 ` Rik van Riel
0 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2002-07-30 13:44 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: linux-kernel, akpm
Andrea Arcangeli <andrea@suse.de> writes:
> > Then again, Andi says that sizeof(struct page) is a problem for
> > x86-64.
>
> not true.
x86-64 has slightly below 100 bytes struct page
Big struct page eats your cache like crazy for many operations.
In addition it costs a considerable amount of memory.
Of course it is not a showstopper because there is no resource to run
out of too quickly, but still needs attention as an important optimization
(either smaller struct page or bigger softpage size)
Of course longer term bigger softpage size is the best solution - that
would make the >16GB i386 people happy too and avoid overhead on big memory
systems both 32bit and 64bit.. Unfortunately there are some
problems with the ELF alignment and the mmap API with it, which may be
no easy to solve.
-Andi
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN}
2002-07-30 13:44 ` Andi Kleen
@ 2002-07-30 14:06 ` Rik van Riel
0 siblings, 0 replies; 29+ messages in thread
From: Rik van Riel @ 2002-07-30 14:06 UTC (permalink / raw)
To: Andi Kleen; +Cc: Andrea Arcangeli, linux-kernel, akpm
On 30 Jul 2002, Andi Kleen wrote:
> Andrea Arcangeli <andrea@suse.de> writes:
>
> > > Then again, Andi says that sizeof(struct page) is a problem for
> > > x86-64.
> >
> > not true.
>
> x86-64 has slightly below 100 bytes struct page
We really need to look at replacing the page pointers with
page frame numbers and packing the 32-bit variables together
to save some memory on 64-bit architectures.
Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"
http://www.surriel.com/ http://distro.conectiva.com/
^ permalink raw reply [flat|nested] 29+ messages in thread
end of thread, other threads:[~2002-07-30 14:03 UTC | newest]
Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-27 13:41 [BK PATCH 2.5] Introduce 64-bit versions of PAGE_{CACHE_,}{MASK,ALIGN} Anton Altaparmakov
2002-07-27 17:23 ` Andrew Morton
2002-07-28 17:53 ` Eric W. Biederman
2002-07-28 18:54 ` Anton Altaparmakov
2002-07-28 20:12 ` Eric W. Biederman
2002-07-28 23:26 ` Linus Torvalds
2002-07-29 0:10 ` Andrew Morton
2002-07-29 0:43 ` William Lee Irwin III
2002-07-29 0:56 ` Andrea Arcangeli
2002-07-29 1:04 ` William Lee Irwin III
2002-07-29 1:09 ` Rik van Riel
2002-07-29 2:14 ` Andrew Morton
2002-07-29 2:11 ` William Lee Irwin III
2002-07-29 2:18 ` Rik van Riel
2002-07-29 0:49 ` Andrea Arcangeli
2002-07-29 2:05 ` Andrew Morton
2002-07-29 2:09 ` William Lee Irwin III
2002-07-29 20:52 ` Andrea Arcangeli
2002-07-29 21:01 ` Andrew Morton
2002-07-29 21:31 ` Andrea Arcangeli
2002-07-29 21:46 ` Andrew Morton
2002-07-29 22:18 ` Andrea Arcangeli
2002-07-29 0:56 ` William Lee Irwin III
2002-07-29 1:36 ` Andrew Morton
2002-07-29 1:37 ` William Lee Irwin III
2002-07-29 9:27 ` Russell King
2002-07-29 18:32 ` Andrew Morton
[not found] <5.1.0.14.2.20020728193528.04336a80@pop.cus.cam.ac.uk.suse.lists.linux.kernel>
[not found] ` <Pine.LNX.4.44.0207281622350.8208-100000@home.transmeta.com.suse.lists.linux.kernel>
[not found] ` <3D448808.CF8D18BA@zip.com.au.suse.lists.linux.kernel>
[not found] ` <20020729004942.GL1201@dualathlon.random.suse.lists.linux.kernel>
[not found] ` <3D44A2DF.F751B564@zip.com.au.suse.lists.linux.kernel>
[not found] ` <20020729205211.GB1201@dualathlon.random.suse.lists.linux.kernel>
2002-07-30 13:44 ` Andi Kleen
2002-07-30 14:06 ` Rik van Riel
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox