* [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions
[not found] <1320986410.21206.48.camel@pasglop>
@ 2011-11-15 2:32 ` Kyle Moffett
2011-11-15 22:31 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 2+ messages in thread
From: Kyle Moffett @ 2011-11-15 2:32 UTC (permalink / raw)
To: linuxppc-dev
Cc: linux-kernel, benh, galak, scottwood, B04825, paul.gortmaker,
Kyle Moffett, Paul Mackerras, Andrew Morton, Milton Miller,
Mike Frysinger, Oleg Nesterov, Anton Blanchard, David S. Miller,
Ian Campbell, Eric Dumazet, Jeff Kirsher, Jiri Pirko,
linuxppc-dev, netdev
These functions are only used from one place each. If the cacheable_*
versions really are more efficient, then those changes should be
migrated into the common code instead.
NOTE: The old routines are just flat buggy on kernels that support
hardware with different cacheline sizes.
Signed-off-by: Kyle Moffett <Kyle.D.Moffett@boeing.com>
---
arch/powerpc/include/asm/system.h | 2 -
arch/powerpc/kernel/ppc_ksyms.c | 2 -
arch/powerpc/lib/copy_32.S | 127 ----------------------------------
arch/powerpc/mm/ppc_mmu_32.c | 2 +-
drivers/net/ethernet/ibm/emac/core.c | 12 +---
5 files changed, 3 insertions(+), 142 deletions(-)
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index e30a13d..25389d1 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct *t)
#endif
extern int call_rtas(const char *, int, int, unsigned long *, ...);
-extern void cacheable_memzero(void *p, unsigned int nb);
-extern void *cacheable_memcpy(void *, const void *, unsigned int);
extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
extern void bad_page_fault(struct pt_regs *, unsigned long, int);
extern int die(const char *, struct pt_regs *, long);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index d3114a7..acba8ce 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
#ifdef CONFIG_PPC32
EXPORT_SYMBOL(timer_interrupt);
EXPORT_SYMBOL(tb_ticks_per_jiffy);
-EXPORT_SYMBOL(cacheable_memcpy);
-EXPORT_SYMBOL(cacheable_memzero);
#endif
#ifdef CONFIG_PPC32
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 55f19f9..6813f80 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
CACHELINE_MASK = (L1_CACHE_BYTES-1)
-/*
- * Use dcbz on the complete cache lines in the destination
- * to set them to zero. This requires that the destination
- * area is cacheable. -- paulus
- */
-_GLOBAL(cacheable_memzero)
- mr r5,r4
- li r4,0
- addi r6,r3,-4
- cmplwi 0,r5,4
- blt 7f
- stwu r4,4(r6)
- beqlr
- andi. r0,r6,3
- add r5,r0,r5
- subf r6,r0,r6
- clrlwi r7,r6,32-LG_CACHELINE_BYTES
- add r8,r7,r5
- srwi r9,r8,LG_CACHELINE_BYTES
- addic. r9,r9,-1 /* total number of complete cachelines */
- ble 2f
- xori r0,r7,CACHELINE_MASK & ~3
- srwi. r0,r0,2
- beq 3f
- mtctr r0
-4: stwu r4,4(r6)
- bdnz 4b
-3: mtctr r9
- li r7,4
-10: dcbz r7,r6
- addi r6,r6,CACHELINE_BYTES
- bdnz 10b
- clrlwi r5,r8,32-LG_CACHELINE_BYTES
- addi r5,r5,4
-2: srwi r0,r5,2
- mtctr r0
- bdz 6f
-1: stwu r4,4(r6)
- bdnz 1b
-6: andi. r5,r5,3
-7: cmpwi 0,r5,0
- beqlr
- mtctr r5
- addi r6,r6,3
-8: stbu r4,1(r6)
- bdnz 8b
- blr
-
_GLOBAL(memset)
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
@@ -142,85 +94,6 @@ _GLOBAL(memset)
bdnz 8b
blr
-/*
- * This version uses dcbz on the complete cache lines in the
- * destination area to reduce memory traffic. This requires that
- * the destination area is cacheable.
- * We only use this version if the source and dest don't overlap.
- * -- paulus.
- */
-_GLOBAL(cacheable_memcpy)
- add r7,r3,r5 /* test if the src & dst overlap */
- add r8,r4,r5
- cmplw 0,r4,r7
- cmplw 1,r3,r8
- crand 0,0,4 /* cr0.lt &= cr1.lt */
- blt memcpy /* if regions overlap */
-
- addi r4,r4,-4
- addi r6,r3,-4
- neg r0,r3
- andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
- beq 58f
-
- cmplw 0,r5,r0 /* is this more than total to do? */
- blt 63f /* if not much to do */
- andi. r8,r0,3 /* get it word-aligned first */
- subf r5,r0,r5
- mtctr r8
- beq+ 61f
-70: lbz r9,4(r4) /* do some bytes */
- stb r9,4(r6)
- addi r4,r4,1
- addi r6,r6,1
- bdnz 70b
-61: srwi. r0,r0,2
- mtctr r0
- beq 58f
-72: lwzu r9,4(r4) /* do some words */
- stwu r9,4(r6)
- bdnz 72b
-
-58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
- clrlwi r5,r5,32-LG_CACHELINE_BYTES
- li r11,4
- mtctr r0
- beq 63f
-53:
- dcbz r11,r6
- COPY_16_BYTES
-#if L1_CACHE_BYTES >= 32
- COPY_16_BYTES
-#if L1_CACHE_BYTES >= 64
- COPY_16_BYTES
- COPY_16_BYTES
-#if L1_CACHE_BYTES >= 128
- COPY_16_BYTES
- COPY_16_BYTES
- COPY_16_BYTES
- COPY_16_BYTES
-#endif
-#endif
-#endif
- bdnz 53b
-
-63: srwi. r0,r5,2
- mtctr r0
- beq 64f
-30: lwzu r0,4(r4)
- stwu r0,4(r6)
- bdnz 30b
-
-64: andi. r0,r5,3
- mtctr r0
- beq+ 65f
-40: lbz r0,4(r4)
- stb r0,4(r6)
- addi r4,r4,1
- addi r6,r6,1
- bdnz 40b
-65: blr
-
_GLOBAL(memmove)
cmplw 0,r3,r4
bgt backwards_memcpy
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 11571e1..9f16b9f 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -224,7 +224,7 @@ void __init MMU_init_hw(void)
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
Hash = __va(memblock_alloc(Hash_size, Hash_size));
- cacheable_memzero(Hash, Hash_size);
+ memset(Hash, 0, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index ed79b2d..be214ad 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -77,13 +77,6 @@ MODULE_AUTHOR
("Eugene Surovegin <eugene.surovegin@zultys.com> or <ebs@ebshome.net>");
MODULE_LICENSE("GPL");
-/*
- * PPC64 doesn't (yet) have a cacheable_memcpy
- */
-#ifdef CONFIG_PPC64
-#define cacheable_memcpy(d,s,n) memcpy((d),(s),(n))
-#endif
-
/* minimum number of free TX descriptors required to wake up TX process */
#define EMAC_TX_WAKEUP_THRESH (NUM_TX_BUFF / 4)
@@ -1637,7 +1630,7 @@ static inline int emac_rx_sg_append(struct emac_instance *dev, int slot)
dev_kfree_skb(dev->rx_sg_skb);
dev->rx_sg_skb = NULL;
} else {
- cacheable_memcpy(skb_tail_pointer(dev->rx_sg_skb),
+ memcpy(skb_tail_pointer(dev->rx_sg_skb),
dev->rx_skb[slot]->data, len);
skb_put(dev->rx_sg_skb, len);
emac_recycle_rx_skb(dev, slot, len);
@@ -1694,8 +1687,7 @@ static int emac_poll_rx(void *param, int budget)
goto oom;
skb_reserve(copy_skb, EMAC_RX_SKB_HEADROOM + 2);
- cacheable_memcpy(copy_skb->data - 2, skb->data - 2,
- len + 2);
+ memcpy(copy_skb->data - 2, skb->data - 2, len + 2);
emac_recycle_rx_skb(dev, slot, len);
skb = copy_skb;
} else if (unlikely(emac_alloc_rx_skb(dev, slot, GFP_ATOMIC)))
--
1.7.2.5
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions
2011-11-15 2:32 ` [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions Kyle Moffett
@ 2011-11-15 22:31 ` Benjamin Herrenschmidt
0 siblings, 0 replies; 2+ messages in thread
From: Benjamin Herrenschmidt @ 2011-11-15 22:31 UTC (permalink / raw)
To: Kyle Moffett
Cc: Mike Frysinger, Ian Campbell, Eric Dumazet, Jiri Pirko, netdev,
B04825, linux-kernel, Milton Miller, paul.gortmaker,
Paul Mackerras, Anton Blanchard, Oleg Nesterov, scottwood,
Andrew Morton, linuxppc-dev, David S. Miller, Jeff Kirsher
On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
> These functions are only used from one place each. If the cacheable_*
> versions really are more efficient, then those changes should be
> migrated into the common code instead.
>
> NOTE: The old routines are just flat buggy on kernels that support
> hardware with different cacheline sizes.
>
> Signed-off-by: Kyle Moffett <Kyle.D.Moffett@boeing.com>
> ---
Right, considering where those are used, I think we can safely remove
them. Thanks.
Ben.
> arch/powerpc/include/asm/system.h | 2 -
> arch/powerpc/kernel/ppc_ksyms.c | 2 -
> arch/powerpc/lib/copy_32.S | 127 ----------------------------------
> arch/powerpc/mm/ppc_mmu_32.c | 2 +-
> drivers/net/ethernet/ibm/emac/core.c | 12 +---
> 5 files changed, 3 insertions(+), 142 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
> index e30a13d..25389d1 100644
> --- a/arch/powerpc/include/asm/system.h
> +++ b/arch/powerpc/include/asm/system.h
> @@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct *t)
> #endif
>
> extern int call_rtas(const char *, int, int, unsigned long *, ...);
> -extern void cacheable_memzero(void *p, unsigned int nb);
> -extern void *cacheable_memcpy(void *, const void *, unsigned int);
> extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
> extern void bad_page_fault(struct pt_regs *, unsigned long, int);
> extern int die(const char *, struct pt_regs *, long);
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
> index d3114a7..acba8ce 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
> #ifdef CONFIG_PPC32
> EXPORT_SYMBOL(timer_interrupt);
> EXPORT_SYMBOL(tb_ticks_per_jiffy);
> -EXPORT_SYMBOL(cacheable_memcpy);
> -EXPORT_SYMBOL(cacheable_memzero);
> #endif
>
> #ifdef CONFIG_PPC32
> diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
> index 55f19f9..6813f80 100644
> --- a/arch/powerpc/lib/copy_32.S
> +++ b/arch/powerpc/lib/copy_32.S
> @@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
> LG_CACHELINE_BYTES = L1_CACHE_SHIFT
> CACHELINE_MASK = (L1_CACHE_BYTES-1)
>
> -/*
> - * Use dcbz on the complete cache lines in the destination
> - * to set them to zero. This requires that the destination
> - * area is cacheable. -- paulus
> - */
> -_GLOBAL(cacheable_memzero)
> - mr r5,r4
> - li r4,0
> - addi r6,r3,-4
> - cmplwi 0,r5,4
> - blt 7f
> - stwu r4,4(r6)
> - beqlr
> - andi. r0,r6,3
> - add r5,r0,r5
> - subf r6,r0,r6
> - clrlwi r7,r6,32-LG_CACHELINE_BYTES
> - add r8,r7,r5
> - srwi r9,r8,LG_CACHELINE_BYTES
> - addic. r9,r9,-1 /* total number of complete cachelines */
> - ble 2f
> - xori r0,r7,CACHELINE_MASK & ~3
> - srwi. r0,r0,2
> - beq 3f
> - mtctr r0
> -4: stwu r4,4(r6)
> - bdnz 4b
> -3: mtctr r9
> - li r7,4
> -10: dcbz r7,r6
> - addi r6,r6,CACHELINE_BYTES
> - bdnz 10b
> - clrlwi r5,r8,32-LG_CACHELINE_BYTES
> - addi r5,r5,4
> -2: srwi r0,r5,2
> - mtctr r0
> - bdz 6f
> -1: stwu r4,4(r6)
> - bdnz 1b
> -6: andi. r5,r5,3
> -7: cmpwi 0,r5,0
> - beqlr
> - mtctr r5
> - addi r6,r6,3
> -8: stbu r4,1(r6)
> - bdnz 8b
> - blr
> -
> _GLOBAL(memset)
> rlwimi r4,r4,8,16,23
> rlwimi r4,r4,16,0,15
> @@ -142,85 +94,6 @@ _GLOBAL(memset)
> bdnz 8b
> blr
>
> -/*
> - * This version uses dcbz on the complete cache lines in the
> - * destination area to reduce memory traffic. This requires that
> - * the destination area is cacheable.
> - * We only use this version if the source and dest don't overlap.
> - * -- paulus.
> - */
> -_GLOBAL(cacheable_memcpy)
> - add r7,r3,r5 /* test if the src & dst overlap */
> - add r8,r4,r5
> - cmplw 0,r4,r7
> - cmplw 1,r3,r8
> - crand 0,0,4 /* cr0.lt &= cr1.lt */
> - blt memcpy /* if regions overlap */
> -
> - addi r4,r4,-4
> - addi r6,r3,-4
> - neg r0,r3
> - andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
> - beq 58f
> -
> - cmplw 0,r5,r0 /* is this more than total to do? */
> - blt 63f /* if not much to do */
> - andi. r8,r0,3 /* get it word-aligned first */
> - subf r5,r0,r5
> - mtctr r8
> - beq+ 61f
> -70: lbz r9,4(r4) /* do some bytes */
> - stb r9,4(r6)
> - addi r4,r4,1
> - addi r6,r6,1
> - bdnz 70b
> -61: srwi. r0,r0,2
> - mtctr r0
> - beq 58f
> -72: lwzu r9,4(r4) /* do some words */
> - stwu r9,4(r6)
> - bdnz 72b
> -
> -58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
> - clrlwi r5,r5,32-LG_CACHELINE_BYTES
> - li r11,4
> - mtctr r0
> - beq 63f
> -53:
> - dcbz r11,r6
> - COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 32
> - COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 64
> - COPY_16_BYTES
> - COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 128
> - COPY_16_BYTES
> - COPY_16_BYTES
> - COPY_16_BYTES
> - COPY_16_BYTES
> -#endif
> -#endif
> -#endif
> - bdnz 53b
> -
> -63: srwi. r0,r5,2
> - mtctr r0
> - beq 64f
> -30: lwzu r0,4(r4)
> - stwu r0,4(r6)
> - bdnz 30b
> -
> -64: andi. r0,r5,3
> - mtctr r0
> - beq+ 65f
> -40: lbz r0,4(r4)
> - stb r0,4(r6)
> - addi r4,r4,1
> - addi r6,r6,1
> - bdnz 40b
> -65: blr
> -
> _GLOBAL(memmove)
> cmplw 0,r3,r4
> bgt backwards_memcpy
> diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
> index 11571e1..9f16b9f 100644
> --- a/arch/powerpc/mm/ppc_mmu_32.c
> +++ b/arch/powerpc/mm/ppc_mmu_32.c
> @@ -224,7 +224,7 @@ void __init MMU_init_hw(void)
> */
> if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
> Hash = __va(memblock_alloc(Hash_size, Hash_size));
> - cacheable_memzero(Hash, Hash_size);
> + memset(Hash, 0, Hash_size);
> _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
>
> Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
> diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
> index ed79b2d..be214ad 100644
> --- a/drivers/net/ethernet/ibm/emac/core.c
> +++ b/drivers/net/ethernet/ibm/emac/core.c
> @@ -77,13 +77,6 @@ MODULE_AUTHOR
> ("Eugene Surovegin <eugene.surovegin@zultys.com> or <ebs@ebshome.net>");
> MODULE_LICENSE("GPL");
>
> -/*
> - * PPC64 doesn't (yet) have a cacheable_memcpy
> - */
> -#ifdef CONFIG_PPC64
> -#define cacheable_memcpy(d,s,n) memcpy((d),(s),(n))
> -#endif
> -
> /* minimum number of free TX descriptors required to wake up TX process */
> #define EMAC_TX_WAKEUP_THRESH (NUM_TX_BUFF / 4)
>
> @@ -1637,7 +1630,7 @@ static inline int emac_rx_sg_append(struct emac_instance *dev, int slot)
> dev_kfree_skb(dev->rx_sg_skb);
> dev->rx_sg_skb = NULL;
> } else {
> - cacheable_memcpy(skb_tail_pointer(dev->rx_sg_skb),
> + memcpy(skb_tail_pointer(dev->rx_sg_skb),
> dev->rx_skb[slot]->data, len);
> skb_put(dev->rx_sg_skb, len);
> emac_recycle_rx_skb(dev, slot, len);
> @@ -1694,8 +1687,7 @@ static int emac_poll_rx(void *param, int budget)
> goto oom;
>
> skb_reserve(copy_skb, EMAC_RX_SKB_HEADROOM + 2);
> - cacheable_memcpy(copy_skb->data - 2, skb->data - 2,
> - len + 2);
> + memcpy(copy_skb->data - 2, skb->data - 2, len + 2);
> emac_recycle_rx_skb(dev, slot, len);
> skb = copy_skb;
> } else if (unlikely(emac_alloc_rx_skb(dev, slot, GFP_ATOMIC)))
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2011-11-15 22:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1320986410.21206.48.camel@pasglop>
2011-11-15 2:32 ` [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions Kyle Moffett
2011-11-15 22:31 ` Benjamin Herrenschmidt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).