* [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-19 9:14 ` David Laight
2014-05-19 22:00 ` Rustad, Mark D
2014-05-16 21:43 ` [PATCH 2/8] radeonfb: Out of line errata workarounds Andi Kleen
` (6 subsequent siblings)
7 siblings, 2 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen, netdev, Jeff Kirsher
From: Andi Kleen <ak@linux.intel.com>
ixgbe_read_reg and ixgbe_write_reg are frequently called and are very big
because they have complex error handling code.
Moving them out of line saves ~27k text in the ixgbe driver.
text data bss dec hex filename
14220873 2008072 1507328 17736273 10ea251 vmlinux-before-ixgbe
14193673 2003976 1507328 17704977 10e2811 vmlinux-ixgbe
Cc: netdev@vger.kernel.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe_common.h | 22 ++--------------------
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 22 ++++++++++++++++++++++
2 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
index f12c40f..05f094d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
@@ -162,28 +162,10 @@ static inline void writeq(u64 val, void __iomem *addr)
}
#endif
-static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
-{
- u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value);
+u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg);
- if (ixgbe_removed(reg_addr))
- return;
- writeq(value, reg_addr + reg);
-}
#define IXGBE_WRITE_REG64(a, reg, value) ixgbe_write_reg64((a), (reg), (value))
-
-static inline u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
-{
- u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
- u32 value;
-
- if (ixgbe_removed(reg_addr))
- return IXGBE_FAILED_READ_REG;
- value = readl(reg_addr + reg);
- if (unlikely(value == IXGBE_FAILED_READ_REG))
- ixgbe_check_remove(hw, reg);
- return value;
-}
#define IXGBE_READ_REG(a, reg) ixgbe_read_reg((a), (reg))
#define IXGBE_WRITE_REG_ARRAY(a, reg, offset, value) \
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d62e7a2..5f81f62 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -371,6 +371,28 @@ void ixgbe_write_pci_cfg_word(struct ixgbe_hw *hw, u32 reg, u16 value)
pci_write_config_word(adapter->pdev, reg, value);
}
+void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
+{
+ u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+
+ if (ixgbe_removed(reg_addr))
+ return;
+ writeq(value, reg_addr + reg);
+}
+
+u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
+{
+ u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+ u32 value;
+
+ if (ixgbe_removed(reg_addr))
+ return IXGBE_FAILED_READ_REG;
+ value = readl(reg_addr + reg);
+ if (unlikely(value == IXGBE_FAILED_READ_REG))
+ ixgbe_check_remove(hw, reg);
+ return value;
+}
+
static void ixgbe_service_event_complete(struct ixgbe_adapter *adapter)
{
BUG_ON(!test_bit(__IXGBE_SERVICE_SCHED, &adapter->state));
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* RE: [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg
2014-05-16 21:43 ` [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg Andi Kleen
@ 2014-05-19 9:14 ` David Laight
2014-05-19 22:00 ` Rustad, Mark D
1 sibling, 0 replies; 23+ messages in thread
From: David Laight @ 2014-05-19 9:14 UTC (permalink / raw)
To: 'Andi Kleen', linux-kernel@vger.kernel.org
Cc: akpm@linux-foundation.org, Andi Kleen, netdev@vger.kernel.org,
Jeff Kirsher
From: Andi Kleen
> ixgbe_read_reg and ixgbe_write_reg are frequently called and are very big
> because they have complex error handling code.
Have you measured the performance impact?
I suspect that it might me measurable.
Clearly the calls during initialisation don't need to be inline,
but there will be some in the normal tx and rx paths.
David
> Moving them out of line saves ~27k text in the ixgbe driver.
>
> text data bss dec hex filename
> 14220873 2008072 1507328 17736273 10ea251 vmlinux-before-ixgbe
> 14193673 2003976 1507328 17704977 10e2811 vmlinux-ixgbe
>
> Cc: netdev@vger.kernel.org
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> drivers/net/ethernet/intel/ixgbe/ixgbe_common.h | 22 ++--------------------
> drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 22 ++++++++++++++++++++++
> 2 files changed, 24 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> index f12c40f..05f094d 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> @@ -162,28 +162,10 @@ static inline void writeq(u64 val, void __iomem *addr)
> }
> #endif
>
> -static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
> -{
> - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> +void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value);
> +u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg);
>
> - if (ixgbe_removed(reg_addr))
> - return;
> - writeq(value, reg_addr + reg);
> -}
> #define IXGBE_WRITE_REG64(a, reg, value) ixgbe_write_reg64((a), (reg), (value))
> -
> -static inline u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
> -{
> - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> - u32 value;
> -
> - if (ixgbe_removed(reg_addr))
> - return IXGBE_FAILED_READ_REG;
> - value = readl(reg_addr + reg);
> - if (unlikely(value == IXGBE_FAILED_READ_REG))
> - ixgbe_check_remove(hw, reg);
> - return value;
> -}
> #define IXGBE_READ_REG(a, reg) ixgbe_read_reg((a), (reg))
>
> #define IXGBE_WRITE_REG_ARRAY(a, reg, offset, value) \
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index d62e7a2..5f81f62 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -371,6 +371,28 @@ void ixgbe_write_pci_cfg_word(struct ixgbe_hw *hw, u32 reg, u16 value)
> pci_write_config_word(adapter->pdev, reg, value);
> }
>
> +void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
> +{
> + u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> +
> + if (ixgbe_removed(reg_addr))
> + return;
> + writeq(value, reg_addr + reg);
> +}
> +
> +u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
> +{
> + u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> + u32 value;
> +
> + if (ixgbe_removed(reg_addr))
> + return IXGBE_FAILED_READ_REG;
> + value = readl(reg_addr + reg);
> + if (unlikely(value == IXGBE_FAILED_READ_REG))
> + ixgbe_check_remove(hw, reg);
> + return value;
> +}
> +
> static void ixgbe_service_event_complete(struct ixgbe_adapter *adapter)
> {
> BUG_ON(!test_bit(__IXGBE_SERVICE_SCHED, &adapter->state));
> --
> 1.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg
2014-05-16 21:43 ` [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg Andi Kleen
2014-05-19 9:14 ` David Laight
@ 2014-05-19 22:00 ` Rustad, Mark D
2014-05-19 23:25 ` Andi Kleen
1 sibling, 1 reply; 23+ messages in thread
From: Rustad, Mark D @ 2014-05-19 22:00 UTC (permalink / raw)
To: Andi Kleen, Kirsher, Jeffrey T
Cc: <linux-kernel@vger.kernel.org>, akpm@linux-foundation.org,
Andi Kleen, Netdev
On May 16, 2014, at 2:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> ixgbe_read_reg and ixgbe_write_reg are frequently called and are very big
> because they have complex error handling code.
Actually, this patch doesn't do anything to ixgbe_write_reg, which would almost certainly be very bad for performance, but instead changes ixgbe_write_reg64. The latter is not in a performance-sensitive path, but is only called from one site, so there is little reason to take it out-of-line.
I already have a patch in queue to make ixgbe_read_reg out-of-line, because it does have a very costly memory footprint inline, as you have found.
> Moving them out of line saves ~27k text in the ixgbe driver.
>
> text data bss dec hex filename
> 14220873 2008072 1507328 17736273 10ea251 vmlinux-before-ixgbe
> 14193673 2003976 1507328 17704977 10e2811 vmlinux-ixgbe
>
> Cc: netdev@vger.kernel.org
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> drivers/net/ethernet/intel/ixgbe/ixgbe_common.h | 22 ++--------------------
> drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 22 ++++++++++++++++++++++
> 2 files changed, 24 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> index f12c40f..05f094d 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
> @@ -162,28 +162,10 @@ static inline void writeq(u64 val, void __iomem *addr)
> }
> #endif
>
> -static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
> -{
> - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> +void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value);
> +u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg);
>
> - if (ixgbe_removed(reg_addr))
> - return;
> - writeq(value, reg_addr + reg);
> -}
> #define IXGBE_WRITE_REG64(a, reg, value) ixgbe_write_reg64((a), (reg), (value))
> -
> -static inline u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
> -{
> - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> - u32 value;
> -
> - if (ixgbe_removed(reg_addr))
> - return IXGBE_FAILED_READ_REG;
> - value = readl(reg_addr + reg);
> - if (unlikely(value == IXGBE_FAILED_READ_REG))
> - ixgbe_check_remove(hw, reg);
> - return value;
> -}
> #define IXGBE_READ_REG(a, reg) ixgbe_read_reg((a), (reg))
>
> #define IXGBE_WRITE_REG_ARRAY(a, reg, offset, value) \
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index d62e7a2..5f81f62 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -371,6 +371,28 @@ void ixgbe_write_pci_cfg_word(struct ixgbe_hw *hw, u32 reg, u16 value)
> pci_write_config_word(adapter->pdev, reg, value);
> }
>
> +void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
> +{
> + u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> +
> + if (ixgbe_removed(reg_addr))
> + return;
> + writeq(value, reg_addr + reg);
> +}
> +
> +u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
> +{
> + u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
> + u32 value;
> +
> + if (ixgbe_removed(reg_addr))
> + return IXGBE_FAILED_READ_REG;
> + value = readl(reg_addr + reg);
> + if (unlikely(value == IXGBE_FAILED_READ_REG))
> + ixgbe_check_remove(hw, reg);
> + return value;
> +}
> +
> static void ixgbe_service_event_complete(struct ixgbe_adapter *adapter)
> {
> BUG_ON(!test_bit(__IXGBE_SERVICE_SCHED, &adapter->state));
> --
> 1.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Mark Rustad, Networking Division, Intel Corporation
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg
2014-05-19 22:00 ` Rustad, Mark D
@ 2014-05-19 23:25 ` Andi Kleen
2014-05-20 17:06 ` Rustad, Mark D
0 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2014-05-19 23:25 UTC (permalink / raw)
To: Rustad, Mark D
Cc: Andi Kleen, Kirsher, Jeffrey T,
<linux-kernel@vger.kernel.org>, akpm@linux-foundation.org,
Netdev
On Mon, May 19, 2014 at 10:00:52PM +0000, Rustad, Mark D wrote:
> On May 16, 2014, at 2:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
>
> > From: Andi Kleen <ak@linux.intel.com>
> >
> > ixgbe_read_reg and ixgbe_write_reg are frequently called and are very big
> > because they have complex error handling code.
>
> Actually, this patch doesn't do anything to ixgbe_write_reg, which would almost certainly be very bad for performance, but instead changes ixgbe_write_reg64.
I doubt a few cycles around the write make a lot of difference for MMIO. MMIO is dominated
by other things.
> The latter is not in a performance-sensitive path, but is only called from one site, so there is little reason to take it out-of-line.
True I moved the wrong one.
ixgbe_write_reg 3305 (0.00%) 8 409
> I already have a patch in queue to make ixgbe_read_reg out-of-line, because it does have a very costly memory footprint inline, as you have found.
Please move write_reg too.
-Andi
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg
2014-05-19 23:25 ` Andi Kleen
@ 2014-05-20 17:06 ` Rustad, Mark D
0 siblings, 0 replies; 23+ messages in thread
From: Rustad, Mark D @ 2014-05-20 17:06 UTC (permalink / raw)
To: Andi Kleen
Cc: Andi Kleen, Kirsher, Jeffrey T,
<linux-kernel@vger.kernel.org>, akpm@linux-foundation.org,
Netdev
On May 19, 2014, at 4:25 PM, Andi Kleen <ak@linux.intel.com> wrote:
> On Mon, May 19, 2014 at 10:00:52PM +0000, Rustad, Mark D wrote:
>> On May 16, 2014, at 2:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>
>>> From: Andi Kleen <ak@linux.intel.com>
>>>
>>> ixgbe_read_reg and ixgbe_write_reg are frequently called and are very big
>>> because they have complex error handling code.
>>
>> Actually, this patch doesn't do anything to ixgbe_write_reg, which would almost certainly be very bad for performance, but instead changes ixgbe_write_reg64.
>
> I doubt a few cycles around the write make a lot of difference for MMIO. MMIO is dominated
> by other things.
>
>> The latter is not in a performance-sensitive path, but is only called from one site, so there is little reason to take it out-of-line.
>
> True I moved the wrong one.
>
> ixgbe_write_reg 3305 (0.00%) 8 409
>
>
>> I already have a patch in queue to make ixgbe_read_reg out-of-line, because it does have a very costly memory footprint inline, as you have found.
>
> Please move write_reg too.
I will take a look at moving most of them out-of-line. There are just a few in very hot paths that should remain inline.
--
Mark Rustad, Networking Division, Intel Corporation
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 2/8] radeonfb: Out of line errata workarounds
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
2014-05-16 21:43 ` [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-16 21:43 ` [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del Andi Kleen
` (5 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen, Benjamin Herrenschmidt, linux-fbdev
From: Andi Kleen <ak@linux.intel.com>
Out of lining _radeon_msleep and radeon_pll_errata_* saves about 40k text.
14193673 2003976 1507328 17704977 10e2811 vmlinux-before-radeon
14152713 2003976 1507328 17664017 10d8811 vmlinux-radeon
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: linux-fbdev@vger.kernel.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
drivers/video/fbdev/aty/radeon_base.c | 57 ++++++++++++++++++++++++++++++++++
drivers/video/fbdev/aty/radeonfb.h | 58 ++---------------------------------
2 files changed, 60 insertions(+), 55 deletions(-)
diff --git a/drivers/video/fbdev/aty/radeon_base.c b/drivers/video/fbdev/aty/radeon_base.c
index 26d80a4..abd89a9 100644
--- a/drivers/video/fbdev/aty/radeon_base.c
+++ b/drivers/video/fbdev/aty/radeon_base.c
@@ -282,6 +282,63 @@ static int backlight = 1;
static int backlight = 0;
#endif
+/* Note about this function: we have some rare cases where we must not schedule,
+ * this typically happen with our special "wake up early" hook which allows us to
+ * wake up the graphic chip (and thus get the console back) before everything else
+ * on some machines that support that mechanism. At this point, interrupts are off
+ * and scheduling is not permitted
+ */
+void _radeon_msleep(struct radeonfb_info *rinfo, unsigned long ms)
+{
+ if (rinfo->no_schedule || oops_in_progress)
+ mdelay(ms);
+ else
+ msleep(ms);
+}
+
+/*
+ * Note about PLL register accesses:
+ *
+ * I have removed the spinlock on them on purpose. The driver now
+ * expects that it will only manipulate the PLL registers in normal
+ * task environment, where radeon_msleep() will be called, protected
+ * by a semaphore (currently the console semaphore) so that no conflict
+ * will happen on the PLL register index.
+ *
+ * With the latest changes to the VT layer, this is guaranteed for all
+ * calls except the actual drawing/blits which aren't supposed to use
+ * the PLL registers anyway
+ *
+ * This is very important for the workarounds to work properly. The only
+ * possible exception to this rule is the call to unblank(), which may
+ * be done at irq time if an oops is in progress.
+ */
+void radeon_pll_errata_after_index(struct radeonfb_info *rinfo)
+{
+ if (!(rinfo->errata & CHIP_ERRATA_PLL_DUMMYREADS))
+ return;
+
+ (void)INREG(CLOCK_CNTL_DATA);
+ (void)INREG(CRTC_GEN_CNTL);
+}
+
+void radeon_pll_errata_after_data(struct radeonfb_info *rinfo)
+{
+ if (rinfo->errata & CHIP_ERRATA_PLL_DELAY) {
+ /* we can't deal with posted writes here ... */
+ _radeon_msleep(rinfo, 5);
+ }
+ if (rinfo->errata & CHIP_ERRATA_R300_CG) {
+ u32 save, tmp;
+ save = INREG(CLOCK_CNTL_INDEX);
+ tmp = save & ~(0x3f | PLL_WR_EN);
+ OUTREG(CLOCK_CNTL_INDEX, tmp);
+ tmp = INREG(CLOCK_CNTL_DATA);
+ OUTREG(CLOCK_CNTL_INDEX, save);
+ }
+}
+
+
/*
* prototypes
*/
diff --git a/drivers/video/fbdev/aty/radeonfb.h b/drivers/video/fbdev/aty/radeonfb.h
index cb84604..bb73446 100644
--- a/drivers/video/fbdev/aty/radeonfb.h
+++ b/drivers/video/fbdev/aty/radeonfb.h
@@ -370,20 +370,7 @@ struct radeonfb_info {
* IO macros
*/
-/* Note about this function: we have some rare cases where we must not schedule,
- * this typically happen with our special "wake up early" hook which allows us to
- * wake up the graphic chip (and thus get the console back) before everything else
- * on some machines that support that mechanism. At this point, interrupts are off
- * and scheduling is not permitted
- */
-static inline void _radeon_msleep(struct radeonfb_info *rinfo, unsigned long ms)
-{
- if (rinfo->no_schedule || oops_in_progress)
- mdelay(ms);
- else
- msleep(ms);
-}
-
+void _radeon_msleep(struct radeonfb_info *rinfo, unsigned long ms);
#define INREG8(addr) readb((rinfo->mmio_base)+addr)
#define OUTREG8(addr,val) writeb(val, (rinfo->mmio_base)+addr)
@@ -408,47 +395,8 @@ static inline void _OUTREGP(struct radeonfb_info *rinfo, u32 addr,
#define OUTREGP(addr,val,mask) _OUTREGP(rinfo, addr, val,mask)
-/*
- * Note about PLL register accesses:
- *
- * I have removed the spinlock on them on purpose. The driver now
- * expects that it will only manipulate the PLL registers in normal
- * task environment, where radeon_msleep() will be called, protected
- * by a semaphore (currently the console semaphore) so that no conflict
- * will happen on the PLL register index.
- *
- * With the latest changes to the VT layer, this is guaranteed for all
- * calls except the actual drawing/blits which aren't supposed to use
- * the PLL registers anyway
- *
- * This is very important for the workarounds to work properly. The only
- * possible exception to this rule is the call to unblank(), which may
- * be done at irq time if an oops is in progress.
- */
-static inline void radeon_pll_errata_after_index(struct radeonfb_info *rinfo)
-{
- if (!(rinfo->errata & CHIP_ERRATA_PLL_DUMMYREADS))
- return;
-
- (void)INREG(CLOCK_CNTL_DATA);
- (void)INREG(CRTC_GEN_CNTL);
-}
-
-static inline void radeon_pll_errata_after_data(struct radeonfb_info *rinfo)
-{
- if (rinfo->errata & CHIP_ERRATA_PLL_DELAY) {
- /* we can't deal with posted writes here ... */
- _radeon_msleep(rinfo, 5);
- }
- if (rinfo->errata & CHIP_ERRATA_R300_CG) {
- u32 save, tmp;
- save = INREG(CLOCK_CNTL_INDEX);
- tmp = save & ~(0x3f | PLL_WR_EN);
- OUTREG(CLOCK_CNTL_INDEX, tmp);
- tmp = INREG(CLOCK_CNTL_DATA);
- OUTREG(CLOCK_CNTL_INDEX, save);
- }
-}
+void radeon_pll_errata_after_index(struct radeonfb_info *rinfo);
+void radeon_pll_errata_after_data(struct radeonfb_info *rinfo);
static inline u32 __INPLL(struct radeonfb_info *rinfo, u32 addr)
{
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
2014-05-16 21:43 ` [PATCH 1/8] ixgbe: Out of line ixgbe_read/write_reg Andi Kleen
2014-05-16 21:43 ` [PATCH 2/8] radeonfb: Out of line errata workarounds Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-17 0:03 ` Dave Jones
2014-05-17 0:03 ` Eric Dumazet
2014-05-16 21:43 ` [PATCH 4/8] e1000e: Out of line __ew32_prepare/__ew32 Andi Kleen
` (4 subsequent siblings)
7 siblings, 2 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Out of lining these two inlines saves ~21k on my vmlinux
14152713 2003976 1507328 17664017 10d8811 vmlinux-before-list
14131431 2008136 1507328 17646895 10d452f vmlinux-list
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
include/linux/list.h | 15 ++++-----------
lib/Makefile | 2 +-
lib/list.c | 22 ++++++++++++++++++++++
3 files changed, 27 insertions(+), 12 deletions(-)
create mode 100644 lib/list.c
diff --git a/include/linux/list.h b/include/linux/list.h
index ef95941..8297885 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -21,11 +21,8 @@
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
-static inline void INIT_LIST_HEAD(struct list_head *list)
-{
- list->next = list;
- list->prev = list;
-}
+/* Out of line to save space */
+void INIT_LIST_HEAD(struct list_head *list);
/*
* Insert a new entry between two known consecutive entries.
@@ -101,12 +98,8 @@ static inline void __list_del_entry(struct list_head *entry)
__list_del(entry->prev, entry->next);
}
-static inline void list_del(struct list_head *entry)
-{
- __list_del(entry->prev, entry->next);
- entry->next = LIST_POISON1;
- entry->prev = LIST_POISON2;
-}
+/* Out of line to save space */
+void list_del(struct list_head *entry);
#else
extern void __list_del_entry(struct list_head *entry);
extern void list_del(struct list_head *entry);
diff --git a/lib/Makefile b/lib/Makefile
index 0cd7b68..8b744f7 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o
+ earlycpio.o list.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/list.c b/lib/list.c
new file mode 100644
index 0000000..298768f
--- /dev/null
+++ b/lib/list.c
@@ -0,0 +1,22 @@
+#include <linux/list.h>
+#include <linux/module.h>
+
+/*
+ * Out of line versions of common list.h functions that bloat the
+ * kernel too much.
+ */
+
+void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+EXPORT_SYMBOL(INIT_LIST_HEAD);
+
+void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+EXPORT_SYMBOL(list_del);
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del
2014-05-16 21:43 ` [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del Andi Kleen
@ 2014-05-17 0:03 ` Dave Jones
2014-05-17 2:37 ` Andi Kleen
2014-05-17 0:03 ` Eric Dumazet
1 sibling, 1 reply; 23+ messages in thread
From: Dave Jones @ 2014-05-17 0:03 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, akpm, Andi Kleen
On Fri, May 16, 2014 at 02:43:10PM -0700, Andi Kleen wrote:
> diff --git a/lib/Makefile b/lib/Makefile
> index 0cd7b68..8b744f7 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
> sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
> proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
> is_single_threaded.o plist.o decompress.o kobject_uevent.o \
> - earlycpio.o
> + earlycpio.o list.o
Hey Andi,
Did you test this with CONFIG_DEBUG_LIST ?
Unless I'm missing something, this looks like we'll have duplicate
symbols for list_del if that is set.
Dave
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del
2014-05-17 0:03 ` Dave Jones
@ 2014-05-17 2:37 ` Andi Kleen
0 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-17 2:37 UTC (permalink / raw)
To: Dave Jones, Andi Kleen, linux-kernel, akpm, Andi Kleen
> Did you test this with CONFIG_DEBUG_LIST ?
> Unless I'm missing something, this looks like we'll have duplicate
> symbols for list_del if that is set.
Good point. Will fix.
BTW I only ran it in my limited config. I'm sure running it on
different configs will show up other low hanging fruit ...
-Andi
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del
2014-05-16 21:43 ` [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del Andi Kleen
2014-05-17 0:03 ` Dave Jones
@ 2014-05-17 0:03 ` Eric Dumazet
1 sibling, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2014-05-17 0:03 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, akpm, Andi Kleen
On Fri, 2014-05-16 at 14:43 -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Out of lining these two inlines saves ~21k on my vmlinux
>
> 14152713 2003976 1507328 17664017 10d8811 vmlinux-before-list
> 14131431 2008136 1507328 17646895 10d452f vmlinux-list
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> +
> +void list_del(struct list_head *entry)
> +{
> + __list_del(entry->prev, entry->next);
> + entry->next = LIST_POISON1;
> + entry->prev = LIST_POISON2;
> +}
> +EXPORT_SYMBOL(list_del);
Have you tried :
CONFIG_DEBUG_LIST=y
Function will be doubly defined/exported then....
BTW, my vmlinux is more like
$ size vmlinux
text data bss dec hex filename
8233991 1743032 1904640 11881663 b54cbf vmlinux
Seems I beat you without even trying ;)
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 4/8] e1000e: Out of line __ew32_prepare/__ew32
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
` (2 preceding siblings ...)
2014-05-16 21:43 ` [PATCH 3/8] list: Out of line INIT_LIST_HEAD and list_del Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-17 3:23 ` Stephen Hemminger
2014-05-16 21:43 ` [PATCH 5/8] x86: Out of line get_dma_ops Andi Kleen
` (3 subsequent siblings)
7 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen, jeffrey.t.kirsher, netdev
From: Andi Kleen <ak@linux.intel.com>
Out of lining these two common inlines saves about 30k text size,
due to their errata workarounds.
14131431 2008136 1507328 17646895 10d452f vmlinux-before-e1000e
14101415 2004040 1507328 17612783 10cbfef vmlinux-e1000e
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
drivers/net/ethernet/intel/e1000e/e1000.h | 31 ++----------------------------
drivers/net/ethernet/intel/e1000e/netdev.c | 30 +++++++++++++++++++++++++++++
2 files changed, 32 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 1471c54..cbe25bb 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -573,35 +573,8 @@ static inline u32 __er32(struct e1000_hw *hw, unsigned long reg)
#define er32(reg) __er32(hw, E1000_##reg)
-/**
- * __ew32_prepare - prepare to write to MAC CSR register on certain parts
- * @hw: pointer to the HW structure
- *
- * When updating the MAC CSR registers, the Manageability Engine (ME) could
- * be accessing the registers at the same time. Normally, this is handled in
- * h/w by an arbiter but on some parts there is a bug that acknowledges Host
- * accesses later than it should which could result in the register to have
- * an incorrect value. Workaround this by checking the FWSM register which
- * has bit 24 set while ME is accessing MAC CSR registers, wait if it is set
- * and try again a number of times.
- **/
-static inline s32 __ew32_prepare(struct e1000_hw *hw)
-{
- s32 i = E1000_ICH_FWSM_PCIM2PCI_COUNT;
-
- while ((er32(FWSM) & E1000_ICH_FWSM_PCIM2PCI) && --i)
- udelay(50);
-
- return i;
-}
-
-static inline void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val)
-{
- if (hw->adapter->flags2 & FLAG2_PCIM2PCI_ARBITER_WA)
- __ew32_prepare(hw);
-
- writel(val, hw->hw_addr + reg);
-}
+s32 __ew32_prepare(struct e1000_hw *hw);
+void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val);
#define ew32(reg, val) __ew32(hw, E1000_##reg, (val))
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 3e69386..9b6cd9a 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -124,6 +124,36 @@ static const struct e1000_reg_info e1000_reg_info_tbl[] = {
};
/**
+ * __ew32_prepare - prepare to write to MAC CSR register on certain parts
+ * @hw: pointer to the HW structure
+ *
+ * When updating the MAC CSR registers, the Manageability Engine (ME) could
+ * be accessing the registers at the same time. Normally, this is handled in
+ * h/w by an arbiter but on some parts there is a bug that acknowledges Host
+ * accesses later than it should which could result in the register to have
+ * an incorrect value. Workaround this by checking the FWSM register which
+ * has bit 24 set while ME is accessing MAC CSR registers, wait if it is set
+ * and try again a number of times.
+ **/
+s32 __ew32_prepare(struct e1000_hw *hw)
+{
+ s32 i = E1000_ICH_FWSM_PCIM2PCI_COUNT;
+
+ while ((er32(FWSM) & E1000_ICH_FWSM_PCIM2PCI) && --i)
+ udelay(50);
+
+ return i;
+}
+
+void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val)
+{
+ if (hw->adapter->flags2 & FLAG2_PCIM2PCI_ARBITER_WA)
+ __ew32_prepare(hw);
+
+ writel(val, hw->hw_addr + reg);
+}
+
+/**
* e1000_regdump - register printout routine
* @hw: pointer to the HW structure
* @reginfo: pointer to the register info table
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH 4/8] e1000e: Out of line __ew32_prepare/__ew32
2014-05-16 21:43 ` [PATCH 4/8] e1000e: Out of line __ew32_prepare/__ew32 Andi Kleen
@ 2014-05-17 3:23 ` Stephen Hemminger
0 siblings, 0 replies; 23+ messages in thread
From: Stephen Hemminger @ 2014-05-17 3:23 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, akpm, Andi Kleen, jeffrey.t.kirsher, netdev
On Fri, 16 May 2014 14:43:11 -0700
Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Out of lining these two common inlines saves about 30k text size,
> due to their errata workarounds.
>
> 14131431 2008136 1507328 17646895 10d452f vmlinux-before-e1000e
> 14101415 2004040 1507328 17612783 10cbfef vmlinux-e1000e
>
> Cc: jeffrey.t.kirsher@intel.com
> Cc: netdev@vger.kernel.org
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
Since you are making a formerly private function global, the name
should be unique. I.e prefix it with e1000_
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 5/8] x86: Out of line get_dma_ops
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
` (3 preceding siblings ...)
2014-05-16 21:43 ` [PATCH 4/8] e1000e: Out of line __ew32_prepare/__ew32 Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-16 21:43 ` [PATCH 6/8] ftrace: Out of line ftrace_trigger_soft_disabled Andi Kleen
` (2 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Out of lining the complex version of get_dma_ops saves about 6.8k on
my kernel.
14101415 2004040 1507328 17612783 10cbfef vmlinux-before-dma
14094629 2004040 1507328 17605997 10ca56d vmlinux-dma
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/include/asm/dma-mapping.h | 7 +++----
arch/x86/lib/Makefile | 2 ++
arch/x86/lib/dma.c | 11 +++++++++++
3 files changed, 16 insertions(+), 4 deletions(-)
create mode 100644 arch/x86/lib/dma.c
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 808dae6..314e4bd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -29,15 +29,14 @@ extern int panic_on_overflow;
extern struct dma_map_ops *dma_ops;
+struct dma_map_ops *__get_dma_ops(struct device *dev);
+
static inline struct dma_map_ops *get_dma_ops(struct device *dev)
{
#ifndef CONFIG_X86_DEV_DMA_OPS
return dma_ops;
#else
- if (unlikely(!dev) || !dev->archdata.dma_ops)
- return dma_ops;
- else
- return dev->archdata.dma_ops;
+ return __get_dma_ops(dev);
#endif
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index eabcb6e..44dae40 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -44,3 +44,5 @@ else
lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
endif
+
+lib-y += dma.o
diff --git a/arch/x86/lib/dma.c b/arch/x86/lib/dma.c
new file mode 100644
index 0000000..c97b5ae
--- /dev/null
+++ b/arch/x86/lib/dma.c
@@ -0,0 +1,11 @@
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+struct dma_map_ops *__get_dma_ops(struct device *dev)
+{
+ if (unlikely(!dev) || !dev->archdata.dma_ops)
+ return dma_ops;
+ else
+ return dev->archdata.dma_ops;
+}
+EXPORT_SYMBOL(__get_dma_ops);
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 6/8] ftrace: Out of line ftrace_trigger_soft_disabled
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
` (4 preceding siblings ...)
2014-05-16 21:43 ` [PATCH 5/8] x86: Out of line get_dma_ops Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-16 21:43 ` [PATCH 7/8] radeon: Out of line radeon_get_ib_value Andi Kleen
2014-05-16 21:43 ` [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat Andi Kleen
7 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Out of lining this function saves about 14k text
text data bss dec hex filename
14094629 2004040 1507328 17605997 10ca56d vmlinux-before-ftrace
14079650 2008136 1507328 17595114 10c7aea vmlinux-ftrace
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
include/linux/ftrace_event.h | 23 +----------------------
kernel/trace/trace_events_trigger.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+), 22 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d16da3e..70be665 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -416,28 +416,7 @@ extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *fil
extern void event_triggers_post_call(struct ftrace_event_file *file,
enum event_trigger_type tt);
-/**
- * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
- * @file: The file pointer of the event to test
- *
- * If any triggers without filters are attached to this event, they
- * will be called here. If the event is soft disabled and has no
- * triggers that require testing the fields, it will return true,
- * otherwise false.
- */
-static inline bool
-ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
-{
- unsigned long eflags = file->flags;
-
- if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
- if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
- event_triggers_call(file, NULL);
- if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
- return true;
- }
- return false;
-}
+extern bool ftrace_trigger_soft_disabled(struct ftrace_event_file *file);
/*
* Helper function for event_trigger_unlock_commit{_regs}().
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b47..136c181 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -28,6 +28,31 @@
static LIST_HEAD(trigger_commands);
static DEFINE_MUTEX(trigger_cmd_mutex);
+
+/**
+ * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
+ * @file: The file pointer of the event to test
+ *
+ * If any triggers without filters are attached to this event, they
+ * will be called here. If the event is soft disabled and has no
+ * triggers that require testing the fields, it will return true,
+ * otherwise false.
+ */
+bool
+ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
+{
+ unsigned long eflags = file->flags;
+
+ if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
+ if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+ event_triggers_call(file, NULL);
+ if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(ftrace_trigger_soft_disabled);
+
static void
trigger_data_free(struct event_trigger_data *data)
{
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH 7/8] radeon: Out of line radeon_get_ib_value
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
` (5 preceding siblings ...)
2014-05-16 21:43 ` [PATCH 6/8] ftrace: Out of line ftrace_trigger_soft_disabled Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-20 16:16 ` Marek Olšák
2014-05-16 21:43 ` [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat Andi Kleen
7 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen, alexander.deucher, dri-devel
From: Andi Kleen <ak@linux.intel.com>
Saves about 5k of text
text data bss dec hex filename
14080360 2008168 1507328 17595856 10c7dd0 vmlinux-before-radeon
14074978 2008168 1507328 17590474 10c68ca vmlinux-radeon
Cc: alexander.deucher@amd.com
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
drivers/gpu/drm/radeon/radeon.h | 10 +---------
drivers/gpu/drm/radeon/radeon_device.c | 9 +++++++++
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 6852861..8cae409 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -1032,15 +1032,7 @@ struct radeon_cs_parser {
struct ww_acquire_ctx ticket;
};
-static inline u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
-{
- struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
-
- if (ibc->kdata)
- return ibc->kdata[idx];
- return p->ib.ptr[idx];
-}
-
+u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx);
struct radeon_cs_packet {
unsigned idx;
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 0e770bb..1cbd171 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -112,6 +112,15 @@ bool radeon_is_px(struct drm_device *dev)
return false;
}
+u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
+{
+ struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
+
+ if (ibc->kdata)
+ return ibc->kdata[idx];
+ return p->ib.ptr[idx];
+}
+
/**
* radeon_program_register_sequence - program an array of registers.
*
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH 7/8] radeon: Out of line radeon_get_ib_value
2014-05-16 21:43 ` [PATCH 7/8] radeon: Out of line radeon_get_ib_value Andi Kleen
@ 2014-05-20 16:16 ` Marek Olšák
2014-05-20 17:04 ` Andi Kleen
2014-05-20 18:14 ` Christian König
0 siblings, 2 replies; 23+ messages in thread
From: Marek Olšák @ 2014-05-20 16:16 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, Deucher, Alexander, akpm, Andi Kleen, dri-devel
I think the function should stay in the header file. It's used in
performance-critical code, so we want it to be inlined.
Marek
On Fri, May 16, 2014 at 11:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Saves about 5k of text
>
> text data bss dec hex filename
> 14080360 2008168 1507328 17595856 10c7dd0 vmlinux-before-radeon
> 14074978 2008168 1507328 17590474 10c68ca vmlinux-radeon
>
> Cc: alexander.deucher@amd.com
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> drivers/gpu/drm/radeon/radeon.h | 10 +---------
> drivers/gpu/drm/radeon/radeon_device.c | 9 +++++++++
> 2 files changed, 10 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 6852861..8cae409 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -1032,15 +1032,7 @@ struct radeon_cs_parser {
> struct ww_acquire_ctx ticket;
> };
>
> -static inline u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
> -{
> - struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
> -
> - if (ibc->kdata)
> - return ibc->kdata[idx];
> - return p->ib.ptr[idx];
> -}
> -
> +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx);
>
> struct radeon_cs_packet {
> unsigned idx;
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index 0e770bb..1cbd171 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -112,6 +112,15 @@ bool radeon_is_px(struct drm_device *dev)
> return false;
> }
>
> +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
> +{
> + struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
> +
> + if (ibc->kdata)
> + return ibc->kdata[idx];
> + return p->ib.ptr[idx];
> +}
> +
> /**
> * radeon_program_register_sequence - program an array of registers.
> *
> --
> 1.9.0
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [PATCH 7/8] radeon: Out of line radeon_get_ib_value
2014-05-20 16:16 ` Marek Olšák
@ 2014-05-20 17:04 ` Andi Kleen
2014-05-20 18:14 ` Christian König
1 sibling, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2014-05-20 17:04 UTC (permalink / raw)
To: Marek Olšák
Cc: Andi Kleen, linux-kernel, Deucher, Alexander, akpm, dri-devel
On Tue, May 20, 2014 at 06:16:48PM +0200, Marek Olšák wrote:
> I think the function should stay in the header file. It's used in
> performance-critical code, so we want it to be inlined.
This doesn't make any sense. If it's talking to the hardware it will
be dominated by the cache misses.
-Andi
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 7/8] radeon: Out of line radeon_get_ib_value
2014-05-20 16:16 ` Marek Olšák
2014-05-20 17:04 ` Andi Kleen
@ 2014-05-20 18:14 ` Christian König
1 sibling, 0 replies; 23+ messages in thread
From: Christian König @ 2014-05-20 18:14 UTC (permalink / raw)
To: Marek Olšák, Andi Kleen
Cc: Deucher, Alexander, akpm, Andi Kleen, linux-kernel, dri-devel
Yeah, agree. That function is quite critical for command stream parsing
and patching.
Christian.
Am 20.05.2014 18:16, schrieb Marek Olšák:
> I think the function should stay in the header file. It's used in
> performance-critical code, so we want it to be inlined.
>
> Marek
>
> On Fri, May 16, 2014 at 11:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> From: Andi Kleen <ak@linux.intel.com>
>>
>> Saves about 5k of text
>>
>> text data bss dec hex filename
>> 14080360 2008168 1507328 17595856 10c7dd0 vmlinux-before-radeon
>> 14074978 2008168 1507328 17590474 10c68ca vmlinux-radeon
>>
>> Cc: alexander.deucher@amd.com
>> Cc: dri-devel@lists.freedesktop.org
>> Signed-off-by: Andi Kleen <ak@linux.intel.com>
>> ---
>> drivers/gpu/drm/radeon/radeon.h | 10 +---------
>> drivers/gpu/drm/radeon/radeon_device.c | 9 +++++++++
>> 2 files changed, 10 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
>> index 6852861..8cae409 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -1032,15 +1032,7 @@ struct radeon_cs_parser {
>> struct ww_acquire_ctx ticket;
>> };
>>
>> -static inline u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
>> -{
>> - struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
>> -
>> - if (ibc->kdata)
>> - return ibc->kdata[idx];
>> - return p->ib.ptr[idx];
>> -}
>> -
>> +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx);
>>
>> struct radeon_cs_packet {
>> unsigned idx;
>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
>> index 0e770bb..1cbd171 100644
>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>> @@ -112,6 +112,15 @@ bool radeon_is_px(struct drm_device *dev)
>> return false;
>> }
>>
>> +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx)
>> +{
>> + struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx];
>> +
>> + if (ibc->kdata)
>> + return ibc->kdata[idx];
>> + return p->ib.ptr[idx];
>> +}
>> +
>> /**
>> * radeon_program_register_sequence - program an array of registers.
>> *
>> --
>> 1.9.0
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat
2014-05-16 21:43 Fix some common inline bloat Andi Kleen
` (6 preceding siblings ...)
2014-05-16 21:43 ` [PATCH 7/8] radeon: Out of line radeon_get_ib_value Andi Kleen
@ 2014-05-16 21:43 ` Andi Kleen
2014-05-17 8:31 ` Sam Ravnborg
7 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2014-05-16 21:43 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, Andi Kleen, linux-kbuild, mmarek
From: Andi Kleen <ak@linux.intel.com>
Add a tool to hunt for inline bloat. It uses objdump -S to account
inlines.
Example output:
Total code bytes seen 10463206
Code bytes by functions:
Function Total Avg Num
kmalloc 37132 (0.00%) 11 3310
ixgbe_read_reg 35440 (0.00%) 24 1444
spin_lock 28975 (0.00%) 11 2575
constant_test_bit 26387 (0.00%) 5 4642
arch_spin_unlock 24986 (0.00%) 7 3364
spin_unlock_irqrestore 24928 (0.00%) 11 2258
readl 24584 (0.00%) 4 5344
writel 23199 (0.00%) 6 3643
perf_fetch_caller_regs 22436 (0.00%) 27 821
get_current 22076 (0.00%) 9 2288
_radeon_msleep 19680 (0.00%) 55 353
INIT_LIST_HEAD 19410 (0.00%) 11 1747
list_del 19270 (0.00%) 16 1176
__ew32_prepare 19080 (0.00%) 25 740
__list_add 17830 (0.00%) 12 1406
Cc: linux-kbuild@vger.kernel.org
Cc: mmarek@suse.cz
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
scripts/inline-account.py | 164 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 164 insertions(+)
create mode 100755 scripts/inline-account.py
diff --git a/scripts/inline-account.py b/scripts/inline-account.py
new file mode 100755
index 0000000..2dfbf7c
--- /dev/null
+++ b/scripts/inline-account.py
@@ -0,0 +1,164 @@
+#!/usr/bin/python
+# account code bytes per source code / functions from objdump -Sl output
+# useful to find inline bloat
+# Author: Andi Kleen
+import os, sys, re, argparse, multiprocessing
+from collections import Counter
+
+p = argparse.ArgumentParser(
+ description="""
+Account code bytes per source code / functions from objdump.
+Useful to find inline bloat.
+
+The line numbers are the beginning of a block, so the actual code can be later.
+Line numbers can be a also little off due to objdump bugs
+also some misaccounting can happen due to inexact gcc debug information.
+The number output for functions may account a single large function multiple
+times. program/object files need to be built with -g.
+
+This is somewhat slow due to objdump -S being slow. It helps to have
+plenty of cores.""")
+p.add_argument('--min-bytes', type=int, help='minimum bytes to report', default=100)
+p.add_argument('--threads', '-t', type=int, default=multiprocessing.cpu_count(),
+ help='Number of objdump processes to run')
+p.add_argument('file', help='object file/program as input')
+args = p.parse_args()
+
+def get_syms(fn):
+ f = os.popen("nm --print-size " + fn)
+ syms = []
+ pc = None
+ for l in f:
+ n = l.split()
+ if len(n) > 2 and n[2].upper() == "T":
+ pc = int(n[0], 16)
+ syms.append(pc)
+ ln = int(n[1], 16)
+ f.close()
+ if not pc:
+ sys.exit(fn + " has no symbols")
+ syms.append(pc + ln)
+ return syms
+
+class Account:
+ pass
+
+def add_account(a, b):
+ a.funcbytes += b.funcbytes
+ a.linebytes += b.linebytes
+ a.funccount += b.funccount
+ a.nolinebytes += a.nolinebytes
+ a.nofuncbytes += a.nofuncbytes
+ a.total += b.total
+ return a
+
+# dont add sys.exit here, causes deadlocks
+def account_range(r):
+ a = Account()
+ a.funcbytes = Counter()
+ a.linebytes = Counter()
+ a.funccount = Counter()
+ a.nolinebytes = 0
+ a.nofuncbytes = 0
+ a.total = 0
+
+ line = None
+ func = None
+ codefunc = None
+
+ cmd = ("objdump -Sl %s --start-address=%#x --stop-address=%#x" %
+ (args.file, r[0], r[1]))
+ f = os.popen(cmd)
+ for l in f:
+ # 250: e8 00 00 00 00 callq 255 <proc_skip_spaces+0x5>
+ m = re.match(r'\s*([0-9a-fA-F]+):\s+(.*)', l)
+ if m:
+ #print "iscode", func, l,
+ bytes = len(re.findall(r'[0-9a-f][0-9a-f] ', m.group(2)))
+ if not func:
+ a.nofuncbytes += bytes
+ continue
+ if not line:
+ a.nolinebytes += bytes
+ continue
+ a.total += bytes
+ a.funcbytes[func] += bytes
+ a.linebytes[(file, line)] += bytes
+ codefunc = func
+ continue
+
+ # sysctl_init():
+ m = re.match(r'([a-zA-Z_][a-zA-Z0-9_]*)\(\):$', l)
+ if m:
+ if codefunc and m.group(1) != codefunc:
+ a.funccount[codefunc] += 1
+ codefunc = None
+ func = m.group(1)
+ continue
+
+ # /sysctl.c:1666
+ m = re.match(r'^([^:]+):(\d+)$', l)
+ if m:
+ file, line = m.group(1), int(m.group(2))
+ continue
+ f.close()
+
+ if codefunc:
+ a.funccount[codefunc] += 1
+ return a
+
+# objdump -S is slow, so we parallelize
+
+# split symbol table into chunks for parallelization
+# we split on functions boundaries to avoid mis-accounting
+# assumes functions have roughly similar length
+syms = sorted(get_syms(args.file))
+chunk = min((len(syms) - 1) / args.threads, len(syms) - 1)
+boundaries = [syms[x] for x in range(0, len(syms) - 1, chunk)] + [syms[-1]]
+ranges = [(boundaries[x], boundaries[x+1]) for x in range(0, len(boundaries) - 1)]
+assert ranges[0][0] == syms[0]
+assert ranges[-1][1] == syms[-1]
+
+# map-reduce
+if args.threads == 1:
+ al = map(account_range, ranges)
+else:
+ al = multiprocessing.Pool(args.threads).map(account_range, ranges)
+a = reduce(add_account, al)
+
+print "Total code bytes seen", a.total
+#print "Bytes with no function %d (%.2f%%)" % (a.nofuncbytes, 100.0*(float(a.nofuncbytes)/a.total))
+#print "Bytes with no lines %d (%.2f%%)" % (a.nolinebytes, 100.0*(float(a.nolinebytes)/a.total))
+
+def sort_map(m):
+ return sorted(m.keys(), key=lambda x: m[x], reverse=True)
+
+print "\nCode bytes by functions:"
+print "%-50s %-5s %-5s %-5s %-5s" % ("Function", "Total", "", "Avg", "Num")
+for j in sort_map(a.funcbytes):
+ if a.funcbytes[j] < args.min_bytes:
+ break
+ print "%-50s %-5d (%.2f%%) %-5d %-5d" % (
+ j,
+ a.funcbytes[j],
+ a.funcbytes[j] / float(a.total),
+ a.funcbytes[j] / a.funccount[j],
+ a.funccount[j])
+
+for j in a.linebytes.keys():
+ if a.linebytes[j] < args.min_bytes:
+ del a.linebytes[j]
+
+# os.path.commonprefix fails with >50k entries
+# just use the first 10
+prefix = os.path.commonprefix(map(lambda x: x[0], a.linebytes.keys()[:10]))
+
+print "\nCode bytes by nearby source line blocks:"
+print "prefix", prefix
+
+print "%-50s %-5s" % ("Line", "Total")
+for j in sort_map(a.linebytes):
+ print "%-50s %-5d (%.2f%%)" % (
+ "%s:%d" % (j[0].replace(prefix, ""), j[1]),
+ a.linebytes[j],
+ a.linebytes[j] / float(a.total))
--
1.9.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat
2014-05-16 21:43 ` [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat Andi Kleen
@ 2014-05-17 8:31 ` Sam Ravnborg
2014-05-17 9:36 ` Sam Ravnborg
0 siblings, 1 reply; 23+ messages in thread
From: Sam Ravnborg @ 2014-05-17 8:31 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, akpm, Andi Kleen, linux-kbuild, mmarek
Hi Andi.
On Fri, May 16, 2014 at 02:43:15PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Add a tool to hunt for inline bloat. It uses objdump -S to account
> inlines.
I tried this on my sparc32 build - but it failed with:
objdump: can't disassemble for architecture UNKNOWN!
It looks simple to add CROSS_COMPILE support but I did not do so.
My python skills are non-existing.
Sam
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 8/8] Kbuild: add inline-account tool to find inline bloat
2014-05-17 8:31 ` Sam Ravnborg
@ 2014-05-17 9:36 ` Sam Ravnborg
2014-05-17 16:51 ` Andi Kleen
0 siblings, 1 reply; 23+ messages in thread
From: Sam Ravnborg @ 2014-05-17 9:36 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, akpm, Andi Kleen, linux-kbuild, mmarek
On Sat, May 17, 2014 at 10:31:44AM +0200, Sam Ravnborg wrote:
> Hi Andi.
>
> On Fri, May 16, 2014 at 02:43:15PM -0700, Andi Kleen wrote:
> > From: Andi Kleen <ak@linux.intel.com>
> >
> > Add a tool to hunt for inline bloat. It uses objdump -S to account
> > inlines.
> I tried this on my sparc32 build - but it failed with:
> objdump: can't disassemble for architecture UNKNOWN!
>
> It looks simple to add CROSS_COMPILE support but I did not do so.
> My python skills are non-existing.
Patched the calls to nm and objdump - but it gave no output
when I ran the script.
nm --print-size shows following output:
00002910 00000024 r CSWTCH.946
00002bd4 00000024 r CSWTCH.951
U PDE_DATA
U ROOT_DEV
000000fc 00000014 T SyS_accept
00002c98 000001a8 T SyS_accept4
00000fc4 0000008c T SyS_bind
00000eb4 00000094 T SyS_connect
00000d98 00000094 T SyS_getpeername
00000e2c 00000088 T SyS_getsockname
00000c6c 00000090 T SyS_getsockopt
00000f48 0000007c T SyS_listen
00000128 00000018 T SyS_recv
0000142c 000000f0 T SyS_recvfrom
00001920 000000d0 T SyS_recvmmsg
00001238 00000010 T SyS_recvmsg
00000110 00000018 T SyS_send
00001d38 00000010 T SyS_sendmmsg
00001db0 00000010 T SyS_sendmsg
000015f4 000000dc T SyS_sendto
00000cfc 0000009c T SyS_setsockopt
00000c0c 00000060 T SyS_shutdown
00003020 000000b4 T SyS_socket
000007ec 000001f8 T SyS_socketcall
00002e40 000001e0 T SyS_socketpair
000b776c 00000098 t T.1063
000b762c 000000d0 t T.1064
objdump -Sl shows following output:
000000d4 <sock_mmap>:
sock_mmap():
d4: 9d e3 bf a0 save %sp, -96, %sp
d8: c2 06 20 78 ld [ %i0 + 0x78 ], %g1
dc: 94 10 00 19 mov %i1, %o2
e0: 92 10 00 01 mov %g1, %o1
e4: c2 00 60 18 ld [ %g1 + 0x18 ], %g1
e8: c2 00 60 40 ld [ %g1 + 0x40 ], %g1
ec: 9f c0 40 00 call %g1
f0: 90 10 00 18 mov %i0, %o0
f4: 81 c7 e0 08 ret
f8: 91 e8 00 08 restore %g0, %o0, %o0
000000fc <SyS_accept>:
sys_accept():
fc: 96 10 20 00 clr %o3
100: 82 13 c0 00 mov %o7, %g1
104: 40 00 00 00 call 104 <SyS_accept+0x8>
108: 9e 10 40 00 mov %g1, %o7
10c: 01 00 00 00 nop
00000110 <SyS_send>:
SyS_send():
110: 98 10 20 00 clr %o4 ! 0 <sock_from_file-0x4c>
114: 9a 10 20 00 clr %o5
118: 82 13 c0 00 mov %o7, %g1
11c: 40 00 00 00 call 11c <SyS_send+0xc>
120: 9e 10 40 00 mov %g1, %o7
124: 01 00 00 00 nop
00000128 <SyS_recv>:
sys_recv():
128: 98 10 20 00 clr %o4 ! 0 <sock_from_file-0x4c>
12c: 9a 10 20 00 clr %o5
130: 82 13 c0 00 mov %o7, %g1
134: 40 00 00 00 call 134 <SyS_recv+0xc>
138: 9e 10 40 00 mov %g1, %o7
13c: 01 00 00 00 nop
Sam
^ permalink raw reply [flat|nested] 23+ messages in thread