* [PATCH 2/7]: [NET]: Add NAPI_STATE_DISABLE.
From: David Miller @ 2008-01-08 5:38 UTC (permalink / raw)
To: netdev
[NET]: Add NAPI_STATE_DISABLE.
Create a bit to signal that a napi_disable() is in progress.
This sets up infrastructure such that net_rx_action() can generically
break out of the ->poll() loop on a NAPI context that has a pending
napi_disable() yet is being bombed with packets (and thus would
otherwise poll endlessly and not allow the napi_disable() to finish).
Now, what napi_disable() does is first set the NAPI_STATE_DISABLE bit
(to indicate that a disable is pending), then it polls for the
NAPI_STATE_SCHED bit, and once the NAPI_STATE_SCHED bit is acquired
the NAPI_STATE_DISABLE bit is cleared. Here, the test_and_set_bit()
provides the necessary memory barrier between the various bitops.
napi_schedule_prep() now tests for a pending disable as it's first
action and won't try to obtain the NAPI_STATE_SCHED bit if a disable
is pending.
As a result, we can remove the netif_running() check in
netif_rx_schedule_prep() because the NAPI disable pending state serves
this purpose. And, it does so in a NAPI centric manner which is what
we really want.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/netdevice.h | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e393995..b0813c3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -319,21 +319,29 @@ struct napi_struct {
enum
{
NAPI_STATE_SCHED, /* Poll is scheduled */
+ NAPI_STATE_DISABLE, /* Disable pending */
};
extern void FASTCALL(__napi_schedule(struct napi_struct *n));
+static inline int napi_disable_pending(struct napi_struct *n)
+{
+ return test_bit(NAPI_STATE_DISABLE, &n->state);
+}
+
/**
* napi_schedule_prep - check if napi can be scheduled
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
* it as running. This is used as a condition variable
- * insure only one NAPI poll instance runs
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
*/
static inline int napi_schedule_prep(struct napi_struct *n)
{
- return !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
+ return !napi_disable_pending(n) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
}
/**
@@ -389,8 +397,10 @@ static inline void napi_complete(struct napi_struct *n)
*/
static inline void napi_disable(struct napi_struct *n)
{
+ set_bit(NAPI_STATE_DISABLE, &n->state);
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
}
/**
@@ -1268,7 +1278,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
static inline int netif_rx_schedule_prep(struct net_device *dev,
struct napi_struct *napi)
{
- return netif_running(dev) && napi_schedule_prep(napi);
+ return napi_schedule_prep(napi);
}
/* Add interface to tail of rx poll list. This assumes that _prep has
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* [PATCH 3/7]: [NET]: Do not check netif_running() and carrier state in ->poll()
From: David Miller @ 2008-01-08 5:39 UTC (permalink / raw)
To: netdev
[NET]: Do not check netif_running() and carrier state in ->poll()
Drivers do this to try to break out of the ->poll()'ing loop
when the device is being brought administratively down.
Now that we have a napi_disable() "pending" state we are going
to solve that problem generically.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/e100.c | 2 +-
drivers/net/e1000/e1000_main.c | 8 +-------
drivers/net/e1000e/netdev.c | 8 +-------
drivers/net/epic100.c | 2 +-
drivers/net/fec_8xx/fec_main.c | 5 -----
drivers/net/fs_enet/fs_enet-main.c | 3 ---
drivers/net/ixgb/ixgb_main.c | 2 +-
drivers/net/ixgbe/ixgbe_main.c | 8 +-------
drivers/net/ixp2000/ixpdev.c | 2 --
drivers/net/myri10ge/myri10ge.c | 2 +-
drivers/net/natsemi.c | 2 +-
drivers/net/qla3xxx.c | 7 +------
drivers/net/s2io.c | 3 ---
drivers/net/tulip/interrupt.c | 5 -----
drivers/net/xen-netfront.c | 5 -----
15 files changed, 9 insertions(+), 55 deletions(-)
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 2b06e4b..68316f1 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -1997,7 +1997,7 @@ static int e100_poll(struct napi_struct *napi, int budget)
tx_cleaned = e100_tx_clean(nic);
/* If no Rx and Tx cleanup work was done, exit polling mode. */
- if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {
+ if((!tx_cleaned && (work_done == 0))) {
netif_rx_complete(netdev, napi);
e100_enable_irq(nic);
}
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 4f37506..9de7144 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3924,10 +3924,6 @@ e1000_clean(struct napi_struct *napi, int budget)
/* Must NOT use netdev_priv macro here. */
adapter = poll_dev->priv;
- /* Keep link state information with original netdev */
- if (!netif_carrier_ok(poll_dev))
- goto quit_polling;
-
/* e1000_clean is called per-cpu. This lock protects
* tx_ring[0] from being cleaned by multiple cpus
* simultaneously. A failure obtaining the lock means
@@ -3942,9 +3938,7 @@ e1000_clean(struct napi_struct *napi, int budget)
&work_done, budget);
/* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done == 0)) ||
- !netif_running(poll_dev)) {
-quit_polling:
+ if ((!tx_cleaned && (work_done == 0))) {
if (likely(adapter->itr_setting & 3))
e1000_set_itr(adapter);
netif_rx_complete(poll_dev, napi);
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 4fd2e23..dd9698c 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -1389,10 +1389,6 @@ static int e1000_clean(struct napi_struct *napi, int budget)
/* Must NOT use netdev_priv macro here. */
adapter = poll_dev->priv;
- /* Keep link state information with original netdev */
- if (!netif_carrier_ok(poll_dev))
- goto quit_polling;
-
/* e1000_clean is called per-cpu. This lock protects
* tx_ring from being cleaned by multiple cpus
* simultaneously. A failure obtaining the lock means
@@ -1405,9 +1401,7 @@ static int e1000_clean(struct napi_struct *napi, int budget)
adapter->clean_rx(adapter, &work_done, budget);
/* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done < budget)) ||
- !netif_running(poll_dev)) {
-quit_polling:
+ if ((!tx_cleaned && (work_done < budget))) {
if (adapter->itr_setting & 3)
e1000_set_itr(adapter);
netif_rx_complete(poll_dev, napi);
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index ecdd3fc..0b365b8 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1273,7 +1273,7 @@ rx_action:
epic_rx_err(dev, ep);
- if (netif_running(dev) && (work_done < budget)) {
+ if (work_done < budget) {
unsigned long flags;
int more;
diff --git a/drivers/net/fec_8xx/fec_main.c b/drivers/net/fec_8xx/fec_main.c
index 8d2904f..ab9637a 100644
--- a/drivers/net/fec_8xx/fec_main.c
+++ b/drivers/net/fec_8xx/fec_main.c
@@ -476,11 +476,6 @@ static int fec_enet_rx_common(struct fec_enet_private *ep,
__u16 pkt_len, sc;
int curidx;
- if (fpi->use_napi) {
- if (!netif_running(dev))
- return 0;
- }
-
/*
* First, grab all of the stats for the incoming packet.
* These get messed up if we get called due to a busy condition.
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index f2a4d39..3e1a57a 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -96,9 +96,6 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int budget)
u16 pkt_len, sc;
int curidx;
- if (!netif_running(dev))
- return 0;
-
/*
* First, grab all of the stats for the incoming packet.
* These get messed up if we get called due to a busy condition.
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index bf9085f..a8bef52 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1794,7 +1794,7 @@ ixgb_clean(struct napi_struct *napi, int budget)
ixgb_clean_rx_irq(adapter, &work_done, budget);
/* if no Tx and not enough Rx work done, exit the polling mode */
- if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {
+ if((!tx_cleaned && (work_done == 0))) {
netif_rx_complete(netdev, napi);
ixgb_irq_enable(adapter);
}
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 00bc525..7c31930 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1470,19 +1470,13 @@ static int ixgbe_clean(struct napi_struct *napi, int budget)
struct net_device *netdev = adapter->netdev;
int tx_cleaned = 0, work_done = 0;
- /* Keep link state information with original netdev */
- if (!netif_carrier_ok(adapter->netdev))
- goto quit_polling;
-
/* In non-MSIX case, there is no multi-Tx/Rx queue */
tx_cleaned = ixgbe_clean_tx_irq(adapter, adapter->tx_ring);
ixgbe_clean_rx_irq(adapter, &adapter->rx_ring[0], &work_done,
budget);
/* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done < budget)) ||
- !netif_running(adapter->netdev)) {
-quit_polling:
+ if ((!tx_cleaned && (work_done < budget))) {
netif_rx_complete(netdev, napi);
ixgbe_irq_enable(adapter);
}
diff --git a/drivers/net/ixp2000/ixpdev.c b/drivers/net/ixp2000/ixpdev.c
index 6c0dd49..484cb2b 100644
--- a/drivers/net/ixp2000/ixpdev.c
+++ b/drivers/net/ixp2000/ixpdev.c
@@ -135,8 +135,6 @@ static int ixpdev_poll(struct napi_struct *napi, int budget)
struct net_device *dev = ip->dev;
int rx;
- /* @@@ Have to stop polling when nds[0] is administratively
- * downed while we are polling. */
rx = 0;
do {
ixp2000_reg_write(IXP2000_IRQ_THD_RAW_STATUS_A_0, 0x00ff);
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 8def865..c90958f 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1239,7 +1239,7 @@ static int myri10ge_poll(struct napi_struct *napi, int budget)
/* process as many rx events as NAPI will allow */
work_done = myri10ge_clean_rx_done(mgp, budget);
- if (work_done < budget || !netif_running(netdev)) {
+ if (work_done < budget) {
netif_rx_complete(netdev, napi);
put_be32(htonl(3), mgp->irq_claim);
}
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 87cde06..c329a4f 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -2266,7 +2266,7 @@ static int natsemi_poll(struct napi_struct *napi, int budget)
/* Reenable interrupts providing nothing is trying to shut
* the chip down. */
spin_lock(&np->lock);
- if (!np->hands_off && netif_running(dev))
+ if (!np->hands_off)
natsemi_irq_enable(dev);
spin_unlock(&np->lock);
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index a579111..cf0774d 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -2320,14 +2320,9 @@ static int ql_poll(struct napi_struct *napi, int budget)
unsigned long hw_flags;
struct ql3xxx_port_registers __iomem *port_regs = qdev->mem_map_registers;
- if (!netif_carrier_ok(ndev))
- goto quit_polling;
-
ql_tx_rx_clean(qdev, &tx_cleaned, &rx_cleaned, budget);
- if (tx_cleaned + rx_cleaned != budget ||
- !netif_running(ndev)) {
-quit_polling:
+ if (tx_cleaned + rx_cleaned != budget) {
spin_lock_irqsave(&qdev->hw_lock, hw_flags);
__netif_rx_complete(ndev, napi);
ql_update_small_bufq_prod_index(qdev);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 9d80f1c..fa57c49 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -2704,9 +2704,6 @@ static int s2io_poll(struct napi_struct *napi, int budget)
struct XENA_dev_config __iomem *bar0 = nic->bar0;
int i;
- if (!is_s2io_card_up(nic))
- return 0;
-
mac_control = &nic->mac_control;
config = &nic->config;
diff --git a/drivers/net/tulip/interrupt.c b/drivers/net/tulip/interrupt.c
index 0461956..6284afd 100644
--- a/drivers/net/tulip/interrupt.c
+++ b/drivers/net/tulip/interrupt.c
@@ -117,9 +117,6 @@ int tulip_poll(struct napi_struct *napi, int budget)
int received = 0;
#endif
- if (!netif_running(dev))
- goto done;
-
#ifdef CONFIG_TULIP_NAPI_HW_MITIGATION
/* that one buffer is needed for mit activation; or might be a
@@ -261,8 +258,6 @@ int tulip_poll(struct napi_struct *napi, int budget)
* finally: amount of IO did not increase at all. */
} while ((ioread32(tp->base_addr + CSR5) & RxIntr));
-done:
-
#ifdef CONFIG_TULIP_NAPI_HW_MITIGATION
/* We use this simplistic scheme for IM. It's proven by
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 2a8fc43..bca37bf 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -852,11 +852,6 @@ static int xennet_poll(struct napi_struct *napi, int budget)
spin_lock(&np->rx_lock);
- if (unlikely(!netif_carrier_ok(dev))) {
- spin_unlock(&np->rx_lock);
- return 0;
- }
-
skb_queue_head_init(&rxq);
skb_queue_head_init(&errq);
skb_queue_head_init(&tmpq);
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* [PATCH 4/7]: [NETXEN]: Fix ->poll() done logic.
From: David Miller @ 2008-01-08 5:39 UTC (permalink / raw)
To: netdev
[NETXEN]: Fix ->poll() done logic.
If work_done >= budget we should always elide the NAPI
completion.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/netxen/netxen_nic_main.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index a80f0cd..454226f 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1321,7 +1321,7 @@ static int netxen_nic_poll(struct napi_struct *napi, int budget)
budget / MAX_RCV_CTX);
}
- if (work_done >= budget && netxen_nic_rx_has_work(adapter) != 0)
+ if (work_done >= budget)
done = 0;
if (netxen_process_cmd_ring((unsigned long)adapter) == 0)
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* [PATCH 5/7]: [NET]: Fix drivers to handle napi_disable() disabling interrupts.
From: David Miller @ 2008-01-08 5:40 UTC (permalink / raw)
To: netdev
[NET]: Fix drivers to handle napi_disable() disabling interrupts.
When we add the generic napi_disable_pending() breakout
logic to net_rx_action() it means that napi_disable()
can cause NAPI poll interrupt events to be disabled.
And this is exactly what we want. If a napi_disable()
is pending, and we are looping in the ->poll(), we want
->poll() event interrupts to stay disabled and we want
to complete the NAPI poll ASAP.
When ->poll() break out during device down was being handled on a
per-driver basis, often these drivers would turn interrupts back on
when '!netif_running()' was detected.
And this would just cause a reschedule of the NAPI ->poll() in the
interrupt handler before the napi_disable() could get in there and
grab the NAPI_STATE_SCHED bit.
The vast majority of drivers don't care if napi_disable() might have
the side effect of disabling NAPI ->poll() event interrupts. In all
such cases, when a napi_disable() is performed, the driver just
disabled interrupts or is about to.
However there were three exceptions to this in PCNET32, R8169, and
SKY2. To fix those cases, at the subsequent napi_enable() points, I
added code to ensure that the ->poll() interrupt events are enabled in
the hardware.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/pcnet32.c | 5 +++++
drivers/net/r8169.c | 2 ++
drivers/net/sky2.c | 3 +++
3 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index ff92aca..90498ff 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -455,9 +455,14 @@ static void pcnet32_netif_start(struct net_device *dev)
{
#ifdef CONFIG_PCNET32_NAPI
struct pcnet32_private *lp = netdev_priv(dev);
+ ulong ioaddr = dev->base_addr;
+ u16 val;
#endif
netif_wake_queue(dev);
#ifdef CONFIG_PCNET32_NAPI
+ val = lp->a.read_csr(ioaddr, CSR3);
+ val &= 0x00ff;
+ lp->a.write_csr(ioaddr, CSR3, val);
napi_enable(&lp->napi);
#endif
}
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 5863190..af80309 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2398,6 +2398,8 @@ static void rtl8169_wait_for_quiescence(struct net_device *dev)
rtl8169_irq_mask_and_ack(ioaddr);
#ifdef CONFIG_R8169_NAPI
+ tp->intr_mask = 0xffff;
+ RTL_W16(IntrMask, tp->intr_event);
napi_enable(&tp->napi);
#endif
}
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index a74fc11..52ec89b 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1168,6 +1168,7 @@ static void sky2_vlan_rx_register(struct net_device *dev, struct vlan_group *grp
TX_VLAN_TAG_OFF);
}
+ sky2_read32(hw, B0_Y2_SP_LISR);
napi_enable(&hw->napi);
netif_tx_unlock_bh(dev);
}
@@ -2043,6 +2044,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
err = sky2_rx_start(sky2);
sky2_write32(hw, B0_IMSK, imask);
+ sky2_read32(hw, B0_Y2_SP_LISR);
napi_enable(&hw->napi);
if (err)
@@ -3861,6 +3863,7 @@ static int sky2_debug_show(struct seq_file *seq, void *v)
last = sky2_read16(hw, Y2_QADDR(rxqaddr[port], PREF_UNIT_PUT_IDX)),
sky2_read16(hw, Y2_QADDR(rxqaddr[port], PREF_UNIT_LAST_IDX)));
+ sky2_read32(hw, B0_Y2_SP_LISR);
napi_enable(&hw->napi);
return 0;
}
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* [PATCH 6/7]:
From: David Miller @ 2008-01-08 5:40 UTC (permalink / raw)
To: netdev
[NET]: Stop polling when napi_disable() is pending.
This finally adds the code in net_rx_action() to break out of the
->poll()'ing loop when a napi_disable() is found to be pending.
Now, even if a device is being flooded with packets it can be cleanly
brought down.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/core/dev.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index be9d301..0879f52 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2207,8 +2207,12 @@ static void net_rx_action(struct softirq_action *h)
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
- if (unlikely(work == weight))
- list_move_tail(&n->poll_list, list);
+ if (unlikely(work == weight)) {
+ if (unlikely(napi_disable_pending(n)))
+ __napi_complete(n);
+ else
+ list_move_tail(&n->poll_list, list);
+ }
netpoll_poll_unlock(have);
}
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* [PATCH 7/7]: [NET]: Make ->poll() breakout consistent in Intel ethernet drivers.
From: David Miller @ 2008-01-08 5:41 UTC (permalink / raw)
To: netdev
[NET]: Make ->poll() breakout consistent in Intel ethernet drivers.
This makes the ->poll() routines of the E100, E1000, E1000E, IXGB, and
IXGBE drivers complete ->poll() consistently.
Now they will all break out when the amount of RX work done is less
than 'budget'.
At a later time, we may want put back code to include the TX work as
well (as at least one other NAPI driver does, but by in large NAPI
drivers do not do this). But if so, it should be done consistently
across the board to all of these drivers.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/e100.c | 7 +++----
drivers/net/e1000/e1000_main.c | 10 +++++-----
drivers/net/e1000e/netdev.c | 8 ++++----
drivers/net/ixgb/ixgb_main.c | 7 +++----
drivers/net/ixgbe/ixgbe_main.c | 8 ++++----
5 files changed, 19 insertions(+), 21 deletions(-)
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 68316f1..b87402b 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -1991,13 +1991,12 @@ static int e100_poll(struct napi_struct *napi, int budget)
struct nic *nic = container_of(napi, struct nic, napi);
struct net_device *netdev = nic->netdev;
unsigned int work_done = 0;
- int tx_cleaned;
e100_rx_clean(nic, &work_done, budget);
- tx_cleaned = e100_tx_clean(nic);
+ e100_tx_clean(nic);
- /* If no Rx and Tx cleanup work was done, exit polling mode. */
- if((!tx_cleaned && (work_done == 0))) {
+ /* If budget not fully consumed, exit the polling mode */
+ if (work_done < budget) {
netif_rx_complete(netdev, napi);
e100_enable_irq(nic);
}
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 9de7144..13d57b0 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3919,7 +3919,7 @@ e1000_clean(struct napi_struct *napi, int budget)
{
struct e1000_adapter *adapter = container_of(napi, struct e1000_adapter, napi);
struct net_device *poll_dev = adapter->netdev;
- int tx_cleaned = 0, work_done = 0;
+ int work_done = 0;
/* Must NOT use netdev_priv macro here. */
adapter = poll_dev->priv;
@@ -3929,16 +3929,16 @@ e1000_clean(struct napi_struct *napi, int budget)
* simultaneously. A failure obtaining the lock means
* tx_ring[0] is currently being cleaned anyway. */
if (spin_trylock(&adapter->tx_queue_lock)) {
- tx_cleaned = e1000_clean_tx_irq(adapter,
- &adapter->tx_ring[0]);
+ e1000_clean_tx_irq(adapter,
+ &adapter->tx_ring[0]);
spin_unlock(&adapter->tx_queue_lock);
}
adapter->clean_rx(adapter, &adapter->rx_ring[0],
&work_done, budget);
- /* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done == 0))) {
+ /* If budget not fully consumed, exit the polling mode */
+ if (work_done < budget) {
if (likely(adapter->itr_setting & 3))
e1000_set_itr(adapter);
netif_rx_complete(poll_dev, napi);
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index dd9698c..4a6fc74 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -1384,7 +1384,7 @@ static int e1000_clean(struct napi_struct *napi, int budget)
{
struct e1000_adapter *adapter = container_of(napi, struct e1000_adapter, napi);
struct net_device *poll_dev = adapter->netdev;
- int tx_cleaned = 0, work_done = 0;
+ int work_done = 0;
/* Must NOT use netdev_priv macro here. */
adapter = poll_dev->priv;
@@ -1394,14 +1394,14 @@ static int e1000_clean(struct napi_struct *napi, int budget)
* simultaneously. A failure obtaining the lock means
* tx_ring is currently being cleaned anyway. */
if (spin_trylock(&adapter->tx_queue_lock)) {
- tx_cleaned = e1000_clean_tx_irq(adapter);
+ e1000_clean_tx_irq(adapter);
spin_unlock(&adapter->tx_queue_lock);
}
adapter->clean_rx(adapter, &work_done, budget);
- /* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done < budget))) {
+ /* If budget not fully consumed, exit the polling mode */
+ if (work_done < budget) {
if (adapter->itr_setting & 3)
e1000_set_itr(adapter);
netif_rx_complete(poll_dev, napi);
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index a8bef52..d2fb88d 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1787,14 +1787,13 @@ ixgb_clean(struct napi_struct *napi, int budget)
{
struct ixgb_adapter *adapter = container_of(napi, struct ixgb_adapter, napi);
struct net_device *netdev = adapter->netdev;
- int tx_cleaned;
int work_done = 0;
- tx_cleaned = ixgb_clean_tx_irq(adapter);
+ ixgb_clean_tx_irq(adapter);
ixgb_clean_rx_irq(adapter, &work_done, budget);
- /* if no Tx and not enough Rx work done, exit the polling mode */
- if((!tx_cleaned && (work_done == 0))) {
+ /* If budget not fully consumed, exit the polling mode */
+ if (work_done < budget) {
netif_rx_complete(netdev, napi);
ixgb_irq_enable(adapter);
}
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 7c31930..a564916 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1468,15 +1468,15 @@ static int ixgbe_clean(struct napi_struct *napi, int budget)
struct ixgbe_adapter *adapter = container_of(napi,
struct ixgbe_adapter, napi);
struct net_device *netdev = adapter->netdev;
- int tx_cleaned = 0, work_done = 0;
+ int work_done = 0;
/* In non-MSIX case, there is no multi-Tx/Rx queue */
- tx_cleaned = ixgbe_clean_tx_irq(adapter, adapter->tx_ring);
+ ixgbe_clean_tx_irq(adapter, adapter->tx_ring);
ixgbe_clean_rx_irq(adapter, &adapter->rx_ring[0], &work_done,
budget);
- /* If no Tx and not enough Rx work done, exit the polling mode */
- if ((!tx_cleaned && (work_done < budget))) {
+ /* If budget not fully consumed, exit the polling mode */
+ if (work_done < budget) {
netif_rx_complete(netdev, napi);
ixgbe_irq_enable(adapter);
}
--
1.5.4.rc2.38.gd6da3
^ permalink raw reply related
* Re: [PATCH][XFRM] Statistics: Add outbound-dropping error.
From: David Miller @ 2008-01-08 5:46 UTC (permalink / raw)
To: nakam; +Cc: herbert, netdev
In-Reply-To: <11997665683729-git-send-email-nakam@linux-ipv6.org>
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Tue, 8 Jan 2008 13:29:28 +0900
> [PATCH][XFRM] Statistics: Add outbound-dropping error.
>
> o Increment PolError counter when flow_cache_lookup() returns
> errored pointer.
>
> o Increment NoStates counter at larval-drop.
>
> Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Applied, thanks.
^ permalink raw reply
* Re: [PATCH] intel ethernet drivers: update MAINTAINERS
From: David Miller @ 2008-01-08 5:47 UTC (permalink / raw)
To: auke-jan.h.kok; +Cc: jeff, netdev
In-Reply-To: <20080108005414.31671.37938.stgit@localhost.localdomain>
From: Auke Kok <auke-jan.h.kok@intel.com>
Date: Mon, 07 Jan 2008 16:54:14 -0800
> Unfortunately Jeb decided to move away from our group. We wish
> Jeb good luck with his new group!
>
> Reordered people a bit so most active team members are on top.
>
> Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
Applied, thanks.
^ permalink raw reply
* Re: [PATCH] [NET] kaweth was forgotten in msec switchover of usb_start_wait_urb
From: David Miller @ 2008-01-08 5:48 UTC (permalink / raw)
To: Russ.Dill, russd; +Cc: zapman, stephane, bhards, oliver, netdev
In-Reply-To: <134b1d6c0801071520h9932537ke0b5daeaabab289b@mail.gmail.com>
From: "Russell Dill" <russd@asu.edu>
Date: Mon, 7 Jan 2008 16:20:20 -0700
> Back in 2.6.12-pre, usb_start_wait_urb was switched over to take
> milliseconds instead of jiffies. kaweth.c was never updated to match.
>
> Signed-off-by: Russ Dill <Russ.Dill@asu.edu>
Applied, thanks.
^ permalink raw reply
* Re: [RFC PATCH v2 1/2] NET: Clone the sk_buff 'iif' field in __skb_clone()
From: David Miller @ 2008-01-08 5:48 UTC (permalink / raw)
To: jmorris; +Cc: paul.moore, netdev
In-Reply-To: <Xine.LNX.4.64.0801080810010.15645@us.intercode.com.au>
From: James Morris <jmorris@namei.org>
Date: Tue, 8 Jan 2008 08:11:11 +1100 (EST)
> On Mon, 7 Jan 2008, Paul Moore wrote:
>
> > Both NetLabel and SELinux (other LSMs may grow to use it as well) rely on the
> > 'iif' field to determine the receiving network interface of inbound packets.
> > Unfortunately, at present this field is not preserved across a skb clone
> > operation which can lead to garbage values if the cloned skb is sent back
> > through the network stack. This patch corrects this problem by properly
> > copying the 'iif' field in __skb_clone() and removing the 'iif' field
> > assignment from skb_act_clone() since it is no longer needed.
> >
> > Also, while we are here, put the assignments in the same order as the offsets
> > to reduce cacheline bounces.
> >
> > Signed-off-by: Paul Moore <paul.moore@hp.com>
>
> Dave, perhaps this one should pushed to Linus now as a bugfix?
Probably we should, yes.
Ok, that's what I'll do.
^ permalink raw reply
* Please pull 'fixes-jgarzik' branch of wireless-2.6
From: John W. Linville @ 2008-01-08 5:15 UTC (permalink / raw)
To: jeff; +Cc: netdev, linux-wireless
Jeff,
One more fix for 2.6.24. Without this b43 passes the wrong channel
number and frequency to mac80211 for received frames. The patch
is a little big, but it is mostly some definitions related to other
definitions used in the fix itself.
Let me know if this is a problem!
Thanks,
John
---
Individual patches are available here:
http://www.kernel.org/pub/linux/kernel/people/linville/wireless-2.6/fixes-jgarzik
---
The following changes since commit 3ce54450461bad18bbe1f9f5aa3ecd2f8e8d1235:
Linus Torvalds (1):
Linux 2.6.24-rc7
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git fixes-jgarzik
Michael Buesch (1):
b43: Fix rxheader channel parsing
drivers/net/wireless/b43/b43.h | 2 +
drivers/net/wireless/b43/main.h | 20 ++---------
drivers/net/wireless/b43/xmit.c | 27 +++++++++++-----
drivers/net/wireless/b43/xmit.h | 65 +++++++++++++++++++++-----------------
4 files changed, 61 insertions(+), 53 deletions(-)
diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h
index a28ad23..7b6fc1a 100644
--- a/drivers/net/wireless/b43/b43.h
+++ b/drivers/net/wireless/b43/b43.h
@@ -273,6 +273,8 @@ enum {
#define B43_PHYTYPE_A 0x00
#define B43_PHYTYPE_B 0x01
#define B43_PHYTYPE_G 0x02
+#define B43_PHYTYPE_N 0x04
+#define B43_PHYTYPE_LP 0x05
/* PHYRegisters */
#define B43_PHY_ILT_A_CTRL 0x0072
diff --git a/drivers/net/wireless/b43/main.h b/drivers/net/wireless/b43/main.h
index 284d17d..08e2e56 100644
--- a/drivers/net/wireless/b43/main.h
+++ b/drivers/net/wireless/b43/main.h
@@ -39,11 +39,11 @@
#define PAD_BYTES(nr_bytes) P4D_BYTES( __LINE__ , (nr_bytes))
/* Lightweight function to convert a frequency (in Mhz) to a channel number. */
-static inline u8 b43_freq_to_channel_a(int freq)
+static inline u8 b43_freq_to_channel_5ghz(int freq)
{
return ((freq - 5000) / 5);
}
-static inline u8 b43_freq_to_channel_bg(int freq)
+static inline u8 b43_freq_to_channel_2ghz(int freq)
{
u8 channel;
@@ -54,19 +54,13 @@ static inline u8 b43_freq_to_channel_bg(int freq)
return channel;
}
-static inline u8 b43_freq_to_channel(struct b43_wldev *dev, int freq)
-{
- if (dev->phy.type == B43_PHYTYPE_A)
- return b43_freq_to_channel_a(freq);
- return b43_freq_to_channel_bg(freq);
-}
/* Lightweight function to convert a channel number to a frequency (in Mhz). */
-static inline int b43_channel_to_freq_a(u8 channel)
+static inline int b43_channel_to_freq_5ghz(u8 channel)
{
return (5000 + (5 * channel));
}
-static inline int b43_channel_to_freq_bg(u8 channel)
+static inline int b43_channel_to_freq_2ghz(u8 channel)
{
int freq;
@@ -77,12 +71,6 @@ static inline int b43_channel_to_freq_bg(u8 channel)
return freq;
}
-static inline int b43_channel_to_freq(struct b43_wldev *dev, u8 channel)
-{
- if (dev->phy.type == B43_PHYTYPE_A)
- return b43_channel_to_freq_a(channel);
- return b43_channel_to_freq_bg(channel);
-}
static inline int b43_is_cck_rate(int rate)
{
diff --git a/drivers/net/wireless/b43/xmit.c b/drivers/net/wireless/b43/xmit.c
index 0bd6f8a..3307ba1 100644
--- a/drivers/net/wireless/b43/xmit.c
+++ b/drivers/net/wireless/b43/xmit.c
@@ -531,21 +531,32 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr)
switch (chanstat & B43_RX_CHAN_PHYTYPE) {
case B43_PHYTYPE_A:
status.phymode = MODE_IEEE80211A;
- status.freq = chanid;
- status.channel = b43_freq_to_channel_a(chanid);
- break;
- case B43_PHYTYPE_B:
- status.phymode = MODE_IEEE80211B;
- status.freq = chanid + 2400;
- status.channel = b43_freq_to_channel_bg(chanid + 2400);
+ B43_WARN_ON(1);
+ /* FIXME: We don't really know which value the "chanid" contains.
+ * So the following assignment might be wrong. */
+ status.channel = chanid;
+ status.freq = b43_channel_to_freq_5ghz(status.channel);
break;
case B43_PHYTYPE_G:
status.phymode = MODE_IEEE80211G;
+ /* chanid is the radio channel cookie value as used
+ * to tune the radio. */
status.freq = chanid + 2400;
- status.channel = b43_freq_to_channel_bg(chanid + 2400);
+ status.channel = b43_freq_to_channel_2ghz(status.freq);
+ break;
+ case B43_PHYTYPE_N:
+ status.phymode = 0xDEAD /*FIXME MODE_IEEE80211N*/;
+ /* chanid is the SHM channel cookie. Which is the plain
+ * channel number in b43. */
+ status.channel = chanid;
+ if (chanstat & B43_RX_CHAN_5GHZ)
+ status.freq = b43_freq_to_channel_5ghz(status.freq);
+ else
+ status.freq = b43_freq_to_channel_2ghz(status.freq);
break;
default:
B43_WARN_ON(1);
+ goto drop;
}
dev->stats.last_rx = jiffies;
diff --git a/drivers/net/wireless/b43/xmit.h b/drivers/net/wireless/b43/xmit.h
index 03bddd2..6dc0793 100644
--- a/drivers/net/wireless/b43/xmit.h
+++ b/drivers/net/wireless/b43/xmit.h
@@ -142,49 +142,56 @@ struct b43_rxhdr_fw4 {
} __attribute__ ((__packed__));
/* PHY RX Status 0 */
-#define B43_RX_PHYST0_GAINCTL 0x4000 /* Gain Control */
-#define B43_RX_PHYST0_PLCPHCF 0x0200
-#define B43_RX_PHYST0_PLCPFV 0x0100
-#define B43_RX_PHYST0_SHORTPRMBL 0x0080 /* Received with Short Preamble */
+#define B43_RX_PHYST0_GAINCTL 0x4000 /* Gain Control */
+#define B43_RX_PHYST0_PLCPHCF 0x0200
+#define B43_RX_PHYST0_PLCPFV 0x0100
+#define B43_RX_PHYST0_SHORTPRMBL 0x0080 /* Received with Short Preamble */
#define B43_RX_PHYST0_LCRS 0x0040
-#define B43_RX_PHYST0_ANT 0x0020 /* Antenna */
-#define B43_RX_PHYST0_UNSRATE 0x0010
+#define B43_RX_PHYST0_ANT 0x0020 /* Antenna */
+#define B43_RX_PHYST0_UNSRATE 0x0010
#define B43_RX_PHYST0_CLIP 0x000C
#define B43_RX_PHYST0_CLIP_SHIFT 2
-#define B43_RX_PHYST0_FTYPE 0x0003 /* Frame type */
-#define B43_RX_PHYST0_CCK 0x0000 /* Frame type: CCK */
-#define B43_RX_PHYST0_OFDM 0x0001 /* Frame type: OFDM */
-#define B43_RX_PHYST0_PRE_N 0x0002 /* Pre-standard N-PHY frame */
-#define B43_RX_PHYST0_STD_N 0x0003 /* Standard N-PHY frame */
+#define B43_RX_PHYST0_FTYPE 0x0003 /* Frame type */
+#define B43_RX_PHYST0_CCK 0x0000 /* Frame type: CCK */
+#define B43_RX_PHYST0_OFDM 0x0001 /* Frame type: OFDM */
+#define B43_RX_PHYST0_PRE_N 0x0002 /* Pre-standard N-PHY frame */
+#define B43_RX_PHYST0_STD_N 0x0003 /* Standard N-PHY frame */
/* PHY RX Status 2 */
-#define B43_RX_PHYST2_LNAG 0xC000 /* LNA Gain */
+#define B43_RX_PHYST2_LNAG 0xC000 /* LNA Gain */
#define B43_RX_PHYST2_LNAG_SHIFT 14
-#define B43_RX_PHYST2_PNAG 0x3C00 /* PNA Gain */
+#define B43_RX_PHYST2_PNAG 0x3C00 /* PNA Gain */
#define B43_RX_PHYST2_PNAG_SHIFT 10
-#define B43_RX_PHYST2_FOFF 0x03FF /* F offset */
+#define B43_RX_PHYST2_FOFF 0x03FF /* F offset */
/* PHY RX Status 3 */
-#define B43_RX_PHYST3_DIGG 0x1800 /* DIG Gain */
+#define B43_RX_PHYST3_DIGG 0x1800 /* DIG Gain */
#define B43_RX_PHYST3_DIGG_SHIFT 11
-#define B43_RX_PHYST3_TRSTATE 0x0400 /* TR state */
+#define B43_RX_PHYST3_TRSTATE 0x0400 /* TR state */
/* MAC RX Status */
-#define B43_RX_MAC_BEACONSENT 0x00008000 /* Beacon send flag */
-#define B43_RX_MAC_KEYIDX 0x000007E0 /* Key index */
-#define B43_RX_MAC_KEYIDX_SHIFT 5
-#define B43_RX_MAC_DECERR 0x00000010 /* Decrypt error */
-#define B43_RX_MAC_DEC 0x00000008 /* Decryption attempted */
-#define B43_RX_MAC_PADDING 0x00000004 /* Pad bytes present */
-#define B43_RX_MAC_RESP 0x00000002 /* Response frame transmitted */
-#define B43_RX_MAC_FCSERR 0x00000001 /* FCS error */
+#define B43_RX_MAC_RXST_VALID 0x01000000 /* PHY RXST valid */
+#define B43_RX_MAC_TKIP_MICERR 0x00100000 /* TKIP MIC error */
+#define B43_RX_MAC_TKIP_MICATT 0x00080000 /* TKIP MIC attempted */
+#define B43_RX_MAC_AGGTYPE 0x00060000 /* Aggregation type */
+#define B43_RX_MAC_AGGTYPE_SHIFT 17
+#define B43_RX_MAC_AMSDU 0x00010000 /* A-MSDU mask */
+#define B43_RX_MAC_BEACONSENT 0x00008000 /* Beacon sent flag */
+#define B43_RX_MAC_KEYIDX 0x000007E0 /* Key index */
+#define B43_RX_MAC_KEYIDX_SHIFT 5
+#define B43_RX_MAC_DECERR 0x00000010 /* Decrypt error */
+#define B43_RX_MAC_DEC 0x00000008 /* Decryption attempted */
+#define B43_RX_MAC_PADDING 0x00000004 /* Pad bytes present */
+#define B43_RX_MAC_RESP 0x00000002 /* Response frame transmitted */
+#define B43_RX_MAC_FCSERR 0x00000001 /* FCS error */
/* RX channel */
-#define B43_RX_CHAN_GAIN 0xFC00 /* Gain */
-#define B43_RX_CHAN_GAIN_SHIFT 10
-#define B43_RX_CHAN_ID 0x03FC /* Channel ID */
-#define B43_RX_CHAN_ID_SHIFT 2
-#define B43_RX_CHAN_PHYTYPE 0x0003 /* PHY type */
+#define B43_RX_CHAN_40MHZ 0x1000 /* 40 Mhz channel width */
+#define B43_RX_CHAN_5GHZ 0x0800 /* 5 Ghz band */
+#define B43_RX_CHAN_ID 0x07F8 /* Channel ID */
+#define B43_RX_CHAN_ID_SHIFT 3
+#define B43_RX_CHAN_PHYTYPE 0x0007 /* PHY type */
+
u8 b43_plcp_get_ratecode_cck(const u8 bitrate);
u8 b43_plcp_get_ratecode_ofdm(const u8 bitrate);
--
John W. Linville
linville@tuxdriver.com
^ permalink raw reply related
* Re: [IPV4] ROUTE: ip_rt_dump() is unecessary slow
From: David Miller @ 2008-01-08 5:52 UTC (permalink / raw)
To: dada1; +Cc: netdev
In-Reply-To: <20080107193002.df137d57.dada1@cosmosbay.com>
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 7 Jan 2008 19:30:02 +0100
> I noticed "ip route list cache x.y.z.t" can be *very* slow.
>
> While strace-ing -T it I also noticed that first part of route cache is
> fetched quite fast :
...
> The following patch corrects this performance/latency problem, removing quadratic behavior.
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
I think removing quadratic behavior is a bug fix, thus applied
to net-2.6, thanks Eric :-)
^ permalink raw reply
* Re: [PATCH] AX25: nuke user trigable printks
From: David Miller @ 2008-01-08 5:55 UTC (permalink / raw)
To: max; +Cc: netdev
In-Reply-To: <1199728513-25258-1-git-send-email-max@stro.at>
From: maximilian attems <max@stro.at>
Date: Mon, 7 Jan 2008 18:55:13 +0100
> sfuzz trigerrs any of those printk easily.
> things that should have gone in early 2.5.x aka years ago
> should not be thrown so easily to the user.
>
> Signed-off-by: maximilian attems <max@stro.at>
Can you replace the printk()'s with comments at least?
Otherwise nobody is going to remember what we were trying
to accomplish here and it will guarentee that, in fact,
support for these old things will never get removed.
At least if comments are there, someone might think to add
these things to the feature removal schedule and follow
through with it.
Thanks.
^ permalink raw reply
* Re: Top 10 kernel oopses for the week ending January 5th, 2008
From: Al Viro @ 2008-01-08 5:59 UTC (permalink / raw)
To: Linus Torvalds
Cc: Kevin Winchester, J. Bruce Fields, Arjan van de Ven,
Linux Kernel Mailing List, Andrew Morton, NetDev, gregkh, neilb
In-Reply-To: <alpine.LFD.1.00.0801071851120.3148@woody.linux-foundation.org>
On Mon, Jan 07, 2008 at 07:26:12PM -0800, Linus Torvalds wrote:
> I usually just compile a small program like
>
> const char array[]="\xnn\xnn\xnn...";
>
> int main(int argc, char **argv)
> {
> printf("%p\n", array);
> *(int *)0=0;
> }
Heh. I prefer
char main[] = {.....};
for the same thing, with gdb a.out and no running at all.
FWIW, I'm going to go through Arjan's collection and post blow-by-blow
analysis of some of those suckers. Tonight, probably...
Let's take e.g. http://www.kerneloops.org/raw.php?rawid=2618
RIP: 0010:[<ffffffff803b49a1>] [<ffffffff803b49a1>] kref_put+0x31/0x80
RSP: 0000:ffff81007ffe5df0 EFLAGS: 00010202
RAX: 0000000000000000 RBX: 0000000034333545 RCX: ffffffff80606270
RDX: 0000000000000040 RSI: ffffffff803b38b0 RDI: 0000000034333545
RBP: ffff81007ffe5e00 R08: 0000000000000001 R09: 0000000000000000
R10: ffffffff8094c430 R11: 0000000000000000 R12: ffffffff803b38b0
R13: ffff81011ede44d8 R14: ffffffff804d7d50 R15: ffff81011ff210f0
FS: 0000000002024870(0000) GS:ffff81011ff0dd00(0000)
...
Call Trace:
[<ffffffff803b37e9>] kobject_put+0x19/0x20
[<ffffffff803b389b>] kobject_del+0x2b/0x40
[<ffffffff804d7d50>] delayed_delete+0x0/0xb0
[<ffffffff804d7db9>] delayed_delete+0x69/0xb0
[<ffffffff80249775>] run_workqueue+0x175/0x210
[<ffffffff8024a411>] worker_thread+0x71/0xb0
[<ffffffff8024d9e0>] autoremove_wake_function+0x0/0x40
[<ffffffff8024a3a0>] worker_thread+0x0/0xb0
[<ffffffff8024d5fd>] kthread+0x4d/0x80
[<ffffffff8020c4b8>] child_rip+0xa/0x12
[<ffffffff8020bbcf>] restore_args+0x0/0x30
[<ffffffff8024d5b0>] kthread+0x0/0x80
[<ffffffff8020c4ae>] child_rip+0x0/0x12
Code: f0 ff 0b 0f 94 c0 31 d2 84 c0 74 0b 48 89 df 41 ff d4 ba 01
What do we have here? It barfs in kref_put() called from kobject_put().
It's -rc6-mm1 and I don't have -mm at hand. Let's see if we can make
any sense out of it from the mainline - it might be a good idea for the
first pass, unless there are some clear indications to the contrary.
kref_put() is fairly low-level (and deep in call chain, here). And it's
pretty small:
int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
WARN_ON(release == NULL);
WARN_ON(release == (void (*)(struct kref *))kfree);
if (atomic_dec_and_test(&kref->refcount)) {
release(kref);
return 1;
}
return 0;
}
Poking around on the site (I'm not familiar enough with it, so bear with
me) gives a link to posting and an important detail missed in that page:
<1>Unable to handle kernel paging request at 0000000034333545 RIP:
[<ffffffff803b49a1>] kref_put+0x31/0x80
Now, that's interesting - we barf on dereferencing a pointer that (a) has
upper 32 bits zero and (b) doesn't have a bit 7 set in any byte. Smells
like ASCII data misinterpreted as a pointer. Conversion to ASCII gives
"E534", which sure as hell does look like a fragment of some string.
OK, so where does that happen? In this case we have only one candidate,
really - atomic_dec_and_test(&kref->refcount). Before it we only compare
argument with constants, after it we pass argument to another function.
Now, looking at the registers (see above) we notice that this address had
come from rbx. Let's try objdump -d lib/kref.o and see what we've got there
for kref_put():
4a: 55 push %rbp
4b: 48 85 f6 test %rsi,%rsi
4e: 48 c7 c1 00 00 00 00 mov $0x0,%rcx
55: ba 36 00 00 00 mov $0x36,%edx
5a: 48 89 e5 mov %rsp,%rbp
5d: 41 54 push %r12
5f: 49 89 f4 mov %rsi,%r12
62: 53 push %rbx
63: 48 89 fb mov %rdi,%rbx
66: 74 15 je 7d <kref_put+0x33>
68: 48 81 fe 00 00 00 00 cmp $0x0,%rsi
6f: 75 26 jne 97 <kref_put+0x4d>
71: 48 c7 c1 00 00 00 00 mov $0x0,%rcx
78: ba 37 00 00 00 mov $0x37,%edx
7d: 48 c7 c6 00 00 00 00 mov $0x0,%rsi
84: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
8b: 31 c0 xor %eax,%eax
8d: e8 00 00 00 00 callq 92 <kref_put+0x48>
92: e8 00 00 00 00 callq 97 <kref_put+0x4d>
97: f0 ff 0b lock decl (%rbx)
9a: 0f 94 c0 sete %al
9d: 31 d2 xor %edx,%edx
9f: 84 c0 test %al,%al
a1: 74 0b je ae <kref_put+0x64>
a3: 48 89 df mov %rbx,%rdi
a6: 41 ff d4 callq *%r12
a9: ba 01 00 00 00 mov $0x1,%edx
ae: 5b pop %rbx
af: 41 5c pop %r12
b1: c9 leaveq
b2: 89 d0 mov %edx,%eax
b4: c3 retq
It's not necessary identical to what that kernel had; still, not bad for
a starting point. We are even lucky enough to find 'f0 ff' immediately
in there:
97: f0 ff 0b lock decl (%rbx)
Next instructions also match - actually, better than I expected. So.
We even have register allocation matching the reported kernel and it
all makes sense - this is the instruction where it had puked, the address
had, indeed, come from rbx and it's locked decrement of *rbx followed
by some testing and conditional jump. Just what one would expect
from atomic_dec_and_test().
IOW, we have &kref->refcount equal to 0x0000000034333545. What's the
value of kref itself? We could look into definition of struct kref,
or just notice that release(kref) should be right after that, so
we could see what gets passed to it.
a3: 48 89 df mov %rbx,%rdi
a6: 41 ff d4 callq *%r12
is clearly it. So what we are passing is rbx itself, so that offset got to
be zero.
All right, so we have kref_put() getting 0x0000000034333545 instead of a
pointer in the first argument. It's called from kobject_put() and unless
-mm has changes in there, there's no need to guess where in kobject_put()
that had been:
void kobject_put(struct kobject * kobj)
{
if (kobj)
kref_put(&kobj->kref, kobject_release);
}
Aha. Now, we need to work back from &kobj->kref to kobj. Again, assuming
that -mm doesn't change struct kobject in a way that would affect the
offset, we have
struct kobject {
const char * k_name;
struct kref kref;
struct list_head entry;
...
Looks like it's not too likely, unless somebody had been deliberately
rearranging fields (to fit cachelines better, etc.). We'll need to
verify that, of course, but for now it's a good starting assumption.
Very well, we have one pointer in front of it. It's an amd64, so
it's an 8 byte field and no alignment padding is to be expected.
IOW, kobj is 0x0000000034333545 - 8, i.e. 0x000000003433353D. What's _that_
in ASCII? "=534". OK, that makes even more sense for a part of some
string...
Let's check; time to take a look at that patch, after all
struct kobject {
- const char * k_name;
+ const char *name;
struct kref kref;
OK, not a problem. While we are at it, kobject_put() is unchanged by
the patch. Now, back to trace. kobject_put() is called from kobject_del().
Now, that one does differ from mainline:
void kobject_del(struct kobject * kobj)
{
if (!kobj)
return;
sysfs_remove_dir(kobj);
kobj->state_in_sysfs = 0;
kobj_kset_leave(kobj);
kobject_put(kobj->parent);
kobj->parent = NULL;
}
Humm... So we have kobj->parent containing crap. What about the caller?
It's from drivers/md/md.c:
static void delayed_delete(struct work_struct *ws)
{
mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
kobject_del(&rdev->kobj);
}
and it's only used in
static void unbind_rdev_from_array(mdk_rdev_t * rdev)
{
char b[BDEVNAME_SIZE];
if (!rdev->mddev) {
MD_BUG();
return;
}
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_init(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state"
*/
INIT_WORK(&rdev->del_work, delayed_delete);
schedule_work(&rdev->del_work);
}
Well, that takes care of the rest of trace - we have used INIT_WORK to set
rdev->del_work up, scheduled it for execution and eventually the callback
had been called (asynchronously to us).
So what do we have so far?
unbind_rdev_from_array(rdev);
had been called and rdev->kobj.parent turned to contain a crap value
(0x000000003433353D) instead of a pointer. That's about all we can
get out of trace.
Now, let's see what could have triggered it. Curious... Looking through
diff shows an interesting bit:
- rdev->kobj.parent = &mddev->kobj;
- if ((err = kobject_add(&rdev->kobj)))
+ if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
in bind_rdev_to_array(). At the first sight the changes in kobject_add()
seem to match that. And nothing else in md.c seems to be setting it to
anything non-NULL. Very well, so it's one of the following:
* unbind_rdev_from_array() called on rdev that didn't pass through
bind_rdev_to_array().
* unbind_rdev_from_array() called on rdev that bailed out from
bind_rdev_to_array() before that point.
* mddev value in the above is crap. That's bloody unlikely, BTW -
kobject_add() would increment the refcount of rdev->kobj.parent (or we would
be in far more trouble, since it would not match kobject_del() and _that_
would hurt a lot more than just md.c). So &mddev->kobj would better be
something saner when it went through that point.
* *rdev got corrupted in between.
Actually, looking at the callers of unbind_rdev_from_array()... We always
follow it with export_rdev(). Which does (presumably) final kobject_put()
on &rdev->kobj, freeing rdev itself.
What guarantees that it doesn't happen before we get to callback? AFAICS,
nothing whatsoever...
And if it does happen, we'll get rdev happily freed (by rdev_free(), as
->release() of &rdev->kobj) by the time we get to delayed_delete(). Which
explains what's going on just fine.
BTW, -mm changes in kobject.c explain WTF it doesn't trigger in mainline -
there we managed to get away with that since kobject_add() bumped refcount
of kobject by one and kobject_del() decremented it. That masked the bug.
^ permalink raw reply
* Re: [RFC PATCH v2 1/2] NET: Clone the sk_buff 'iif' field in __skb_clone()
From: David Miller @ 2008-01-08 6:01 UTC (permalink / raw)
To: paul.moore; +Cc: netdev
In-Reply-To: <20080107174741.13488.93482.stgit@flek.americas.hpqcorp.net>
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 07 Jan 2008 12:47:42 -0500
> Both NetLabel and SELinux (other LSMs may grow to use it as well) rely on the
> 'iif' field to determine the receiving network interface of inbound packets.
> Unfortunately, at present this field is not preserved across a skb clone
> operation which can lead to garbage values if the cloned skb is sent back
> through the network stack. This patch corrects this problem by properly
> copying the 'iif' field in __skb_clone() and removing the 'iif' field
> assignment from skb_act_clone() since it is no longer needed.
>
> Also, while we are here, put the assignments in the same order as the offsets
> to reduce cacheline bounces.
>
> Signed-off-by: Paul Moore <paul.moore@hp.com>
Applied to net-2.6 and I think I'll toss this into -stable as well.
Thanks.
^ permalink raw reply
* Re: [RFC PATCH v2 2/2] SELinux: Add network ingress and egress control permission checks
From: David Miller @ 2008-01-08 6:02 UTC (permalink / raw)
To: paul.moore; +Cc: netdev
In-Reply-To: <20080107174748.13488.11389.stgit@flek.americas.hpqcorp.net>
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 07 Jan 2008 12:47:48 -0500
> This patch implements packet ingress/egress controls for SELinux which allow
> SELinux security policy to control the flow of all IPv4 and IPv6 packets into
> and out of the system. Currently SELinux does not have proper control over
> forwarded packets and this patch corrects this problem.
>
> Special thanks to Venkat Yekkirala <vyekkirala@trustedcs.com> whose earlier
> work on this topic eventually led to this patch.
>
> Signed-off-by: Paul Moore <paul.moore@hp.com>
This looks fine, and since it doesn't touch anything under net/
please feel free to merge it however you like.
^ permalink raw reply
* Re: Please pull 'fixes-jgarzik' branch of wireless-2.6
From: David Miller @ 2008-01-08 6:04 UTC (permalink / raw)
To: linville; +Cc: jeff, netdev, linux-wireless
In-Reply-To: <20080108051523.GB3125@tuxdriver.com>
From: "John W. Linville" <linville@tuxdriver.com>
Date: Tue, 8 Jan 2008 00:15:23 -0500
> git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git fixes-jgarzik
Since Jeff is busy and asked me to take in driver fixes
I'll pulled this into my net-2.6 tree.
Thanks!
^ permalink raw reply
* Re: [PATCH][LRO] Fix lro_mgr->features checks
From: David Miller @ 2008-01-08 6:09 UTC (permalink / raw)
To: Brice.Goglin; +Cc: gallatin, themann, torvalds, linux-kernel, netdev
In-Reply-To: <47822A2C.4020303@inria.fr>
From: Brice Goglin <Brice.Goglin@inria.fr>
Date: Mon, 07 Jan 2008 14:33:32 +0100
> [PATCH][LRO] Fix lro_mgr->features checks
>
> lro_mgr->features contains a bitmask of LRO_F_* values which are
> defined as power of two, not as bit indexes.
> They must be checked with x&LRO_F_FOO, not with test_bit(LRO_F_FOO,&x).
>
> Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
> Acked-by: Andrew Gallatin <gallatin@myri.com>
Applied, thanks a lot.
^ permalink raw reply
* Re: [IPV4] ROUTE: ip_rt_dump() is unecessary slow
From: Eric Dumazet @ 2008-01-08 6:11 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20080107.215253.250616600.davem@davemloft.net>
[-- Attachment #1: Type: text/plain, Size: 821 bytes --]
David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 7 Jan 2008 19:30:02 +0100
>
>> I noticed "ip route list cache x.y.z.t" can be *very* slow.
>>
>> While strace-ing -T it I also noticed that first part of route cache is
>> fetched quite fast :
> ...
>> The following patch corrects this performance/latency problem, removing quadratic behavior.
>>
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>
> I think removing quadratic behavior is a bug fix, thus applied
> to net-2.6, thanks Eric :-)
You are the boss :)
I had another patch to submit, so I based it no net-2.6, please just say me if
you prefer a net-2/6.25 one, as it's not a critical bug fix...
Thank you
[IPV4] ROUTE: fix rcu_dereference() uses in /proc/net/rt_cache
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: route_rcu.patch --]
[-- Type: text/plain, Size: 1011 bytes --]
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d337706..3b7562f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -278,7 +278,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = rcu_dereference(rt_hash_table[st->bucket].chain);
if (r)
break;
rcu_read_unlock_bh();
@@ -288,15 +288,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
{
- struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+ struct rt_cache_iter_state *st = seq->private;
- r = r->u.dst.rt_next;
+ r = rcu_dereference(r->u.dst.rt_next);
while (!r) {
rcu_read_unlock_bh();
if (--st->bucket < 0)
break;
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = rcu_dereference(rt_hash_table[st->bucket].chain);
}
return r;
}
^ permalink raw reply related
* Re: [Patch] xfrm_policy_destroy: rename and relative fixes
From: Herbert Xu @ 2008-01-08 6:15 UTC (permalink / raw)
To: WANG Cong; +Cc: LKML, David Miller, netdev
In-Reply-To: <20080103120550.GB2498@hacking>
On Thu, Jan 03, 2008 at 08:05:50PM +0800, WANG Cong wrote:
>
> Since __xfrm_policy_destroy is used to destory the resources
> allocated by xfrm_policy_alloc. So using the name
> __xfrm_policy_destroy is not correspond with xfrm_policy_alloc.
> Rename it to xfrm_policy_destroy.
>
> And along with some instances that call xfrm_policy_alloc
> but not using xfrm_policy_destroy to destroy the resource,
> fix them.
>
> Cc: David Miller <davem@davemloft.net>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
This patch looks OK to me.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* Re: [IPV4] ROUTE: ip_rt_dump() is unecessary slow
From: David Miller @ 2008-01-08 6:15 UTC (permalink / raw)
To: dada1; +Cc: netdev
In-Reply-To: <47831412.40606@cosmosbay.com>
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 08 Jan 2008 07:11:30 +0100
> @@ -288,15 +288,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
>
> static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
> {
> - struct rt_cache_iter_state *st = rcu_dereference(seq->private);
> + struct rt_cache_iter_state *st = seq->private;
>
Can you explain to me why this rcu_dereference() can be removed?
The rest of your patch is OK and once I understand the above
I'll add it to net-2.6, thanks!
^ permalink raw reply
* Re: [Patch] xfrm_policy_destroy: rename and relative fixes
From: David Miller @ 2008-01-08 6:34 UTC (permalink / raw)
To: herbert; +Cc: xiyou.wangcong, linux-kernel, netdev
In-Reply-To: <20080108061516.GD2529@gondor.apana.org.au>
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 8 Jan 2008 17:15:16 +1100
> On Thu, Jan 03, 2008 at 08:05:50PM +0800, WANG Cong wrote:
> >
> > Since __xfrm_policy_destroy is used to destory the resources
> > allocated by xfrm_policy_alloc. So using the name
> > __xfrm_policy_destroy is not correspond with xfrm_policy_alloc.
> > Rename it to xfrm_policy_destroy.
> >
> > And along with some instances that call xfrm_policy_alloc
> > but not using xfrm_policy_destroy to destroy the resource,
> > fix them.
> >
> > Cc: David Miller <davem@davemloft.net>
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
>
> This patch looks OK to me.
Applied, thanks everyone.
^ permalink raw reply
* Re: [PATCH net-2.6.25] [IPV4] Remove unused member of dst_entry
From: David Miller @ 2008-01-08 6:37 UTC (permalink / raw)
To: ramirose; +Cc: netdev
In-Reply-To: <eb3ff54b0801060215g55cf862ja793400f060b9e34@mail.gmail.com>
From: "Rami Rosen" <ramirose@gmail.com>
Date: Sun, 6 Jan 2008 12:15:35 +0200
> The info placeholder member of dst_entry seems to be unused in the
> network stack.
>
> Signed-off-by: Rami Rosen <ramirose@gmail.com>
Applied, thanks.
^ permalink raw reply
* Re: [IPV4] ROUTE: ip_rt_dump() is unecessary slow
From: Eric Dumazet @ 2008-01-08 6:37 UTC (permalink / raw)
To: David Miller, Dipankar Sarma; +Cc: netdev
In-Reply-To: <20080107.221537.178212902.davem@davemloft.net>
[-- Attachment #1: Type: text/plain, Size: 1581 bytes --]
David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 08 Jan 2008 07:11:30 +0100
>
>> @@ -288,15 +288,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
>>
>> static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
>> {
>> - struct rt_cache_iter_state *st = rcu_dereference(seq->private);
>> + struct rt_cache_iter_state *st = seq->private;
>>
>
> Can you explain to me why this rcu_dereference() can be removed?
Very good question, but honestly I really dont see why it was there at the
first place :
"struct seq_file" is private to this thread, so seq.private is
also private and cannot change while this thread runs rt_cache_get_next().
Reading it once (as guaranted bu rcu_dereference()) or several time if
compiler really is dumb enough wont change the result...
>
> The rest of your patch is OK and once I understand the above
> I'll add it to net-2.6, thanks!
>
Maybe we can first have an Ack from Dipankar Sarma, I can repost the patch
with a more precise ChangeLog and wait his answer ?
[NET] ROUTE: fix rcu_dereference() uses in /proc/net/rt_cache
In rt_cache_get_next(), no need to guard seq->private by a rcu_dereference()
since seq is private to the thread running this function. Reading seq.private
once (as guaranted bu rcu_dereference()) or several time if compiler really is
dumb enough wont change the result.
But we miss real spots where rcu_dereference() are needed, both in
rt_cache_get_first() and rt_cache_get_next()
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: route_rcu.patch --]
[-- Type: text/plain, Size: 1011 bytes --]
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d337706..3b7562f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -278,7 +278,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = rcu_dereference(rt_hash_table[st->bucket].chain);
if (r)
break;
rcu_read_unlock_bh();
@@ -288,15 +288,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
{
- struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+ struct rt_cache_iter_state *st = seq->private;
- r = r->u.dst.rt_next;
+ r = rcu_dereference(r->u.dst.rt_next);
while (!r) {
rcu_read_unlock_bh();
if (--st->bucket < 0)
break;
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = rcu_dereference(rt_hash_table[st->bucket].chain);
}
return r;
}
^ permalink raw reply related
* Re: [PACKET]: Fix sparse warnings in af_packet.c
From: David Miller @ 2008-01-08 6:40 UTC (permalink / raw)
To: dada1; +Cc: netdev
In-Reply-To: <20080107120108.06453951.dada1@cosmosbay.com>
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 7 Jan 2008 12:01:08 +0100
> CHECK net/packet/af_packet.c
> net/packet/af_packet.c:1876:14: warning: context imbalance in 'packet_seq_start' - wrong count at exit
> net/packet/af_packet.c:1888:13: warning: context imbalance in 'packet_seq_stop' - unexpected unlock
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Applied.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox